- Wrap pd.to_datetime() in try-except in transformer.py to handle non-standard strings like '*M 199901 Interpolate' that pandas 3.x raises on even with errors="coerce" - Fix 'int object has no attribute lower' in analyzer.py by using str(col).lower() for numeric column names from Excel files Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
51 lines
1.8 KiB
Python
51 lines
1.8 KiB
Python
"""Data summarization for AI prompt context."""
|
|
|
|
from __future__ import annotations
|
|
import pandas as pd
|
|
|
|
|
|
def summarize_data(sheets: dict[str, pd.DataFrame], max_tokens: int = 1500) -> str:
|
|
"""Produce a concise text summary of loaded data for the AI prompt.
|
|
|
|
Includes sheet names, column names, data types, date ranges, and value ranges.
|
|
"""
|
|
parts = []
|
|
|
|
for sheet_name, df in sheets.items():
|
|
cols = df.columns.tolist()
|
|
n_rows = len(df)
|
|
n_cols = len(cols)
|
|
|
|
part = f"Sheet '{sheet_name}': {n_rows} rows x {n_cols} columns\n"
|
|
part += f" Columns: {', '.join(str(c) for c in cols[:20])}"
|
|
if len(cols) > 20:
|
|
part += f" ... (+{len(cols) - 20} more)"
|
|
part += "\n"
|
|
|
|
# Identify date columns
|
|
for col in cols[:20]:
|
|
try:
|
|
if df[col].dtype == "datetime64[ns]" or str(col).lower() in ("date", "dates", "time", "period"):
|
|
dates = pd.to_datetime(df[col], errors="coerce").dropna()
|
|
if not dates.empty:
|
|
part += f" Date range ({col}): {dates.min().strftime('%Y-%m-%d')} to {dates.max().strftime('%Y-%m-%d')}\n"
|
|
continue
|
|
except Exception:
|
|
pass
|
|
|
|
# Numeric summary
|
|
if pd.api.types.is_numeric_dtype(df[col]):
|
|
valid = df[col].dropna()
|
|
if not valid.empty:
|
|
part += f" {col}: min={valid.min():.4g}, max={valid.max():.4g}\n"
|
|
|
|
parts.append(part)
|
|
|
|
summary = "DATA SUMMARY:\n" + "\n".join(parts)
|
|
|
|
# Truncate if too long (rough estimate: ~4 chars per token)
|
|
max_chars = max_tokens * 4
|
|
if len(summary) > max_chars:
|
|
summary = summary[:max_chars] + "\n... (truncated)"
|
|
|
|
return summary
|