pimco-charts/app/data/analyzer.py
Vadym Samoilenko 1b36609e5d Fix datetime parsing error and int column name crash
- Wrap pd.to_datetime() in try-except in transformer.py to handle
  non-standard strings like '*M 199901 Interpolate' that pandas 3.x
  raises on even with errors="coerce"
- Fix 'int object has no attribute lower' in analyzer.py by using
  str(col).lower() for numeric column names from Excel files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-19 21:57:26 +00:00

51 lines
1.8 KiB
Python

"""Data summarization for AI prompt context."""
from __future__ import annotations
import pandas as pd
def summarize_data(sheets: dict[str, pd.DataFrame], max_tokens: int = 1500) -> str:
"""Produce a concise text summary of loaded data for the AI prompt.
Includes sheet names, column names, data types, date ranges, and value ranges.
"""
parts = []
for sheet_name, df in sheets.items():
cols = df.columns.tolist()
n_rows = len(df)
n_cols = len(cols)
part = f"Sheet '{sheet_name}': {n_rows} rows x {n_cols} columns\n"
part += f" Columns: {', '.join(str(c) for c in cols[:20])}"
if len(cols) > 20:
part += f" ... (+{len(cols) - 20} more)"
part += "\n"
# Identify date columns
for col in cols[:20]:
try:
if df[col].dtype == "datetime64[ns]" or str(col).lower() in ("date", "dates", "time", "period"):
dates = pd.to_datetime(df[col], errors="coerce").dropna()
if not dates.empty:
part += f" Date range ({col}): {dates.min().strftime('%Y-%m-%d')} to {dates.max().strftime('%Y-%m-%d')}\n"
continue
except Exception:
pass
# Numeric summary
if pd.api.types.is_numeric_dtype(df[col]):
valid = df[col].dropna()
if not valid.empty:
part += f" {col}: min={valid.min():.4g}, max={valid.max():.4g}\n"
parts.append(part)
summary = "DATA SUMMARY:\n" + "\n".join(parts)
# Truncate if too long (rough estimate: ~4 chars per token)
max_chars = max_tokens * 4
if len(summary) > max_chars:
summary = summary[:max_chars] + "\n... (truncated)"
return summary