pimco-charts/app/data/analyzer.py

"""Data summarization for AI prompt context."""

from __future__ import annotations
import pandas as pd


def summarize_data(sheets: dict[str, pd.DataFrame], max_tokens: int = 1500) -> str:
    """Produce a concise text summary of loaded data for the AI prompt.

    Includes sheet names, column names, data types, date ranges, and value ranges.
    """
    parts = []

    for sheet_name, df in sheets.items():
        cols = df.columns.tolist()
        n_rows = len(df)
        n_cols = len(cols)

        part = f"Sheet '{sheet_name}': {n_rows} rows x {n_cols} columns\n"
        part += f"  Columns: {', '.join(str(c) for c in cols[:20])}"
        if len(cols) > 20:
            part += f" ... (+{len(cols) - 20} more)"
        part += "\n"

        # Identify date columns
        for col in cols[:20]:
            try:
                if df[col].dtype == "datetime64[ns]" or str(col).lower() in ("date", "dates", "time", "period"):
                    dates = pd.to_datetime(df[col], errors="coerce").dropna()
                    if not dates.empty:
                        part += f"  Date range ({col}): {dates.min().strftime('%Y-%m-%d')} to {dates.max().strftime('%Y-%m-%d')}\n"
                    continue
            except Exception:
                pass

            # Numeric summary
            if pd.api.types.is_numeric_dtype(df[col]):
                valid = df[col].dropna()
                if not valid.empty:
                    part += f"  {col}: min={valid.min():.4g}, max={valid.max():.4g}\n"

        parts.append(part)

    summary = "DATA SUMMARY:\n" + "\n".join(parts)

    # Truncate if too long (rough estimate: ~4 chars per token)
    max_chars = max_tokens * 4
    if len(summary) > max_chars:
        summary = summary[:max_chars] + "\n... (truncated)"

    return summary