pimco-charts/app/data/transformer.py

"""Data transformation: date parsing, column selection, resampling."""

from __future__ import annotations
import pandas as pd


def prepare_dataframe(df: pd.DataFrame, date_column: str | None = None) -> pd.DataFrame:
    """Prepare a DataFrame for charting: parse dates, ensure numeric columns.

    Args:
        df: Raw DataFrame
        date_column: Optional name of the date column. Auto-detected if None.

    Returns:
        Cleaned DataFrame with parsed dates and numeric columns.
    """
    df = df.copy()

    # Auto-detect date column
    if date_column is None:
        date_column = _detect_date_column(df)

    if date_column and date_column in df.columns:
        try:
            df[date_column] = pd.to_datetime(df[date_column], errors="coerce")
        except Exception:
            date_column = None
    if date_column and date_column in df.columns:
        # Drop rows where date is NaT
        df = df.dropna(subset=[date_column])
        # Sort by date
        df = df.sort_values(date_column).reset_index(drop=True)

    # Convert numeric-looking columns
    for col in df.columns:
        if col == date_column:
            continue
        # Handle object and string dtypes (pandas 2.x may use StringDtype)
        if pd.api.types.is_string_dtype(df[col]) or df[col].dtype == object:
            try:
                # Check if values are percentage strings like "-0.42%" before
                # calling to_numeric (which silently coerces them to NaN)
                sample = df[col].dropna().head(20).astype(str)
                pct_count = sample.str.match(r'^\s*-?\d[\d,.]*\s*%\s*$').sum()
                if pct_count > len(sample) * 0.5:
                    # Strip the % sign, then convert; value stays as-is (e.g. -0.42)
                    df[col] = (
                        df[col].astype(str)
                        .str.replace(r'%\s*$', '', regex=True)
                        .str.replace(',', '', regex=False)
                    )
                df[col] = pd.to_numeric(df[col], errors="coerce")
            except Exception:
                pass

    return df


_DATE_EXACT_NAMES = {
    "date", "dates", "time", "timestamp", "period", "month", "quarter",
    "year", "as of", "as_of", "report_date", "reporting_date",
    "fiscal_year", "observation_date", "obs_date",
}


def _detect_date_column(df: pd.DataFrame) -> str | None:
    """Auto-detect the date column in a DataFrame."""
    # Exact name match
    for col in df.columns:
        if str(col).lower() in _DATE_EXACT_NAMES:
            return col

    # Substring match: any column whose name contains "date" or "time"
    for col in df.columns:
        col_lower = str(col).lower()
        if "date" in col_lower or "time" in col_lower:
            return col

    # Check by dtype
    for col in df.columns:
        if df[col].dtype == "datetime64[ns]":
            return col

    # Try parsing first column
    first_col = df.columns[0]
    try:
        parsed = pd.to_datetime(df[first_col].head(10), errors="coerce")
        if parsed.notna().sum() >= 5:
            return first_col
    except Exception:
        pass

    return None