- Add RobotoCondensed-Bold.ttf and RobotoCondensed-Light.ttf; embed all three weights (300/normal/bold) in SVG @font-face for correct browser preview - Dockerfile: install fontconfig and register fonts as system fonts so CairoSVG/Pango resolves Roboto Condensed in PNG/PDF exports (not a fallback) - Export PNG/PDF at dpi=150 for higher quality output - Add SVG <clipPath> per panel so data series cannot overflow axis boundaries - Replace np.arange tick generation with safe while-loop (no float artifacts) - Legend: multi-row wrapping when items exceed chart width; improved per-char text width estimation; legend_row_count() helper for dynamic layout - Dynamic pad_top in compute_layout() based on legend rows + subtitle presence, preventing title/subtitle/legend overlap regardless of series count - Y-axis: skip overlapping labels (gridlines always drawn); adaptive tick precision so -0.0042% shows correctly instead of -0.00% - transformer.py: strip % signs from string-formatted percentages before pd.to_numeric; handle pandas 2.x StringDtype; expand date column detection to cover Year, Quarter, report_date, substring matches (start_datetime etc.) - Sync _find_date_column() in engine.py with transformer.py detection logic - prompts.py: enforce Roboto Condensed for all text; add axis range, label length, and decimal-percentage guidance rules Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
93 lines
3.1 KiB
Python
93 lines
3.1 KiB
Python
"""Data transformation: date parsing, column selection, resampling."""
|
|
|
|
from __future__ import annotations
|
|
import pandas as pd
|
|
|
|
|
|
def prepare_dataframe(df: pd.DataFrame, date_column: str | None = None) -> pd.DataFrame:
|
|
"""Prepare a DataFrame for charting: parse dates, ensure numeric columns.
|
|
|
|
Args:
|
|
df: Raw DataFrame
|
|
date_column: Optional name of the date column. Auto-detected if None.
|
|
|
|
Returns:
|
|
Cleaned DataFrame with parsed dates and numeric columns.
|
|
"""
|
|
df = df.copy()
|
|
|
|
# Auto-detect date column
|
|
if date_column is None:
|
|
date_column = _detect_date_column(df)
|
|
|
|
if date_column and date_column in df.columns:
|
|
try:
|
|
df[date_column] = pd.to_datetime(df[date_column], errors="coerce")
|
|
except Exception:
|
|
date_column = None
|
|
if date_column and date_column in df.columns:
|
|
# Drop rows where date is NaT
|
|
df = df.dropna(subset=[date_column])
|
|
# Sort by date
|
|
df = df.sort_values(date_column).reset_index(drop=True)
|
|
|
|
# Convert numeric-looking columns
|
|
for col in df.columns:
|
|
if col == date_column:
|
|
continue
|
|
# Handle object and string dtypes (pandas 2.x may use StringDtype)
|
|
if pd.api.types.is_string_dtype(df[col]) or df[col].dtype == object:
|
|
try:
|
|
# Check if values are percentage strings like "-0.42%" before
|
|
# calling to_numeric (which silently coerces them to NaN)
|
|
sample = df[col].dropna().head(20).astype(str)
|
|
pct_count = sample.str.match(r'^\s*-?\d[\d,.]*\s*%\s*$').sum()
|
|
if pct_count > len(sample) * 0.5:
|
|
# Strip the % sign, then convert; value stays as-is (e.g. -0.42)
|
|
df[col] = (
|
|
df[col].astype(str)
|
|
.str.replace(r'%\s*$', '', regex=True)
|
|
.str.replace(',', '', regex=False)
|
|
)
|
|
df[col] = pd.to_numeric(df[col], errors="coerce")
|
|
except Exception:
|
|
pass
|
|
|
|
return df
|
|
|
|
|
|
_DATE_EXACT_NAMES = {
|
|
"date", "dates", "time", "timestamp", "period", "month", "quarter",
|
|
"year", "as of", "as_of", "report_date", "reporting_date",
|
|
"fiscal_year", "observation_date", "obs_date",
|
|
}
|
|
|
|
|
|
def _detect_date_column(df: pd.DataFrame) -> str | None:
|
|
"""Auto-detect the date column in a DataFrame."""
|
|
# Exact name match
|
|
for col in df.columns:
|
|
if str(col).lower() in _DATE_EXACT_NAMES:
|
|
return col
|
|
|
|
# Substring match: any column whose name contains "date" or "time"
|
|
for col in df.columns:
|
|
col_lower = str(col).lower()
|
|
if "date" in col_lower or "time" in col_lower:
|
|
return col
|
|
|
|
# Check by dtype
|
|
for col in df.columns:
|
|
if df[col].dtype == "datetime64[ns]":
|
|
return col
|
|
|
|
# Try parsing first column
|
|
first_col = df.columns[0]
|
|
try:
|
|
parsed = pd.to_datetime(df[first_col].head(10), errors="coerce")
|
|
if parsed.notna().sum() >= 5:
|
|
return first_col
|
|
except Exception:
|
|
pass
|
|
|
|
return None
|