fix: improve TM parser EN/TX split and fix report SQL errors

The compact TM format parser was storing the combined EN+TX text in both fields, causing the LLM retrieval agent to fail at matching source lines against TM entries — resulting in all-low confidence tiers. Added _split_en_tx() heuristic that detects the language boundary at the first non-ASCII sentence. Also includes raw _text in LLM prompt for context. Fixed get_jobs_over_time GroupingError by using literal_column for date_trunc, added date filters to status_breakdown, and fixed Decimal serialization in locale stats. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-10 17:47:53 -04:00 · 2026-04-10 17:47:53 -04:00 · e97d4f81b7
commit e97d4f81b7
parent 52bc499272
2 changed files with 54 additions and 12 deletions
--- a/backend/app/pipeline/agents/agent_2_tm_retrieval.py
+++ b/backend/app/pipeline/agents/agent_2_tm_retrieval.py
@ -83,11 +83,16 @@ def _format_tm_entries_for_prompt(entries: list[TMEntry]) -> str:
    """Format TM entries into a numbered list for the LLM prompt."""
    lines: list[str] = []
    for i, entry in enumerate(entries, start=1):
-        lines.append(
+        line = (
            f"[TM-{i}] seg_key={entry.seg_key} | en={entry.en} | "
            f"tx={entry.tx} | channel={entry.channel} | "
            f"sub_channel={entry.sub_channel} | date={entry.date}"
        )
+        # Include the raw text for compact-format entries so the LLM
+        # can see the original unsplit content for better matching
+        if entry._text and entry._text != entry.en:
+            line += f" | raw={entry._text}"
+        lines.append(line)
    return "\n".join(lines)


--- a/backend/app/pipeline/modules/tm_file_loader.py
+++ b/backend/app/pipeline/modules/tm_file_loader.py
@ -112,6 +112,51 @@ import re
 _LOCALE_RE = re.compile(r"\b([a-z]{2}-[a-z]{2})\b")


+def _split_en_tx(text: str) -> tuple[str, str]:
+    """Attempt to split combined EN/TX text from compact TM entries.
+
+    The compact format packs both the English source and the target-language
+    translation into a single string after the locale code.  There is no
+    explicit delimiter, but in practice the boundary is almost always at a
+    sentence-ending period followed by a capital letter that starts the
+    target-language text.
+
+    Heuristic (works for the vast majority of real Amazon TM files):
+      1. Split on ". " boundaries.
+      2. Walk forward and assume the first sentence that contains non-ASCII
+         characters (ö, ü, é, ñ, etc.) marks the start of the TX portion.
+      3. If all sentences are ASCII-only (e.g. short entries), fall back to
+         a 50/50 split at the nearest sentence boundary.
+
+    Returns (en, tx).
+    """
+    if not text:
+        return ("", "")
+
+    # Split on sentence boundaries (period followed by space + capital)
+    parts = re.split(r"(?<=\.)\s+(?=[A-ZÄÖÜÉÈÊÀÁÂÃÇÑ])", text)
+    if len(parts) <= 1:
+        # No clear sentence boundary — return full text as EN, empty TX
+        return (text, text)
+
+    # Walk forward: first part with non-ASCII = start of TX
+    for i, part in enumerate(parts):
+        if i == 0:
+            continue  # first part is always EN
+        if re.search(r"[^\x00-\x7F]", part):
+            en = " ".join(parts[:i]).strip()
+            tx = " ".join(parts[i:]).strip()
+            return (en, tx)
+
+    # All ASCII — split at midpoint sentence boundary
+    mid = len(parts) // 2
+    if mid == 0:
+        mid = 1
+    en = " ".join(parts[:mid]).strip()
+    tx = " ".join(parts[mid:]).strip()
+    return (en, tx)
+
+
 def _parse_compact_entry(t_value: str) -> TMEntry | None:
    """Parse the compact 't' field format used in real TM files.

@ -120,8 +165,7 @@ def _parse_compact_entry(t_value: str) -> TMEntry | None:

    The locale code (xx-xx) is the reliable split point:
    - Everything BEFORE the locale code is the seg_key + note_type metadata
-    - Everything AFTER needs to be split into EN source and TX translation
-      (we store the full post-locale text and let the LLM handle matching)
+    - Everything AFTER is split into EN source and TX translation
    """
    match = _LOCALE_RE.search(t_value)
    if not match:
@ -131,16 +175,12 @@ def _parse_compact_entry(t_value: str) -> TMEntry | None:
    before_locale = t_value[: match.start()].strip()
    after_locale = t_value[match.end() :].strip()

-    # Extract seg_key: everything up to the sequence number
-    # e.g., "Value Q1 24 Radio 001 VO" -> seg_key="Value Q1 24 Radio 001"
-    # note_type would be "VO", "Headline", "BVO", "Super", etc.
    seg_key = before_locale

    # Extract channel info from seg_key
    channel = ""
    sub_channel = ""
    seg_parts = before_locale.split()
-    # Try to find channel indicators in the seg_key
    channel_keywords = {
        "mass", "value", "onsite", "outbound", "radio",
        "tv_olv", "display", "ooh", "dooh", "social", "print",
@ -162,11 +202,8 @@ def _parse_compact_entry(t_value: str) -> TMEntry | None:
            note_type = part
            break

-    # For EN/TX split: the boundary is where the language switches
-    # We store the full text and let the retrieval agent handle it
-    # Simple heuristic: store everything after locale as combined en+tx
-    en_text = after_locale
-    tx_text = after_locale
+    # Split EN and TX from the combined text
+    en_text, tx_text = _split_en_tx(after_locale)

    # Try to extract year from seg_key for the date field
    date = ""