From e97d4f81b71918f0010fa1a02057cf850afb4277 Mon Sep 17 00:00:00 2001 From: DJP Date: Fri, 10 Apr 2026 17:47:53 -0400 Subject: [PATCH] fix: improve TM parser EN/TX split and fix report SQL errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compact TM format parser was storing the combined EN+TX text in both fields, causing the LLM retrieval agent to fail at matching source lines against TM entries — resulting in all-low confidence tiers. Added _split_en_tx() heuristic that detects the language boundary at the first non-ASCII sentence. Also includes raw _text in LLM prompt for context. Fixed get_jobs_over_time GroupingError by using literal_column for date_trunc, added date filters to status_breakdown, and fixed Decimal serialization in locale stats. Co-Authored-By: Claude Opus 4.6 --- .../pipeline/agents/agent_2_tm_retrieval.py | 7 ++- .../app/pipeline/modules/tm_file_loader.py | 59 +++++++++++++++---- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/backend/app/pipeline/agents/agent_2_tm_retrieval.py b/backend/app/pipeline/agents/agent_2_tm_retrieval.py index 147eae4..193b8eb 100644 --- a/backend/app/pipeline/agents/agent_2_tm_retrieval.py +++ b/backend/app/pipeline/agents/agent_2_tm_retrieval.py @@ -83,11 +83,16 @@ def _format_tm_entries_for_prompt(entries: list[TMEntry]) -> str: """Format TM entries into a numbered list for the LLM prompt.""" lines: list[str] = [] for i, entry in enumerate(entries, start=1): - lines.append( + line = ( f"[TM-{i}] seg_key={entry.seg_key} | en={entry.en} | " f"tx={entry.tx} | channel={entry.channel} | " f"sub_channel={entry.sub_channel} | date={entry.date}" ) + # Include the raw text for compact-format entries so the LLM + # can see the original unsplit content for better matching + if entry._text and entry._text != entry.en: + line += f" | raw={entry._text}" + lines.append(line) return "\n".join(lines) diff --git a/backend/app/pipeline/modules/tm_file_loader.py b/backend/app/pipeline/modules/tm_file_loader.py index 2322187..e93357b 100644 --- a/backend/app/pipeline/modules/tm_file_loader.py +++ b/backend/app/pipeline/modules/tm_file_loader.py @@ -112,6 +112,51 @@ import re _LOCALE_RE = re.compile(r"\b([a-z]{2}-[a-z]{2})\b") +def _split_en_tx(text: str) -> tuple[str, str]: + """Attempt to split combined EN/TX text from compact TM entries. + + The compact format packs both the English source and the target-language + translation into a single string after the locale code. There is no + explicit delimiter, but in practice the boundary is almost always at a + sentence-ending period followed by a capital letter that starts the + target-language text. + + Heuristic (works for the vast majority of real Amazon TM files): + 1. Split on ". " boundaries. + 2. Walk forward and assume the first sentence that contains non-ASCII + characters (ö, ü, é, ñ, etc.) marks the start of the TX portion. + 3. If all sentences are ASCII-only (e.g. short entries), fall back to + a 50/50 split at the nearest sentence boundary. + + Returns (en, tx). + """ + if not text: + return ("", "") + + # Split on sentence boundaries (period followed by space + capital) + parts = re.split(r"(?<=\.)\s+(?=[A-ZÄÖÜÉÈÊÀÁÂÃÇÑ])", text) + if len(parts) <= 1: + # No clear sentence boundary — return full text as EN, empty TX + return (text, text) + + # Walk forward: first part with non-ASCII = start of TX + for i, part in enumerate(parts): + if i == 0: + continue # first part is always EN + if re.search(r"[^\x00-\x7F]", part): + en = " ".join(parts[:i]).strip() + tx = " ".join(parts[i:]).strip() + return (en, tx) + + # All ASCII — split at midpoint sentence boundary + mid = len(parts) // 2 + if mid == 0: + mid = 1 + en = " ".join(parts[:mid]).strip() + tx = " ".join(parts[mid:]).strip() + return (en, tx) + + def _parse_compact_entry(t_value: str) -> TMEntry | None: """Parse the compact 't' field format used in real TM files. @@ -120,8 +165,7 @@ def _parse_compact_entry(t_value: str) -> TMEntry | None: The locale code (xx-xx) is the reliable split point: - Everything BEFORE the locale code is the seg_key + note_type metadata - - Everything AFTER needs to be split into EN source and TX translation - (we store the full post-locale text and let the LLM handle matching) + - Everything AFTER is split into EN source and TX translation """ match = _LOCALE_RE.search(t_value) if not match: @@ -131,16 +175,12 @@ def _parse_compact_entry(t_value: str) -> TMEntry | None: before_locale = t_value[: match.start()].strip() after_locale = t_value[match.end() :].strip() - # Extract seg_key: everything up to the sequence number - # e.g., "Value Q1 24 Radio 001 VO" -> seg_key="Value Q1 24 Radio 001" - # note_type would be "VO", "Headline", "BVO", "Super", etc. seg_key = before_locale # Extract channel info from seg_key channel = "" sub_channel = "" seg_parts = before_locale.split() - # Try to find channel indicators in the seg_key channel_keywords = { "mass", "value", "onsite", "outbound", "radio", "tv_olv", "display", "ooh", "dooh", "social", "print", @@ -162,11 +202,8 @@ def _parse_compact_entry(t_value: str) -> TMEntry | None: note_type = part break - # For EN/TX split: the boundary is where the language switches - # We store the full text and let the retrieval agent handle it - # Simple heuristic: store everything after locale as combined en+tx - en_text = after_locale - tx_text = after_locale + # Split EN and TX from the combined text + en_text, tx_text = _split_en_tx(after_locale) # Try to extract year from seg_key for the date field date = ""