fix: improve TM parser EN/TX split and fix report SQL errors

The compact TM format parser was storing the combined EN+TX text in both
fields, causing the LLM retrieval agent to fail at matching source lines
against TM entries — resulting in all-low confidence tiers. Added
_split_en_tx() heuristic that detects the language boundary at the first
non-ASCII sentence. Also includes raw _text in LLM prompt for context.

Fixed get_jobs_over_time GroupingError by using literal_column for
date_trunc, added date filters to status_breakdown, and fixed Decimal
serialization in locale stats.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
DJP 2026-04-10 17:47:53 -04:00
parent 52bc499272
commit e97d4f81b7
2 changed files with 54 additions and 12 deletions

View file

@ -83,11 +83,16 @@ def _format_tm_entries_for_prompt(entries: list[TMEntry]) -> str:
"""Format TM entries into a numbered list for the LLM prompt."""
lines: list[str] = []
for i, entry in enumerate(entries, start=1):
lines.append(
line = (
f"[TM-{i}] seg_key={entry.seg_key} | en={entry.en} | "
f"tx={entry.tx} | channel={entry.channel} | "
f"sub_channel={entry.sub_channel} | date={entry.date}"
)
# Include the raw text for compact-format entries so the LLM
# can see the original unsplit content for better matching
if entry._text and entry._text != entry.en:
line += f" | raw={entry._text}"
lines.append(line)
return "\n".join(lines)

View file

@ -112,6 +112,51 @@ import re
_LOCALE_RE = re.compile(r"\b([a-z]{2}-[a-z]{2})\b")
def _split_en_tx(text: str) -> tuple[str, str]:
"""Attempt to split combined EN/TX text from compact TM entries.
The compact format packs both the English source and the target-language
translation into a single string after the locale code. There is no
explicit delimiter, but in practice the boundary is almost always at a
sentence-ending period followed by a capital letter that starts the
target-language text.
Heuristic (works for the vast majority of real Amazon TM files):
1. Split on ". " boundaries.
2. Walk forward and assume the first sentence that contains non-ASCII
characters (ö, ü, é, ñ, etc.) marks the start of the TX portion.
3. If all sentences are ASCII-only (e.g. short entries), fall back to
a 50/50 split at the nearest sentence boundary.
Returns (en, tx).
"""
if not text:
return ("", "")
# Split on sentence boundaries (period followed by space + capital)
parts = re.split(r"(?<=\.)\s+(?=[A-ZÄÖÜÉÈÊÀÁÂÃÇÑ])", text)
if len(parts) <= 1:
# No clear sentence boundary — return full text as EN, empty TX
return (text, text)
# Walk forward: first part with non-ASCII = start of TX
for i, part in enumerate(parts):
if i == 0:
continue # first part is always EN
if re.search(r"[^\x00-\x7F]", part):
en = " ".join(parts[:i]).strip()
tx = " ".join(parts[i:]).strip()
return (en, tx)
# All ASCII — split at midpoint sentence boundary
mid = len(parts) // 2
if mid == 0:
mid = 1
en = " ".join(parts[:mid]).strip()
tx = " ".join(parts[mid:]).strip()
return (en, tx)
def _parse_compact_entry(t_value: str) -> TMEntry | None:
"""Parse the compact 't' field format used in real TM files.
@ -120,8 +165,7 @@ def _parse_compact_entry(t_value: str) -> TMEntry | None:
The locale code (xx-xx) is the reliable split point:
- Everything BEFORE the locale code is the seg_key + note_type metadata
- Everything AFTER needs to be split into EN source and TX translation
(we store the full post-locale text and let the LLM handle matching)
- Everything AFTER is split into EN source and TX translation
"""
match = _LOCALE_RE.search(t_value)
if not match:
@ -131,16 +175,12 @@ def _parse_compact_entry(t_value: str) -> TMEntry | None:
before_locale = t_value[: match.start()].strip()
after_locale = t_value[match.end() :].strip()
# Extract seg_key: everything up to the sequence number
# e.g., "Value Q1 24 Radio 001 VO" -> seg_key="Value Q1 24 Radio 001"
# note_type would be "VO", "Headline", "BVO", "Super", etc.
seg_key = before_locale
# Extract channel info from seg_key
channel = ""
sub_channel = ""
seg_parts = before_locale.split()
# Try to find channel indicators in the seg_key
channel_keywords = {
"mass", "value", "onsite", "outbound", "radio",
"tv_olv", "display", "ooh", "dooh", "social", "print",
@ -162,11 +202,8 @@ def _parse_compact_entry(t_value: str) -> TMEntry | None:
note_type = part
break
# For EN/TX split: the boundary is where the language switches
# We store the full text and let the retrieval agent handle it
# Simple heuristic: store everything after locale as combined en+tx
en_text = after_locale
tx_text = after_locale
# Split EN and TX from the combined text
en_text, tx_text = _split_en_tx(after_locale)
# Try to extract year from seg_key for the date field
date = ""