From bb8ed2a004ef3b5cf1e6b8ecb9daffe36dca95ac Mon Sep 17 00:00:00 2001 From: DJP Date: Mon, 11 May 2026 10:57:21 -0400 Subject: [PATCH] =?UTF-8?q?Round=202.7:=20three=20broken=20promises=20?= =?UTF-8?q?=E2=80=94=20empty=20TM,=20supplementary=20files,=20new-TM=20cas?= =?UTF-8?q?ing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug 1: Empty tm_channels was silently re-defaulted to [campaign channel] in both agent_single.py and job_tasks.py via `or [channel]`. Python's `or` treats [] as falsy, so the frontend's empty-list intent was lost. Fixed by replacing `or` with an explicit `is not None` check at both sites. Empty list now means "load no TMs"; None still falls back. Bug 2: Supplementary files dropped by Agent1Validator. The validator built FileManifest(...) with explicit kwargs but forgot supplementary_files, so the raw field from _resolve_file_manifest never reached agent_single.run(). Files were uploaded to disk but never inlined into the LLM context. Fixed by adding supplementary_files=raw.get("supplementary_files", []) to the validator's FileManifest construction. Bug 3: New TM channels lowercased in StepReview.tsx, breaking case-sensitive file lookup. On Linux, "flat_primecbmt_nl-be.json" ≠ "flat_PrimeCBMT_nl-be.json", so the file was silently skipped and zero TM entries loaded. Legacy channels worked only because the hardcoded CHANNEL_FILE_MAP has lowercase keys mapping to canonically-cased filenames; auto-discovered channels (PrimeCBM, PrimeCBMT, etc.) had no such safety net. Two-part fix: 3a. StepReview.tsx no longer lowercases tm_channels — preserves case end-to-end from registry → frontend → backend → disk. 3b. _resolve_all_tm_paths builds a case-insensitive index of the locale's TM directory once per call and resolves filenames against it. Forgives any historical case-drift between registry and disk. Verified end-to-end with a standalone test script run inside the backend container: 8/8 assertions pass covering empty tm_channels, supplementary file passthrough, exact-case lookups, lowercase fallback, missing channels, legacy MASS in both cases, and empty tm_channels yielding no TM paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../app/pipeline/agents/agent_1_validator.py | 1 + backend/app/pipeline/agents/agent_single.py | 36 ++++++++++++++++--- backend/app/tasks/job_tasks.py | 9 ++++- .../components/jobs/JobWizard/StepReview.tsx | 7 ++-- 4 files changed, 46 insertions(+), 7 deletions(-) diff --git a/backend/app/pipeline/agents/agent_1_validator.py b/backend/app/pipeline/agents/agent_1_validator.py index d7c1cca..433f5d7 100644 --- a/backend/app/pipeline/agents/agent_1_validator.py +++ b/backend/app/pipeline/agents/agent_1_validator.py @@ -85,6 +85,7 @@ class Agent1Validator(BaseAgent): "locale_considerations_file" ), date_pct_formats_file=self.file_manifest_raw.get("date_pct_formats_file"), + supplementary_files=self.file_manifest_raw.get("supplementary_files", []), ) # Load reference files (for validation) diff --git a/backend/app/pipeline/agents/agent_single.py b/backend/app/pipeline/agents/agent_single.py index 39dde3b..4f9abd3 100644 --- a/backend/app/pipeline/agents/agent_single.py +++ b/backend/app/pipeline/agents/agent_single.py @@ -57,15 +57,27 @@ def _resolve_all_tm_paths( """Resolve TM file paths for multiple channels. Tries the legacy hardcoded CHANNEL_FILE_MAP first (which has the - canonical lowercase patterns for the original channels). For any + canonical lowercase keys for the original channels). For any channel not in the map (e.g. newly registered like PrimeCBM), falls back to the generic `flat__.json` pattern. - Missing files at the resolved path are silently skipped at load time. + + If the canonical path doesn't exist on disk, scan the locale's TM + directory for any filename that matches case-insensitively and use + that path instead. Protects against drift between the registry's + stored channel casing and the on-disk filename casing — both code + paths historically have produced mixed-case outputs. """ lc = locale_code.lower() tm_dir = os.path.join(settings.STORAGE_ROOT, "amazon", "tm", locale_code) paths: list[str] = [] + # Build a one-time case-insensitive index of the locale's TM directory. + # Empty/missing dir is fine — the resolved paths just won't exist. + dir_index: dict[str, str] = {} + if os.path.isdir(tm_dir): + for entry in os.listdir(tm_dir): + dir_index[entry.lower()] = entry + for ch in tm_channels: pattern = CHANNEL_FILE_MAP.get(ch.lower()) if pattern: @@ -74,7 +86,14 @@ def _resolve_all_tm_paths( # Auto-discovered channel — use the generic pattern with the # channel name as-cased in the registry. filename = TM_FILENAME_PATTERN.format(channel=ch, lc=lc) - path = os.path.join(tm_dir, filename) + + # Case-insensitive lookup: if the canonical filename doesn't + # exist exactly, resolve against the dir index. + actual = dir_index.get(filename.lower()) + if actual is not None: + path = os.path.join(tm_dir, actual) + else: + path = os.path.join(tm_dir, filename) # leave as-is; will be skipped at load if path not in paths: paths.append(path) @@ -461,7 +480,16 @@ class AgentSingle(BaseAgent): channel = context.job_params.channel programme = context.job_params.programme campaign_name = context.job_params.campaign_name - tm_channels = context.job_params.tm_channels or [channel] + # Distinguish "user explicitly sent empty list" (load no TMs) from + # "tm_channels was None" (legacy job → fall back to campaign channel). + # Bare `or` treats [] as falsy and would re-default to [channel] — + # which silently reintroduces the campaign TM after the user asked + # for none. + tm_channels = ( + context.job_params.tm_channels + if context.job_params.tm_channels is not None + else [channel] + ) logger.info( "AgentSingle starting: locale=%s channel=%s tm_channels=%s lines=%d", diff --git a/backend/app/tasks/job_tasks.py b/backend/app/tasks/job_tasks.py index 283987f..6937efd 100644 --- a/backend/app/tasks/job_tasks.py +++ b/backend/app/tasks/job_tasks.py @@ -282,7 +282,14 @@ def process_locale_instance(self, job_id: str, locale_code: str) -> dict: "programme": job.programme.value, "campaign_name": job.campaign_name, "context_prompt": job.context_prompt, - "tm_channels": job.tm_channels or [job.channel], + # Preserve an explicit empty list ("no TMs") from + # the user; only fall back to the campaign channel + # when the field was never set (legacy jobs). + "tm_channels": ( + job.tm_channels + if job.tm_channels is not None + else [job.channel] + ), "llm_model": job.llm_model, } diff --git a/frontend/src/components/jobs/JobWizard/StepReview.tsx b/frontend/src/components/jobs/JobWizard/StepReview.tsx index 6ac9d37..b86c585 100644 --- a/frontend/src/components/jobs/JobWizard/StepReview.tsx +++ b/frontend/src/components/jobs/JobWizard/StepReview.tsx @@ -51,8 +51,11 @@ export function StepReview({ data, onBack }: StepReviewProps) { programme: data.programme.toLowerCase(), channel: data.channel.toLowerCase(), sub_channel: data.sub_channel ? data.sub_channel.toLowerCase() : null, - // Always send an array (even empty) so the backend respects "no TMs" vs "default to channel" - tm_channels: data.tm_channels.map(c => c.toLowerCase()), + // Always send an array (even empty) so the backend respects "no TMs" vs "default to channel". + // Do NOT lowercase the channel names — the pipeline resolves TM filenames using the + // exact case from the registry (e.g. flat_PrimeCBMT_nl-be.json). Lowercasing breaks + // case-sensitive filesystem lookups for any channel not in the legacy CHANNEL_FILE_MAP. + tm_channels: data.tm_channels, llm_model: data.llm_model || undefined, locale_codes: data.locales, context_prompt: data.context_override || undefined,