From 76bee82119e9ecd0bd96d413a8b099b3a805c1f1 Mon Sep 17 00:00:00 2001 From: Vadym Samoilenko Date: Fri, 8 May 2026 18:36:59 +0100 Subject: [PATCH] =?UTF-8?q?fix(pipeline):=20fix=205=20QA=20tickets=20?= =?UTF-8?q?=E2=80=94=20caption=20alignment,=20glossary,=20source=5Fhas=5Fa?= =?UTF-8?q?d=20render,=20filler=20words,=20NL=20error=20surfacing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - caption_aligner: lower match ratio 0.5→0.35, widen search window 60→150, add time-based cursor fallback on miss - gemini.py: explicit 'MUST use glossary terms' requirement in translate_vtt prompt; source_has_ad prompt now instructs not to include AD narration in captions - ingest_and_ai: load glossary for source language and pass to extract_accessibility - render_accessible_video: handle source_has_ad=True via caption-embed path (ffmpeg subtitle inject, no AD pipeline) - translate_and_synthesize: track failed languages, write translation_errors to DB, add exc_info to error log - vtt.py: expand _FILLER_PATTERNS to nl/pt/pl/uk/ru, widen EN/ES/FR/DE/IT lists - gemini_ingestion.md: strengthen line:0% placement rule, expand disfluency examples per language Co-Authored-By: Claude Sonnet 4.6 --- backend/app/lib/vtt.py | 15 ++-- backend/app/prompts/gemini_ingestion.md | 11 ++- backend/app/services/caption_aligner.py | 24 ++++-- backend/app/services/gemini.py | 13 ++- backend/app/tasks/ingest_and_ai.py | 6 ++ backend/app/tasks/render_accessible_video.py | 86 +++++++++++++++++++ backend/app/tasks/translate_and_synthesize.py | 18 ++-- 7 files changed, 148 insertions(+), 25 deletions(-) diff --git a/backend/app/lib/vtt.py b/backend/app/lib/vtt.py index 4dc7435..1c2b4fd 100644 --- a/backend/app/lib/vtt.py +++ b/backend/app/lib/vtt.py @@ -288,11 +288,16 @@ class VTTEditor: # DCMP §6.01 filler patterns per language (whole-word, case-insensitive) _FILLER_PATTERNS: dict[str, str] = { - "en": r'\b(um+|uh+|ah+|er+|hmm+|like|you know|i mean)\b', - "es": r'\b(eh+|este|o sea|pues)\b', - "fr": r'\b(euh+|beh|ben|donc|quoi)\b', - "de": r'\b(äh+|ähm+|halt|ne)\b', - "it": r'\b(ehm+|allora|cioè|tipo)\b', + "en": r'\b(um+|uh+|ah+|er+|hmm+|you know|i mean|sort of|kind of|basically|literally|honestly|actually|right\?|so yeah)\b', + "es": r'\b(eh+|este|o sea|pues|bueno|o sea que|mmm+)\b', + "fr": r'\b(euh+|beh|ben|donc|quoi|enfin|voilà|genre)\b', + "de": r'\b(äh+|ähm+|halt|ne|also|naja|sozusagen|quasi)\b', + "it": r'\b(ehm+|allora|cioè|tipo|praticamente|insomma|ecco)\b', + "nl": r'\b(eh+|nou|zeg|eigenlijk|gewoon|toch|zo van|hè)\b', + "pt": r'\b(ahn+|hã+|né|sabe|tipo|então|assim)\b', + "pl": r'\b(no|że|bo|znaczy|właśnie|jakby|wiesz)\b', + "uk": r'\b(ну+|ем+|типу|знаєш|значить|власне|от)\b', + "ru": r'\b(ну+|эм+|типа|знаешь|значит|вот|собственно)\b', } @staticmethod diff --git a/backend/app/prompts/gemini_ingestion.md b/backend/app/prompts/gemini_ingestion.md index 876317b..3e1555d 100644 --- a/backend/app/prompts/gemini_ingestion.md +++ b/backend/app/prompts/gemini_ingestion.md @@ -59,11 +59,12 @@ CAPTION FORMATTING (DCMP standard): - Use mixed case. Use ALL CAPS only for screaming or shouting DISFLUENCY REMOVAL (DCMP §6.01): -- Do NOT include filler words, false starts, or hesitations in captions -- Remove: "um", "uh", "ah", "er", "hmm", "like" (as filler), "you know" (as filler), "I mean" (as filler) -- Also remove language-specific fillers (e.g., "euh"/"beh" in French, "äh"/"ähm" in German, "eh"/"este" in Spanish, "ehm"/"allora" in Italian) +- MANDATORY: Never include filler words, false starts, or hesitations in captions — remove them silently +- English fillers to remove: "um", "uh", "ah", "er", "hmm", "you know", "I mean", "sort of", "kind of", "basically", "literally", "honestly" +- Language-specific fillers: French "euh"/"beh"/"ben"/"genre", German "äh"/"ähm"/"halt"/"also", Spanish "eh"/"este"/"o sea"/"pues", Italian "ehm"/"allora"/"cioè"/"tipo", Dutch "eh"/"nou"/"zeg"/"eigenlijk", Portuguese "ahn"/"né"/"sabe"/"tipo" - Remove false starts when the speaker self-corrects immediately (e.g., "I was — I went to the store" → "I went to the store") - Do NOT remove meaningful repetition, emphasis, or intentional stylistic choices +- When in doubt whether a word is a filler or content: omit it — clean captions are preferred over over-inclusive ones SOUND AND MUSIC FORMATTING (DCMP standard): - Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching] @@ -77,7 +78,9 @@ SOUND AND MUSIC FORMATTING (DCMP standard): CAPTION PLACEMENT: - Captions are normally positioned at the bottom of the screen -- When visible text, graphics, logos, or on-screen information appear at the bottom of the frame during a caption cue, add the VTT cue setting "line:0%" to move that caption to the top — format: "00:00:01.000 --> 00:00:03.000 line:0%" +- CRITICAL: When ANY of the following are visible at the BOTTOM of the frame during a caption cue — on-screen text, lower-thirds, name plates, location titles, graphics, logos, product labels, URLs, or any visual information — you MUST add the VTT cue setting "line:0%" to move that cue to the top of the screen. Format: "00:00:01.000 --> 00:00:03.000 line:0%" +- When in doubt whether bottom content conflicts with captions, use "line:0%" — it is better to be at the top than to obstruct important on-screen information +- Example: if a lower-third name plate is visible at seconds 0:05–0:08, all caption cues overlapping that range must have "line:0%" ETHICAL GUIDELINES FOR DESCRIBING PEOPLE (DCMP standard): - Consistently identify people/characters by name. When a name is not yet known, identify by the most obvious visible attribute (e.g., "the person in the red jacket") until the name is established, then switch to the name and use it consistently diff --git a/backend/app/services/caption_aligner.py b/backend/app/services/caption_aligner.py index 419b364..f734c7d 100644 --- a/backend/app/services/caption_aligner.py +++ b/backend/app/services/caption_aligner.py @@ -10,6 +10,7 @@ Algorithm: graceful fallbacks where Whisper didn't capture the audio. """ +import bisect import re from dataclasses import dataclass @@ -23,10 +24,12 @@ logger = get_logger(__name__) _PUNCT = re.compile(r"[^\w']", re.UNICODE) # Tokens shorter than this are considered stop-words and excluded from matching _MIN_TOKEN_LEN = 2 -# Minimum fraction of cue tokens that must match Whisper words for alignment -_MIN_MATCH_RATIO = 0.5 -# How many Whisper words ahead of the cursor to search for a cue's tokens -_SEARCH_WINDOW = 60 +# Minimum fraction of cue tokens that must match Whisper words for alignment. +# Lowered from 0.5 → 0.35 to handle Gemini paraphrasing and short cues. +_MIN_MATCH_RATIO = 0.35 +# How many Whisper words ahead of the cursor to search for a cue's tokens. +# Widened from 60 → 150 so the window stays valid even after several failed cues. +_SEARCH_WINDOW = 150 def _tokenise(text: str) -> list[str]: @@ -80,6 +83,13 @@ def _find_match( return best +def _cursor_for_time(whisper_words: list[WordTimestamp], t: float, from_idx: int) -> int: + """Return the index of the first Whisper word at or after time t, starting from from_idx.""" + starts = [w.start for w in whisper_words] + idx = bisect.bisect_left(starts, t, from_idx) + return min(idx, len(whisper_words) - 1) + + def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str: """Replace VTT cue timings with Whisper-accurate timestamps where possible. @@ -97,23 +107,23 @@ def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str: for cue in cues: tokens = _tokenise(cue.text) if not tokens: - # Sound-effect or music cue — nothing to align continue match = _find_match(tokens, whisper_words, cursor) if match is None: + # Advance cursor to the Whisper word closest to this cue's start time + # so subsequent cues don't search from a stale position. + cursor = _cursor_for_time(whisper_words, cue.start_time, cursor) continue new_start = whisper_words[match.first_word_idx].start new_end = whisper_words[match.last_word_idx].end - # Sanity: don't create zero-duration or backwards cues if new_end > new_start: cue.start_time = new_start cue.end_time = new_end aligned += 1 - # Advance cursor to just past the last matched word cursor = match.last_word_idx + 1 logger.info( diff --git a/backend/app/services/gemini.py b/backend/app/services/gemini.py index 9db7c72..ac3d878 100644 --- a/backend/app/services/gemini.py +++ b/backend/app/services/gemini.py @@ -146,8 +146,11 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w if source_has_ad: return ( "SOURCE AUDIO DESCRIPTION NOTICE: This video already has professional audio descriptions " - "embedded in its audio track. Return an empty audio_description_vtt containing only " - "the WEBVTT header (\"WEBVTT\\n\") — do NOT generate new audio descriptions." + "embedded in its audio track. " + "1) Return an empty audio_description_vtt containing only the WEBVTT header (\"WEBVTT\\n\") — do NOT generate new audio descriptions. " + "2) For captions_vtt: transcribe ONLY the original program dialogue and relevant sound effects. " + "Do NOT caption the audio description narration — AD narration is spoken during natural pauses " + "and describes visual scenes rather than being part of the original dialogue." ) return "" @@ -891,6 +894,10 @@ JSON: _tgt_label = locale_lib.get_gemini_label(target_language) _glossary_section = self._build_glossary_block(glossary_block) _glossary_line = f"\n\n{_glossary_section}" if _glossary_section else "" + _glossary_req = ( + "\n- MUST use the exact approved terms from the glossary below — these override natural translation choices, even for English terms" + if _glossary_section else "" + ) _adapt_line = _style_instruction.format(tgt=_tgt_label) if style == "transcreate" else "" prompt = f"""Translate the following {cue_count} numbered text segments from {_src_label} to {_tgt_label}. @@ -899,7 +906,7 @@ REQUIREMENTS: - Format: "1. translated text", "2. translated text", etc. - Preserve speaker labels like [Speaker 1]: unchanged - {_adapt_line}Use natural, idiomatic {_tgt_label} -- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_line} +- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_req}{_glossary_line} Segments to translate: {numbered_texts}""" diff --git a/backend/app/tasks/ingest_and_ai.py b/backend/app/tasks/ingest_and_ai.py index fb6e9bb..1c7fd05 100644 --- a/backend/app/tasks/ingest_and_ai.py +++ b/backend/app/tasks/ingest_and_ai.py @@ -169,11 +169,17 @@ async def ingest_and_ai_task_impl(job_id: str): user_external_id=_cost_ctx["user_id"], project_id=_cost_ctx["project_id"], ) + # Load glossary for source language — use brand context as vocabulary hint + from ..services.glossary_service import get_glossary_block_for_job + _source_lang = job_doc.get("source", {}).get("language", "en") + _job_for_glossary = {**job_doc, "_glossary_source_text": brand_context or ""} + glossary_block = await get_glossary_block_for_job(_job_for_glossary, _source_lang, db) ai_result = await gemini_service.extract_accessibility( temp_path, brand_context=brand_context, sdh_requested=sdh_requested, source_has_ad=source_has_ad, + glossary_block=glossary_block, _cost_ctx=_cost_ctx, ) # Enforce: if source already has AD, discard any AI-generated AD diff --git a/backend/app/tasks/render_accessible_video.py b/backend/app/tasks/render_accessible_video.py index 5725f12..9ec012c 100644 --- a/backend/app/tasks/render_accessible_video.py +++ b/backend/app/tasks/render_accessible_video.py @@ -135,6 +135,15 @@ async def _async_render_accessible_video(job_id: str, language: str): if not lang_output: raise ValueError(f"No outputs found for language {language}") + # When source already has professional AD, render captions-only accessible video + source_has_ad = job_doc.get("source", {}).get("source_has_ad", False) + if source_has_ad: + await _render_source_has_ad_video( + job_id, job_doc, language, lang_output, + source_video_path, temp_dir, db, job_title + ) + return + # 3. Download AD VTT content ad_vtt_gcs = lang_output.get("ad_vtt_gcs") if not ad_vtt_gcs: @@ -367,6 +376,83 @@ async def _async_render_accessible_video(job_id: str, language: str): client.close() +async def _render_source_has_ad_video( + job_id: str, + job_doc: dict, + language: str, + lang_output: dict, + source_video_path: str, + temp_dir: str, + db, + job_title: str, +) -> None: + """Render accessible video for jobs where the source already has professional AD. + + Embeds the captions VTT as a soft subtitle track — no AD audio injection needed + since the original audio track already contains the AD narration. + """ + captions_vtt_gcs = lang_output.get("captions_vtt_gcs") + if not captions_vtt_gcs: + raise ValueError(f"No captions VTT found for language {language}") + + # Download captions VTT + captions_blob_path = captions_vtt_gcs.replace(f"gs://{settings.gcs_bucket}/", "") + captions_vtt_content = gcs_service.bucket.blob(captions_blob_path).download_as_text() + + # Write VTT to temp file + vtt_path = os.path.join(temp_dir, "captions.vtt") + with open(vtt_path, "w", encoding="utf-8") as f: + f.write(captions_vtt_content) + + # Embed captions as soft subtitle track — no re-encode needed + output_video_path = os.path.join(temp_dir, "accessible_video.mp4") + cmd = [ + "ffmpeg", "-y", + "-i", source_video_path, + "-i", vtt_path, + "-c", "copy", + "-c:s", "webvtt", + "-metadata:s:s:0", f"language={language}", + output_video_path, + ] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + raise RuntimeError(f"ffmpeg caption embed failed: {result.stderr[-500:]}") + + # Upload rendered video + video_blob_path = gcs_path(job_doc, language, "accessible_video.mp4") + video_blob = gcs_service.bucket.blob(video_blob_path) + video_blob.content_type = "video/mp4" + video_blob.upload_from_filename(output_video_path) + video_gcs_uri = f"gs://{settings.gcs_bucket}/{video_blob_path}" + logger.info(f"Uploaded source-has-ad accessible video to {video_gcs_uri}") + + # Update job document + await db.jobs.update_one( + {"_id": job_id}, + { + "$set": { + f"outputs.{language}.accessible_video_gcs": video_gcs_uri, + f"outputs.{language}.accessible_video_method": "caption_embed", + f"accessible_video_progress.{language}": { + "status": "completed", + "method": "caption_embed", + "started_at": job_doc.get("accessible_video_progress", {}).get(language, {}).get("started_at"), + "completed_at": datetime.utcnow(), + }, + "updated_at": datetime.utcnow(), + } + }, + ) + broadcast_status_update( + job_id, + "asset_ready", + job_title=job_title, + message=f"Accessible video ready for {language.upper()} (caption embed)", + ) + await _check_accessible_video_completion(job_id, db) + + def _build_placements_from_ad_vtt(ad_vtt_content: str, cue_durations: list[float]) -> list[dict]: """ Build placement instructions from AD VTT cues and TTS durations. diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py index 222f36a..f94e903 100644 --- a/backend/app/tasks/translate_and_synthesize.py +++ b/backend/app/tasks/translate_and_synthesize.py @@ -189,6 +189,7 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No updated_outputs = job_doc.get("outputs", {}) _source_text_for_glossary = " ".join(filter(None, [source_captions_vtt, source_ad_vtt])) + _failed_languages: list[str] = [] try: target_languages = [lang for lang in requested_languages if lang != source_language] @@ -268,7 +269,8 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No logger.info(f"Processed language: {language} (origin: {origin})") except Exception as e: - logger.error(f"Failed to process language {language}: {e}") + logger.error(f"Failed to process language {language}: {e}", exc_info=True) + _failed_languages.append(language) # Preserve existing GCS URIs and origin so retranslation failure # doesn't destroy captions the user can still view existing = updated_outputs.get(language, {}) @@ -288,14 +290,18 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No for lang in target_languages if lang in updated_outputs } + _status_update: dict = { + "status": JobStatus.TTS_GENERATING.value, + "updated_at": datetime.utcnow(), + **per_lang_updates, + } + if _failed_languages: + _status_update["translation_errors"] = _failed_languages + logger.warning(f"Job {job_id}: translation failed for languages: {_failed_languages}") await db.jobs.update_one( {"_id": job_id}, { - "$set": { - "status": JobStatus.TTS_GENERATING.value, - "updated_at": datetime.utcnow(), - **per_lang_updates, - }, + "$set": _status_update, "$push": { "review.history": { "at": datetime.utcnow(),