fix(pipeline): fix 5 QA tickets — caption alignment, glossary, source_has_ad render, filler words, NL error surfacing

- caption_aligner: lower match ratio 0.5→0.35, widen search window 60→150, add time-based cursor fallback on miss
- gemini.py: explicit 'MUST use glossary terms' requirement in translate_vtt prompt; source_has_ad prompt now instructs not to include AD narration in captions
- ingest_and_ai: load glossary for source language and pass to extract_accessibility
- render_accessible_video: handle source_has_ad=True via caption-embed path (ffmpeg subtitle inject, no AD pipeline)
- translate_and_synthesize: track failed languages, write translation_errors to DB, add exc_info to error log
- vtt.py: expand _FILLER_PATTERNS to nl/pt/pl/uk/ru, widen EN/ES/FR/DE/IT lists
- gemini_ingestion.md: strengthen line:0% placement rule, expand disfluency examples per language

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-05-08 18:36:59 +01:00
parent f7708f0214
commit 76bee82119
7 changed files with 148 additions and 25 deletions

View file

@ -288,11 +288,16 @@ class VTTEditor:
# DCMP §6.01 filler patterns per language (whole-word, case-insensitive)
_FILLER_PATTERNS: dict[str, str] = {
"en": r'\b(um+|uh+|ah+|er+|hmm+|like|you know|i mean)\b',
"es": r'\b(eh+|este|o sea|pues)\b',
"fr": r'\b(euh+|beh|ben|donc|quoi)\b',
"de": r'\b(äh+|ähm+|halt|ne)\b',
"it": r'\b(ehm+|allora|cioè|tipo)\b',
"en": r'\b(um+|uh+|ah+|er+|hmm+|you know|i mean|sort of|kind of|basically|literally|honestly|actually|right\?|so yeah)\b',
"es": r'\b(eh+|este|o sea|pues|bueno|o sea que|mmm+)\b',
"fr": r'\b(euh+|beh|ben|donc|quoi|enfin|voilà|genre)\b',
"de": r'\b(äh+|ähm+|halt|ne|also|naja|sozusagen|quasi)\b',
"it": r'\b(ehm+|allora|cioè|tipo|praticamente|insomma|ecco)\b',
"nl": r'\b(eh+|nou|zeg|eigenlijk|gewoon|toch|zo van|hè)\b',
"pt": r'\b(ahn+|hã+|né|sabe|tipo|então|assim)\b',
"pl": r'\b(no|że|bo|znaczy|właśnie|jakby|wiesz)\b',
"uk": r'\b(ну+|ем+|типу|знаєш|значить|власне|от)\b',
"ru": r'\b(ну+|эм+|типа|знаешь|значит|вот|собственно)\b',
}
@staticmethod

View file

@ -59,11 +59,12 @@ CAPTION FORMATTING (DCMP standard):
- Use mixed case. Use ALL CAPS only for screaming or shouting
DISFLUENCY REMOVAL (DCMP §6.01):
- Do NOT include filler words, false starts, or hesitations in captions
- Remove: "um", "uh", "ah", "er", "hmm", "like" (as filler), "you know" (as filler), "I mean" (as filler)
- Also remove language-specific fillers (e.g., "euh"/"beh" in French, "äh"/"ähm" in German, "eh"/"este" in Spanish, "ehm"/"allora" in Italian)
- MANDATORY: Never include filler words, false starts, or hesitations in captions — remove them silently
- English fillers to remove: "um", "uh", "ah", "er", "hmm", "you know", "I mean", "sort of", "kind of", "basically", "literally", "honestly"
- Language-specific fillers: French "euh"/"beh"/"ben"/"genre", German "äh"/"ähm"/"halt"/"also", Spanish "eh"/"este"/"o sea"/"pues", Italian "ehm"/"allora"/"cioè"/"tipo", Dutch "eh"/"nou"/"zeg"/"eigenlijk", Portuguese "ahn"/"né"/"sabe"/"tipo"
- Remove false starts when the speaker self-corrects immediately (e.g., "I was — I went to the store" → "I went to the store")
- Do NOT remove meaningful repetition, emphasis, or intentional stylistic choices
- When in doubt whether a word is a filler or content: omit it — clean captions are preferred over over-inclusive ones
SOUND AND MUSIC FORMATTING (DCMP standard):
- Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
@ -77,7 +78,9 @@ SOUND AND MUSIC FORMATTING (DCMP standard):
CAPTION PLACEMENT:
- Captions are normally positioned at the bottom of the screen
- When visible text, graphics, logos, or on-screen information appear at the bottom of the frame during a caption cue, add the VTT cue setting "line:0%" to move that caption to the top — format: "00:00:01.000 --> 00:00:03.000 line:0%"
- CRITICAL: When ANY of the following are visible at the BOTTOM of the frame during a caption cue — on-screen text, lower-thirds, name plates, location titles, graphics, logos, product labels, URLs, or any visual information — you MUST add the VTT cue setting "line:0%" to move that cue to the top of the screen. Format: "00:00:01.000 --> 00:00:03.000 line:0%"
- When in doubt whether bottom content conflicts with captions, use "line:0%" — it is better to be at the top than to obstruct important on-screen information
- Example: if a lower-third name plate is visible at seconds 0:050:08, all caption cues overlapping that range must have "line:0%"
ETHICAL GUIDELINES FOR DESCRIBING PEOPLE (DCMP standard):
- Consistently identify people/characters by name. When a name is not yet known, identify by the most obvious visible attribute (e.g., "the person in the red jacket") until the name is established, then switch to the name and use it consistently

View file

@ -10,6 +10,7 @@ Algorithm:
graceful fallbacks where Whisper didn't capture the audio.
"""
import bisect
import re
from dataclasses import dataclass
@ -23,10 +24,12 @@ logger = get_logger(__name__)
_PUNCT = re.compile(r"[^\w']", re.UNICODE)
# Tokens shorter than this are considered stop-words and excluded from matching
_MIN_TOKEN_LEN = 2
# Minimum fraction of cue tokens that must match Whisper words for alignment
_MIN_MATCH_RATIO = 0.5
# How many Whisper words ahead of the cursor to search for a cue's tokens
_SEARCH_WINDOW = 60
# Minimum fraction of cue tokens that must match Whisper words for alignment.
# Lowered from 0.5 → 0.35 to handle Gemini paraphrasing and short cues.
_MIN_MATCH_RATIO = 0.35
# How many Whisper words ahead of the cursor to search for a cue's tokens.
# Widened from 60 → 150 so the window stays valid even after several failed cues.
_SEARCH_WINDOW = 150
def _tokenise(text: str) -> list[str]:
@ -80,6 +83,13 @@ def _find_match(
return best
def _cursor_for_time(whisper_words: list[WordTimestamp], t: float, from_idx: int) -> int:
"""Return the index of the first Whisper word at or after time t, starting from from_idx."""
starts = [w.start for w in whisper_words]
idx = bisect.bisect_left(starts, t, from_idx)
return min(idx, len(whisper_words) - 1)
def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
"""Replace VTT cue timings with Whisper-accurate timestamps where possible.
@ -97,23 +107,23 @@ def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
for cue in cues:
tokens = _tokenise(cue.text)
if not tokens:
# Sound-effect or music cue — nothing to align
continue
match = _find_match(tokens, whisper_words, cursor)
if match is None:
# Advance cursor to the Whisper word closest to this cue's start time
# so subsequent cues don't search from a stale position.
cursor = _cursor_for_time(whisper_words, cue.start_time, cursor)
continue
new_start = whisper_words[match.first_word_idx].start
new_end = whisper_words[match.last_word_idx].end
# Sanity: don't create zero-duration or backwards cues
if new_end > new_start:
cue.start_time = new_start
cue.end_time = new_end
aligned += 1
# Advance cursor to just past the last matched word
cursor = match.last_word_idx + 1
logger.info(

View file

@ -146,8 +146,11 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
if source_has_ad:
return (
"SOURCE AUDIO DESCRIPTION NOTICE: This video already has professional audio descriptions "
"embedded in its audio track. Return an empty audio_description_vtt containing only "
"the WEBVTT header (\"WEBVTT\\n\") — do NOT generate new audio descriptions."
"embedded in its audio track. "
"1) Return an empty audio_description_vtt containing only the WEBVTT header (\"WEBVTT\\n\") — do NOT generate new audio descriptions. "
"2) For captions_vtt: transcribe ONLY the original program dialogue and relevant sound effects. "
"Do NOT caption the audio description narration — AD narration is spoken during natural pauses "
"and describes visual scenes rather than being part of the original dialogue."
)
return ""
@ -891,6 +894,10 @@ JSON:
_tgt_label = locale_lib.get_gemini_label(target_language)
_glossary_section = self._build_glossary_block(glossary_block)
_glossary_line = f"\n\n{_glossary_section}" if _glossary_section else ""
_glossary_req = (
"\n- MUST use the exact approved terms from the glossary below — these override natural translation choices, even for English terms"
if _glossary_section else ""
)
_adapt_line = _style_instruction.format(tgt=_tgt_label) if style == "transcreate" else ""
prompt = f"""Translate the following {cue_count} numbered text segments from {_src_label} to {_tgt_label}.
@ -899,7 +906,7 @@ REQUIREMENTS:
- Format: "1. translated text", "2. translated text", etc.
- Preserve speaker labels like [Speaker 1]: unchanged
- {_adapt_line}Use natural, idiomatic {_tgt_label}
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_line}
- Do NOT add any explanation, preamble, or extra lines{extra_instruction}{_glossary_req}{_glossary_line}
Segments to translate:
{numbered_texts}"""

View file

@ -169,11 +169,17 @@ async def ingest_and_ai_task_impl(job_id: str):
user_external_id=_cost_ctx["user_id"],
project_id=_cost_ctx["project_id"],
)
# Load glossary for source language — use brand context as vocabulary hint
from ..services.glossary_service import get_glossary_block_for_job
_source_lang = job_doc.get("source", {}).get("language", "en")
_job_for_glossary = {**job_doc, "_glossary_source_text": brand_context or ""}
glossary_block = await get_glossary_block_for_job(_job_for_glossary, _source_lang, db)
ai_result = await gemini_service.extract_accessibility(
temp_path,
brand_context=brand_context,
sdh_requested=sdh_requested,
source_has_ad=source_has_ad,
glossary_block=glossary_block,
_cost_ctx=_cost_ctx,
)
# Enforce: if source already has AD, discard any AI-generated AD

View file

@ -135,6 +135,15 @@ async def _async_render_accessible_video(job_id: str, language: str):
if not lang_output:
raise ValueError(f"No outputs found for language {language}")
# When source already has professional AD, render captions-only accessible video
source_has_ad = job_doc.get("source", {}).get("source_has_ad", False)
if source_has_ad:
await _render_source_has_ad_video(
job_id, job_doc, language, lang_output,
source_video_path, temp_dir, db, job_title
)
return
# 3. Download AD VTT content
ad_vtt_gcs = lang_output.get("ad_vtt_gcs")
if not ad_vtt_gcs:
@ -367,6 +376,83 @@ async def _async_render_accessible_video(job_id: str, language: str):
client.close()
async def _render_source_has_ad_video(
job_id: str,
job_doc: dict,
language: str,
lang_output: dict,
source_video_path: str,
temp_dir: str,
db,
job_title: str,
) -> None:
"""Render accessible video for jobs where the source already has professional AD.
Embeds the captions VTT as a soft subtitle track no AD audio injection needed
since the original audio track already contains the AD narration.
"""
captions_vtt_gcs = lang_output.get("captions_vtt_gcs")
if not captions_vtt_gcs:
raise ValueError(f"No captions VTT found for language {language}")
# Download captions VTT
captions_blob_path = captions_vtt_gcs.replace(f"gs://{settings.gcs_bucket}/", "")
captions_vtt_content = gcs_service.bucket.blob(captions_blob_path).download_as_text()
# Write VTT to temp file
vtt_path = os.path.join(temp_dir, "captions.vtt")
with open(vtt_path, "w", encoding="utf-8") as f:
f.write(captions_vtt_content)
# Embed captions as soft subtitle track — no re-encode needed
output_video_path = os.path.join(temp_dir, "accessible_video.mp4")
cmd = [
"ffmpeg", "-y",
"-i", source_video_path,
"-i", vtt_path,
"-c", "copy",
"-c:s", "webvtt",
"-metadata:s:s:0", f"language={language}",
output_video_path,
]
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"ffmpeg caption embed failed: {result.stderr[-500:]}")
# Upload rendered video
video_blob_path = gcs_path(job_doc, language, "accessible_video.mp4")
video_blob = gcs_service.bucket.blob(video_blob_path)
video_blob.content_type = "video/mp4"
video_blob.upload_from_filename(output_video_path)
video_gcs_uri = f"gs://{settings.gcs_bucket}/{video_blob_path}"
logger.info(f"Uploaded source-has-ad accessible video to {video_gcs_uri}")
# Update job document
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
f"outputs.{language}.accessible_video_gcs": video_gcs_uri,
f"outputs.{language}.accessible_video_method": "caption_embed",
f"accessible_video_progress.{language}": {
"status": "completed",
"method": "caption_embed",
"started_at": job_doc.get("accessible_video_progress", {}).get(language, {}).get("started_at"),
"completed_at": datetime.utcnow(),
},
"updated_at": datetime.utcnow(),
}
},
)
broadcast_status_update(
job_id,
"asset_ready",
job_title=job_title,
message=f"Accessible video ready for {language.upper()} (caption embed)",
)
await _check_accessible_video_completion(job_id, db)
def _build_placements_from_ad_vtt(ad_vtt_content: str, cue_durations: list[float]) -> list[dict]:
"""
Build placement instructions from AD VTT cues and TTS durations.

View file

@ -189,6 +189,7 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No
updated_outputs = job_doc.get("outputs", {})
_source_text_for_glossary = " ".join(filter(None, [source_captions_vtt, source_ad_vtt]))
_failed_languages: list[str] = []
try:
target_languages = [lang for lang in requested_languages if lang != source_language]
@ -268,7 +269,8 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No
logger.info(f"Processed language: {language} (origin: {origin})")
except Exception as e:
logger.error(f"Failed to process language {language}: {e}")
logger.error(f"Failed to process language {language}: {e}", exc_info=True)
_failed_languages.append(language)
# Preserve existing GCS URIs and origin so retranslation failure
# doesn't destroy captions the user can still view
existing = updated_outputs.get(language, {})
@ -288,14 +290,18 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No
for lang in target_languages
if lang in updated_outputs
}
_status_update: dict = {
"status": JobStatus.TTS_GENERATING.value,
"updated_at": datetime.utcnow(),
**per_lang_updates,
}
if _failed_languages:
_status_update["translation_errors"] = _failed_languages
logger.warning(f"Job {job_id}: translation failed for languages: {_failed_languages}")
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {
"status": JobStatus.TTS_GENERATING.value,
"updated_at": datetime.utcnow(),
**per_lang_updates,
},
"$set": _status_update,
"$push": {
"review.history": {
"at": datetime.utcnow(),