fix: 7 caption/AD quality bugs + retranslation error handling
Bug fixes: - Bug 1a: source_has_ad flag prevents AI generating AD over existing professional AD; JobBrief/Job models, gemini service prompt conditional, NewBrief UI checkbox - Bug 1b: disable native textTracks on video element to prevent double captions - Bug 2: caption ALL audible speech including off-screen narrators (prompt fix) - Bug 3: DCMP §6.01 disfluency removal for EN/ES/FR/DE/IT (prompt + post-pass) - Bug 4: VTT cue settings (line:0%, position:) preserved through parser round-trip - Bug 5: Whisper word-level timestamp alignment via new caption_aligner service - Bug 6: assert_cue_alignment used .start/.end; renamed to .start_time/.end_time - New migration: backfill source_has_ad=False on existing jobs and job_briefs Also fix retranslation error handling to preserve existing GCS URIs on failure so video_native captions remain accessible if retranslation fails. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
00dd1643f5
commit
290d5e32e6
13 changed files with 325 additions and 16 deletions
|
|
@ -8,6 +8,7 @@ class VTTCue:
|
|||
end_time: float # seconds
|
||||
text: str
|
||||
identifier: str | None = None
|
||||
settings: str = ""
|
||||
|
||||
|
||||
class VTTParser:
|
||||
|
|
@ -37,10 +38,11 @@ class VTTParser:
|
|||
|
||||
# Parse timing line
|
||||
if " --> " in line:
|
||||
timing_match = re.match(r'([\d:.,]+)\s+-->\s+([\d:.,]+)', line)
|
||||
timing_match = re.match(r'([\d:.,]+)\s+-->\s+([\d:.,]+)\s*(.*)', line)
|
||||
if timing_match:
|
||||
start_time = VTTParser._parse_timestamp(timing_match.group(1))
|
||||
end_time = VTTParser._parse_timestamp(timing_match.group(2))
|
||||
settings = timing_match.group(3).strip()
|
||||
|
||||
# Collect text lines until empty line or next cue
|
||||
i += 1
|
||||
|
|
@ -53,7 +55,8 @@ class VTTParser:
|
|||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
text="\n".join(text_lines),
|
||||
identifier=identifier
|
||||
identifier=identifier,
|
||||
settings=settings,
|
||||
))
|
||||
else:
|
||||
i += 1
|
||||
|
|
@ -70,10 +73,13 @@ class VTTParser:
|
|||
if cue.identifier:
|
||||
lines.append(cue.identifier)
|
||||
|
||||
# Add timing line
|
||||
# Add timing line (preserve cue settings like line:0%)
|
||||
start_timestamp = VTTParser._format_timestamp(cue.start_time)
|
||||
end_timestamp = VTTParser._format_timestamp(cue.end_time)
|
||||
lines.append(f"{start_timestamp} --> {end_timestamp}")
|
||||
timing_line = f"{start_timestamp} --> {end_timestamp}"
|
||||
if cue.settings:
|
||||
timing_line += f" {cue.settings}"
|
||||
lines.append(timing_line)
|
||||
|
||||
# Add text (can be multi-line)
|
||||
lines.append(cue.text)
|
||||
|
|
@ -156,11 +162,11 @@ class VTTEditor:
|
|||
raise ValueError(
|
||||
f"Cue count mismatch for {lang}: EN has {len(en_cues)}, target has {len(tgt_cues)}"
|
||||
)
|
||||
for i, (en, tgt) in enumerate(zip(en_cues, tgt_cues)):
|
||||
if en.start != tgt.start or en.end != tgt.end:
|
||||
for i, (en, tgt) in enumerate(zip(en_cues, tgt_cues, strict=True)):
|
||||
if en.start_time != tgt.start_time or en.end_time != tgt.end_time:
|
||||
raise ValueError(
|
||||
f"Timestamp mismatch for {lang} cue {i}: "
|
||||
f"EN {en.start}-->{en.end}, target {tgt.start}-->{tgt.end}"
|
||||
f"EN {en.start_time}-->{en.end_time}, target {tgt.start_time}-->{tgt.end_time}"
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
|
|
@ -236,7 +242,7 @@ class VTTEditor:
|
|||
)
|
||||
return False, errors
|
||||
|
||||
for i, (src, tgt) in enumerate(zip(source_cues, translated_cues)):
|
||||
for i, (src, tgt) in enumerate(zip(source_cues, translated_cues, strict=False)):
|
||||
if abs(src.start_time - tgt.start_time) > 0.001:
|
||||
errors.append(
|
||||
f"Cue {i + 1}: start time changed "
|
||||
|
|
@ -266,3 +272,28 @@ class VTTEditor:
|
|||
|
||||
return VTTParser.build(cues)
|
||||
|
||||
# DCMP §6.01 filler patterns per language (whole-word, case-insensitive)
|
||||
_FILLER_PATTERNS: dict[str, str] = {
|
||||
"en": r'\b(um+|uh+|ah+|er+|hmm+|like|you know|i mean)\b',
|
||||
"es": r'\b(eh+|este|o sea|pues)\b',
|
||||
"fr": r'\b(euh+|beh|ben|donc|quoi)\b',
|
||||
"de": r'\b(äh+|ähm+|halt|ne)\b',
|
||||
"it": r'\b(ehm+|allora|cioè|tipo)\b',
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def clean_disfluencies(vtt_content: str, lang: str) -> str:
|
||||
"""Remove filler words and hesitations per DCMP §6.01 for supported languages."""
|
||||
pattern = VTTEditor._FILLER_PATTERNS.get(lang.split("-")[0].lower())
|
||||
if not pattern:
|
||||
return vtt_content
|
||||
cues = VTTParser.parse(vtt_content)
|
||||
compiled = re.compile(pattern, re.IGNORECASE)
|
||||
for cue in cues:
|
||||
cleaned = compiled.sub("", cue.text)
|
||||
# Collapse multiple spaces and strip leading/trailing punctuation artifacts
|
||||
cleaned = re.sub(r'[ \t]{2,}', ' ', cleaned).strip().strip(',').strip()
|
||||
if cleaned:
|
||||
cue.text = cleaned
|
||||
return VTTParser.build(cues)
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,26 @@
|
|||
"""Backfill source_has_ad=False on existing jobs and job_briefs."""
|
||||
from app.migrations.migrator import Migration
|
||||
|
||||
|
||||
class Migration(Migration):
|
||||
version = "2026-05-08-000000"
|
||||
description = "Add source_has_ad field to jobs.source and job_briefs"
|
||||
|
||||
async def up(self) -> None:
|
||||
db = self.db
|
||||
|
||||
jobs_result = await db.jobs.update_many(
|
||||
{"source.source_has_ad": {"$exists": False}},
|
||||
{"$set": {"source.source_has_ad": False}},
|
||||
)
|
||||
briefs_result = await db.job_briefs.update_many(
|
||||
{"source_has_ad": {"$exists": False}},
|
||||
{"$set": {"source_has_ad": False}},
|
||||
)
|
||||
|
||||
print(f"✅ Backfilled source_has_ad on {jobs_result.modified_count} jobs, {briefs_result.modified_count} job_briefs")
|
||||
|
||||
async def down(self) -> None:
|
||||
db = self.db
|
||||
await db.jobs.update_many({}, {"$unset": {"source.source_has_ad": ""}})
|
||||
await db.job_briefs.update_many({}, {"$unset": {"source_has_ad": ""}})
|
||||
|
|
@ -50,6 +50,7 @@ class Source(BaseModel):
|
|||
language: constr(min_length=2, max_length=10) = "en" # Final source language (from detection or explicit)
|
||||
language_hint: str | None = None # User-provided hint for non-English videos
|
||||
detected_language: str | None = None # AI-detected language from Gemini
|
||||
source_has_ad: bool = False # Source video already contains professional audio descriptions
|
||||
|
||||
|
||||
class TTSPreferences(BaseModel):
|
||||
|
|
@ -281,6 +282,7 @@ class JobCreate(BaseModel):
|
|||
language_hint: str | None = None # Optional hint when source_is_english=False
|
||||
requested_outputs: RequestedOutputs
|
||||
brand_context: str | None = None # Comma-separated brand names present in the video (e.g. "Sellotape, Coca-Cola")
|
||||
source_has_ad: bool = False # Source video already contains professional audio descriptions
|
||||
|
||||
|
||||
class JobUpdate(BaseModel):
|
||||
|
|
|
|||
|
|
@ -45,6 +45,7 @@ class JobBriefCreate(BaseModel):
|
|||
deadline: datetime | None = None
|
||||
project_id: str | None = None
|
||||
assignee_id: str | None = None
|
||||
source_has_ad: bool = False # Source video already contains professional audio descriptions
|
||||
|
||||
|
||||
class JobBriefUpdate(BaseModel):
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ You are given a video. Return a JSON object with:
|
|||
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (in the detected language)
|
||||
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (MUST be written in the detected language)
|
||||
{SDH_FIELD}
|
||||
{SOURCE_HAS_AD}
|
||||
|
||||
CRITICAL LANGUAGE REQUIREMENT:
|
||||
- First, detect the language spoken in the video
|
||||
|
|
@ -36,7 +37,7 @@ CRITICAL TIMING REQUIREMENTS:
|
|||
- Each caption cue should end exactly when the speaker finishes that phrase/sentence
|
||||
- Listen carefully to detect natural speech pauses and word boundaries
|
||||
- Avoid starting captions too early or ending them too late
|
||||
- Ensure captions align with lip movement and speech rhythm
|
||||
- Caption ALL audible speech — include off-screen narrators, voiceover, and any speaker not visible on screen. Do NOT omit speech because the speaker is not visible or because it plays over non-dialogue segments.
|
||||
- For audio descriptions, time them during natural speech gaps or over non-dialogue audio
|
||||
- Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends)
|
||||
|
||||
|
|
@ -57,6 +58,13 @@ CAPTION FORMATTING (DCMP standard):
|
|||
- Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds
|
||||
- Use mixed case. Use ALL CAPS only for screaming or shouting
|
||||
|
||||
DISFLUENCY REMOVAL (DCMP §6.01):
|
||||
- Do NOT include filler words, false starts, or hesitations in captions
|
||||
- Remove: "um", "uh", "ah", "er", "hmm", "like" (as filler), "you know" (as filler), "I mean" (as filler)
|
||||
- Also remove language-specific fillers (e.g., "euh"/"beh" in French, "äh"/"ähm" in German, "eh"/"este" in Spanish, "ehm"/"allora" in Italian)
|
||||
- Remove false starts when the speaker self-corrects immediately (e.g., "I was — I went to the store" → "I went to the store")
|
||||
- Do NOT remove meaningful repetition, emphasis, or intentional stylistic choices
|
||||
|
||||
SOUND AND MUSIC FORMATTING (DCMP standard):
|
||||
- Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
|
||||
- Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks]
|
||||
|
|
|
|||
|
|
@ -10,6 +10,7 @@ You are given a video. Return a JSON object with:
|
|||
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE})
|
||||
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE})
|
||||
{SDH_FIELD}
|
||||
{SOURCE_HAS_AD}
|
||||
|
||||
TARGET LANGUAGE: {TARGET_LANGUAGE}
|
||||
|
||||
|
|
@ -40,7 +41,7 @@ CRITICAL TIMING REQUIREMENTS:
|
|||
- Each caption cue should end exactly when the speaker finishes that phrase/sentence
|
||||
- Listen carefully to detect natural speech pauses and word boundaries
|
||||
- Avoid starting captions too early or ending them too late
|
||||
- Ensure captions align with lip movement and speech rhythm
|
||||
- Caption ALL audible speech — include off-screen narrators, voiceover, and any speaker not visible on screen. Do NOT omit speech because the speaker is not visible or because it plays over non-dialogue segments.
|
||||
- For audio descriptions, time them during natural speech gaps or over non-dialogue audio
|
||||
- Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends)
|
||||
|
||||
|
|
@ -61,6 +62,13 @@ CAPTION FORMATTING (DCMP standard):
|
|||
- Minimum caption duration: approximately 1.3 seconds. Maximum: 6 seconds
|
||||
- Use mixed case. Use ALL CAPS only for screaming or shouting
|
||||
|
||||
DISFLUENCY REMOVAL (DCMP §6.01):
|
||||
- Do NOT include filler words, false starts, or hesitations in captions
|
||||
- Remove: "um", "uh", "ah", "er", "hmm", "like" (as filler), "you know" (as filler), "I mean" (as filler)
|
||||
- Also remove language-specific fillers (e.g., "euh"/"beh" in French, "äh"/"ähm" in German, "eh"/"este" in Spanish, "ehm"/"allora" in Italian)
|
||||
- Remove false starts when the speaker self-corrects immediately (e.g., "I was — I went to the store" → "I went to the store")
|
||||
- Do NOT remove meaningful repetition, emphasis, or intentional stylistic choices
|
||||
|
||||
SOUND AND MUSIC FORMATTING (DCMP standard):
|
||||
- Sound effects: lowercase in square brackets — e.g., [door slams], [footsteps approaching]
|
||||
- Use present participle for sustained sounds: [dog barking]; use third person for abrupt sounds: [dog barks]
|
||||
|
|
|
|||
125
backend/app/services/caption_aligner.py
Normal file
125
backend/app/services/caption_aligner.py
Normal file
|
|
@ -0,0 +1,125 @@
|
|||
"""Align Gemini caption VTT timings against Whisper word-level timestamps.
|
||||
|
||||
Algorithm:
|
||||
For each VTT cue, tokenise its text and search for the token sequence in the
|
||||
Whisper word stream starting from the cursor position (with a look-ahead window).
|
||||
When a match of sufficient confidence is found the cue's start/end timestamps
|
||||
are replaced with the matched Whisper words' start/end. Cues that cannot be
|
||||
matched (music notation, sound effects, empty cues) keep their original Gemini
|
||||
timestamps. The result has Whisper-accurate timings early in the video and
|
||||
graceful fallbacks where Whisper didn't capture the audio.
|
||||
"""
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
from ..core.logging import get_logger
|
||||
from ..lib.vtt import VTTEditor, VTTParser
|
||||
from ..services.whisper_service import WordTimestamp
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
# Characters to strip when comparing tokens
|
||||
_PUNCT = re.compile(r"[^\w']", re.UNICODE)
|
||||
# Tokens shorter than this are considered stop-words and excluded from matching
|
||||
_MIN_TOKEN_LEN = 2
|
||||
# Minimum fraction of cue tokens that must match Whisper words for alignment
|
||||
_MIN_MATCH_RATIO = 0.5
|
||||
# How many Whisper words ahead of the cursor to search for a cue's tokens
|
||||
_SEARCH_WINDOW = 60
|
||||
|
||||
|
||||
def _tokenise(text: str) -> list[str]:
|
||||
"""Lower-case, strip punctuation, drop short tokens."""
|
||||
return [
|
||||
t for t in (_PUNCT.sub("", w).lower() for w in text.split())
|
||||
if len(t) >= _MIN_TOKEN_LEN
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class _Match:
|
||||
first_word_idx: int
|
||||
last_word_idx: int
|
||||
ratio: float # matched_tokens / cue_tokens
|
||||
|
||||
|
||||
def _find_match(
|
||||
cue_tokens: list[str],
|
||||
whisper_words: list[WordTimestamp],
|
||||
cursor: int,
|
||||
) -> _Match | None:
|
||||
"""Return the best match for cue_tokens starting at cursor ± SEARCH_WINDOW."""
|
||||
if not cue_tokens:
|
||||
return None
|
||||
|
||||
best: _Match | None = None
|
||||
end = min(cursor + _SEARCH_WINDOW, len(whisper_words))
|
||||
|
||||
for start_idx in range(cursor, end):
|
||||
matched = 0
|
||||
last_idx = start_idx
|
||||
token_pos = 0
|
||||
|
||||
for w_idx in range(start_idx, end):
|
||||
if token_pos >= len(cue_tokens):
|
||||
break
|
||||
w_tok = _PUNCT.sub("", whisper_words[w_idx].word).lower()
|
||||
if w_tok == cue_tokens[token_pos]:
|
||||
matched += 1
|
||||
last_idx = w_idx
|
||||
token_pos += 1
|
||||
|
||||
ratio = matched / len(cue_tokens)
|
||||
if ratio >= _MIN_MATCH_RATIO:
|
||||
if best is None or ratio > best.ratio:
|
||||
best = _Match(start_idx, last_idx, ratio)
|
||||
if ratio == 1.0:
|
||||
break # perfect match — no need to search further
|
||||
|
||||
return best
|
||||
|
||||
|
||||
def align(captions_vtt: str, whisper_words: list[WordTimestamp]) -> str:
|
||||
"""Replace VTT cue timings with Whisper-accurate timestamps where possible.
|
||||
|
||||
Returns a VTT string with the same cue count as the input, with improved
|
||||
timing accuracy on cues that could be matched to Whisper word output.
|
||||
"""
|
||||
if not whisper_words:
|
||||
logger.warning("caption_aligner: no Whisper words supplied — returning original VTT")
|
||||
return captions_vtt
|
||||
|
||||
cues = VTTParser.parse(captions_vtt)
|
||||
cursor = 0
|
||||
aligned = 0
|
||||
|
||||
for cue in cues:
|
||||
tokens = _tokenise(cue.text)
|
||||
if not tokens:
|
||||
# Sound-effect or music cue — nothing to align
|
||||
continue
|
||||
|
||||
match = _find_match(tokens, whisper_words, cursor)
|
||||
if match is None:
|
||||
continue
|
||||
|
||||
new_start = whisper_words[match.first_word_idx].start
|
||||
new_end = whisper_words[match.last_word_idx].end
|
||||
|
||||
# Sanity: don't create zero-duration or backwards cues
|
||||
if new_end > new_start:
|
||||
cue.start_time = new_start
|
||||
cue.end_time = new_end
|
||||
aligned += 1
|
||||
|
||||
# Advance cursor to just past the last matched word
|
||||
cursor = match.last_word_idx + 1
|
||||
|
||||
logger.info(
|
||||
f"caption_aligner: aligned {aligned}/{len(cues)} cues "
|
||||
f"against {len(whisper_words)} Whisper words"
|
||||
)
|
||||
return VTTEditor.translate_preserving_timing(
|
||||
captions_vtt, [c.text for c in cues]
|
||||
) if aligned == 0 else VTTParser.build(cues)
|
||||
|
|
@ -113,6 +113,15 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
|
|||
return glossary_block.strip()
|
||||
return ""
|
||||
|
||||
def _build_source_has_ad_block(self, source_has_ad: bool) -> str:
|
||||
if source_has_ad:
|
||||
return (
|
||||
"SOURCE AUDIO DESCRIPTION NOTICE: This video already has professional audio descriptions "
|
||||
"embedded in its audio track. Return an empty audio_description_vtt containing only "
|
||||
"the WEBVTT header (\"WEBVTT\\n\") — do NOT generate new audio descriptions."
|
||||
)
|
||||
return ""
|
||||
|
||||
def _build_brand_context_block(self, brand_context: str | None) -> str:
|
||||
"""Build the brand context instruction block for injection into prompts."""
|
||||
if brand_context and brand_context.strip():
|
||||
|
|
@ -125,7 +134,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
|
|||
)
|
||||
return "No specific brand names have been provided for this video."
|
||||
|
||||
async def extract_accessibility(self, video_file_path: str, brand_context: str | None = None, sdh_requested: bool = False, glossary_block: str | None = None, _cost_ctx: dict | None = None) -> dict[str, Any]:
|
||||
async def extract_accessibility(self, video_file_path: str, brand_context: str | None = None, sdh_requested: bool = False, glossary_block: str | None = None, source_has_ad: bool = False, _cost_ctx: dict | None = None) -> dict[str, Any]:
|
||||
"""
|
||||
Extract captions and audio descriptions from video using Gemini 2.0
|
||||
Returns structured JSON with transcript, captions VTT, and audio description VTT
|
||||
|
|
@ -137,6 +146,7 @@ Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched w
|
|||
.replace("{GLOSSARY}", self._build_glossary_block(glossary_block))
|
||||
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
|
||||
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
|
||||
.replace("{SOURCE_HAS_AD}", self._build_source_has_ad_block(source_has_ad))
|
||||
)
|
||||
uploaded_file = None
|
||||
|
||||
|
|
|
|||
|
|
@ -1,20 +1,25 @@
|
|||
import asyncio
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
|
||||
import ffmpeg
|
||||
from celery import Task
|
||||
from celery.result import allow_join_result
|
||||
from motor.motor_asyncio import AsyncIOMotorClient
|
||||
|
||||
from ..core.config import settings
|
||||
from ..core.logging import get_logger
|
||||
from ..lib.vtt import VTTEditor
|
||||
from ..models.job import JobStatus
|
||||
from ..services import cost_tracker
|
||||
from ..services import caption_aligner, cost_tracker
|
||||
from ..services.gcs import gcs_path, gcs_service, upload_vtt_to_gcs
|
||||
from ..services.gemini import gemini_service
|
||||
from ..services.whisper_service import WordTimestamp
|
||||
from . import celery_app
|
||||
from ._websocket_bridge import broadcast_status_update
|
||||
from .whisper_transcribe import transcribe_video_audio_task
|
||||
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
|
@ -153,6 +158,7 @@ async def ingest_and_ai_task_impl(job_id: str):
|
|||
# Process with Gemini
|
||||
brand_context = job_doc.get("brand_context")
|
||||
sdh_requested = job_doc.get("requested_outputs", {}).get("sdh_vtt", False)
|
||||
source_has_ad = job_doc.get("source", {}).get("source_has_ad", False)
|
||||
_cost_ctx = {
|
||||
"user_id": job_doc.get("client_id", "system"),
|
||||
"job_id": job_id,
|
||||
|
|
@ -167,8 +173,13 @@ async def ingest_and_ai_task_impl(job_id: str):
|
|||
temp_path,
|
||||
brand_context=brand_context,
|
||||
sdh_requested=sdh_requested,
|
||||
source_has_ad=source_has_ad,
|
||||
_cost_ctx=_cost_ctx,
|
||||
)
|
||||
# Enforce: if source already has AD, discard any AI-generated AD
|
||||
if source_has_ad:
|
||||
ai_result["audio_description_vtt"] = "WEBVTT\n"
|
||||
logger.info(f"source_has_ad=True for job {job_id}: skipping AD generation")
|
||||
|
||||
# Final safety check for required fields
|
||||
required_fields = ["captions_vtt", "audio_description_vtt"]
|
||||
|
|
@ -202,6 +213,13 @@ async def ingest_and_ai_task_impl(job_id: str):
|
|||
source_language = detected_language
|
||||
logger.info(f"Using detected language '{source_language}' for job {job_id}")
|
||||
|
||||
# Post-process: remove filler words per DCMP §6.01
|
||||
captions_vtt = VTTEditor.clean_disfluencies(ai_result["captions_vtt"], source_language)
|
||||
|
||||
# Align caption timings with Whisper word-level timestamps (Bug 5)
|
||||
captions_vtt = await _align_captions_with_whisper(captions_vtt, temp_path, job_id)
|
||||
ai_result["captions_vtt"] = captions_vtt
|
||||
|
||||
# Upload VTT files to GCS using detected language
|
||||
captions_gcs_uri = await upload_vtt_to_gcs(
|
||||
ai_result["captions_vtt"],
|
||||
|
|
@ -333,3 +351,47 @@ async def _get_video_duration(video_path: str) -> float:
|
|||
except Exception as e:
|
||||
logger.warning(f"Could not determine video duration: {e}")
|
||||
return 0.0
|
||||
|
||||
|
||||
async def _align_captions_with_whisper(captions_vtt: str, video_path: str, job_id: str) -> str:
|
||||
"""Align caption VTT timings with Whisper word timestamps. Returns original VTT on failure."""
|
||||
audio_path = video_path.replace(".mp4", "_captions_align.mp3")
|
||||
try:
|
||||
# Extract audio at 16kHz mono (optimal for Whisper)
|
||||
def _extract():
|
||||
result = subprocess.run(
|
||||
["ffmpeg", "-y", "-i", video_path, "-vn", "-acodec", "libmp3lame",
|
||||
"-ar", "16000", "-ac", "1", "-q:a", "5", audio_path],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise RuntimeError(f"FFmpeg failed: {result.stderr}")
|
||||
|
||||
await asyncio.to_thread(_extract)
|
||||
|
||||
task_result = transcribe_video_audio_task.apply_async(
|
||||
args=[job_id, audio_path], queue="whisper"
|
||||
)
|
||||
poll_count = 0
|
||||
while not task_result.ready():
|
||||
await asyncio.sleep(1.0)
|
||||
poll_count += 1
|
||||
if poll_count > 600:
|
||||
logger.warning(f"Whisper timeout for job {job_id}, skipping alignment")
|
||||
return captions_vtt
|
||||
|
||||
with allow_join_result():
|
||||
result_data = task_result.get(timeout=10)
|
||||
|
||||
words = [
|
||||
WordTimestamp(word=w["word"], start=w["start"], end=w["end"])
|
||||
for w in result_data.get("words", [])
|
||||
]
|
||||
return caption_aligner.align(captions_vtt, words)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Whisper caption alignment failed for job {job_id}: {e} — using Gemini timestamps")
|
||||
return captions_vtt
|
||||
finally:
|
||||
if os.path.exists(audio_path):
|
||||
os.unlink(audio_path)
|
||||
|
|
|
|||
|
|
@ -269,9 +269,12 @@ async def _async_translate_and_synthesize(job_id: str, languages: list[str] | No
|
|||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process language {language}: {e}")
|
||||
# Preserve existing GCS URIs and origin so retranslation failure
|
||||
# doesn't destroy captions the user can still view
|
||||
existing = updated_outputs.get(language, {})
|
||||
updated_outputs[language] = {
|
||||
"origin": "transcreate" if _style == "transcreate" else "gemini_translate",
|
||||
"qa_notes": f"Translation failed: {str(e)}",
|
||||
**existing,
|
||||
"qa_notes": f"Translation failed: {str(e)[:200]}",
|
||||
}
|
||||
|
||||
finally:
|
||||
|
|
|
|||
|
|
@ -53,6 +53,20 @@ export function VideoReviewPlayer({ job, downloads }: VideoReviewPlayerProps) {
|
|||
}
|
||||
}, [assetTabs, activeTabKey]);
|
||||
|
||||
// Disable browser-native text tracks so they don't compete with our React overlay
|
||||
useEffect(() => {
|
||||
const video = videoRef.current;
|
||||
if (!video) return;
|
||||
const disableTracks = () => {
|
||||
for (let i = 0; i < video.textTracks.length; i++) {
|
||||
video.textTracks[i].mode = 'disabled';
|
||||
}
|
||||
};
|
||||
disableTracks();
|
||||
video.addEventListener('loadedmetadata', disableTracks);
|
||||
return () => video.removeEventListener('loadedmetadata', disableTracks);
|
||||
}, [videoRef.current]);
|
||||
|
||||
// Get current tab
|
||||
const activeTab = assetTabs.find((t) => t.key === activeTabKey);
|
||||
|
||||
|
|
@ -305,9 +319,9 @@ export function VideoReviewPlayer({ job, downloads }: VideoReviewPlayerProps) {
|
|||
</div>
|
||||
)}
|
||||
|
||||
{/* Caption Overlay — always at the bottom, above native controls */}
|
||||
{/* Caption Overlay — position at top when cue has line:0% setting */}
|
||||
{showCaptions && currentCaption && (
|
||||
<div className="absolute bottom-14 left-1/2 transform -translate-x-1/2 bg-black bg-opacity-80 text-white px-4 py-2 rounded max-w-[90%]">
|
||||
<div className={`absolute ${currentCaption.positionTop ? 'top-4' : 'bottom-14'} left-1/2 transform -translate-x-1/2 bg-black bg-opacity-80 text-white px-4 py-2 rounded max-w-[90%]`}>
|
||||
<div className="text-center whitespace-pre-wrap">
|
||||
{currentCaption.text}
|
||||
</div>
|
||||
|
|
|
|||
|
|
@ -77,6 +77,7 @@ export function NewBrief() {
|
|||
const [accessibleMethod, setAccessibleMethod] = useState<'overlay' | 'pause_insert'>('pause_insert');
|
||||
const [sdhVtt, setSdhVtt] = useState(false);
|
||||
const [descriptiveTranscript, setDescriptiveTranscript] = useState(false);
|
||||
const [sourceHasAd, setSourceHasAd] = useState(false);
|
||||
|
||||
const { data: projects = [] } = useAllProjects();
|
||||
const { data: assignees = [] } = useBriefAssignees();
|
||||
|
|
@ -113,6 +114,7 @@ export function NewBrief() {
|
|||
deadline: deadline || undefined,
|
||||
project_id: projectId || undefined,
|
||||
assignee_id: assigneeId || undefined,
|
||||
source_has_ad: sourceHasAd,
|
||||
});
|
||||
toast.toastOnly.success('Brief created');
|
||||
navigate(`/briefs/${brief.id}`);
|
||||
|
|
@ -240,6 +242,22 @@ export function NewBrief() {
|
|||
</div>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">Source Video</label>
|
||||
<label className="flex items-start gap-2 text-sm text-gray-700 cursor-pointer">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={sourceHasAd}
|
||||
onChange={e => setSourceHasAd(e.target.checked)}
|
||||
className="rounded mt-0.5 flex-shrink-0"
|
||||
/>
|
||||
<span>
|
||||
<span className="font-medium">Source video already contains audio descriptions</span>
|
||||
<span className="text-gray-400 ml-1">— AI will not generate new AD for this job</span>
|
||||
</span>
|
||||
</label>
|
||||
</div>
|
||||
|
||||
<div>
|
||||
<label className="block text-sm font-medium text-gray-700 mb-2">
|
||||
Languages
|
||||
|
|
|
|||
|
|
@ -880,4 +880,5 @@ export interface JobBriefCreate {
|
|||
deadline?: string;
|
||||
project_id?: string;
|
||||
assignee_id?: string;
|
||||
source_has_ad?: boolean;
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue