From c413fcb747baf39f70495b7a5ebdd6e79b438099 Mon Sep 17 00:00:00 2001 From: Vadym Samoilenko Date: Wed, 18 Mar 2026 15:02:18 +0000 Subject: [PATCH] feat: add SDH (Subtitles for Deaf and Hard of Hearing) caption output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SDH captions extend standard VTT with speaker identification labels, sound effects [PHONE RINGS], music notation ♪, and off-screen indicators. - Add sdh_vtt flag to RequestedOutputs model and frontend form - Add sdh_captions_vtt_gcs field to LangOutput model - Inject SDH generation instructions into both Gemini prompts via {SDH_FIELD} and {SDH_GUIDELINES} placeholders when requested - Upload sdh_captions.vtt to GCS in ingest task - Pass SDH through video_native translation (Gemini generates it directly) and traditional translation (translate source SDH VTT via Gemini) - Expose sdh_captions_vtt in downloads endpoint and bulk zip export Co-Authored-By: Claude Sonnet 4.6 --- backend/app/api/v1/routes_jobs.py | 9 ++++ backend/app/models/job.py | 2 + backend/app/prompts/gemini_ingestion.md | 3 ++ .../app/prompts/gemini_ingestion_targeted.md | 3 ++ backend/app/services/gemini.py | 38 +++++++++++++-- backend/app/services/zip_download.py | 1 + backend/app/tasks/ingest_and_ai.py | 27 +++++++++-- backend/app/tasks/translate_and_synthesize.py | 48 ++++++++++++++++--- frontend/src/routes/jobs/NewJob.tsx | 13 +++++ frontend/src/types/api.ts | 1 + 10 files changed, 128 insertions(+), 17 deletions(-) diff --git a/backend/app/api/v1/routes_jobs.py b/backend/app/api/v1/routes_jobs.py index dce4018..fd93523 100644 --- a/backend/app/api/v1/routes_jobs.py +++ b/backend/app/api/v1/routes_jobs.py @@ -1073,6 +1073,15 @@ async def get_job_downloads( except Exception as e: logger.warning(f"Failed to generate signed URL for accessible video {language}: {e}") + # SDH Captions VTT + if "sdh_captions_vtt_gcs" in lang_output: + blob_path = lang_output["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "") + try: + signed_url = await get_signed_download_url(blob_path, 24) + lang_downloads["sdh_captions_vtt"] = signed_url + except Exception as e: + logger.warning(f"Failed to generate signed URL for SDH captions {language}: {e}") + # Re-timed Captions VTT (for pause-insert accessible videos) if "retimed_captions_vtt_gcs" in lang_output: blob_path = lang_output["retimed_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "") diff --git a/backend/app/models/job.py b/backend/app/models/job.py index 8431b1a..822e6ed 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -62,6 +62,7 @@ class RequestedOutputs(BaseModel): audio_description_mp3: bool = True accessible_video_mp4: bool = False # Rendered video with embedded audio descriptions accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None # User-selected method + sdh_vtt: bool = False # SDH (Subtitles for Deaf and Hard of Hearing) captions with speaker labels, sound effects, music notation languages: list[str] = [] transcreation: list[str] = [] tts_preferences: Optional[TTSPreferences] = None @@ -109,6 +110,7 @@ class AccessibleVideoEditState(BaseModel): class LangOutput(BaseModel): captions_vtt_gcs: Optional[str] = None + sdh_captions_vtt_gcs: Optional[str] = None # SDH-format captions (speaker labels, sound effects, music) ad_vtt_gcs: Optional[str] = None ad_mp3_gcs: Optional[str] = None # Accessible video outputs diff --git a/backend/app/prompts/gemini_ingestion.md b/backend/app/prompts/gemini_ingestion.md index 3fe62bf..4d504c0 100644 --- a/backend/app/prompts/gemini_ingestion.md +++ b/backend/app/prompts/gemini_ingestion.md @@ -9,6 +9,7 @@ You are given a video. Return a JSON object with: - transcript_plaintext: full spoken words, punctuated (in the detected language) - captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (in the detected language) - audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (MUST be written in the detected language) +{SDH_FIELD} CRITICAL LANGUAGE REQUIREMENT: - First, detect the language spoken in the video @@ -69,6 +70,8 @@ Rules: - Be succinct — omit redundant or self-evident details - Do NOT duplicate information already in the spoken dialogue +{SDH_GUIDELINES} + CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text. Example output format: diff --git a/backend/app/prompts/gemini_ingestion_targeted.md b/backend/app/prompts/gemini_ingestion_targeted.md index 39a47d5..361adf6 100644 --- a/backend/app/prompts/gemini_ingestion_targeted.md +++ b/backend/app/prompts/gemini_ingestion_targeted.md @@ -9,6 +9,7 @@ You are given a video. Return a JSON object with: - transcript_plaintext: full spoken words, punctuated, translated/written in {TARGET_LANGUAGE} - captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE}) - audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE}) +{SDH_FIELD} TARGET LANGUAGE: {TARGET_LANGUAGE} @@ -74,6 +75,8 @@ Rules: - Do NOT duplicate information already in the spoken dialogue - Write all descriptions in natural, fluent {TARGET_LANGUAGE} +{SDH_GUIDELINES} + CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text. Example output format (if TARGET_LANGUAGE were Spanish): diff --git a/backend/app/services/gemini.py b/backend/app/services/gemini.py index 85a85d7..5b1067c 100644 --- a/backend/app/services/gemini.py +++ b/backend/app/services/gemini.py @@ -59,6 +59,24 @@ class GeminiService: logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s") return False + def _build_sdh_field(self, sdh_requested: bool) -> str: + if sdh_requested: + return "- sdh_captions_vtt: a valid WebVTT file as a single string, containing SDH-format captions (same timing as captions_vtt, but enriched with speaker labels, sound effects, and music notation)" + return "" + + def _build_sdh_guidelines(self, sdh_requested: bool) -> str: + if not sdh_requested: + return "" + return """SDH (SUBTITLES FOR THE DEAF AND HARD OF HEARING) GUIDELINES: +Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched with: +- Speaker identification when multiple speakers are present: use "NAME:" prefix (e.g., "JOHN: Hello there") or "[NARRATOR]" for narration +- Non-speech sounds that are plot-relevant, in square brackets: [DOOR SLAMS], [PHONE RINGS], [CROWD CHEERING], [THUNDER] +- Music: use ♪ for background music cues (e.g., "♪ tense music ♪") or ♪ around sung lyrics +- Off-screen or voice-over speakers: indicate with "(off-screen)" or "[V.O.]" where relevant +- Non-speech vocalisations when relevant: [SIGHS], [LAUGHS], [SCREAMS] +- Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm) +- Only add sound effect cues where they add meaningful context; do not annotate every minor sound""" + def _build_brand_context_block(self, brand_context: Optional[str]) -> str: """Build the brand context instruction block for injection into prompts.""" if brand_context and brand_context.strip(): @@ -71,13 +89,18 @@ class GeminiService: ) return "No specific brand names have been provided for this video." - async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None) -> dict[str, Any]: + async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False) -> dict[str, Any]: """ Extract captions and audio descriptions from video using Gemini 2.0 Returns structured JSON with transcript, captions VTT, and audio description VTT """ prompt_template = self._load_prompt("gemini_ingestion.md") - prompt = prompt_template.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)) + prompt = ( + prompt_template + .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)) + .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested)) + .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested)) + ) uploaded_file = None try: @@ -258,7 +281,8 @@ Fix the JSON and return it: self, video_file_path: str, target_language: str, - brand_context: Optional[str] = None + brand_context: Optional[str] = None, + sdh_requested: bool = False ) -> dict[str, Any]: """ Extract captions and audio descriptions from video using Gemini, @@ -279,8 +303,12 @@ Fix the JSON and return it: all in the target language """ prompt_template = self._load_prompt("gemini_ingestion_targeted.md") - prompt = prompt_template.replace("{TARGET_LANGUAGE}", target_language).replace( - "{BRAND_CONTEXT}", self._build_brand_context_block(brand_context) + prompt = ( + prompt_template + .replace("{TARGET_LANGUAGE}", target_language) + .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)) + .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested)) + .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested)) ) uploaded_file = None diff --git a/backend/app/services/zip_download.py b/backend/app/services/zip_download.py index 9b32e36..fa373e9 100644 --- a/backend/app/services/zip_download.py +++ b/backend/app/services/zip_download.py @@ -43,6 +43,7 @@ def sanitize_filename(name: str, max_length: int = 50) -> str: # Mapping from LangOutput field names to output filenames FILE_TYPE_MAPPING = { "captions_vtt_gcs": "captions.vtt", + "sdh_captions_vtt_gcs": "sdh_captions.vtt", "ad_vtt_gcs": "ad.vtt", "ad_mp3_gcs": "ad.mp3", "accessible_video_gcs": "accessible_video.mp4", diff --git a/backend/app/tasks/ingest_and_ai.py b/backend/app/tasks/ingest_and_ai.py index 2691116..a728eb6 100644 --- a/backend/app/tasks/ingest_and_ai.py +++ b/backend/app/tasks/ingest_and_ai.py @@ -204,7 +204,12 @@ async def ingest_and_ai_task_impl(job_id: str): # Process with Gemini brand_context = job_doc.get("brand_context") - ai_result = await gemini_service.extract_accessibility(temp_path, brand_context=brand_context) + sdh_requested = job_doc.get("requested_outputs", {}).get("sdh_vtt", False) + ai_result = await gemini_service.extract_accessibility( + temp_path, + brand_context=brand_context, + sdh_requested=sdh_requested + ) # Final safety check for required fields required_fields = ["captions_vtt", "audio_description_vtt"] @@ -249,6 +254,21 @@ async def ingest_and_ai_task_impl(job_id: str): f"{job_id}/{source_language}/ad.vtt" ) + # Upload SDH VTT if generated + sdh_gcs_uri = None + if sdh_requested and ai_result.get("sdh_captions_vtt"): + sdh_gcs_uri = await upload_vtt_to_gcs( + ai_result["sdh_captions_vtt"], + f"{job_id}/{source_language}/sdh_captions.vtt" + ) + + source_lang_output = { + "captions_vtt_gcs": captions_gcs_uri, + "ad_vtt_gcs": ad_gcs_uri, + } + if sdh_gcs_uri: + source_lang_output["sdh_captions_vtt_gcs"] = sdh_gcs_uri + # Update job with AI results, detected language, and outputs # Set status to TRANSLATING to trigger translation pipeline before QC await db.jobs.update_one( @@ -260,10 +280,7 @@ async def ingest_and_ai_task_impl(job_id: str): "source.detected_language": detected_language, "ai.ingestion_json": ai_result, "ai.confidence": ai_result["confidence"], - f"outputs.{source_language}": { - "captions_vtt_gcs": captions_gcs_uri, - "ad_vtt_gcs": ad_gcs_uri - }, + f"outputs.{source_language}": source_lang_output, "updated_at": datetime.utcnow() }, "$push": { diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py index eb9bd44..c94330c 100644 --- a/backend/app/tasks/translate_and_synthesize.py +++ b/backend/app/tasks/translate_and_synthesize.py @@ -177,6 +177,8 @@ async def _async_translate_and_synthesize(job_id: str): translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional") logger.info(f"Translation mode for job {job_id}: {translation_mode}") + sdh_requested = job_doc["requested_outputs"].get("sdh_vtt", False) + # Get source language VTT content (needed for traditional mode) source_outputs = job_doc["outputs"].get(source_language) if not source_outputs: @@ -215,6 +217,12 @@ async def _async_translate_and_synthesize(job_id: str): source_captions_vtt = captions_blob.download_as_text() source_ad_vtt = ad_blob.download_as_text() + # Download source SDH VTT for traditional-mode translation + source_sdh_vtt = None + if sdh_requested and source_outputs.get("sdh_captions_vtt_gcs"): + sdh_blob_path = source_outputs["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "") + source_sdh_vtt = gcs_service.bucket.blob(sdh_blob_path).download_as_text() + try: # Get target languages (exclude source) target_languages = [lang for lang in requested_languages if lang != source_language] @@ -239,7 +247,8 @@ async def _async_translate_and_synthesize(job_id: str): return await gemini_service.extract_accessibility_targeted( video_local_path, lang, - brand_context=job_brand_context + brand_context=job_brand_context, + sdh_requested=sdh_requested ) result = await retry_with_backoff(extract_targeted, max_retries=3) @@ -257,12 +266,20 @@ async def _async_translate_and_synthesize(job_id: str): f"{job_id}/{lang}/ad.vtt" ) + # Upload SDH VTT if generated + sdh_gcs_uri = None + if sdh_requested and result.get("sdh_captions_vtt"): + sdh_gcs_uri = await upload_vtt_to_gcs( + result["sdh_captions_vtt"], + f"{job_id}/{lang}/sdh_captions.vtt" + ) + logger.info(f"Completed video-native translation for {lang}") - return (lang, captions_gcs_uri, ad_gcs_uri, None) + return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, None) except Exception as e: logger.error(f"Video-native translation failed for {lang}: {e}") - return (lang, None, None, str(e)) + return (lang, None, None, None, str(e)) # Run all translations in parallel (limited by semaphore) if target_languages: @@ -281,18 +298,21 @@ async def _async_translate_and_synthesize(job_id: str): "qa_notes": f"Translation failed: {str(result)}" } else: - lang, captions_uri, ad_uri, error_msg = result + lang, captions_uri, ad_uri, sdh_uri, error_msg = result if error_msg: updated_outputs[lang] = { "origin": "video_native", "qa_notes": f"Translation failed: {error_msg}" } else: - updated_outputs[lang] = { + lang_out = { "captions_vtt_gcs": captions_uri, "ad_vtt_gcs": ad_uri, "origin": "video_native" } + if sdh_uri: + lang_out["sdh_captions_vtt_gcs"] = sdh_uri + updated_outputs[lang] = lang_out logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)") else: @@ -343,12 +363,26 @@ async def _async_translate_and_synthesize(job_id: str): f"{job_id}/{language}/ad.vtt" ) - # Store language outputs - updated_outputs[language] = { + # Translate and upload SDH VTT if requested + lang_out: dict = { "captions_vtt_gcs": captions_gcs_uri, "ad_vtt_gcs": ad_gcs_uri, "origin": origin } + if sdh_requested and source_sdh_vtt: + async def translate_sdh(): + return await gemini_service.translate_vtt( + source_sdh_vtt, language, source_language=source_language + ) + translated_sdh = await retry_with_backoff(translate_sdh, max_retries=3) + sdh_gcs_uri = await upload_vtt_to_gcs( + translated_sdh, + f"{job_id}/{language}/sdh_captions.vtt" + ) + lang_out["sdh_captions_vtt_gcs"] = sdh_gcs_uri + + # Store language outputs + updated_outputs[language] = lang_out logger.info(f"Successfully processed VTT files for language: {language} (origin: {origin})") diff --git a/frontend/src/routes/jobs/NewJob.tsx b/frontend/src/routes/jobs/NewJob.tsx index a8ca812..8844f9e 100644 --- a/frontend/src/routes/jobs/NewJob.tsx +++ b/frontend/src/routes/jobs/NewJob.tsx @@ -20,6 +20,7 @@ const jobSchema = z.object({ audio_description_vtt: z.boolean(), audio_description_mp3: z.boolean(), accessible_video_mp4: z.boolean(), + sdh_vtt: z.boolean(), languages: z.array(z.string()), translation_mode: z.enum(['traditional', 'video_native']), }); @@ -74,6 +75,7 @@ export function NewJob() { audio_description_vtt: true, audio_description_mp3: true, accessible_video_mp4: false, + sdh_vtt: false, languages: [], translation_mode: 'video_native', } @@ -128,6 +130,7 @@ export function NewJob() { accessible_video_mp4: data.accessible_video_mp4, accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined, languages: data.languages, + sdh_vtt: data.sdh_vtt, transcreation: [], // Transcreation replaced by video_native translation mode tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined, translation_mode: data.translation_mode, @@ -207,6 +210,7 @@ export function NewJob() { accessible_video_mp4: data.accessible_video_mp4, accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined, languages: data.languages, + sdh_vtt: data.sdh_vtt, transcreation: [], // Transcreation replaced by video_native translation mode tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined, translation_mode: data.translation_mode, @@ -252,6 +256,7 @@ export function NewJob() { accessible_video_mp4: data.accessible_video_mp4, accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined, languages: data.languages, + sdh_vtt: data.sdh_vtt, transcreation: [], // Transcreation replaced by video_native translation mode tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined, translation_mode: data.translation_mode, @@ -542,6 +547,14 @@ export function NewJob() { /> Accessible Video (MP4 with embedded audio descriptions) + {/* Accessible Video Method Selector - shown when accessible_video_mp4 is checked */} {accessibleVideoMp4 && ( diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index 9680aa5..e8d11f2 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -66,6 +66,7 @@ export interface RequestedOutputs { audio_description_mp3: boolean; accessible_video_mp4: boolean; // Rendered video with embedded audio descriptions accessible_video_method?: AccessibleVideoMethod; // User-selected method for accessible video + sdh_vtt?: boolean; // SDH captions with speaker labels, sound effects, music notation languages: string[]; transcreation: string[]; tts_preferences?: TTSPreferences;