feat: add SDH (Subtitles for Deaf and Hard of Hearing) caption output

SDH captions extend standard VTT with speaker identification labels, sound effects [PHONE RINGS], music notation ♪, and off-screen indicators. - Add sdh_vtt flag to RequestedOutputs model and frontend form - Add sdh_captions_vtt_gcs field to LangOutput model - Inject SDH generation instructions into both Gemini prompts via {SDH_FIELD} and {SDH_GUIDELINES} placeholders when requested - Upload sdh_captions.vtt to GCS in ingest task - Pass SDH through video_native translation (Gemini generates it directly) and traditional translation (translate source SDH VTT via Gemini) - Expose sdh_captions_vtt in downloads endpoint and bulk zip export Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-18 15:02:18 +00:00 · 2026-03-18 15:02:18 +00:00 · c413fcb747
commit c413fcb747
parent 2e8a8dc287
10 changed files with 128 additions and 17 deletions
--- a/backend/app/api/v1/routes_jobs.py
+++ b/backend/app/api/v1/routes_jobs.py
@ -1073,6 +1073,15 @@ async def get_job_downloads(
            except Exception as e:
                logger.warning(f"Failed to generate signed URL for accessible video {language}: {e}")

+        # SDH Captions VTT
+        if "sdh_captions_vtt_gcs" in lang_output:
+            blob_path = lang_output["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
+            try:
+                signed_url = await get_signed_download_url(blob_path, 24)
+                lang_downloads["sdh_captions_vtt"] = signed_url
+            except Exception as e:
+                logger.warning(f"Failed to generate signed URL for SDH captions {language}: {e}")
+
        # Re-timed Captions VTT (for pause-insert accessible videos)
        if "retimed_captions_vtt_gcs" in lang_output:
            blob_path = lang_output["retimed_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
--- a/backend/app/models/job.py
+++ b/backend/app/models/job.py
@ -62,6 +62,7 @@ class RequestedOutputs(BaseModel):
    audio_description_mp3: bool = True
    accessible_video_mp4: bool = False  # Rendered video with embedded audio descriptions
    accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None  # User-selected method
+    sdh_vtt: bool = False  # SDH (Subtitles for Deaf and Hard of Hearing) captions with speaker labels, sound effects, music notation
    languages: list[str] = []
    transcreation: list[str] = []
    tts_preferences: Optional[TTSPreferences] = None
@ -109,6 +110,7 @@ class AccessibleVideoEditState(BaseModel):

 class LangOutput(BaseModel):
    captions_vtt_gcs: Optional[str] = None
+    sdh_captions_vtt_gcs: Optional[str] = None  # SDH-format captions (speaker labels, sound effects, music)
    ad_vtt_gcs: Optional[str] = None
    ad_mp3_gcs: Optional[str] = None
    # Accessible video outputs
--- a/backend/app/prompts/gemini_ingestion.md
+++ b/backend/app/prompts/gemini_ingestion.md
@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
 - transcript_plaintext: full spoken words, punctuated (in the detected language)
 - captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (in the detected language)
 - audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (MUST be written in the detected language)
+{SDH_FIELD}

 CRITICAL LANGUAGE REQUIREMENT:
 - First, detect the language spoken in the video
@ -69,6 +70,8 @@ Rules:
 - Be succinct — omit redundant or self-evident details
 - Do NOT duplicate information already in the spoken dialogue

+{SDH_GUIDELINES}
+
 CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.

 Example output format:
--- a/backend/app/prompts/gemini_ingestion_targeted.md
+++ b/backend/app/prompts/gemini_ingestion_targeted.md
@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
 - transcript_plaintext: full spoken words, punctuated, translated/written in {TARGET_LANGUAGE}
 - captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE})
 - audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE})
+{SDH_FIELD}

 TARGET LANGUAGE: {TARGET_LANGUAGE}

@ -74,6 +75,8 @@ Rules:
 - Do NOT duplicate information already in the spoken dialogue
 - Write all descriptions in natural, fluent {TARGET_LANGUAGE}

+{SDH_GUIDELINES}
+
 CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.

 Example output format (if TARGET_LANGUAGE were Spanish):
--- a/backend/app/services/gemini.py
+++ b/backend/app/services/gemini.py
@ -59,6 +59,24 @@ class GeminiService:
        logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s")
        return False

+    def _build_sdh_field(self, sdh_requested: bool) -> str:
+        if sdh_requested:
+            return "- sdh_captions_vtt: a valid WebVTT file as a single string, containing SDH-format captions (same timing as captions_vtt, but enriched with speaker labels, sound effects, and music notation)"
+        return ""
+
+    def _build_sdh_guidelines(self, sdh_requested: bool) -> str:
+        if not sdh_requested:
+            return ""
+        return """SDH (SUBTITLES FOR THE DEAF AND HARD OF HEARING) GUIDELINES:
+Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched with:
+- Speaker identification when multiple speakers are present: use "NAME:" prefix (e.g., "JOHN: Hello there") or "[NARRATOR]" for narration
+- Non-speech sounds that are plot-relevant, in square brackets: [DOOR SLAMS], [PHONE RINGS], [CROWD CHEERING], [THUNDER]
+- Music: use ♪ for background music cues (e.g., "♪ tense music ♪") or ♪ around sung lyrics
+- Off-screen or voice-over speakers: indicate with "(off-screen)" or "[V.O.]" where relevant
+- Non-speech vocalisations when relevant: [SIGHS], [LAUGHS], [SCREAMS]
+- Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm)
+- Only add sound effect cues where they add meaningful context; do not annotate every minor sound"""
+
    def _build_brand_context_block(self, brand_context: Optional[str]) -> str:
        """Build the brand context instruction block for injection into prompts."""
        if brand_context and brand_context.strip():
@ -71,13 +89,18 @@ class GeminiService:
                )
        return "No specific brand names have been provided for this video."

-    async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None) -> dict[str, Any]:
+    async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False) -> dict[str, Any]:
        """
        Extract captions and audio descriptions from video using Gemini 2.0
        Returns structured JSON with transcript, captions VTT, and audio description VTT
        """
        prompt_template = self._load_prompt("gemini_ingestion.md")
-        prompt = prompt_template.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
+        prompt = (
+            prompt_template
+            .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
+            .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
+            .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
+        )
        uploaded_file = None

        try:
@ -258,7 +281,8 @@ Fix the JSON and return it:
        self,
        video_file_path: str,
        target_language: str,
-        brand_context: Optional[str] = None
+        brand_context: Optional[str] = None,
+        sdh_requested: bool = False
    ) -> dict[str, Any]:
        """
        Extract captions and audio descriptions from video using Gemini,
@ -279,8 +303,12 @@ Fix the JSON and return it:
            all in the target language
        """
        prompt_template = self._load_prompt("gemini_ingestion_targeted.md")
-        prompt = prompt_template.replace("{TARGET_LANGUAGE}", target_language).replace(
-            "{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)
+        prompt = (
+            prompt_template
+            .replace("{TARGET_LANGUAGE}", target_language)
+            .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
+            .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
+            .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
        )
        uploaded_file = None

--- a/backend/app/services/zip_download.py
+++ b/backend/app/services/zip_download.py
@ -43,6 +43,7 @@ def sanitize_filename(name: str, max_length: int = 50) -> str:
 # Mapping from LangOutput field names to output filenames
 FILE_TYPE_MAPPING = {
    "captions_vtt_gcs": "captions.vtt",
+    "sdh_captions_vtt_gcs": "sdh_captions.vtt",
    "ad_vtt_gcs": "ad.vtt",
    "ad_mp3_gcs": "ad.mp3",
    "accessible_video_gcs": "accessible_video.mp4",
--- a/backend/app/tasks/ingest_and_ai.py
+++ b/backend/app/tasks/ingest_and_ai.py
@ -204,7 +204,12 @@ async def ingest_and_ai_task_impl(job_id: str):

                # Process with Gemini
                brand_context = job_doc.get("brand_context")
-                ai_result = await gemini_service.extract_accessibility(temp_path, brand_context=brand_context)
+                sdh_requested = job_doc.get("requested_outputs", {}).get("sdh_vtt", False)
+                ai_result = await gemini_service.extract_accessibility(
+                    temp_path,
+                    brand_context=brand_context,
+                    sdh_requested=sdh_requested
+                )

                # Final safety check for required fields
                required_fields = ["captions_vtt", "audio_description_vtt"]
@ -249,6 +254,21 @@ async def ingest_and_ai_task_impl(job_id: str):
                    f"{job_id}/{source_language}/ad.vtt"
                )

+                # Upload SDH VTT if generated
+                sdh_gcs_uri = None
+                if sdh_requested and ai_result.get("sdh_captions_vtt"):
+                    sdh_gcs_uri = await upload_vtt_to_gcs(
+                        ai_result["sdh_captions_vtt"],
+                        f"{job_id}/{source_language}/sdh_captions.vtt"
+                    )
+
+                source_lang_output = {
+                    "captions_vtt_gcs": captions_gcs_uri,
+                    "ad_vtt_gcs": ad_gcs_uri,
+                }
+                if sdh_gcs_uri:
+                    source_lang_output["sdh_captions_vtt_gcs"] = sdh_gcs_uri
+
                # Update job with AI results, detected language, and outputs
                # Set status to TRANSLATING to trigger translation pipeline before QC
                await db.jobs.update_one(
@ -260,10 +280,7 @@ async def ingest_and_ai_task_impl(job_id: str):
                            "source.detected_language": detected_language,
                            "ai.ingestion_json": ai_result,
                            "ai.confidence": ai_result["confidence"],
-                            f"outputs.{source_language}": {
-                                "captions_vtt_gcs": captions_gcs_uri,
-                                "ad_vtt_gcs": ad_gcs_uri
-                            },
+                            f"outputs.{source_language}": source_lang_output,
                            "updated_at": datetime.utcnow()
                        },
                        "$push": {
--- a/backend/app/tasks/translate_and_synthesize.py
+++ b/backend/app/tasks/translate_and_synthesize.py
@ -177,6 +177,8 @@ async def _async_translate_and_synthesize(job_id: str):
        translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional")
        logger.info(f"Translation mode for job {job_id}: {translation_mode}")

+        sdh_requested = job_doc["requested_outputs"].get("sdh_vtt", False)
+
        # Get source language VTT content (needed for traditional mode)
        source_outputs = job_doc["outputs"].get(source_language)
        if not source_outputs:
@ -215,6 +217,12 @@ async def _async_translate_and_synthesize(job_id: str):
            source_captions_vtt = captions_blob.download_as_text()
            source_ad_vtt = ad_blob.download_as_text()

+            # Download source SDH VTT for traditional-mode translation
+            source_sdh_vtt = None
+            if sdh_requested and source_outputs.get("sdh_captions_vtt_gcs"):
+                sdh_blob_path = source_outputs["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
+                source_sdh_vtt = gcs_service.bucket.blob(sdh_blob_path).download_as_text()
+
        try:
            # Get target languages (exclude source)
            target_languages = [lang for lang in requested_languages if lang != source_language]
@ -239,7 +247,8 @@ async def _async_translate_and_synthesize(job_id: str):
                                return await gemini_service.extract_accessibility_targeted(
                                    video_local_path,
                                    lang,
-                                    brand_context=job_brand_context
+                                    brand_context=job_brand_context,
+                                    sdh_requested=sdh_requested
                                )

                            result = await retry_with_backoff(extract_targeted, max_retries=3)
@ -257,12 +266,20 @@ async def _async_translate_and_synthesize(job_id: str):
                                f"{job_id}/{lang}/ad.vtt"
                            )

+                            # Upload SDH VTT if generated
+                            sdh_gcs_uri = None
+                            if sdh_requested and result.get("sdh_captions_vtt"):
+                                sdh_gcs_uri = await upload_vtt_to_gcs(
+                                    result["sdh_captions_vtt"],
+                                    f"{job_id}/{lang}/sdh_captions.vtt"
+                                )
+
                            logger.info(f"Completed video-native translation for {lang}")
-                            return (lang, captions_gcs_uri, ad_gcs_uri, None)
+                            return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, None)

                        except Exception as e:
                            logger.error(f"Video-native translation failed for {lang}: {e}")
-                            return (lang, None, None, str(e))
+                            return (lang, None, None, None, str(e))

                # Run all translations in parallel (limited by semaphore)
                if target_languages:
@ -281,18 +298,21 @@ async def _async_translate_and_synthesize(job_id: str):
                                "qa_notes": f"Translation failed: {str(result)}"
                            }
                        else:
-                            lang, captions_uri, ad_uri, error_msg = result
+                            lang, captions_uri, ad_uri, sdh_uri, error_msg = result
                            if error_msg:
                                updated_outputs[lang] = {
                                    "origin": "video_native",
                                    "qa_notes": f"Translation failed: {error_msg}"
                                }
                            else:
-                                updated_outputs[lang] = {
+                                lang_out = {
                                    "captions_vtt_gcs": captions_uri,
                                    "ad_vtt_gcs": ad_uri,
                                    "origin": "video_native"
                                }
+                                if sdh_uri:
+                                    lang_out["sdh_captions_vtt_gcs"] = sdh_uri
+                                updated_outputs[lang] = lang_out
                                logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)")

            else:
@ -343,12 +363,26 @@ async def _async_translate_and_synthesize(job_id: str):
                            f"{job_id}/{language}/ad.vtt"
                        )

-                        # Store language outputs
-                        updated_outputs[language] = {
+                        # Translate and upload SDH VTT if requested
+                        lang_out: dict = {
                            "captions_vtt_gcs": captions_gcs_uri,
                            "ad_vtt_gcs": ad_gcs_uri,
                            "origin": origin
                        }
+                        if sdh_requested and source_sdh_vtt:
+                            async def translate_sdh():
+                                return await gemini_service.translate_vtt(
+                                    source_sdh_vtt, language, source_language=source_language
+                                )
+                            translated_sdh = await retry_with_backoff(translate_sdh, max_retries=3)
+                            sdh_gcs_uri = await upload_vtt_to_gcs(
+                                translated_sdh,
+                                f"{job_id}/{language}/sdh_captions.vtt"
+                            )
+                            lang_out["sdh_captions_vtt_gcs"] = sdh_gcs_uri
+
+                        # Store language outputs
+                        updated_outputs[language] = lang_out

                        logger.info(f"Successfully processed VTT files for language: {language} (origin: {origin})")

--- a/frontend/src/routes/jobs/NewJob.tsx
+++ b/frontend/src/routes/jobs/NewJob.tsx
@ -20,6 +20,7 @@ const jobSchema = z.object({
  audio_description_vtt: z.boolean(),
  audio_description_mp3: z.boolean(),
  accessible_video_mp4: z.boolean(),
+  sdh_vtt: z.boolean(),
  languages: z.array(z.string()),
  translation_mode: z.enum(['traditional', 'video_native']),
 });
@ -74,6 +75,7 @@ export function NewJob() {
      audio_description_vtt: true,
      audio_description_mp3: true,
      accessible_video_mp4: false,
+      sdh_vtt: false,
      languages: [],
      translation_mode: 'video_native',
    }
@ -128,6 +130,7 @@ export function NewJob() {
        accessible_video_mp4: data.accessible_video_mp4,
        accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
        languages: data.languages,
+        sdh_vtt: data.sdh_vtt,
        transcreation: [],  // Transcreation replaced by video_native translation mode
        tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
        translation_mode: data.translation_mode,
@ -207,6 +210,7 @@ export function NewJob() {
        accessible_video_mp4: data.accessible_video_mp4,
        accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
        languages: data.languages,
+        sdh_vtt: data.sdh_vtt,
        transcreation: [],  // Transcreation replaced by video_native translation mode
        tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
        translation_mode: data.translation_mode,
@ -252,6 +256,7 @@ export function NewJob() {
        accessible_video_mp4: data.accessible_video_mp4,
        accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
        languages: data.languages,
+        sdh_vtt: data.sdh_vtt,
        transcreation: [],  // Transcreation replaced by video_native translation mode
        tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
        translation_mode: data.translation_mode,
@ -542,6 +547,14 @@ export function NewJob() {
              />
              <span>Accessible Video (MP4 with embedded audio descriptions)</span>
            </label>
+            <label className="flex items-center">
+              <input
+                type="checkbox"
+                {...register('sdh_vtt')}
+                className="mr-2"
+              />
+              <span>SDH Captions (VTT with speaker labels, sound effects &amp; music notation)</span>
+            </label>

            {/* Accessible Video Method Selector - shown when accessible_video_mp4 is checked */}
            {accessibleVideoMp4 && (
--- a/frontend/src/types/api.ts
+++ b/frontend/src/types/api.ts
@ -66,6 +66,7 @@ export interface RequestedOutputs {
  audio_description_mp3: boolean;
  accessible_video_mp4: boolean;  // Rendered video with embedded audio descriptions
  accessible_video_method?: AccessibleVideoMethod;  // User-selected method for accessible video
+  sdh_vtt?: boolean;  // SDH captions with speaker labels, sound effects, music notation
  languages: string[];
  transcreation: string[];
  tts_preferences?: TTSPreferences;