From c413fcb747baf39f70495b7a5ebdd6e79b438099 Mon Sep 17 00:00:00 2001
From: Vadym Samoilenko <vadymsamoilenko@oliver.agency>
Date: Wed, 18 Mar 2026 15:02:18 +0000
Subject: [PATCH] feat: add SDH (Subtitles for Deaf and Hard of Hearing)
 caption output
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

SDH captions extend standard VTT with speaker identification labels,
sound effects [PHONE RINGS], music notation ♪, and off-screen indicators.

- Add sdh_vtt flag to RequestedOutputs model and frontend form
- Add sdh_captions_vtt_gcs field to LangOutput model
- Inject SDH generation instructions into both Gemini prompts via
  {SDH_FIELD} and {SDH_GUIDELINES} placeholders when requested
- Upload sdh_captions.vtt to GCS in ingest task
- Pass SDH through video_native translation (Gemini generates it directly)
  and traditional translation (translate source SDH VTT via Gemini)
- Expose sdh_captions_vtt in downloads endpoint and bulk zip export

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 backend/app/api/v1/routes_jobs.py             |  9 ++++
 backend/app/models/job.py                     |  2 +
 backend/app/prompts/gemini_ingestion.md       |  3 ++
 .../app/prompts/gemini_ingestion_targeted.md  |  3 ++
 backend/app/services/gemini.py                | 38 +++++++++++++--
 backend/app/services/zip_download.py          |  1 +
 backend/app/tasks/ingest_and_ai.py            | 27 +++++++++--
 backend/app/tasks/translate_and_synthesize.py | 48 ++++++++++++++++---
 frontend/src/routes/jobs/NewJob.tsx           | 13 +++++
 frontend/src/types/api.ts                     |  1 +
 10 files changed, 128 insertions(+), 17 deletions(-)

diff --git a/backend/app/api/v1/routes_jobs.py b/backend/app/api/v1/routes_jobs.py
index dce4018..fd93523 100644
--- a/backend/app/api/v1/routes_jobs.py
+++ b/backend/app/api/v1/routes_jobs.py
@@ -1073,6 +1073,15 @@ async def get_job_downloads(
             except Exception as e:
                 logger.warning(f"Failed to generate signed URL for accessible video {language}: {e}")
 
+        # SDH Captions VTT
+        if "sdh_captions_vtt_gcs" in lang_output:
+            blob_path = lang_output["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
+            try:
+                signed_url = await get_signed_download_url(blob_path, 24)
+                lang_downloads["sdh_captions_vtt"] = signed_url
+            except Exception as e:
+                logger.warning(f"Failed to generate signed URL for SDH captions {language}: {e}")
+
         # Re-timed Captions VTT (for pause-insert accessible videos)
         if "retimed_captions_vtt_gcs" in lang_output:
             blob_path = lang_output["retimed_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
diff --git a/backend/app/models/job.py b/backend/app/models/job.py
index 8431b1a..822e6ed 100644
--- a/backend/app/models/job.py
+++ b/backend/app/models/job.py
@@ -62,6 +62,7 @@ class RequestedOutputs(BaseModel):
     audio_description_mp3: bool = True
     accessible_video_mp4: bool = False  # Rendered video with embedded audio descriptions
     accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None  # User-selected method
+    sdh_vtt: bool = False  # SDH (Subtitles for Deaf and Hard of Hearing) captions with speaker labels, sound effects, music notation
     languages: list[str] = []
     transcreation: list[str] = []
     tts_preferences: Optional[TTSPreferences] = None
@@ -109,6 +110,7 @@ class AccessibleVideoEditState(BaseModel):
 
 class LangOutput(BaseModel):
     captions_vtt_gcs: Optional[str] = None
+    sdh_captions_vtt_gcs: Optional[str] = None  # SDH-format captions (speaker labels, sound effects, music)
     ad_vtt_gcs: Optional[str] = None
     ad_mp3_gcs: Optional[str] = None
     # Accessible video outputs
diff --git a/backend/app/prompts/gemini_ingestion.md b/backend/app/prompts/gemini_ingestion.md
index 3fe62bf..4d504c0 100644
--- a/backend/app/prompts/gemini_ingestion.md
+++ b/backend/app/prompts/gemini_ingestion.md
@@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
 - transcript_plaintext: full spoken words, punctuated (in the detected language)
 - captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (in the detected language)
 - audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (MUST be written in the detected language)
+{SDH_FIELD}
 
 CRITICAL LANGUAGE REQUIREMENT:
 - First, detect the language spoken in the video
@@ -69,6 +70,8 @@ Rules:
 - Be succinct — omit redundant or self-evident details
 - Do NOT duplicate information already in the spoken dialogue
 
+{SDH_GUIDELINES}
+
 CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
 
 Example output format:
diff --git a/backend/app/prompts/gemini_ingestion_targeted.md b/backend/app/prompts/gemini_ingestion_targeted.md
index 39a47d5..361adf6 100644
--- a/backend/app/prompts/gemini_ingestion_targeted.md
+++ b/backend/app/prompts/gemini_ingestion_targeted.md
@@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
 - transcript_plaintext: full spoken words, punctuated, translated/written in {TARGET_LANGUAGE}
 - captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE})
 - audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE})
+{SDH_FIELD}
 
 TARGET LANGUAGE: {TARGET_LANGUAGE}
 
@@ -74,6 +75,8 @@ Rules:
 - Do NOT duplicate information already in the spoken dialogue
 - Write all descriptions in natural, fluent {TARGET_LANGUAGE}
 
+{SDH_GUIDELINES}
+
 CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
 
 Example output format (if TARGET_LANGUAGE were Spanish):
diff --git a/backend/app/services/gemini.py b/backend/app/services/gemini.py
index 85a85d7..5b1067c 100644
--- a/backend/app/services/gemini.py
+++ b/backend/app/services/gemini.py
@@ -59,6 +59,24 @@ class GeminiService:
         logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s")
         return False
 
+    def _build_sdh_field(self, sdh_requested: bool) -> str:
+        if sdh_requested:
+            return "- sdh_captions_vtt: a valid WebVTT file as a single string, containing SDH-format captions (same timing as captions_vtt, but enriched with speaker labels, sound effects, and music notation)"
+        return ""
+
+    def _build_sdh_guidelines(self, sdh_requested: bool) -> str:
+        if not sdh_requested:
+            return ""
+        return """SDH (SUBTITLES FOR THE DEAF AND HARD OF HEARING) GUIDELINES:
+Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched with:
+- Speaker identification when multiple speakers are present: use "NAME:" prefix (e.g., "JOHN: Hello there") or "[NARRATOR]" for narration
+- Non-speech sounds that are plot-relevant, in square brackets: [DOOR SLAMS], [PHONE RINGS], [CROWD CHEERING], [THUNDER]
+- Music: use ♪ for background music cues (e.g., "♪ tense music ♪") or ♪ around sung lyrics
+- Off-screen or voice-over speakers: indicate with "(off-screen)" or "[V.O.]" where relevant
+- Non-speech vocalisations when relevant: [SIGHS], [LAUGHS], [SCREAMS]
+- Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm)
+- Only add sound effect cues where they add meaningful context; do not annotate every minor sound"""
+
     def _build_brand_context_block(self, brand_context: Optional[str]) -> str:
         """Build the brand context instruction block for injection into prompts."""
         if brand_context and brand_context.strip():
@@ -71,13 +89,18 @@ class GeminiService:
                 )
         return "No specific brand names have been provided for this video."
 
-    async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None) -> dict[str, Any]:
+    async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False) -> dict[str, Any]:
         """
         Extract captions and audio descriptions from video using Gemini 2.0
         Returns structured JSON with transcript, captions VTT, and audio description VTT
         """
         prompt_template = self._load_prompt("gemini_ingestion.md")
-        prompt = prompt_template.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
+        prompt = (
+            prompt_template
+            .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
+            .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
+            .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
+        )
         uploaded_file = None
 
         try:
@@ -258,7 +281,8 @@ Fix the JSON and return it:
         self,
         video_file_path: str,
         target_language: str,
-        brand_context: Optional[str] = None
+        brand_context: Optional[str] = None,
+        sdh_requested: bool = False
     ) -> dict[str, Any]:
         """
         Extract captions and audio descriptions from video using Gemini,
@@ -279,8 +303,12 @@ Fix the JSON and return it:
             all in the target language
         """
         prompt_template = self._load_prompt("gemini_ingestion_targeted.md")
-        prompt = prompt_template.replace("{TARGET_LANGUAGE}", target_language).replace(
-            "{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)
+        prompt = (
+            prompt_template
+            .replace("{TARGET_LANGUAGE}", target_language)
+            .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
+            .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
+            .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
         )
         uploaded_file = None
 
diff --git a/backend/app/services/zip_download.py b/backend/app/services/zip_download.py
index 9b32e36..fa373e9 100644
--- a/backend/app/services/zip_download.py
+++ b/backend/app/services/zip_download.py
@@ -43,6 +43,7 @@ def sanitize_filename(name: str, max_length: int = 50) -> str:
 # Mapping from LangOutput field names to output filenames
 FILE_TYPE_MAPPING = {
     "captions_vtt_gcs": "captions.vtt",
+    "sdh_captions_vtt_gcs": "sdh_captions.vtt",
     "ad_vtt_gcs": "ad.vtt",
     "ad_mp3_gcs": "ad.mp3",
     "accessible_video_gcs": "accessible_video.mp4",
diff --git a/backend/app/tasks/ingest_and_ai.py b/backend/app/tasks/ingest_and_ai.py
index 2691116..a728eb6 100644
--- a/backend/app/tasks/ingest_and_ai.py
+++ b/backend/app/tasks/ingest_and_ai.py
@@ -204,7 +204,12 @@ async def ingest_and_ai_task_impl(job_id: str):
 
                 # Process with Gemini
                 brand_context = job_doc.get("brand_context")
-                ai_result = await gemini_service.extract_accessibility(temp_path, brand_context=brand_context)
+                sdh_requested = job_doc.get("requested_outputs", {}).get("sdh_vtt", False)
+                ai_result = await gemini_service.extract_accessibility(
+                    temp_path,
+                    brand_context=brand_context,
+                    sdh_requested=sdh_requested
+                )
 
                 # Final safety check for required fields
                 required_fields = ["captions_vtt", "audio_description_vtt"]
@@ -249,6 +254,21 @@ async def ingest_and_ai_task_impl(job_id: str):
                     f"{job_id}/{source_language}/ad.vtt"
                 )
 
+                # Upload SDH VTT if generated
+                sdh_gcs_uri = None
+                if sdh_requested and ai_result.get("sdh_captions_vtt"):
+                    sdh_gcs_uri = await upload_vtt_to_gcs(
+                        ai_result["sdh_captions_vtt"],
+                        f"{job_id}/{source_language}/sdh_captions.vtt"
+                    )
+
+                source_lang_output = {
+                    "captions_vtt_gcs": captions_gcs_uri,
+                    "ad_vtt_gcs": ad_gcs_uri,
+                }
+                if sdh_gcs_uri:
+                    source_lang_output["sdh_captions_vtt_gcs"] = sdh_gcs_uri
+
                 # Update job with AI results, detected language, and outputs
                 # Set status to TRANSLATING to trigger translation pipeline before QC
                 await db.jobs.update_one(
@@ -260,10 +280,7 @@ async def ingest_and_ai_task_impl(job_id: str):
                             "source.detected_language": detected_language,
                             "ai.ingestion_json": ai_result,
                             "ai.confidence": ai_result["confidence"],
-                            f"outputs.{source_language}": {
-                                "captions_vtt_gcs": captions_gcs_uri,
-                                "ad_vtt_gcs": ad_gcs_uri
-                            },
+                            f"outputs.{source_language}": source_lang_output,
                             "updated_at": datetime.utcnow()
                         },
                         "$push": {
diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py
index eb9bd44..c94330c 100644
--- a/backend/app/tasks/translate_and_synthesize.py
+++ b/backend/app/tasks/translate_and_synthesize.py
@@ -177,6 +177,8 @@ async def _async_translate_and_synthesize(job_id: str):
         translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional")
         logger.info(f"Translation mode for job {job_id}: {translation_mode}")
 
+        sdh_requested = job_doc["requested_outputs"].get("sdh_vtt", False)
+
         # Get source language VTT content (needed for traditional mode)
         source_outputs = job_doc["outputs"].get(source_language)
         if not source_outputs:
@@ -215,6 +217,12 @@ async def _async_translate_and_synthesize(job_id: str):
             source_captions_vtt = captions_blob.download_as_text()
             source_ad_vtt = ad_blob.download_as_text()
 
+            # Download source SDH VTT for traditional-mode translation
+            source_sdh_vtt = None
+            if sdh_requested and source_outputs.get("sdh_captions_vtt_gcs"):
+                sdh_blob_path = source_outputs["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
+                source_sdh_vtt = gcs_service.bucket.blob(sdh_blob_path).download_as_text()
+
         try:
             # Get target languages (exclude source)
             target_languages = [lang for lang in requested_languages if lang != source_language]
@@ -239,7 +247,8 @@ async def _async_translate_and_synthesize(job_id: str):
                                 return await gemini_service.extract_accessibility_targeted(
                                     video_local_path,
                                     lang,
-                                    brand_context=job_brand_context
+                                    brand_context=job_brand_context,
+                                    sdh_requested=sdh_requested
                                 )
 
                             result = await retry_with_backoff(extract_targeted, max_retries=3)
@@ -257,12 +266,20 @@ async def _async_translate_and_synthesize(job_id: str):
                                 f"{job_id}/{lang}/ad.vtt"
                             )
 
+                            # Upload SDH VTT if generated
+                            sdh_gcs_uri = None
+                            if sdh_requested and result.get("sdh_captions_vtt"):
+                                sdh_gcs_uri = await upload_vtt_to_gcs(
+                                    result["sdh_captions_vtt"],
+                                    f"{job_id}/{lang}/sdh_captions.vtt"
+                                )
+
                             logger.info(f"Completed video-native translation for {lang}")
-                            return (lang, captions_gcs_uri, ad_gcs_uri, None)
+                            return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, None)
 
                         except Exception as e:
                             logger.error(f"Video-native translation failed for {lang}: {e}")
-                            return (lang, None, None, str(e))
+                            return (lang, None, None, None, str(e))
 
                 # Run all translations in parallel (limited by semaphore)
                 if target_languages:
@@ -281,18 +298,21 @@ async def _async_translate_and_synthesize(job_id: str):
                                 "qa_notes": f"Translation failed: {str(result)}"
                             }
                         else:
-                            lang, captions_uri, ad_uri, error_msg = result
+                            lang, captions_uri, ad_uri, sdh_uri, error_msg = result
                             if error_msg:
                                 updated_outputs[lang] = {
                                     "origin": "video_native",
                                     "qa_notes": f"Translation failed: {error_msg}"
                                 }
                             else:
-                                updated_outputs[lang] = {
+                                lang_out = {
                                     "captions_vtt_gcs": captions_uri,
                                     "ad_vtt_gcs": ad_uri,
                                     "origin": "video_native"
                                 }
+                                if sdh_uri:
+                                    lang_out["sdh_captions_vtt_gcs"] = sdh_uri
+                                updated_outputs[lang] = lang_out
                                 logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)")
 
             else:
@@ -343,12 +363,26 @@ async def _async_translate_and_synthesize(job_id: str):
                             f"{job_id}/{language}/ad.vtt"
                         )
 
-                        # Store language outputs
-                        updated_outputs[language] = {
+                        # Translate and upload SDH VTT if requested
+                        lang_out: dict = {
                             "captions_vtt_gcs": captions_gcs_uri,
                             "ad_vtt_gcs": ad_gcs_uri,
                             "origin": origin
                         }
+                        if sdh_requested and source_sdh_vtt:
+                            async def translate_sdh():
+                                return await gemini_service.translate_vtt(
+                                    source_sdh_vtt, language, source_language=source_language
+                                )
+                            translated_sdh = await retry_with_backoff(translate_sdh, max_retries=3)
+                            sdh_gcs_uri = await upload_vtt_to_gcs(
+                                translated_sdh,
+                                f"{job_id}/{language}/sdh_captions.vtt"
+                            )
+                            lang_out["sdh_captions_vtt_gcs"] = sdh_gcs_uri
+
+                        # Store language outputs
+                        updated_outputs[language] = lang_out
 
                         logger.info(f"Successfully processed VTT files for language: {language} (origin: {origin})")
 
diff --git a/frontend/src/routes/jobs/NewJob.tsx b/frontend/src/routes/jobs/NewJob.tsx
index a8ca812..8844f9e 100644
--- a/frontend/src/routes/jobs/NewJob.tsx
+++ b/frontend/src/routes/jobs/NewJob.tsx
@@ -20,6 +20,7 @@ const jobSchema = z.object({
   audio_description_vtt: z.boolean(),
   audio_description_mp3: z.boolean(),
   accessible_video_mp4: z.boolean(),
+  sdh_vtt: z.boolean(),
   languages: z.array(z.string()),
   translation_mode: z.enum(['traditional', 'video_native']),
 });
@@ -74,6 +75,7 @@ export function NewJob() {
       audio_description_vtt: true,
       audio_description_mp3: true,
       accessible_video_mp4: false,
+      sdh_vtt: false,
       languages: [],
       translation_mode: 'video_native',
     }
@@ -128,6 +130,7 @@ export function NewJob() {
         accessible_video_mp4: data.accessible_video_mp4,
         accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
         languages: data.languages,
+        sdh_vtt: data.sdh_vtt,
         transcreation: [],  // Transcreation replaced by video_native translation mode
         tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
         translation_mode: data.translation_mode,
@@ -207,6 +210,7 @@ export function NewJob() {
         accessible_video_mp4: data.accessible_video_mp4,
         accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
         languages: data.languages,
+        sdh_vtt: data.sdh_vtt,
         transcreation: [],  // Transcreation replaced by video_native translation mode
         tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
         translation_mode: data.translation_mode,
@@ -252,6 +256,7 @@ export function NewJob() {
         accessible_video_mp4: data.accessible_video_mp4,
         accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
         languages: data.languages,
+        sdh_vtt: data.sdh_vtt,
         transcreation: [],  // Transcreation replaced by video_native translation mode
         tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
         translation_mode: data.translation_mode,
@@ -542,6 +547,14 @@ export function NewJob() {
               />
               <span>Accessible Video (MP4 with embedded audio descriptions)</span>
             </label>
+            <label className="flex items-center">
+              <input
+                type="checkbox"
+                {...register('sdh_vtt')}
+                className="mr-2"
+              />
+              <span>SDH Captions (VTT with speaker labels, sound effects &amp; music notation)</span>
+            </label>
 
             {/* Accessible Video Method Selector - shown when accessible_video_mp4 is checked */}
             {accessibleVideoMp4 && (
diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts
index 9680aa5..e8d11f2 100644
--- a/frontend/src/types/api.ts
+++ b/frontend/src/types/api.ts
@@ -66,6 +66,7 @@ export interface RequestedOutputs {
   audio_description_mp3: boolean;
   accessible_video_mp4: boolean;  // Rendered video with embedded audio descriptions
   accessible_video_method?: AccessibleVideoMethod;  // User-selected method for accessible video
+  sdh_vtt?: boolean;  // SDH captions with speaker labels, sound effects, music notation
   languages: string[];
   transcreation: string[];
   tts_preferences?: TTSPreferences;