feat: add SDH (Subtitles for Deaf and Hard of Hearing) caption output

SDH captions extend standard VTT with speaker identification labels,
sound effects [PHONE RINGS], music notation ♪, and off-screen indicators.

- Add sdh_vtt flag to RequestedOutputs model and frontend form
- Add sdh_captions_vtt_gcs field to LangOutput model
- Inject SDH generation instructions into both Gemini prompts via
  {SDH_FIELD} and {SDH_GUIDELINES} placeholders when requested
- Upload sdh_captions.vtt to GCS in ingest task
- Pass SDH through video_native translation (Gemini generates it directly)
  and traditional translation (translate source SDH VTT via Gemini)
- Expose sdh_captions_vtt in downloads endpoint and bulk zip export

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-03-18 15:02:18 +00:00
parent 2e8a8dc287
commit c413fcb747
10 changed files with 128 additions and 17 deletions

View file

@ -1073,6 +1073,15 @@ async def get_job_downloads(
except Exception as e:
logger.warning(f"Failed to generate signed URL for accessible video {language}: {e}")
# SDH Captions VTT
if "sdh_captions_vtt_gcs" in lang_output:
blob_path = lang_output["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
try:
signed_url = await get_signed_download_url(blob_path, 24)
lang_downloads["sdh_captions_vtt"] = signed_url
except Exception as e:
logger.warning(f"Failed to generate signed URL for SDH captions {language}: {e}")
# Re-timed Captions VTT (for pause-insert accessible videos)
if "retimed_captions_vtt_gcs" in lang_output:
blob_path = lang_output["retimed_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")

View file

@ -62,6 +62,7 @@ class RequestedOutputs(BaseModel):
audio_description_mp3: bool = True
accessible_video_mp4: bool = False # Rendered video with embedded audio descriptions
accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None # User-selected method
sdh_vtt: bool = False # SDH (Subtitles for Deaf and Hard of Hearing) captions with speaker labels, sound effects, music notation
languages: list[str] = []
transcreation: list[str] = []
tts_preferences: Optional[TTSPreferences] = None
@ -109,6 +110,7 @@ class AccessibleVideoEditState(BaseModel):
class LangOutput(BaseModel):
captions_vtt_gcs: Optional[str] = None
sdh_captions_vtt_gcs: Optional[str] = None # SDH-format captions (speaker labels, sound effects, music)
ad_vtt_gcs: Optional[str] = None
ad_mp3_gcs: Optional[str] = None
# Accessible video outputs

View file

@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
- transcript_plaintext: full spoken words, punctuated (in the detected language)
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (in the detected language)
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (MUST be written in the detected language)
{SDH_FIELD}
CRITICAL LANGUAGE REQUIREMENT:
- First, detect the language spoken in the video
@ -69,6 +70,8 @@ Rules:
- Be succinct — omit redundant or self-evident details
- Do NOT duplicate information already in the spoken dialogue
{SDH_GUIDELINES}
CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
Example output format:

View file

@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
- transcript_plaintext: full spoken words, punctuated, translated/written in {TARGET_LANGUAGE}
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE})
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE})
{SDH_FIELD}
TARGET LANGUAGE: {TARGET_LANGUAGE}
@ -74,6 +75,8 @@ Rules:
- Do NOT duplicate information already in the spoken dialogue
- Write all descriptions in natural, fluent {TARGET_LANGUAGE}
{SDH_GUIDELINES}
CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
Example output format (if TARGET_LANGUAGE were Spanish):

View file

@ -59,6 +59,24 @@ class GeminiService:
logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s")
return False
def _build_sdh_field(self, sdh_requested: bool) -> str:
if sdh_requested:
return "- sdh_captions_vtt: a valid WebVTT file as a single string, containing SDH-format captions (same timing as captions_vtt, but enriched with speaker labels, sound effects, and music notation)"
return ""
def _build_sdh_guidelines(self, sdh_requested: bool) -> str:
if not sdh_requested:
return ""
return """SDH (SUBTITLES FOR THE DEAF AND HARD OF HEARING) GUIDELINES:
Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched with:
- Speaker identification when multiple speakers are present: use "NAME:" prefix (e.g., "JOHN: Hello there") or "[NARRATOR]" for narration
- Non-speech sounds that are plot-relevant, in square brackets: [DOOR SLAMS], [PHONE RINGS], [CROWD CHEERING], [THUNDER]
- Music: use for background music cues (e.g., "♪ tense music ♪") or around sung lyrics
- Off-screen or voice-over speakers: indicate with "(off-screen)" or "[V.O.]" where relevant
- Non-speech vocalisations when relevant: [SIGHS], [LAUGHS], [SCREAMS]
- Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm)
- Only add sound effect cues where they add meaningful context; do not annotate every minor sound"""
def _build_brand_context_block(self, brand_context: Optional[str]) -> str:
"""Build the brand context instruction block for injection into prompts."""
if brand_context and brand_context.strip():
@ -71,13 +89,18 @@ class GeminiService:
)
return "No specific brand names have been provided for this video."
async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None) -> dict[str, Any]:
async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False) -> dict[str, Any]:
"""
Extract captions and audio descriptions from video using Gemini 2.0
Returns structured JSON with transcript, captions VTT, and audio description VTT
"""
prompt_template = self._load_prompt("gemini_ingestion.md")
prompt = prompt_template.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
prompt = (
prompt_template
.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
)
uploaded_file = None
try:
@ -258,7 +281,8 @@ Fix the JSON and return it:
self,
video_file_path: str,
target_language: str,
brand_context: Optional[str] = None
brand_context: Optional[str] = None,
sdh_requested: bool = False
) -> dict[str, Any]:
"""
Extract captions and audio descriptions from video using Gemini,
@ -279,8 +303,12 @@ Fix the JSON and return it:
all in the target language
"""
prompt_template = self._load_prompt("gemini_ingestion_targeted.md")
prompt = prompt_template.replace("{TARGET_LANGUAGE}", target_language).replace(
"{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)
prompt = (
prompt_template
.replace("{TARGET_LANGUAGE}", target_language)
.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
)
uploaded_file = None

View file

@ -43,6 +43,7 @@ def sanitize_filename(name: str, max_length: int = 50) -> str:
# Mapping from LangOutput field names to output filenames
FILE_TYPE_MAPPING = {
"captions_vtt_gcs": "captions.vtt",
"sdh_captions_vtt_gcs": "sdh_captions.vtt",
"ad_vtt_gcs": "ad.vtt",
"ad_mp3_gcs": "ad.mp3",
"accessible_video_gcs": "accessible_video.mp4",

View file

@ -204,7 +204,12 @@ async def ingest_and_ai_task_impl(job_id: str):
# Process with Gemini
brand_context = job_doc.get("brand_context")
ai_result = await gemini_service.extract_accessibility(temp_path, brand_context=brand_context)
sdh_requested = job_doc.get("requested_outputs", {}).get("sdh_vtt", False)
ai_result = await gemini_service.extract_accessibility(
temp_path,
brand_context=brand_context,
sdh_requested=sdh_requested
)
# Final safety check for required fields
required_fields = ["captions_vtt", "audio_description_vtt"]
@ -249,6 +254,21 @@ async def ingest_and_ai_task_impl(job_id: str):
f"{job_id}/{source_language}/ad.vtt"
)
# Upload SDH VTT if generated
sdh_gcs_uri = None
if sdh_requested and ai_result.get("sdh_captions_vtt"):
sdh_gcs_uri = await upload_vtt_to_gcs(
ai_result["sdh_captions_vtt"],
f"{job_id}/{source_language}/sdh_captions.vtt"
)
source_lang_output = {
"captions_vtt_gcs": captions_gcs_uri,
"ad_vtt_gcs": ad_gcs_uri,
}
if sdh_gcs_uri:
source_lang_output["sdh_captions_vtt_gcs"] = sdh_gcs_uri
# Update job with AI results, detected language, and outputs
# Set status to TRANSLATING to trigger translation pipeline before QC
await db.jobs.update_one(
@ -260,10 +280,7 @@ async def ingest_and_ai_task_impl(job_id: str):
"source.detected_language": detected_language,
"ai.ingestion_json": ai_result,
"ai.confidence": ai_result["confidence"],
f"outputs.{source_language}": {
"captions_vtt_gcs": captions_gcs_uri,
"ad_vtt_gcs": ad_gcs_uri
},
f"outputs.{source_language}": source_lang_output,
"updated_at": datetime.utcnow()
},
"$push": {

View file

@ -177,6 +177,8 @@ async def _async_translate_and_synthesize(job_id: str):
translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional")
logger.info(f"Translation mode for job {job_id}: {translation_mode}")
sdh_requested = job_doc["requested_outputs"].get("sdh_vtt", False)
# Get source language VTT content (needed for traditional mode)
source_outputs = job_doc["outputs"].get(source_language)
if not source_outputs:
@ -215,6 +217,12 @@ async def _async_translate_and_synthesize(job_id: str):
source_captions_vtt = captions_blob.download_as_text()
source_ad_vtt = ad_blob.download_as_text()
# Download source SDH VTT for traditional-mode translation
source_sdh_vtt = None
if sdh_requested and source_outputs.get("sdh_captions_vtt_gcs"):
sdh_blob_path = source_outputs["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
source_sdh_vtt = gcs_service.bucket.blob(sdh_blob_path).download_as_text()
try:
# Get target languages (exclude source)
target_languages = [lang for lang in requested_languages if lang != source_language]
@ -239,7 +247,8 @@ async def _async_translate_and_synthesize(job_id: str):
return await gemini_service.extract_accessibility_targeted(
video_local_path,
lang,
brand_context=job_brand_context
brand_context=job_brand_context,
sdh_requested=sdh_requested
)
result = await retry_with_backoff(extract_targeted, max_retries=3)
@ -257,12 +266,20 @@ async def _async_translate_and_synthesize(job_id: str):
f"{job_id}/{lang}/ad.vtt"
)
# Upload SDH VTT if generated
sdh_gcs_uri = None
if sdh_requested and result.get("sdh_captions_vtt"):
sdh_gcs_uri = await upload_vtt_to_gcs(
result["sdh_captions_vtt"],
f"{job_id}/{lang}/sdh_captions.vtt"
)
logger.info(f"Completed video-native translation for {lang}")
return (lang, captions_gcs_uri, ad_gcs_uri, None)
return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, None)
except Exception as e:
logger.error(f"Video-native translation failed for {lang}: {e}")
return (lang, None, None, str(e))
return (lang, None, None, None, str(e))
# Run all translations in parallel (limited by semaphore)
if target_languages:
@ -281,18 +298,21 @@ async def _async_translate_and_synthesize(job_id: str):
"qa_notes": f"Translation failed: {str(result)}"
}
else:
lang, captions_uri, ad_uri, error_msg = result
lang, captions_uri, ad_uri, sdh_uri, error_msg = result
if error_msg:
updated_outputs[lang] = {
"origin": "video_native",
"qa_notes": f"Translation failed: {error_msg}"
}
else:
updated_outputs[lang] = {
lang_out = {
"captions_vtt_gcs": captions_uri,
"ad_vtt_gcs": ad_uri,
"origin": "video_native"
}
if sdh_uri:
lang_out["sdh_captions_vtt_gcs"] = sdh_uri
updated_outputs[lang] = lang_out
logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)")
else:
@ -343,12 +363,26 @@ async def _async_translate_and_synthesize(job_id: str):
f"{job_id}/{language}/ad.vtt"
)
# Store language outputs
updated_outputs[language] = {
# Translate and upload SDH VTT if requested
lang_out: dict = {
"captions_vtt_gcs": captions_gcs_uri,
"ad_vtt_gcs": ad_gcs_uri,
"origin": origin
}
if sdh_requested and source_sdh_vtt:
async def translate_sdh():
return await gemini_service.translate_vtt(
source_sdh_vtt, language, source_language=source_language
)
translated_sdh = await retry_with_backoff(translate_sdh, max_retries=3)
sdh_gcs_uri = await upload_vtt_to_gcs(
translated_sdh,
f"{job_id}/{language}/sdh_captions.vtt"
)
lang_out["sdh_captions_vtt_gcs"] = sdh_gcs_uri
# Store language outputs
updated_outputs[language] = lang_out
logger.info(f"Successfully processed VTT files for language: {language} (origin: {origin})")

View file

@ -20,6 +20,7 @@ const jobSchema = z.object({
audio_description_vtt: z.boolean(),
audio_description_mp3: z.boolean(),
accessible_video_mp4: z.boolean(),
sdh_vtt: z.boolean(),
languages: z.array(z.string()),
translation_mode: z.enum(['traditional', 'video_native']),
});
@ -74,6 +75,7 @@ export function NewJob() {
audio_description_vtt: true,
audio_description_mp3: true,
accessible_video_mp4: false,
sdh_vtt: false,
languages: [],
translation_mode: 'video_native',
}
@ -128,6 +130,7 @@ export function NewJob() {
accessible_video_mp4: data.accessible_video_mp4,
accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
languages: data.languages,
sdh_vtt: data.sdh_vtt,
transcreation: [], // Transcreation replaced by video_native translation mode
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
translation_mode: data.translation_mode,
@ -207,6 +210,7 @@ export function NewJob() {
accessible_video_mp4: data.accessible_video_mp4,
accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
languages: data.languages,
sdh_vtt: data.sdh_vtt,
transcreation: [], // Transcreation replaced by video_native translation mode
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
translation_mode: data.translation_mode,
@ -252,6 +256,7 @@ export function NewJob() {
accessible_video_mp4: data.accessible_video_mp4,
accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
languages: data.languages,
sdh_vtt: data.sdh_vtt,
transcreation: [], // Transcreation replaced by video_native translation mode
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
translation_mode: data.translation_mode,
@ -542,6 +547,14 @@ export function NewJob() {
/>
<span>Accessible Video (MP4 with embedded audio descriptions)</span>
</label>
<label className="flex items-center">
<input
type="checkbox"
{...register('sdh_vtt')}
className="mr-2"
/>
<span>SDH Captions (VTT with speaker labels, sound effects &amp; music notation)</span>
</label>
{/* Accessible Video Method Selector - shown when accessible_video_mp4 is checked */}
{accessibleVideoMp4 && (

View file

@ -66,6 +66,7 @@ export interface RequestedOutputs {
audio_description_mp3: boolean;
accessible_video_mp4: boolean; // Rendered video with embedded audio descriptions
accessible_video_method?: AccessibleVideoMethod; // User-selected method for accessible video
sdh_vtt?: boolean; // SDH captions with speaker labels, sound effects, music notation
languages: string[];
transcreation: string[];
tts_preferences?: TTSPreferences;