diff --git a/backend/app/api/v1/routes_jobs.py b/backend/app/api/v1/routes_jobs.py
index dce4018..fd93523 100644
--- a/backend/app/api/v1/routes_jobs.py
+++ b/backend/app/api/v1/routes_jobs.py
@@ -1073,6 +1073,15 @@ async def get_job_downloads(
except Exception as e:
logger.warning(f"Failed to generate signed URL for accessible video {language}: {e}")
+ # SDH Captions VTT
+ if "sdh_captions_vtt_gcs" in lang_output:
+ blob_path = lang_output["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
+ try:
+ signed_url = await get_signed_download_url(blob_path, 24)
+ lang_downloads["sdh_captions_vtt"] = signed_url
+ except Exception as e:
+ logger.warning(f"Failed to generate signed URL for SDH captions {language}: {e}")
+
# Re-timed Captions VTT (for pause-insert accessible videos)
if "retimed_captions_vtt_gcs" in lang_output:
blob_path = lang_output["retimed_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
diff --git a/backend/app/models/job.py b/backend/app/models/job.py
index 8431b1a..822e6ed 100644
--- a/backend/app/models/job.py
+++ b/backend/app/models/job.py
@@ -62,6 +62,7 @@ class RequestedOutputs(BaseModel):
audio_description_mp3: bool = True
accessible_video_mp4: bool = False # Rendered video with embedded audio descriptions
accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None # User-selected method
+ sdh_vtt: bool = False # SDH (Subtitles for Deaf and Hard of Hearing) captions with speaker labels, sound effects, music notation
languages: list[str] = []
transcreation: list[str] = []
tts_preferences: Optional[TTSPreferences] = None
@@ -109,6 +110,7 @@ class AccessibleVideoEditState(BaseModel):
class LangOutput(BaseModel):
captions_vtt_gcs: Optional[str] = None
+ sdh_captions_vtt_gcs: Optional[str] = None # SDH-format captions (speaker labels, sound effects, music)
ad_vtt_gcs: Optional[str] = None
ad_mp3_gcs: Optional[str] = None
# Accessible video outputs
diff --git a/backend/app/prompts/gemini_ingestion.md b/backend/app/prompts/gemini_ingestion.md
index 3fe62bf..4d504c0 100644
--- a/backend/app/prompts/gemini_ingestion.md
+++ b/backend/app/prompts/gemini_ingestion.md
@@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
- transcript_plaintext: full spoken words, punctuated (in the detected language)
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (in the detected language)
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (MUST be written in the detected language)
+{SDH_FIELD}
CRITICAL LANGUAGE REQUIREMENT:
- First, detect the language spoken in the video
@@ -69,6 +70,8 @@ Rules:
- Be succinct — omit redundant or self-evident details
- Do NOT duplicate information already in the spoken dialogue
+{SDH_GUIDELINES}
+
CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
Example output format:
diff --git a/backend/app/prompts/gemini_ingestion_targeted.md b/backend/app/prompts/gemini_ingestion_targeted.md
index 39a47d5..361adf6 100644
--- a/backend/app/prompts/gemini_ingestion_targeted.md
+++ b/backend/app/prompts/gemini_ingestion_targeted.md
@@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
- transcript_plaintext: full spoken words, punctuated, translated/written in {TARGET_LANGUAGE}
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE})
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE})
+{SDH_FIELD}
TARGET LANGUAGE: {TARGET_LANGUAGE}
@@ -74,6 +75,8 @@ Rules:
- Do NOT duplicate information already in the spoken dialogue
- Write all descriptions in natural, fluent {TARGET_LANGUAGE}
+{SDH_GUIDELINES}
+
CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
Example output format (if TARGET_LANGUAGE were Spanish):
diff --git a/backend/app/services/gemini.py b/backend/app/services/gemini.py
index 85a85d7..5b1067c 100644
--- a/backend/app/services/gemini.py
+++ b/backend/app/services/gemini.py
@@ -59,6 +59,24 @@ class GeminiService:
logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s")
return False
+ def _build_sdh_field(self, sdh_requested: bool) -> str:
+ if sdh_requested:
+ return "- sdh_captions_vtt: a valid WebVTT file as a single string, containing SDH-format captions (same timing as captions_vtt, but enriched with speaker labels, sound effects, and music notation)"
+ return ""
+
+ def _build_sdh_guidelines(self, sdh_requested: bool) -> str:
+ if not sdh_requested:
+ return ""
+ return """SDH (SUBTITLES FOR THE DEAF AND HARD OF HEARING) GUIDELINES:
+Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched with:
+- Speaker identification when multiple speakers are present: use "NAME:" prefix (e.g., "JOHN: Hello there") or "[NARRATOR]" for narration
+- Non-speech sounds that are plot-relevant, in square brackets: [DOOR SLAMS], [PHONE RINGS], [CROWD CHEERING], [THUNDER]
+- Music: use ♪ for background music cues (e.g., "♪ tense music ♪") or ♪ around sung lyrics
+- Off-screen or voice-over speakers: indicate with "(off-screen)" or "[V.O.]" where relevant
+- Non-speech vocalisations when relevant: [SIGHS], [LAUGHS], [SCREAMS]
+- Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm)
+- Only add sound effect cues where they add meaningful context; do not annotate every minor sound"""
+
def _build_brand_context_block(self, brand_context: Optional[str]) -> str:
"""Build the brand context instruction block for injection into prompts."""
if brand_context and brand_context.strip():
@@ -71,13 +89,18 @@ class GeminiService:
)
return "No specific brand names have been provided for this video."
- async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None) -> dict[str, Any]:
+ async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False) -> dict[str, Any]:
"""
Extract captions and audio descriptions from video using Gemini 2.0
Returns structured JSON with transcript, captions VTT, and audio description VTT
"""
prompt_template = self._load_prompt("gemini_ingestion.md")
- prompt = prompt_template.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
+ prompt = (
+ prompt_template
+ .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
+ .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
+ .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
+ )
uploaded_file = None
try:
@@ -258,7 +281,8 @@ Fix the JSON and return it:
self,
video_file_path: str,
target_language: str,
- brand_context: Optional[str] = None
+ brand_context: Optional[str] = None,
+ sdh_requested: bool = False
) -> dict[str, Any]:
"""
Extract captions and audio descriptions from video using Gemini,
@@ -279,8 +303,12 @@ Fix the JSON and return it:
all in the target language
"""
prompt_template = self._load_prompt("gemini_ingestion_targeted.md")
- prompt = prompt_template.replace("{TARGET_LANGUAGE}", target_language).replace(
- "{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)
+ prompt = (
+ prompt_template
+ .replace("{TARGET_LANGUAGE}", target_language)
+ .replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
+ .replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
+ .replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
)
uploaded_file = None
diff --git a/backend/app/services/zip_download.py b/backend/app/services/zip_download.py
index 9b32e36..fa373e9 100644
--- a/backend/app/services/zip_download.py
+++ b/backend/app/services/zip_download.py
@@ -43,6 +43,7 @@ def sanitize_filename(name: str, max_length: int = 50) -> str:
# Mapping from LangOutput field names to output filenames
FILE_TYPE_MAPPING = {
"captions_vtt_gcs": "captions.vtt",
+ "sdh_captions_vtt_gcs": "sdh_captions.vtt",
"ad_vtt_gcs": "ad.vtt",
"ad_mp3_gcs": "ad.mp3",
"accessible_video_gcs": "accessible_video.mp4",
diff --git a/backend/app/tasks/ingest_and_ai.py b/backend/app/tasks/ingest_and_ai.py
index 2691116..a728eb6 100644
--- a/backend/app/tasks/ingest_and_ai.py
+++ b/backend/app/tasks/ingest_and_ai.py
@@ -204,7 +204,12 @@ async def ingest_and_ai_task_impl(job_id: str):
# Process with Gemini
brand_context = job_doc.get("brand_context")
- ai_result = await gemini_service.extract_accessibility(temp_path, brand_context=brand_context)
+ sdh_requested = job_doc.get("requested_outputs", {}).get("sdh_vtt", False)
+ ai_result = await gemini_service.extract_accessibility(
+ temp_path,
+ brand_context=brand_context,
+ sdh_requested=sdh_requested
+ )
# Final safety check for required fields
required_fields = ["captions_vtt", "audio_description_vtt"]
@@ -249,6 +254,21 @@ async def ingest_and_ai_task_impl(job_id: str):
f"{job_id}/{source_language}/ad.vtt"
)
+ # Upload SDH VTT if generated
+ sdh_gcs_uri = None
+ if sdh_requested and ai_result.get("sdh_captions_vtt"):
+ sdh_gcs_uri = await upload_vtt_to_gcs(
+ ai_result["sdh_captions_vtt"],
+ f"{job_id}/{source_language}/sdh_captions.vtt"
+ )
+
+ source_lang_output = {
+ "captions_vtt_gcs": captions_gcs_uri,
+ "ad_vtt_gcs": ad_gcs_uri,
+ }
+ if sdh_gcs_uri:
+ source_lang_output["sdh_captions_vtt_gcs"] = sdh_gcs_uri
+
# Update job with AI results, detected language, and outputs
# Set status to TRANSLATING to trigger translation pipeline before QC
await db.jobs.update_one(
@@ -260,10 +280,7 @@ async def ingest_and_ai_task_impl(job_id: str):
"source.detected_language": detected_language,
"ai.ingestion_json": ai_result,
"ai.confidence": ai_result["confidence"],
- f"outputs.{source_language}": {
- "captions_vtt_gcs": captions_gcs_uri,
- "ad_vtt_gcs": ad_gcs_uri
- },
+ f"outputs.{source_language}": source_lang_output,
"updated_at": datetime.utcnow()
},
"$push": {
diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py
index eb9bd44..c94330c 100644
--- a/backend/app/tasks/translate_and_synthesize.py
+++ b/backend/app/tasks/translate_and_synthesize.py
@@ -177,6 +177,8 @@ async def _async_translate_and_synthesize(job_id: str):
translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional")
logger.info(f"Translation mode for job {job_id}: {translation_mode}")
+ sdh_requested = job_doc["requested_outputs"].get("sdh_vtt", False)
+
# Get source language VTT content (needed for traditional mode)
source_outputs = job_doc["outputs"].get(source_language)
if not source_outputs:
@@ -215,6 +217,12 @@ async def _async_translate_and_synthesize(job_id: str):
source_captions_vtt = captions_blob.download_as_text()
source_ad_vtt = ad_blob.download_as_text()
+ # Download source SDH VTT for traditional-mode translation
+ source_sdh_vtt = None
+ if sdh_requested and source_outputs.get("sdh_captions_vtt_gcs"):
+ sdh_blob_path = source_outputs["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
+ source_sdh_vtt = gcs_service.bucket.blob(sdh_blob_path).download_as_text()
+
try:
# Get target languages (exclude source)
target_languages = [lang for lang in requested_languages if lang != source_language]
@@ -239,7 +247,8 @@ async def _async_translate_and_synthesize(job_id: str):
return await gemini_service.extract_accessibility_targeted(
video_local_path,
lang,
- brand_context=job_brand_context
+ brand_context=job_brand_context,
+ sdh_requested=sdh_requested
)
result = await retry_with_backoff(extract_targeted, max_retries=3)
@@ -257,12 +266,20 @@ async def _async_translate_and_synthesize(job_id: str):
f"{job_id}/{lang}/ad.vtt"
)
+ # Upload SDH VTT if generated
+ sdh_gcs_uri = None
+ if sdh_requested and result.get("sdh_captions_vtt"):
+ sdh_gcs_uri = await upload_vtt_to_gcs(
+ result["sdh_captions_vtt"],
+ f"{job_id}/{lang}/sdh_captions.vtt"
+ )
+
logger.info(f"Completed video-native translation for {lang}")
- return (lang, captions_gcs_uri, ad_gcs_uri, None)
+ return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, None)
except Exception as e:
logger.error(f"Video-native translation failed for {lang}: {e}")
- return (lang, None, None, str(e))
+ return (lang, None, None, None, str(e))
# Run all translations in parallel (limited by semaphore)
if target_languages:
@@ -281,18 +298,21 @@ async def _async_translate_and_synthesize(job_id: str):
"qa_notes": f"Translation failed: {str(result)}"
}
else:
- lang, captions_uri, ad_uri, error_msg = result
+ lang, captions_uri, ad_uri, sdh_uri, error_msg = result
if error_msg:
updated_outputs[lang] = {
"origin": "video_native",
"qa_notes": f"Translation failed: {error_msg}"
}
else:
- updated_outputs[lang] = {
+ lang_out = {
"captions_vtt_gcs": captions_uri,
"ad_vtt_gcs": ad_uri,
"origin": "video_native"
}
+ if sdh_uri:
+ lang_out["sdh_captions_vtt_gcs"] = sdh_uri
+ updated_outputs[lang] = lang_out
logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)")
else:
@@ -343,12 +363,26 @@ async def _async_translate_and_synthesize(job_id: str):
f"{job_id}/{language}/ad.vtt"
)
- # Store language outputs
- updated_outputs[language] = {
+ # Translate and upload SDH VTT if requested
+ lang_out: dict = {
"captions_vtt_gcs": captions_gcs_uri,
"ad_vtt_gcs": ad_gcs_uri,
"origin": origin
}
+ if sdh_requested and source_sdh_vtt:
+ async def translate_sdh():
+ return await gemini_service.translate_vtt(
+ source_sdh_vtt, language, source_language=source_language
+ )
+ translated_sdh = await retry_with_backoff(translate_sdh, max_retries=3)
+ sdh_gcs_uri = await upload_vtt_to_gcs(
+ translated_sdh,
+ f"{job_id}/{language}/sdh_captions.vtt"
+ )
+ lang_out["sdh_captions_vtt_gcs"] = sdh_gcs_uri
+
+ # Store language outputs
+ updated_outputs[language] = lang_out
logger.info(f"Successfully processed VTT files for language: {language} (origin: {origin})")
diff --git a/frontend/src/routes/jobs/NewJob.tsx b/frontend/src/routes/jobs/NewJob.tsx
index a8ca812..8844f9e 100644
--- a/frontend/src/routes/jobs/NewJob.tsx
+++ b/frontend/src/routes/jobs/NewJob.tsx
@@ -20,6 +20,7 @@ const jobSchema = z.object({
audio_description_vtt: z.boolean(),
audio_description_mp3: z.boolean(),
accessible_video_mp4: z.boolean(),
+ sdh_vtt: z.boolean(),
languages: z.array(z.string()),
translation_mode: z.enum(['traditional', 'video_native']),
});
@@ -74,6 +75,7 @@ export function NewJob() {
audio_description_vtt: true,
audio_description_mp3: true,
accessible_video_mp4: false,
+ sdh_vtt: false,
languages: [],
translation_mode: 'video_native',
}
@@ -128,6 +130,7 @@ export function NewJob() {
accessible_video_mp4: data.accessible_video_mp4,
accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
languages: data.languages,
+ sdh_vtt: data.sdh_vtt,
transcreation: [], // Transcreation replaced by video_native translation mode
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
translation_mode: data.translation_mode,
@@ -207,6 +210,7 @@ export function NewJob() {
accessible_video_mp4: data.accessible_video_mp4,
accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
languages: data.languages,
+ sdh_vtt: data.sdh_vtt,
transcreation: [], // Transcreation replaced by video_native translation mode
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
translation_mode: data.translation_mode,
@@ -252,6 +256,7 @@ export function NewJob() {
accessible_video_mp4: data.accessible_video_mp4,
accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
languages: data.languages,
+ sdh_vtt: data.sdh_vtt,
transcreation: [], // Transcreation replaced by video_native translation mode
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
translation_mode: data.translation_mode,
@@ -542,6 +547,14 @@ export function NewJob() {
/>
Accessible Video (MP4 with embedded audio descriptions)
+
{/* Accessible Video Method Selector - shown when accessible_video_mp4 is checked */}
{accessibleVideoMp4 && (
diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts
index 9680aa5..e8d11f2 100644
--- a/frontend/src/types/api.ts
+++ b/frontend/src/types/api.ts
@@ -66,6 +66,7 @@ export interface RequestedOutputs {
audio_description_mp3: boolean;
accessible_video_mp4: boolean; // Rendered video with embedded audio descriptions
accessible_video_method?: AccessibleVideoMethod; // User-selected method for accessible video
+ sdh_vtt?: boolean; // SDH captions with speaker labels, sound effects, music notation
languages: string[];
transcreation: string[];
tts_preferences?: TTSPreferences;