feat: add SDH (Subtitles for Deaf and Hard of Hearing) caption output
SDH captions extend standard VTT with speaker identification labels,
sound effects [PHONE RINGS], music notation ♪, and off-screen indicators.
- Add sdh_vtt flag to RequestedOutputs model and frontend form
- Add sdh_captions_vtt_gcs field to LangOutput model
- Inject SDH generation instructions into both Gemini prompts via
{SDH_FIELD} and {SDH_GUIDELINES} placeholders when requested
- Upload sdh_captions.vtt to GCS in ingest task
- Pass SDH through video_native translation (Gemini generates it directly)
and traditional translation (translate source SDH VTT via Gemini)
- Expose sdh_captions_vtt in downloads endpoint and bulk zip export
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2e8a8dc287
commit
c413fcb747
10 changed files with 128 additions and 17 deletions
|
|
@ -1073,6 +1073,15 @@ async def get_job_downloads(
|
|||
except Exception as e:
|
||||
logger.warning(f"Failed to generate signed URL for accessible video {language}: {e}")
|
||||
|
||||
# SDH Captions VTT
|
||||
if "sdh_captions_vtt_gcs" in lang_output:
|
||||
blob_path = lang_output["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
try:
|
||||
signed_url = await get_signed_download_url(blob_path, 24)
|
||||
lang_downloads["sdh_captions_vtt"] = signed_url
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to generate signed URL for SDH captions {language}: {e}")
|
||||
|
||||
# Re-timed Captions VTT (for pause-insert accessible videos)
|
||||
if "retimed_captions_vtt_gcs" in lang_output:
|
||||
blob_path = lang_output["retimed_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
|
|
|
|||
|
|
@ -62,6 +62,7 @@ class RequestedOutputs(BaseModel):
|
|||
audio_description_mp3: bool = True
|
||||
accessible_video_mp4: bool = False # Rendered video with embedded audio descriptions
|
||||
accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None # User-selected method
|
||||
sdh_vtt: bool = False # SDH (Subtitles for Deaf and Hard of Hearing) captions with speaker labels, sound effects, music notation
|
||||
languages: list[str] = []
|
||||
transcreation: list[str] = []
|
||||
tts_preferences: Optional[TTSPreferences] = None
|
||||
|
|
@ -109,6 +110,7 @@ class AccessibleVideoEditState(BaseModel):
|
|||
|
||||
class LangOutput(BaseModel):
|
||||
captions_vtt_gcs: Optional[str] = None
|
||||
sdh_captions_vtt_gcs: Optional[str] = None # SDH-format captions (speaker labels, sound effects, music)
|
||||
ad_vtt_gcs: Optional[str] = None
|
||||
ad_mp3_gcs: Optional[str] = None
|
||||
# Accessible video outputs
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
|
|||
- transcript_plaintext: full spoken words, punctuated (in the detected language)
|
||||
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (in the detected language)
|
||||
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (MUST be written in the detected language)
|
||||
{SDH_FIELD}
|
||||
|
||||
CRITICAL LANGUAGE REQUIREMENT:
|
||||
- First, detect the language spoken in the video
|
||||
|
|
@ -69,6 +70,8 @@ Rules:
|
|||
- Be succinct — omit redundant or self-evident details
|
||||
- Do NOT duplicate information already in the spoken dialogue
|
||||
|
||||
{SDH_GUIDELINES}
|
||||
|
||||
CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
|
||||
|
||||
Example output format:
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ You are given a video. Return a JSON object with:
|
|||
- transcript_plaintext: full spoken words, punctuated, translated/written in {TARGET_LANGUAGE}
|
||||
- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE})
|
||||
- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE})
|
||||
{SDH_FIELD}
|
||||
|
||||
TARGET LANGUAGE: {TARGET_LANGUAGE}
|
||||
|
||||
|
|
@ -74,6 +75,8 @@ Rules:
|
|||
- Do NOT duplicate information already in the spoken dialogue
|
||||
- Write all descriptions in natural, fluent {TARGET_LANGUAGE}
|
||||
|
||||
{SDH_GUIDELINES}
|
||||
|
||||
CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text.
|
||||
|
||||
Example output format (if TARGET_LANGUAGE were Spanish):
|
||||
|
|
|
|||
|
|
@ -59,6 +59,24 @@ class GeminiService:
|
|||
logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s")
|
||||
return False
|
||||
|
||||
def _build_sdh_field(self, sdh_requested: bool) -> str:
|
||||
if sdh_requested:
|
||||
return "- sdh_captions_vtt: a valid WebVTT file as a single string, containing SDH-format captions (same timing as captions_vtt, but enriched with speaker labels, sound effects, and music notation)"
|
||||
return ""
|
||||
|
||||
def _build_sdh_guidelines(self, sdh_requested: bool) -> str:
|
||||
if not sdh_requested:
|
||||
return ""
|
||||
return """SDH (SUBTITLES FOR THE DEAF AND HARD OF HEARING) GUIDELINES:
|
||||
Generate sdh_captions_vtt using the same cue timings as captions_vtt, enriched with:
|
||||
- Speaker identification when multiple speakers are present: use "NAME:" prefix (e.g., "JOHN: Hello there") or "[NARRATOR]" for narration
|
||||
- Non-speech sounds that are plot-relevant, in square brackets: [DOOR SLAMS], [PHONE RINGS], [CROWD CHEERING], [THUNDER]
|
||||
- Music: use ♪ for background music cues (e.g., "♪ tense music ♪") or ♪ around sung lyrics
|
||||
- Off-screen or voice-over speakers: indicate with "(off-screen)" or "[V.O.]" where relevant
|
||||
- Non-speech vocalisations when relevant: [SIGHS], [LAUGHS], [SCREAMS]
|
||||
- Maintain the same timestamp format as captions_vtt (HH:MM:SS.mmm --> HH:MM:SS.mmm)
|
||||
- Only add sound effect cues where they add meaningful context; do not annotate every minor sound"""
|
||||
|
||||
def _build_brand_context_block(self, brand_context: Optional[str]) -> str:
|
||||
"""Build the brand context instruction block for injection into prompts."""
|
||||
if brand_context and brand_context.strip():
|
||||
|
|
@ -71,13 +89,18 @@ class GeminiService:
|
|||
)
|
||||
return "No specific brand names have been provided for this video."
|
||||
|
||||
async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None) -> dict[str, Any]:
|
||||
async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None, sdh_requested: bool = False) -> dict[str, Any]:
|
||||
"""
|
||||
Extract captions and audio descriptions from video using Gemini 2.0
|
||||
Returns structured JSON with transcript, captions VTT, and audio description VTT
|
||||
"""
|
||||
prompt_template = self._load_prompt("gemini_ingestion.md")
|
||||
prompt = prompt_template.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
|
||||
prompt = (
|
||||
prompt_template
|
||||
.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
|
||||
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
|
||||
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
|
||||
)
|
||||
uploaded_file = None
|
||||
|
||||
try:
|
||||
|
|
@ -258,7 +281,8 @@ Fix the JSON and return it:
|
|||
self,
|
||||
video_file_path: str,
|
||||
target_language: str,
|
||||
brand_context: Optional[str] = None
|
||||
brand_context: Optional[str] = None,
|
||||
sdh_requested: bool = False
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Extract captions and audio descriptions from video using Gemini,
|
||||
|
|
@ -279,8 +303,12 @@ Fix the JSON and return it:
|
|||
all in the target language
|
||||
"""
|
||||
prompt_template = self._load_prompt("gemini_ingestion_targeted.md")
|
||||
prompt = prompt_template.replace("{TARGET_LANGUAGE}", target_language).replace(
|
||||
"{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)
|
||||
prompt = (
|
||||
prompt_template
|
||||
.replace("{TARGET_LANGUAGE}", target_language)
|
||||
.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context))
|
||||
.replace("{SDH_FIELD}", self._build_sdh_field(sdh_requested))
|
||||
.replace("{SDH_GUIDELINES}", self._build_sdh_guidelines(sdh_requested))
|
||||
)
|
||||
uploaded_file = None
|
||||
|
||||
|
|
|
|||
|
|
@ -43,6 +43,7 @@ def sanitize_filename(name: str, max_length: int = 50) -> str:
|
|||
# Mapping from LangOutput field names to output filenames
|
||||
FILE_TYPE_MAPPING = {
|
||||
"captions_vtt_gcs": "captions.vtt",
|
||||
"sdh_captions_vtt_gcs": "sdh_captions.vtt",
|
||||
"ad_vtt_gcs": "ad.vtt",
|
||||
"ad_mp3_gcs": "ad.mp3",
|
||||
"accessible_video_gcs": "accessible_video.mp4",
|
||||
|
|
|
|||
|
|
@ -204,7 +204,12 @@ async def ingest_and_ai_task_impl(job_id: str):
|
|||
|
||||
# Process with Gemini
|
||||
brand_context = job_doc.get("brand_context")
|
||||
ai_result = await gemini_service.extract_accessibility(temp_path, brand_context=brand_context)
|
||||
sdh_requested = job_doc.get("requested_outputs", {}).get("sdh_vtt", False)
|
||||
ai_result = await gemini_service.extract_accessibility(
|
||||
temp_path,
|
||||
brand_context=brand_context,
|
||||
sdh_requested=sdh_requested
|
||||
)
|
||||
|
||||
# Final safety check for required fields
|
||||
required_fields = ["captions_vtt", "audio_description_vtt"]
|
||||
|
|
@ -249,6 +254,21 @@ async def ingest_and_ai_task_impl(job_id: str):
|
|||
f"{job_id}/{source_language}/ad.vtt"
|
||||
)
|
||||
|
||||
# Upload SDH VTT if generated
|
||||
sdh_gcs_uri = None
|
||||
if sdh_requested and ai_result.get("sdh_captions_vtt"):
|
||||
sdh_gcs_uri = await upload_vtt_to_gcs(
|
||||
ai_result["sdh_captions_vtt"],
|
||||
f"{job_id}/{source_language}/sdh_captions.vtt"
|
||||
)
|
||||
|
||||
source_lang_output = {
|
||||
"captions_vtt_gcs": captions_gcs_uri,
|
||||
"ad_vtt_gcs": ad_gcs_uri,
|
||||
}
|
||||
if sdh_gcs_uri:
|
||||
source_lang_output["sdh_captions_vtt_gcs"] = sdh_gcs_uri
|
||||
|
||||
# Update job with AI results, detected language, and outputs
|
||||
# Set status to TRANSLATING to trigger translation pipeline before QC
|
||||
await db.jobs.update_one(
|
||||
|
|
@ -260,10 +280,7 @@ async def ingest_and_ai_task_impl(job_id: str):
|
|||
"source.detected_language": detected_language,
|
||||
"ai.ingestion_json": ai_result,
|
||||
"ai.confidence": ai_result["confidence"],
|
||||
f"outputs.{source_language}": {
|
||||
"captions_vtt_gcs": captions_gcs_uri,
|
||||
"ad_vtt_gcs": ad_gcs_uri
|
||||
},
|
||||
f"outputs.{source_language}": source_lang_output,
|
||||
"updated_at": datetime.utcnow()
|
||||
},
|
||||
"$push": {
|
||||
|
|
|
|||
|
|
@ -177,6 +177,8 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional")
|
||||
logger.info(f"Translation mode for job {job_id}: {translation_mode}")
|
||||
|
||||
sdh_requested = job_doc["requested_outputs"].get("sdh_vtt", False)
|
||||
|
||||
# Get source language VTT content (needed for traditional mode)
|
||||
source_outputs = job_doc["outputs"].get(source_language)
|
||||
if not source_outputs:
|
||||
|
|
@ -215,6 +217,12 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
source_captions_vtt = captions_blob.download_as_text()
|
||||
source_ad_vtt = ad_blob.download_as_text()
|
||||
|
||||
# Download source SDH VTT for traditional-mode translation
|
||||
source_sdh_vtt = None
|
||||
if sdh_requested and source_outputs.get("sdh_captions_vtt_gcs"):
|
||||
sdh_blob_path = source_outputs["sdh_captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "")
|
||||
source_sdh_vtt = gcs_service.bucket.blob(sdh_blob_path).download_as_text()
|
||||
|
||||
try:
|
||||
# Get target languages (exclude source)
|
||||
target_languages = [lang for lang in requested_languages if lang != source_language]
|
||||
|
|
@ -239,7 +247,8 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
return await gemini_service.extract_accessibility_targeted(
|
||||
video_local_path,
|
||||
lang,
|
||||
brand_context=job_brand_context
|
||||
brand_context=job_brand_context,
|
||||
sdh_requested=sdh_requested
|
||||
)
|
||||
|
||||
result = await retry_with_backoff(extract_targeted, max_retries=3)
|
||||
|
|
@ -257,12 +266,20 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
f"{job_id}/{lang}/ad.vtt"
|
||||
)
|
||||
|
||||
# Upload SDH VTT if generated
|
||||
sdh_gcs_uri = None
|
||||
if sdh_requested and result.get("sdh_captions_vtt"):
|
||||
sdh_gcs_uri = await upload_vtt_to_gcs(
|
||||
result["sdh_captions_vtt"],
|
||||
f"{job_id}/{lang}/sdh_captions.vtt"
|
||||
)
|
||||
|
||||
logger.info(f"Completed video-native translation for {lang}")
|
||||
return (lang, captions_gcs_uri, ad_gcs_uri, None)
|
||||
return (lang, captions_gcs_uri, ad_gcs_uri, sdh_gcs_uri, None)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Video-native translation failed for {lang}: {e}")
|
||||
return (lang, None, None, str(e))
|
||||
return (lang, None, None, None, str(e))
|
||||
|
||||
# Run all translations in parallel (limited by semaphore)
|
||||
if target_languages:
|
||||
|
|
@ -281,18 +298,21 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
"qa_notes": f"Translation failed: {str(result)}"
|
||||
}
|
||||
else:
|
||||
lang, captions_uri, ad_uri, error_msg = result
|
||||
lang, captions_uri, ad_uri, sdh_uri, error_msg = result
|
||||
if error_msg:
|
||||
updated_outputs[lang] = {
|
||||
"origin": "video_native",
|
||||
"qa_notes": f"Translation failed: {error_msg}"
|
||||
}
|
||||
else:
|
||||
updated_outputs[lang] = {
|
||||
lang_out = {
|
||||
"captions_vtt_gcs": captions_uri,
|
||||
"ad_vtt_gcs": ad_uri,
|
||||
"origin": "video_native"
|
||||
}
|
||||
if sdh_uri:
|
||||
lang_out["sdh_captions_vtt_gcs"] = sdh_uri
|
||||
updated_outputs[lang] = lang_out
|
||||
logger.info(f"Successfully processed VTT files for language: {lang} (origin: video_native)")
|
||||
|
||||
else:
|
||||
|
|
@ -343,12 +363,26 @@ async def _async_translate_and_synthesize(job_id: str):
|
|||
f"{job_id}/{language}/ad.vtt"
|
||||
)
|
||||
|
||||
# Store language outputs
|
||||
updated_outputs[language] = {
|
||||
# Translate and upload SDH VTT if requested
|
||||
lang_out: dict = {
|
||||
"captions_vtt_gcs": captions_gcs_uri,
|
||||
"ad_vtt_gcs": ad_gcs_uri,
|
||||
"origin": origin
|
||||
}
|
||||
if sdh_requested and source_sdh_vtt:
|
||||
async def translate_sdh():
|
||||
return await gemini_service.translate_vtt(
|
||||
source_sdh_vtt, language, source_language=source_language
|
||||
)
|
||||
translated_sdh = await retry_with_backoff(translate_sdh, max_retries=3)
|
||||
sdh_gcs_uri = await upload_vtt_to_gcs(
|
||||
translated_sdh,
|
||||
f"{job_id}/{language}/sdh_captions.vtt"
|
||||
)
|
||||
lang_out["sdh_captions_vtt_gcs"] = sdh_gcs_uri
|
||||
|
||||
# Store language outputs
|
||||
updated_outputs[language] = lang_out
|
||||
|
||||
logger.info(f"Successfully processed VTT files for language: {language} (origin: {origin})")
|
||||
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ const jobSchema = z.object({
|
|||
audio_description_vtt: z.boolean(),
|
||||
audio_description_mp3: z.boolean(),
|
||||
accessible_video_mp4: z.boolean(),
|
||||
sdh_vtt: z.boolean(),
|
||||
languages: z.array(z.string()),
|
||||
translation_mode: z.enum(['traditional', 'video_native']),
|
||||
});
|
||||
|
|
@ -74,6 +75,7 @@ export function NewJob() {
|
|||
audio_description_vtt: true,
|
||||
audio_description_mp3: true,
|
||||
accessible_video_mp4: false,
|
||||
sdh_vtt: false,
|
||||
languages: [],
|
||||
translation_mode: 'video_native',
|
||||
}
|
||||
|
|
@ -128,6 +130,7 @@ export function NewJob() {
|
|||
accessible_video_mp4: data.accessible_video_mp4,
|
||||
accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
|
||||
languages: data.languages,
|
||||
sdh_vtt: data.sdh_vtt,
|
||||
transcreation: [], // Transcreation replaced by video_native translation mode
|
||||
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
|
||||
translation_mode: data.translation_mode,
|
||||
|
|
@ -207,6 +210,7 @@ export function NewJob() {
|
|||
accessible_video_mp4: data.accessible_video_mp4,
|
||||
accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
|
||||
languages: data.languages,
|
||||
sdh_vtt: data.sdh_vtt,
|
||||
transcreation: [], // Transcreation replaced by video_native translation mode
|
||||
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
|
||||
translation_mode: data.translation_mode,
|
||||
|
|
@ -252,6 +256,7 @@ export function NewJob() {
|
|||
accessible_video_mp4: data.accessible_video_mp4,
|
||||
accessible_video_method: data.accessible_video_mp4 ? accessibleVideoMethod : undefined,
|
||||
languages: data.languages,
|
||||
sdh_vtt: data.sdh_vtt,
|
||||
transcreation: [], // Transcreation replaced by video_native translation mode
|
||||
tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined,
|
||||
translation_mode: data.translation_mode,
|
||||
|
|
@ -542,6 +547,14 @@ export function NewJob() {
|
|||
/>
|
||||
<span>Accessible Video (MP4 with embedded audio descriptions)</span>
|
||||
</label>
|
||||
<label className="flex items-center">
|
||||
<input
|
||||
type="checkbox"
|
||||
{...register('sdh_vtt')}
|
||||
className="mr-2"
|
||||
/>
|
||||
<span>SDH Captions (VTT with speaker labels, sound effects & music notation)</span>
|
||||
</label>
|
||||
|
||||
{/* Accessible Video Method Selector - shown when accessible_video_mp4 is checked */}
|
||||
{accessibleVideoMp4 && (
|
||||
|
|
|
|||
|
|
@ -66,6 +66,7 @@ export interface RequestedOutputs {
|
|||
audio_description_mp3: boolean;
|
||||
accessible_video_mp4: boolean; // Rendered video with embedded audio descriptions
|
||||
accessible_video_method?: AccessibleVideoMethod; // User-selected method for accessible video
|
||||
sdh_vtt?: boolean; // SDH captions with speaker labels, sound effects, music notation
|
||||
languages: string[];
|
||||
transcreation: string[];
|
||||
tts_preferences?: TTSPreferences;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue