- Rewrote VTT translation to two-step (text-only → Gemini → apply to original timestamps) preventing caption timing desync - Added polling fallback for all processing states and Safari visibilitychange WebSocket reconnect - Added 11 new TTS languages (cs, da, fi, hu, no, sk, sv, es-419, pt-BR, fr-CA) - Updated caption/AD prompts to DCMP Captioning Key & Description Key standards (line splitting, ♪ music notation, italic tags, caption positioning, ethics guidelines) - Added descriptive transcript generation (WCAG 2.1 §1.2.1) combining captions + AD into plain text - Fixed amix normalize=0 to prevent audio loss in rendered videos - Fixed AD re-timing double-count when source_ms is None - Fixed cue block numbering to be 1-based in VttEditor and Timeline Preview Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
"""
|
|
Descriptive Transcript Service
|
|
|
|
Generates a WCAG-compliant descriptive transcript by merging captions (speech)
|
|
and audio descriptions (visuals) into a single chronological plain text document.
|
|
|
|
Format:
|
|
(SPEECH)
|
|
[MUSIC PLAYING]
|
|
|
|
(DESCRIPTION)
|
|
Jennifer runs across the stage.
|
|
|
|
(SPEECH)
|
|
In sports, there is no substitute for sweat.
|
|
|
|
Reference: WCAG 2.1 Success Criterion 1.2.1
|
|
"""
|
|
from ..lib.vtt import VTTCue, VTTParser
|
|
from ..core.logging import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
def generate_descriptive_transcript(captions_vtt: str, ad_vtt: str) -> str:
|
|
"""
|
|
Merge captions VTT and audio description VTT into a descriptive transcript.
|
|
|
|
Args:
|
|
captions_vtt: WebVTT content for captions (speech + non-speech audio)
|
|
ad_vtt: WebVTT content for audio descriptions (visual descriptions)
|
|
|
|
Returns:
|
|
Plain text descriptive transcript with (SPEECH) and (DESCRIPTION) sections
|
|
"""
|
|
try:
|
|
caption_cues = VTTParser.parse(captions_vtt) if captions_vtt else []
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse captions VTT for transcript: {e}")
|
|
caption_cues = []
|
|
|
|
try:
|
|
ad_cues = VTTParser.parse(ad_vtt) if ad_vtt else []
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse AD VTT for transcript: {e}")
|
|
ad_cues = []
|
|
|
|
if not caption_cues and not ad_cues:
|
|
return ""
|
|
|
|
# Tag each cue with its type
|
|
tagged: list[tuple[str, VTTCue]] = (
|
|
[("speech", cue) for cue in caption_cues] +
|
|
[("description", cue) for cue in ad_cues]
|
|
)
|
|
|
|
# Sort chronologically by start time; descriptions before captions at the same time
|
|
tagged.sort(key=lambda x: (x[1].start_time, 0 if x[0] == "description" else 1))
|
|
|
|
# Group consecutive same-type cues together
|
|
sections: list[tuple[str, list[str]]] = []
|
|
current_type: str | None = None
|
|
current_texts: list[str] = []
|
|
|
|
for cue_type, cue in tagged:
|
|
text = cue.text.strip()
|
|
if not text:
|
|
continue
|
|
|
|
if cue_type == current_type:
|
|
current_texts.append(text)
|
|
else:
|
|
if current_type is not None and current_texts:
|
|
sections.append((current_type, current_texts))
|
|
current_type = cue_type
|
|
current_texts = [text]
|
|
|
|
if current_type is not None and current_texts:
|
|
sections.append((current_type, current_texts))
|
|
|
|
# Build the output text
|
|
output_lines: list[str] = []
|
|
for section_type, texts in sections:
|
|
header = "(SPEECH)" if section_type == "speech" else "(DESCRIPTION)"
|
|
output_lines.append(header)
|
|
output_lines.append("\n".join(texts))
|
|
output_lines.append("") # blank line between sections
|
|
|
|
return "\n".join(output_lines).strip()
|