video-accessibility/backend/app/services/descriptive_transcript.py
Vadym Samoilenko 31199f8705 chore: push all session changes — backend hardening, tests, apache config, deploy scripts
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 15:52:14 +01:00

89 lines
2.7 KiB
Python

"""
Descriptive Transcript Service
Generates a WCAG-compliant descriptive transcript by merging captions (speech)
and audio descriptions (visuals) into a single chronological plain text document.
Format:
(SPEECH)
[MUSIC PLAYING]
(DESCRIPTION)
Jennifer runs across the stage.
(SPEECH)
In sports, there is no substitute for sweat.
Reference: WCAG 2.1 Success Criterion 1.2.1
"""
from ..core.logging import get_logger
from ..lib.vtt import VTTCue, VTTParser
logger = get_logger(__name__)
def generate_descriptive_transcript(captions_vtt: str, ad_vtt: str) -> str:
"""
Merge captions VTT and audio description VTT into a descriptive transcript.
Args:
captions_vtt: WebVTT content for captions (speech + non-speech audio)
ad_vtt: WebVTT content for audio descriptions (visual descriptions)
Returns:
Plain text descriptive transcript with (SPEECH) and (DESCRIPTION) sections
"""
try:
caption_cues = VTTParser.parse(captions_vtt) if captions_vtt else []
except Exception as e:
logger.warning(f"Failed to parse captions VTT for transcript: {e}")
caption_cues = []
try:
ad_cues = VTTParser.parse(ad_vtt) if ad_vtt else []
except Exception as e:
logger.warning(f"Failed to parse AD VTT for transcript: {e}")
ad_cues = []
if not caption_cues and not ad_cues:
return ""
# Tag each cue with its type
tagged: list[tuple[str, VTTCue]] = (
[("speech", cue) for cue in caption_cues] +
[("description", cue) for cue in ad_cues]
)
# Sort chronologically by start time; descriptions before captions at the same time
tagged.sort(key=lambda x: (x[1].start_time, 0 if x[0] == "description" else 1))
# Group consecutive same-type cues together
sections: list[tuple[str, list[str]]] = []
current_type: str | None = None
current_texts: list[str] = []
for cue_type, cue in tagged:
text = cue.text.strip()
if not text:
continue
if cue_type == current_type:
current_texts.append(text)
else:
if current_type is not None and current_texts:
sections.append((current_type, current_texts))
current_type = cue_type
current_texts = [text]
if current_type is not None and current_texts:
sections.append((current_type, current_texts))
# Build the output text
output_lines: list[str] = []
for section_type, texts in sections:
header = "(SPEECH)" if section_type == "speech" else "(DESCRIPTION)"
output_lines.append(header)
output_lines.append("\n".join(texts))
output_lines.append("") # blank line between sections
return "\n".join(output_lines).strip()