89 lines
2.7 KiB
Python
89 lines
2.7 KiB
Python
"""
|
|
Descriptive Transcript Service
|
|
|
|
Generates a WCAG-compliant descriptive transcript by merging captions (speech)
|
|
and audio descriptions (visuals) into a single chronological plain text document.
|
|
|
|
Format:
|
|
(SPEECH)
|
|
[MUSIC PLAYING]
|
|
|
|
(DESCRIPTION)
|
|
Jennifer runs across the stage.
|
|
|
|
(SPEECH)
|
|
In sports, there is no substitute for sweat.
|
|
|
|
Reference: WCAG 2.1 Success Criterion 1.2.1
|
|
"""
|
|
from ..core.logging import get_logger
|
|
from ..lib.vtt import VTTCue, VTTParser
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
def generate_descriptive_transcript(captions_vtt: str, ad_vtt: str) -> str:
|
|
"""
|
|
Merge captions VTT and audio description VTT into a descriptive transcript.
|
|
|
|
Args:
|
|
captions_vtt: WebVTT content for captions (speech + non-speech audio)
|
|
ad_vtt: WebVTT content for audio descriptions (visual descriptions)
|
|
|
|
Returns:
|
|
Plain text descriptive transcript with (SPEECH) and (DESCRIPTION) sections
|
|
"""
|
|
try:
|
|
caption_cues = VTTParser.parse(captions_vtt) if captions_vtt else []
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse captions VTT for transcript: {e}")
|
|
caption_cues = []
|
|
|
|
try:
|
|
ad_cues = VTTParser.parse(ad_vtt) if ad_vtt else []
|
|
except Exception as e:
|
|
logger.warning(f"Failed to parse AD VTT for transcript: {e}")
|
|
ad_cues = []
|
|
|
|
if not caption_cues and not ad_cues:
|
|
return ""
|
|
|
|
# Tag each cue with its type
|
|
tagged: list[tuple[str, VTTCue]] = (
|
|
[("speech", cue) for cue in caption_cues] +
|
|
[("description", cue) for cue in ad_cues]
|
|
)
|
|
|
|
# Sort chronologically by start time; descriptions before captions at the same time
|
|
tagged.sort(key=lambda x: (x[1].start_time, 0 if x[0] == "description" else 1))
|
|
|
|
# Group consecutive same-type cues together
|
|
sections: list[tuple[str, list[str]]] = []
|
|
current_type: str | None = None
|
|
current_texts: list[str] = []
|
|
|
|
for cue_type, cue in tagged:
|
|
text = cue.text.strip()
|
|
if not text:
|
|
continue
|
|
|
|
if cue_type == current_type:
|
|
current_texts.append(text)
|
|
else:
|
|
if current_type is not None and current_texts:
|
|
sections.append((current_type, current_texts))
|
|
current_type = cue_type
|
|
current_texts = [text]
|
|
|
|
if current_type is not None and current_texts:
|
|
sections.append((current_type, current_texts))
|
|
|
|
# Build the output text
|
|
output_lines: list[str] = []
|
|
for section_type, texts in sections:
|
|
header = "(SPEECH)" if section_type == "speech" else "(DESCRIPTION)"
|
|
output_lines.append(header)
|
|
output_lines.append("\n".join(texts))
|
|
output_lines.append("") # blank line between sections
|
|
|
|
return "\n".join(output_lines).strip()
|