""" Descriptive Transcript Service Generates a WCAG-compliant descriptive transcript by merging captions (speech) and audio descriptions (visuals) into a single chronological plain text document. Format: (SPEECH) [MUSIC PLAYING] (DESCRIPTION) Jennifer runs across the stage. (SPEECH) In sports, there is no substitute for sweat. Reference: WCAG 2.1 Success Criterion 1.2.1 """ from ..lib.vtt import VTTCue, VTTParser from ..core.logging import get_logger logger = get_logger(__name__) def generate_descriptive_transcript(captions_vtt: str, ad_vtt: str) -> str: """ Merge captions VTT and audio description VTT into a descriptive transcript. Args: captions_vtt: WebVTT content for captions (speech + non-speech audio) ad_vtt: WebVTT content for audio descriptions (visual descriptions) Returns: Plain text descriptive transcript with (SPEECH) and (DESCRIPTION) sections """ try: caption_cues = VTTParser.parse(captions_vtt) if captions_vtt else [] except Exception as e: logger.warning(f"Failed to parse captions VTT for transcript: {e}") caption_cues = [] try: ad_cues = VTTParser.parse(ad_vtt) if ad_vtt else [] except Exception as e: logger.warning(f"Failed to parse AD VTT for transcript: {e}") ad_cues = [] if not caption_cues and not ad_cues: return "" # Tag each cue with its type tagged: list[tuple[str, VTTCue]] = ( [("speech", cue) for cue in caption_cues] + [("description", cue) for cue in ad_cues] ) # Sort chronologically by start time; descriptions before captions at the same time tagged.sort(key=lambda x: (x[1].start_time, 0 if x[0] == "description" else 1)) # Group consecutive same-type cues together sections: list[tuple[str, list[str]]] = [] current_type: str | None = None current_texts: list[str] = [] for cue_type, cue in tagged: text = cue.text.strip() if not text: continue if cue_type == current_type: current_texts.append(text) else: if current_type is not None and current_texts: sections.append((current_type, current_texts)) current_type = cue_type current_texts = [text] if current_type is not None and current_texts: sections.append((current_type, current_texts)) # Build the output text output_lines: list[str] = [] for section_type, texts in sections: header = "(SPEECH)" if section_type == "speech" else "(DESCRIPTION)" output_lines.append(header) output_lines.append("\n".join(texts)) output_lines.append("") # blank line between sections return "\n".join(output_lines).strip()