video-accessibility/backend/app/services/descriptive_transcript.py

"""
Descriptive Transcript Service

Generates a WCAG-compliant descriptive transcript by merging captions (speech)
and audio descriptions (visuals) into a single chronological plain text document.

Format:
    (SPEECH)
    [MUSIC PLAYING]

    (DESCRIPTION)
    Jennifer runs across the stage.

    (SPEECH)
    In sports, there is no substitute for sweat.

Reference: WCAG 2.1 Success Criterion 1.2.1
"""
from ..lib.vtt import VTTCue, VTTParser
from ..core.logging import get_logger

logger = get_logger(__name__)


def generate_descriptive_transcript(captions_vtt: str, ad_vtt: str) -> str:
    """
    Merge captions VTT and audio description VTT into a descriptive transcript.

    Args:
        captions_vtt: WebVTT content for captions (speech + non-speech audio)
        ad_vtt: WebVTT content for audio descriptions (visual descriptions)

    Returns:
        Plain text descriptive transcript with (SPEECH) and (DESCRIPTION) sections
    """
    try:
        caption_cues = VTTParser.parse(captions_vtt) if captions_vtt else []
    except Exception as e:
        logger.warning(f"Failed to parse captions VTT for transcript: {e}")
        caption_cues = []

    try:
        ad_cues = VTTParser.parse(ad_vtt) if ad_vtt else []
    except Exception as e:
        logger.warning(f"Failed to parse AD VTT for transcript: {e}")
        ad_cues = []

    if not caption_cues and not ad_cues:
        return ""

    # Tag each cue with its type
    tagged: list[tuple[str, VTTCue]] = (
        [("speech", cue) for cue in caption_cues] +
        [("description", cue) for cue in ad_cues]
    )

    # Sort chronologically by start time; descriptions before captions at the same time
    tagged.sort(key=lambda x: (x[1].start_time, 0 if x[0] == "description" else 1))

    # Group consecutive same-type cues together
    sections: list[tuple[str, list[str]]] = []
    current_type: str | None = None
    current_texts: list[str] = []

    for cue_type, cue in tagged:
        text = cue.text.strip()
        if not text:
            continue

        if cue_type == current_type:
            current_texts.append(text)
        else:
            if current_type is not None and current_texts:
                sections.append((current_type, current_texts))
            current_type = cue_type
            current_texts = [text]

    if current_type is not None and current_texts:
        sections.append((current_type, current_texts))

    # Build the output text
    output_lines: list[str] = []
    for section_type, texts in sections:
        header = "(SPEECH)" if section_type == "speech" else "(DESCRIPTION)"
        output_lines.append(header)
        output_lines.append("\n".join(texts))
        output_lines.append("")  # blank line between sections

    return "\n".join(output_lines).strip()