diff --git a/scripts/replace_cue_voice.py b/scripts/replace_cue_voice.py new file mode 100644 index 0000000..30f98a6 --- /dev/null +++ b/scripts/replace_cue_voice.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +""" +One-off script: Generate a new TTS MP3 for a single AD cue. + +Downloads the AD VTT from GCS, shows all cues, lets you pick one, +synthesizes it with a different voice, and saves the MP3 locally. + +Usage (from project root on the server): + PYTHONPATH=backend python scripts/replace_cue_voice.py +""" + +import asyncio +import os +import sys + +# Add backend to path so we can import app modules +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "backend")) + +# Load .env before importing app modules that read settings at import time +from dotenv import load_dotenv +load_dotenv(os.path.join(os.path.dirname(__file__), "..", "backend", ".env")) + +from app.core.config import settings +from app.services.gemini_tts import GeminiTTSService +from app.services.gcs import gcs_service +from app.tasks.tts_synthesis import parse_ad_cues + + +# Voices for quick reference +FEMALE_VOICES = ["Kore", "Leda", "Aoede", "Callirrhoe", "Autonoe", "Erinome", + "Laomedeia", "Achernar", "Despina", "Pulcherrima"] +MALE_VOICES = ["Puck", "Charon", "Fenrir", "Orus", "Enceladus", "Iapetus"] + +JOB_ID = "1798981-27-littmann-basics-of-auscultation-how-to-video-ms-st-fr-v2fv" +LANGUAGE = "fr" +DEFAULT_VOICE = "Leda" + + +def format_ts(seconds: float) -> str: + h = int(seconds // 3600) + m = int((seconds % 3600) // 60) + s = seconds % 60 + return f"{h:02d}:{m:02d}:{s:06.3f}" + + +async def main(): + print("=" * 60) + print(" Generate AD Cue MP3") + print(f" Job: {JOB_ID}") + print(f" Language: {LANGUAGE}") + print("=" * 60) + + # --- Download AD VTT --- + print("\nDownloading AD VTT from GCS...") + blob_path = f"{JOB_ID}/{LANGUAGE}/ad.vtt" + blob = gcs_service.bucket.blob(blob_path) + if not blob.exists(): + print(f"ERROR: Not found: gs://{settings.gcs_bucket}/{blob_path}") + return + vtt_content = blob.download_as_text() + + # --- Show cues --- + cues = parse_ad_cues(vtt_content) + if not cues: + print("No AD cues found.") + return + + print(f"\n{len(cues)} AD cues:\n") + for i, cue in enumerate(cues): + text_preview = cue["text"][:90] + ("..." if len(cue["text"]) > 90 else "") + print(f" [{i:2d}] {format_ts(cue['start_time'])} → {format_ts(cue['end_time'])}") + print(f" {text_preview}") + + # --- Select cue --- + print() + cue_input = input("Cue index to regenerate: ").strip() + try: + cue_index = int(cue_input) + selected = cues[cue_index] + except (ValueError, IndexError): + print("Invalid index. Exiting.") + return + + print(f"\nSelected: [{cue_index}] {selected['text']}") + + # --- Voice --- + print(f"\nFemale: {', '.join(FEMALE_VOICES)}") + print(f"Male: {', '.join(MALE_VOICES)}") + voice = input(f"Voice [{DEFAULT_VOICE}]: ").strip() or DEFAULT_VOICE + + # --- Synthesize --- + print(f"\nSynthesizing with voice={voice}, language={LANGUAGE}...") + tts = GeminiTTSService() + + text = selected["text"].strip() + if not text.endswith((".", "!", "?")): + text += "." + + try: + audio_bytes = await tts.synthesize_text( + text=text, + voice_name=voice, + language=LANGUAGE, + model="flash", + speed=1.0, + style_prompt="", + ) + except Exception as e: + print(f"ERROR: TTS failed: {e}") + return + + # --- Save --- + filename = f"cue_{cue_index}_{voice}_{LANGUAGE}.mp3" + with open(filename, "wb") as f: + f.write(audio_bytes) + + print(f"\nSaved: {filename} ({len(audio_bytes):,} bytes)") + print("Done. Download this file and listen to verify.") + + +if __name__ == "__main__": + asyncio.run(main())