#!/usr/bin/env python3 """ One-off script: Generate a new TTS MP3 for a single AD cue. Downloads the AD VTT from GCS, shows all cues, lets you pick one, synthesizes it with a different voice, and saves the MP3 locally. Usage (from project root on the server): PYTHONPATH=backend python scripts/replace_cue_voice.py """ import asyncio import os import sys # Add backend to path so we can import app modules sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "backend")) # Load .env before importing app modules that read settings at import time from dotenv import load_dotenv load_dotenv(os.path.join(os.path.dirname(__file__), "..", "backend", ".env")) from app.core.config import settings from app.services.gemini_tts import GeminiTTSService from app.services.gcs import gcs_service from app.tasks.tts_synthesis import parse_ad_cues # Voices for quick reference FEMALE_VOICES = ["Kore", "Leda", "Aoede", "Callirrhoe", "Autonoe", "Erinome", "Laomedeia", "Achernar", "Despina", "Pulcherrima"] MALE_VOICES = ["Puck", "Charon", "Fenrir", "Orus", "Enceladus", "Iapetus"] JOB_ID = "1798981-27-littmann-basics-of-auscultation-how-to-video-ms-st-fr-v2fv" LANGUAGE = "fr" DEFAULT_VOICE = "Leda" def format_ts(seconds: float) -> str: h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = seconds % 60 return f"{h:02d}:{m:02d}:{s:06.3f}" async def main(): print("=" * 60) print(" Generate AD Cue MP3") print(f" Job: {JOB_ID}") print(f" Language: {LANGUAGE}") print("=" * 60) # --- Download AD VTT --- print("\nDownloading AD VTT from GCS...") blob_path = f"{JOB_ID}/{LANGUAGE}/ad.vtt" blob = gcs_service.bucket.blob(blob_path) if not blob.exists(): print(f"ERROR: Not found: gs://{settings.gcs_bucket}/{blob_path}") return vtt_content = blob.download_as_text() # --- Show cues --- cues = parse_ad_cues(vtt_content) if not cues: print("No AD cues found.") return print(f"\n{len(cues)} AD cues:\n") for i, cue in enumerate(cues): text_preview = cue["text"][:90] + ("..." if len(cue["text"]) > 90 else "") print(f" [{i:2d}] {format_ts(cue['start_time'])} → {format_ts(cue['end_time'])}") print(f" {text_preview}") # --- Select cue --- print() cue_input = input("Cue index to regenerate: ").strip() try: cue_index = int(cue_input) selected = cues[cue_index] except (ValueError, IndexError): print("Invalid index. Exiting.") return print(f"\nSelected: [{cue_index}] {selected['text']}") # --- Voice --- print(f"\nFemale: {', '.join(FEMALE_VOICES)}") print(f"Male: {', '.join(MALE_VOICES)}") voice = input(f"Voice [{DEFAULT_VOICE}]: ").strip() or DEFAULT_VOICE # --- Synthesize --- print(f"\nSynthesizing with voice={voice}, language={LANGUAGE}...") tts = GeminiTTSService() text = selected["text"].strip() if not text.endswith((".", "!", "?")): text += "." try: audio_bytes = await tts.synthesize_text( text=text, voice_name=voice, language=LANGUAGE, model="flash", speed=1.0, style_prompt="", ) except Exception as e: print(f"ERROR: TTS failed: {e}") return # --- Save --- filename = f"cue_{cue_index}_{voice}_{LANGUAGE}.mp3" with open(filename, "wb") as f: f.write(audio_bytes) print(f"\nSaved: {filename} ({len(audio_bytes):,} bytes)") print("Done. Download this file and listen to verify.") if __name__ == "__main__": asyncio.run(main())