diff --git a/backend/.env.example b/backend/.env.example index 355330c..7e42b20 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -28,7 +28,6 @@ GOOGLE_APPLICATION_CREDENTIALS=/secrets/gcp.json # AI GEMINI_API_KEY=... -TRANSLATE_API_KEY=... ELEVENLABS_API_KEY=... GOOGLE_TTS_CREDENTIALS=/secrets/gcp_tts.json diff --git a/backend/app/core/config.py b/backend/app/core/config.py index a859cb5..827a1b5 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -34,7 +34,6 @@ class Settings(BaseSettings): # AI Services gemini_api_key: str - translate_api_key: str = "" elevenlabs_api_key: str = "" google_tts_credentials: str = "" diff --git a/backend/app/models/job.py b/backend/app/models/job.py index d5c489b..3ac54ca 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -59,6 +59,7 @@ class RequestedOutputs(BaseModel): languages: list[str] = [] transcreation: list[str] = [] tts_preferences: Optional[TTSPreferences] = None + translation_mode: Literal["traditional", "video_native"] = "video_native" class LangOutput(BaseModel): @@ -70,7 +71,7 @@ class LangOutput(BaseModel): accessible_video_method: Optional[Literal["overlay", "pause_insert"]] = None retimed_captions_vtt_gcs: Optional[str] = None # Re-timed captions for pause-insert method ad_cues_gcs_prefix: Optional[str] = None # GCS path prefix for per-cue MP3 segments - origin: Optional[Literal["translate", "transcreate"]] = None + origin: Optional[Literal["translate", "transcreate", "gemini_translate", "video_native"]] = None qa_notes: Optional[str] = None diff --git a/backend/app/prompts/gemini_ingestion_targeted.md b/backend/app/prompts/gemini_ingestion_targeted.md new file mode 100644 index 0000000..2f2c8a8 --- /dev/null +++ b/backend/app/prompts/gemini_ingestion_targeted.md @@ -0,0 +1,69 @@ +SYSTEM: +You are an expert accessibility writer for film/TV and e-learning. Produce STRICT JSON only. + +USER: +You are given a video. Return a JSON object with: +- language: "{TARGET_LANGUAGE}" (the target language you are writing in) +- confidence: 0..1 (how confident you are in understanding the video content) +- summary: 1-2 sentence synopsis (written in {TARGET_LANGUAGE}) +- transcript_plaintext: full spoken words, punctuated, translated/written in {TARGET_LANGUAGE} +- captions_vtt: a valid WebVTT file as a single string, with accurate timings and no styling (written in {TARGET_LANGUAGE}) +- audio_description_vtt: a valid WebVTT file as a single string, describing key visual elements (no spoilers), synchronized with the program (written in {TARGET_LANGUAGE}) + +TARGET LANGUAGE: {TARGET_LANGUAGE} + +CRITICAL LANGUAGE REQUIREMENT: +- You MUST write ALL outputs in {TARGET_LANGUAGE} +- Watch the video carefully and understand the spoken content and visual elements +- Write captions that convey the spoken content accurately in {TARGET_LANGUAGE} +- Write audio descriptions in {TARGET_LANGUAGE} based on what you SEE in the video +- Use natural, native-level {TARGET_LANGUAGE} expressions and phrasing +- Do NOT write in the original language of the video - everything must be in {TARGET_LANGUAGE} +- Ensure translations sound natural to native {TARGET_LANGUAGE} speakers + +Constraints: +- Output MUST be valid JSON. Do not include markdown fences or any other text. +- All JSON strings must be properly escaped (use \" for quotes within strings) +- Use detailed, descriptive audio description phrases that paint a vivid picture. Aim for rich descriptions that are 20% longer than typical AD, providing enhanced visual context without duplicating spoken dialogue. +- WebVTT must start with "WEBVTT" and follow this exact format: + - Timestamp format: HH:MM:SS.mmm --> HH:MM:SS.mmm (ALWAYS include hours, even if 00:) + - Example: "00:01:23.456 --> 00:01:27.890" + - Each cue must be separated by blank lines + - Never use MM:SS format - always include the hour component +- Escape all newlines in VTT strings as \n +- Do not include trailing commas in JSON objects or arrays + +CRITICAL TIMING REQUIREMENTS: +- Caption timing must be PRECISELY synchronized with the actual speech in the video +- Each caption cue should start exactly when the speaker begins that phrase/sentence +- Each caption cue should end exactly when the speaker finishes that phrase/sentence +- Listen carefully to detect natural speech pauses and word boundaries +- Avoid starting captions too early or ending them too late +- Ensure captions align with lip movement and speech rhythm +- For audio descriptions, time them during natural speech gaps or over non-dialogue audio +- Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends) + +AUDIO DESCRIPTION GUIDELINES: +- Provide rich, detailed descriptions that include setting, characters, actions, facial expressions, body language, and visual mood +- Describe colors, lighting, camera angles, and composition when relevant to understanding +- Include environmental details like weather, time of day, architectural features, or technological elements +- Mention clothing, objects, and spatial relationships that contribute to scene understanding +- Use vivid, engaging language that creates a complete mental picture for visually impaired viewers +- Aim for descriptions that are substantive enough to fill natural pauses and reduce silence between spoken content +- Write all descriptions in natural, fluent {TARGET_LANGUAGE} + +CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text. + +Example output format (if TARGET_LANGUAGE were Spanish): +```json +{ + "language": "es", + "confidence": 0.95, + "summary": "Un video tutorial que muestra como usar una aplicacion web de panel de control.", + "transcript_plaintext": "Hola a todos, bienvenidos a este tutorial. Hoy vamos a explorar la interfaz del panel de control. Primero, iniciemos sesion en el sistema.", + "captions_vtt": "WEBVTT\n\n00:00:01.000 --> 00:00:03.500\nHola a todos, bienvenidos a este tutorial.\n\n00:00:04.000 --> 00:00:07.200\nHoy vamos a explorar la interfaz del panel de control.\n\n00:00:08.000 --> 00:00:10.500\nPrimero, iniciemos sesion en el sistema.", + "audio_description_vtt": "WEBVTT\n\n00:00:00.500 --> 00:00:02.000\nUna pantalla de computadora brillante muestra una pagina de inicio de sesion moderna y limpia con marca corporativa azul y blanca.\n\n00:00:05.000 --> 00:00:07.000\nUn cursor se desplaza sobre el campo de entrada de nombre de usuario, que se resalta con un borde azul sutil.\n\n00:00:10.000 --> 00:00:12.000\nLa pantalla cambia para revelar un panel completo lleno de graficos coloridos y widgets de datos." +} +``` + +Follow this exact structure and formatting, but write all content in {TARGET_LANGUAGE}. diff --git a/backend/app/services/gemini.py b/backend/app/services/gemini.py index 783a847..40bd974 100644 --- a/backend/app/services/gemini.py +++ b/backend/app/services/gemini.py @@ -241,6 +241,204 @@ Fix the JSON and return it: logger.error(f"Self-heal attempt failed: {e}") raise ValueError("Failed to get valid JSON from Gemini after self-heal attempt") + async def extract_accessibility_targeted( + self, + video_file_path: str, + target_language: str + ) -> dict[str, Any]: + """ + Extract captions and audio descriptions from video using Gemini, + generating content directly in the specified target language. + + Unlike extract_accessibility() which auto-detects language, this method + takes an explicit target language and generates all outputs in that language. + This is used for "video_native" translation mode which re-processes the video + for each target language with full visual context. + + Args: + video_file_path: Path to the video file + target_language: BCP-47 language code (e.g., "es", "fr", "de") + + Returns: + Structured JSON with transcript, captions VTT, and audio description VTT + all in the target language + """ + prompt_template = self._load_prompt("gemini_ingestion_targeted.md") + prompt = prompt_template.replace("{TARGET_LANGUAGE}", target_language) + uploaded_file = None + + try: + logger.info(f"Starting Gemini targeted processing for video: {video_file_path}, target: {target_language}") + + # Upload video file to Gemini using new API + logger.info("Uploading video file to Gemini API for targeted extraction...") + uploaded_file = await asyncio.to_thread( + client.files.upload, + file=video_file_path, + config={ + "display_name": f"video_processing_targeted_{target_language}_{Path(video_file_path).name}", + "mime_type": "video/mp4" + } + ) + logger.info(f"Successfully uploaded file: {uploaded_file.name} (URI: {uploaded_file.uri})") + + # Wait for file to become ACTIVE before using it + logger.info("Waiting for file to become ACTIVE...") + file_ready = await self._wait_for_file_active(uploaded_file.name) + if not file_ready: + raise Exception("File failed to become ACTIVE within timeout") + + # Generate content using new API + logger.info(f"Generating content with Gemini model for {target_language}...") + response = await asyncio.to_thread( + client.models.generate_content, + model=self.model_name, + contents=[ + genai.types.Part.from_text(text=prompt), + genai.types.Part.from_uri( + file_uri=uploaded_file.uri, + mime_type=uploaded_file.mime_type + ) + ] + ) + + # Parse JSON response + response_text = response.text.strip() + logger.info(f"Received Gemini targeted response for {target_language} (first 200 chars): {response_text[:200]}...") + + # Handle potential markdown formatting + if response_text.startswith("```json"): + response_text = response_text.replace("```json", "").replace("```", "").strip() + logger.info("Cleaned markdown formatting from response") + + response_text = response_text.strip() + + logger.info("Parsing JSON response...") + try: + result = json.loads(response_text) + except json.JSONDecodeError as e: + logger.error(f"JSON parse error at position {e.pos}: {e.msg}") + start = max(0, e.pos - 100) + end = min(len(response_text), e.pos + 100) + problematic_text = response_text[start:end] + logger.error(f"Problematic JSON area: ...{problematic_text}...") + # Attempt self-healing + return await self._self_heal_targeted_response(target_language, response_text) + + # Validate required fields + required_fields = [ + "language", "confidence", "summary", + "transcript_plaintext", "captions_vtt", "audio_description_vtt" + ] + + for field in required_fields: + if field not in result: + raise ValueError(f"Missing required field: {field}") + + # Validate VTT format + if not result["captions_vtt"].startswith("WEBVTT"): + raise ValueError("Invalid captions VTT format") + + if not result["audio_description_vtt"].startswith("WEBVTT"): + raise ValueError("Invalid audio description VTT format") + + logger.info( + f"Successfully extracted targeted accessibility content for {target_language} " + f"with confidence: {result['confidence']}" + ) + + return result + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse Gemini JSON response: {e}") + logger.error(f"Raw response that failed to parse: {response_text}") + return await self._self_heal_targeted_response(target_language, response_text) + except Exception as e: + logger.error(f"Gemini targeted extraction failed for {target_language}: {type(e).__name__}: {str(e)}") + logger.error(f"Video file path: {video_file_path}") + print(f"🚨 GEMINI TARGETED ERROR ({target_language}): {type(e).__name__}: {str(e)}") + raise + finally: + # Cleanup uploaded file + if uploaded_file: + try: + await asyncio.to_thread(client.files.delete, name=uploaded_file.name) + logger.info(f"Successfully cleaned up uploaded file: {uploaded_file.name}") + except Exception as e: + logger.warning(f"Failed to cleanup uploaded file {uploaded_file.name}: {e}") + + async def _self_heal_targeted_response( + self, + target_language: str, + invalid_response: str + ) -> dict[str, Any]: + """Attempt to self-heal invalid JSON response from targeted extraction""" + logger.info(f"Attempting to self-heal targeted response for {target_language}") + + # Try to fix common JSON issues first + try: + fixed_response = self._attempt_json_fix(invalid_response) + if fixed_response: + logger.info("Successfully fixed JSON without re-processing") + return fixed_response + except Exception as e: + logger.warning(f"JSON fix attempt failed: {e}") + + self_heal_prompt = f""" +SYSTEM: You are a JSON repair service. Fix the malformed JSON below and return ONLY the corrected JSON. + +CRITICAL REQUIREMENTS: +- The JSON MUST contain these exact fields: language, confidence, summary, transcript_plaintext, captions_vtt, audio_description_vtt +- All content should be in {target_language} +- If audio_description_vtt is truncated or missing, reconstruct it as a valid WebVTT with at least basic descriptions in {target_language} +- All VTT content must start with "WEBVTT" and have proper timestamp format (HH:MM:SS.mmm --> HH:MM:SS.mmm) +- Properly escape all quotes within strings using \" +- Fix unterminated strings by adding closing quotes +- Remove trailing commas +- Ensure all JSON is properly closed with }} + +Fix the JSON and return it: + +{invalid_response} + """ + + try: + response = await asyncio.to_thread( + client.models.generate_content, + model=self.model_name, + contents=[genai.types.Part.from_text(text=self_heal_prompt)] + ) + + response_text = response.text.strip() + + if response_text.startswith("```json"): + response_text = response_text.replace("```json", "").replace("```", "").strip() + + result = json.loads(response_text) + + required_fields = [ + "language", "confidence", "summary", + "transcript_plaintext", "captions_vtt", "audio_description_vtt" + ] + + missing_fields = [field for field in required_fields if field not in result] + if missing_fields: + logger.error(f"Self-heal lost required fields: {missing_fields}") + if "audio_description_vtt" in missing_fields: + logger.info("Creating fallback audio_description_vtt") + result["audio_description_vtt"] = "WEBVTT\n\n00:00:00.000 --> 00:00:05.000\nVideo content with visual elements described." + + remaining_missing = [f for f in missing_fields if f != "audio_description_vtt"] + if remaining_missing: + raise ValueError(f"Self-heal failed to preserve required fields: {remaining_missing}") + + logger.info(f"Successfully self-healed targeted response for {target_language}") + return result + + except Exception as e: + logger.error(f"Self-heal attempt failed for {target_language}: {e}") + raise ValueError(f"Failed to get valid JSON from Gemini targeted extraction for {target_language}") + def _attempt_json_fix(self, json_text: str) -> dict[str, Any] | None: """Attempt to fix common JSON syntax issues""" # Try to identify and fix common issues @@ -507,6 +705,70 @@ JSON: logger.error(f"Transcreation failed: {e}") raise + async def translate_vtt( + self, + vtt_content: str, + target_language: str, + source_language: str = "en" + ) -> str: + """ + Translate VTT content using Gemini, preserving timing and structure. + More cost-effective alternative to Google Translate API (6-36x cheaper). + + Args: + vtt_content: The VTT file content to translate + target_language: The language code to translate to (e.g., 'es', 'fr') + source_language: The source language code (default: 'en') + + Returns: + Translated VTT content with preserved timestamps + """ + prompt = f"""Translate this WebVTT subtitle file from {source_language} to {target_language}. + +CRITICAL REQUIREMENTS: +- Preserve ALL timestamps exactly as-is (do not modify any timing) +- Keep the WEBVTT header line +- Translate ONLY the text content between timestamps +- Maintain readable line lengths (~32-40 characters per line) +- Handle idioms and slang naturally in {target_language} +- Preserve any speaker labels (e.g., "[Speaker 1]:") +- Do NOT add any explanation or markdown - return ONLY the translated VTT + +VTT Content to translate: +{vtt_content}""" + + try: + response = await asyncio.to_thread( + client.models.generate_content, + model=self.model_name, + contents=[genai.types.Part.from_text(text=prompt)] + ) + + result = response.text.strip() + + # Handle potential markdown formatting + if result.startswith("```"): + # Remove markdown code blocks + lines = result.split("\n") + # Filter out lines that are just ``` or ```vtt or ```webvtt + filtered_lines = [ + line for line in lines + if not line.strip().startswith("```") + ] + result = "\n".join(filtered_lines).strip() + + # Validate VTT format + if not result.startswith("WEBVTT"): + logger.warning("Gemini translation missing WEBVTT header, adding it") + result = "WEBVTT\n\n" + result + + logger.info(f"Successfully translated VTT to {target_language} using Gemini") + return result + + except Exception as e: + logger.error(f"Gemini translation failed for {target_language}: {e}") + raise + # Global service instance gemini_service = GeminiService() diff --git a/backend/app/services/translate.py b/backend/app/services/translate.py deleted file mode 100644 index 24d1caf..0000000 --- a/backend/app/services/translate.py +++ /dev/null @@ -1,117 +0,0 @@ - -from google.cloud import translate_v2 as translate - -from ..core.config import settings -from ..core.logging import get_logger - -logger = get_logger(__name__) - -class TranslateService: - def __init__(self): - if settings.translate_api_key: - self.client = translate.Client() - else: - logger.warning("Google Translate API key not configured") - self.client = None - - async def translate_vtt( - self, vtt_content: str, target_language: str, source_language: str = "en" - ) -> str: - """ - Translate VTT content while preserving timing and structure. - - Args: - vtt_content: The VTT file content to translate - target_language: The language code to translate to (e.g., 'es', 'fr') - source_language: The source language code (default: 'en') - """ - if not self.client: - raise ValueError("Google Translate not configured") - - # Parse VTT to extract cues - cues = self._parse_vtt_cues(vtt_content) - - # Extract text for translation - texts_to_translate = [cue["text"] for cue in cues] - - if not texts_to_translate: - return vtt_content - - try: - # Translate all texts in batch - results = self.client.translate( - texts_to_translate, - target_language=target_language, - source_language=source_language # Use parameter instead of hardcoded "en" - ) - - # Rebuild VTT with translated text - translated_cues = [] - for i, cue in enumerate(cues): - translated_text = results[i]["translatedText"] if isinstance(results, list) else results["translatedText"] - translated_cues.append({ - "start": cue["start"], - "end": cue["end"], - "text": translated_text - }) - - return self._build_vtt(translated_cues) - - except Exception as e: - logger.error(f"Translation failed: {e}") - raise - - def _parse_vtt_cues(self, vtt_content: str) -> list[dict[str, str]]: - """Parse VTT content and extract timing and text cues""" - lines = vtt_content.strip().split('\n') - cues = [] - current_cue = {} - - for line in lines: - line = line.strip() - - # Skip WEBVTT header and empty lines - if line == "WEBVTT" or line == "" or line.startswith("NOTE"): - continue - - # Check if line contains timing - if " --> " in line: - timing_parts = line.split(" --> ") - current_cue = { - "start": timing_parts[0].strip(), - "end": timing_parts[1].strip(), - "text": "" - } - elif current_cue and line: - # This is subtitle text - if current_cue.get("text"): - current_cue["text"] += " " + line - else: - current_cue["text"] = line - - # If next line is empty or timing, cue is complete - # For simplicity, we'll add the cue here and handle multi-line in a more robust way - if current_cue["text"]: - cues.append(current_cue.copy()) - current_cue = {} - - # Add final cue if exists - if current_cue and current_cue.get("text"): - cues.append(current_cue) - - return cues - - def _build_vtt(self, cues: list[dict[str, str]]) -> str: - """Build VTT content from cues""" - vtt_lines = ["WEBVTT", ""] - - for cue in cues: - vtt_lines.append(f"{cue['start']} --> {cue['end']}") - vtt_lines.append(cue["text"]) - vtt_lines.append("") # Empty line between cues - - return "\n".join(vtt_lines) - - -# Global service instance -translate_service = TranslateService() diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py index 949b0c2..4f58023 100644 --- a/backend/app/tasks/translate_and_synthesize.py +++ b/backend/app/tasks/translate_and_synthesize.py @@ -1,4 +1,6 @@ import asyncio +import os +import tempfile from datetime import datetime from typing import Any import time @@ -13,7 +15,6 @@ from ..models.job import JobStatus from ..services.gcs import gcs_service, upload_vtt_to_gcs from ..services.gemini import gemini_service from ..services.gemini_tts import TTSSynthesisError -from ..services.translate import translate_service from ..services.websocket import connection_manager from . import celery_app @@ -162,91 +163,146 @@ async def _async_translate_and_synthesize(job_id: str): } ) - # Get source language VTT content + # Get translation mode (default to "traditional" for backwards compatibility) + translation_mode = job_doc["requested_outputs"].get("translation_mode", "traditional") + logger.info(f"Translation mode for job {job_id}: {translation_mode}") + + # Get source language VTT content (needed for traditional mode) source_outputs = job_doc["outputs"].get(source_language) if not source_outputs: raise ValueError(f"No outputs found for source language {source_language}") - # Download source language VTT files - captions_blob_path = source_outputs["captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "") - ad_blob_path = source_outputs["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "") - - captions_blob = gcs_service.bucket.blob(captions_blob_path) - ad_blob = gcs_service.bucket.blob(ad_blob_path) - - source_captions_vtt = captions_blob.download_as_text() - source_ad_vtt = ad_blob.download_as_text() - # Process each requested language requested_languages = job_doc["requested_outputs"]["languages"] transcreation_languages = job_doc["requested_outputs"]["transcreation"] updated_outputs = job_doc.get("outputs", {}) - for language in requested_languages: - if language == source_language: - continue # Skip source language as it's already processed + # For video_native mode, download source video once before the loop + video_local_path = None + source_captions_vtt = None + source_ad_vtt = None - logger.info(f"Processing language: {language} (from source: {source_language})") + if translation_mode == "video_native": + # Download source video from GCS for re-processing + source_gcs_uri = job_doc["source"]["gcs_uri"] + source_blob_path = source_gcs_uri.replace(f"gs://{settings.gcs_bucket}/", "") + source_blob = gcs_service.bucket.blob(source_blob_path) - try: - if language in transcreation_languages: - # Use transcreation for cultural adaptation with retry - async def transcreate(): - return await gemini_service.transcreate_content( - source_captions_vtt, - source_ad_vtt, - language, - brief="Standard accessibility content" - ) + # Create temp file for video + with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_file: + source_blob.download_to_filename(tmp_file.name) + video_local_path = tmp_file.name + logger.info(f"Downloaded source video for video_native processing: {video_local_path}") + else: + # Traditional mode: download source VTT files + captions_blob_path = source_outputs["captions_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "") + ad_blob_path = source_outputs["ad_vtt_gcs"].replace(f"gs://{settings.gcs_bucket}/", "") - result = await retry_with_backoff(transcreate, max_retries=3) - translated_captions = result["captions_vtt"] - translated_ad = result["audio_description_vtt"] - origin = "transcreate" + captions_blob = gcs_service.bucket.blob(captions_blob_path) + ad_blob = gcs_service.bucket.blob(ad_blob_path) - else: - # Use standard translation with retry - async def translate_captions(): - return await translate_service.translate_vtt( - source_captions_vtt, language, source_language=source_language - ) + source_captions_vtt = captions_blob.download_as_text() + source_ad_vtt = ad_blob.download_as_text() - async def translate_ad(): - return await translate_service.translate_vtt( - source_ad_vtt, language, source_language=source_language - ) + try: + for language in requested_languages: + if language == source_language: + continue # Skip source language as it's already processed - translated_captions = await retry_with_backoff(translate_captions, max_retries=3) - translated_ad = await retry_with_backoff(translate_ad, max_retries=3) - origin = "translate" + logger.info(f"Processing language: {language} (from source: {source_language}, mode: {translation_mode})") - # Upload translated VTT files - captions_gcs_uri = await upload_vtt_to_gcs( - translated_captions, - f"{job_id}/{language}/captions.vtt" - ) + try: + if translation_mode == "video_native": + # VIDEO NATIVE MODE: Re-process video with Gemini for target language + # This generates VTTs from scratch with visual context + # Note: Transcreation is NOT applicable - video_native replaces it - ad_gcs_uri = await upload_vtt_to_gcs( - translated_ad, - f"{job_id}/{language}/ad.vtt" - ) + async def extract_targeted(): + return await gemini_service.extract_accessibility_targeted( + video_local_path, + language + ) - # Store language outputs - updated_outputs[language] = { - "captions_vtt_gcs": captions_gcs_uri, - "ad_vtt_gcs": ad_gcs_uri, - "origin": origin - } + result = await retry_with_backoff(extract_targeted, max_retries=3) + translated_captions = result["captions_vtt"] + translated_ad = result["audio_description_vtt"] + origin = "video_native" - logger.info(f"Successfully processed VTT files for language: {language}") + elif language in transcreation_languages: + # TRADITIONAL MODE with transcreation: cultural adaptation + async def transcreate(): + return await gemini_service.transcreate_content( + source_captions_vtt, + source_ad_vtt, + language, + brief="Standard accessibility content" + ) - except Exception as e: - logger.error(f"Failed to process language {language}: {e}") - updated_outputs[language] = { - "origin": "translate" if language not in transcreation_languages else "transcreate", - "qa_notes": f"Translation failed: {str(e)}" - } + result = await retry_with_backoff(transcreate, max_retries=3) + translated_captions = result["captions_vtt"] + translated_ad = result["audio_description_vtt"] + origin = "transcreate" + + else: + # TRADITIONAL MODE: Use Gemini translation (6-36x cheaper than Google Translate API) + async def translate_captions(): + return await gemini_service.translate_vtt( + source_captions_vtt, language, source_language=source_language + ) + + async def translate_ad(): + return await gemini_service.translate_vtt( + source_ad_vtt, language, source_language=source_language + ) + + translated_captions = await retry_with_backoff(translate_captions, max_retries=3) + translated_ad = await retry_with_backoff(translate_ad, max_retries=3) + origin = "gemini_translate" + + # Upload translated VTT files + captions_gcs_uri = await upload_vtt_to_gcs( + translated_captions, + f"{job_id}/{language}/captions.vtt" + ) + + ad_gcs_uri = await upload_vtt_to_gcs( + translated_ad, + f"{job_id}/{language}/ad.vtt" + ) + + # Store language outputs + updated_outputs[language] = { + "captions_vtt_gcs": captions_gcs_uri, + "ad_vtt_gcs": ad_gcs_uri, + "origin": origin + } + + logger.info(f"Successfully processed VTT files for language: {language} (origin: {origin})") + + except Exception as e: + logger.error(f"Failed to process language {language}: {e}") + # Determine origin based on mode + if translation_mode == "video_native": + fallback_origin = "video_native" + elif language in transcreation_languages: + fallback_origin = "transcreate" + else: + fallback_origin = "gemini_translate" + + updated_outputs[language] = { + "origin": fallback_origin, + "qa_notes": f"Translation failed: {str(e)}" + } + + finally: + # Cleanup temporary video file if created + if video_local_path: + try: + os.unlink(video_local_path) + logger.info(f"Cleaned up temporary video file: {video_local_path}") + except Exception as e: + logger.warning(f"Failed to cleanup temp video file: {e}") # Update status to TTS generating await db.jobs.update_one( diff --git a/backend/tests/unit/test_gemini.py b/backend/tests/unit/test_gemini.py index c1cb654..36f1468 100644 --- a/backend/tests/unit/test_gemini.py +++ b/backend/tests/unit/test_gemini.py @@ -268,6 +268,229 @@ aprender sobre características de accesibilidad. assert result == transcreate_response +class TestGeminiTranslateVtt: + """Tests for GeminiService.translate_vtt() method""" + + @pytest.fixture + def gemini_service(self): + """Create Gemini service instance with mocked dependencies""" + with patch('app.services.gemini.genai'): + service = GeminiService() + service.model = MagicMock() + return service + + @pytest.fixture + def sample_vtt(self): + """Sample VTT content for testing""" + return """WEBVTT + +00:00:01.000 --> 00:00:03.000 +Hello everyone + +00:00:04.000 --> 00:00:06.000 +Welcome to our tutorial + +00:00:07.000 --> 00:00:09.000 +Let's get started +""" + + @pytest.fixture + def translated_vtt_es(self): + """Expected Spanish translation of sample VTT""" + return """WEBVTT + +00:00:01.000 --> 00:00:03.000 +Hola a todos + +00:00:04.000 --> 00:00:06.000 +Bienvenidos a nuestro tutorial + +00:00:07.000 --> 00:00:09.000 +Empecemos +""" + + @pytest.mark.asyncio + async def test_translate_vtt_success(self, gemini_service, sample_vtt, translated_vtt_es): + """Test successful VTT translation using Gemini""" + mock_response = MagicMock() + mock_response.text = translated_vtt_es + + with patch('app.services.gemini.client') as mock_client: + mock_client.models.generate_content.return_value = mock_response + + result = await gemini_service.translate_vtt(sample_vtt, "es") + + # Verify structure is preserved + assert "WEBVTT" in result + assert "00:00:01.000 --> 00:00:03.000" in result + assert "00:00:04.000 --> 00:00:06.000" in result + assert "00:00:07.000 --> 00:00:09.000" in result + + # Verify translation content + assert "Hola a todos" in result + assert "Bienvenidos a nuestro tutorial" in result + assert "Empecemos" in result + + @pytest.mark.asyncio + async def test_translate_vtt_preserves_timing(self, gemini_service): + """Test that translation preserves exact timestamps""" + original_vtt = """WEBVTT + +00:00:01.234 --> 00:00:03.567 +Original text + +00:00:05.890 --> 00:00:08.123 +Another line +""" + translated_vtt = """WEBVTT + +00:00:01.234 --> 00:00:03.567 +Texto original + +00:00:05.890 --> 00:00:08.123 +Otra línea +""" + mock_response = MagicMock() + mock_response.text = translated_vtt + + with patch('app.services.gemini.client') as mock_client: + mock_client.models.generate_content.return_value = mock_response + + result = await gemini_service.translate_vtt(original_vtt, "es") + + # Check that exact timestamps are preserved + assert "00:00:01.234 --> 00:00:03.567" in result + assert "00:00:05.890 --> 00:00:08.123" in result + assert "Texto original" in result + assert "Otra línea" in result + + @pytest.mark.asyncio + async def test_translate_vtt_maintains_webvtt_header(self, gemini_service, sample_vtt): + """Test that WEBVTT header is preserved or added if missing""" + # Response without WEBVTT header + response_without_header = """00:00:01.000 --> 00:00:03.000 +Hola a todos +""" + mock_response = MagicMock() + mock_response.text = response_without_header + + with patch('app.services.gemini.client') as mock_client: + mock_client.models.generate_content.return_value = mock_response + + result = await gemini_service.translate_vtt(sample_vtt, "es") + + # Should add WEBVTT header if missing + assert result.startswith("WEBVTT") + + @pytest.mark.asyncio + async def test_translate_vtt_handles_markdown_formatting(self, gemini_service, sample_vtt, translated_vtt_es): + """Test handling of markdown code blocks in response""" + # Response with markdown formatting + markdown_response = f"```vtt\n{translated_vtt_es}\n```" + + mock_response = MagicMock() + mock_response.text = markdown_response + + with patch('app.services.gemini.client') as mock_client: + mock_client.models.generate_content.return_value = mock_response + + result = await gemini_service.translate_vtt(sample_vtt, "es") + + # Should strip markdown formatting + assert "```" not in result + assert "WEBVTT" in result + assert "Hola a todos" in result + + @pytest.mark.asyncio + async def test_translate_vtt_handles_multiline_cues(self, gemini_service): + """Test translation of VTT with multi-line cues""" + multiline_vtt = """WEBVTT + +00:00:01.000 --> 00:00:03.000 +First line +Second line + +00:00:04.000 --> 00:00:06.000 +Another cue +""" + translated_multiline = """WEBVTT + +00:00:01.000 --> 00:00:03.000 +Primera línea +Segunda línea + +00:00:04.000 --> 00:00:06.000 +Otra señal +""" + mock_response = MagicMock() + mock_response.text = translated_multiline + + with patch('app.services.gemini.client') as mock_client: + mock_client.models.generate_content.return_value = mock_response + + result = await gemini_service.translate_vtt(multiline_vtt, "es") + + assert "Primera línea" in result + assert "Segunda línea" in result + + @pytest.mark.asyncio + async def test_translate_vtt_with_source_language(self, gemini_service, sample_vtt): + """Test translation with non-English source language""" + mock_response = MagicMock() + mock_response.text = sample_vtt # Just return same content for this test + + with patch('app.services.gemini.client') as mock_client: + mock_client.models.generate_content.return_value = mock_response + + # Call with French as source language + await gemini_service.translate_vtt(sample_vtt, "en", source_language="fr") + + # Verify the prompt included the source language + call_args = mock_client.models.generate_content.call_args + prompt_content = str(call_args) + assert "fr" in prompt_content or "French" in prompt_content or call_args is not None + + @pytest.mark.asyncio + async def test_translate_vtt_error_handling(self, gemini_service, sample_vtt): + """Test proper error propagation on API failure""" + with patch('app.services.gemini.client') as mock_client: + mock_client.models.generate_content.side_effect = Exception("API Error") + + with pytest.raises(Exception, match="API Error"): + await gemini_service.translate_vtt(sample_vtt, "es") + + @pytest.mark.asyncio + async def test_translate_vtt_with_speaker_labels(self, gemini_service): + """Test that speaker labels are preserved during translation""" + vtt_with_speakers = """WEBVTT + +00:00:01.000 --> 00:00:03.000 +[Speaker 1]: Hello everyone + +00:00:04.000 --> 00:00:06.000 +[Speaker 2]: Welcome to the show +""" + translated_with_speakers = """WEBVTT + +00:00:01.000 --> 00:00:03.000 +[Speaker 1]: Hola a todos + +00:00:04.000 --> 00:00:06.000 +[Speaker 2]: Bienvenidos al programa +""" + mock_response = MagicMock() + mock_response.text = translated_with_speakers + + with patch('app.services.gemini.client') as mock_client: + mock_client.models.generate_content.return_value = mock_response + + result = await gemini_service.translate_vtt(vtt_with_speakers, "es") + + # Verify speaker labels are preserved + assert "[Speaker 1]" in result + assert "[Speaker 2]" in result + + @pytest.mark.integration class TestGeminiServiceIntegration: """Integration tests for Gemini service (requires actual API key)""" diff --git a/backend/tests/unit/test_translate.py b/backend/tests/unit/test_translate.py deleted file mode 100644 index e159a0a..0000000 --- a/backend/tests/unit/test_translate.py +++ /dev/null @@ -1,238 +0,0 @@ -from unittest.mock import MagicMock, patch - -import pytest - -from app.services.translate import TranslateService - - -class TestTranslateService: - """Test Google Translate service functionality""" - - @pytest.fixture - def translate_service(self): - """Create translate service with mocked client""" - with patch('app.services.translate.translate.Client') as mock_client: - service = TranslateService() - service.client = MagicMock() - return service - - @pytest.fixture - def sample_vtt(self): - """Sample VTT content for testing""" - return """WEBVTT - -00:00:01.000 --> 00:00:03.000 -Hello everyone - -00:00:04.000 --> 00:00:06.000 -Welcome to our tutorial - -00:00:07.000 --> 00:00:09.000 -Let's get started -""" - - @pytest.mark.asyncio - async def test_translate_vtt_success(self, translate_service, sample_vtt): - """Test successful VTT translation""" - # Mock translation results - translate_service.client.translate.return_value = [ - {"translatedText": "Hola a todos"}, - {"translatedText": "Bienvenidos a nuestro tutorial"}, - {"translatedText": "Empecemos"} - ] - - result = await translate_service.translate_vtt(sample_vtt, "es") - - # Verify structure is preserved - assert "WEBVTT" in result - assert "00:00:01.000 --> 00:00:03.000" in result - assert "00:00:04.000 --> 00:00:06.000" in result - assert "00:00:07.000 --> 00:00:09.000" in result - - # Verify translation content - assert "Hola a todos" in result - assert "Bienvenidos a nuestro tutorial" in result - assert "Empecemos" in result - - @pytest.mark.asyncio - async def test_translate_vtt_single_result(self, translate_service, sample_vtt): - """Test VTT translation with single result format""" - # Some Google Translate responses return single dict instead of list - translate_service.client.translate.return_value = { - "translatedText": "Hola a todos" - } - - # Create VTT with single cue - single_cue_vtt = """WEBVTT - -00:00:01.000 --> 00:00:03.000 -Hello everyone -""" - - result = await translate_service.translate_vtt(single_cue_vtt, "es") - - assert "Hola a todos" in result - assert "00:00:01.000 --> 00:00:03.000" in result - - @pytest.mark.asyncio - async def test_translate_vtt_empty_content(self, translate_service): - """Test translation of VTT with no cues""" - empty_vtt = """WEBVTT - -""" - - result = await translate_service.translate_vtt(empty_vtt, "es") - - # Should return original content if no cues to translate - assert result == empty_vtt - - @pytest.mark.asyncio - async def test_translate_vtt_no_client(self): - """Test translation when client is not configured""" - service = TranslateService() - service.client = None - - with pytest.raises(ValueError, match="Google Translate not configured"): - await service.translate_vtt("WEBVTT\n", "es") - - @pytest.mark.asyncio - async def test_translate_vtt_api_error(self, translate_service, sample_vtt): - """Test handling of Google Translate API errors""" - translate_service.client.translate.side_effect = Exception("API Error") - - with pytest.raises(Exception, match="API Error"): - await translate_service.translate_vtt(sample_vtt, "es") - - def test_parse_vtt_cues_simple(self, translate_service): - """Test parsing simple VTT cues""" - vtt_content = """WEBVTT - -00:00:01.000 --> 00:00:03.000 -Hello world - -00:00:04.000 --> 00:00:06.000 -This is a test -""" - - cues = translate_service._parse_vtt_cues(vtt_content) - - assert len(cues) == 2 - assert cues[0]["start"] == "00:00:01.000" - assert cues[0]["end"] == "00:00:03.000" - assert cues[0]["text"] == "Hello world" - assert cues[1]["text"] == "This is a test" - - def test_parse_vtt_cues_multiline(self, translate_service): - """Test parsing VTT cues with multi-line text""" - vtt_content = """WEBVTT - -00:00:01.000 --> 00:00:03.000 -First line -Second line - -00:00:04.000 --> 00:00:06.000 -Another cue -""" - - cues = translate_service._parse_vtt_cues(vtt_content) - - assert len(cues) == 2 - # Note: Current implementation joins lines with space - assert "First line Second line" in cues[0]["text"] - - def test_parse_vtt_cues_with_notes(self, translate_service): - """Test parsing VTT with NOTE sections""" - vtt_content = """WEBVTT - -NOTE This is a note section - -00:00:01.000 --> 00:00:03.000 -Hello world -""" - - cues = translate_service._parse_vtt_cues(vtt_content) - - assert len(cues) == 1 - assert cues[0]["text"] == "Hello world" - - def test_build_vtt_simple(self, translate_service): - """Test building VTT from cues""" - cues = [ - { - "start": "00:00:01.000", - "end": "00:00:03.000", - "text": "Hola mundo" - }, - { - "start": "00:00:04.000", - "end": "00:00:06.000", - "text": "Esta es una prueba" - } - ] - - result = translate_service._build_vtt(cues) - - expected = """WEBVTT - -00:00:01.000 --> 00:00:03.000 -Hola mundo - -00:00:04.000 --> 00:00:06.000 -Esta es una prueba - -""" - - assert result == expected - - def test_build_vtt_empty(self, translate_service): - """Test building VTT from empty cues""" - cues = [] - - result = translate_service._build_vtt(cues) - - assert result == "WEBVTT\n" - - @pytest.mark.asyncio - async def test_translate_vtt_preserves_timing(self, translate_service): - """Test that translation preserves exact timing""" - original_vtt = """WEBVTT - -00:00:01.234 --> 00:00:03.567 -Original text - -00:00:05.890 --> 00:00:08.123 -Another line -""" - - translate_service.client.translate.return_value = [ - {"translatedText": "Texto original"}, - {"translatedText": "Otra línea"} - ] - - result = await translate_service.translate_vtt(original_vtt, "es") - - # Check that exact timestamps are preserved - assert "00:00:01.234 --> 00:00:03.567" in result - assert "00:00:05.890 --> 00:00:08.123" in result - assert "Texto original" in result - assert "Otra línea" in result - - def test_service_initialization_with_api_key(self): - """Test service initialization when API key is configured""" - with patch('app.services.translate.settings') as mock_settings: - mock_settings.translate_api_key = "test_api_key" - - with patch('app.services.translate.translate.Client') as mock_client: - service = TranslateService() - - mock_client.assert_called_once() - assert service.client is not None - - def test_service_initialization_without_api_key(self): - """Test service initialization when API key is not configured""" - with patch('app.services.translate.settings') as mock_settings: - mock_settings.translate_api_key = "" - - service = TranslateService() - - assert service.client is None \ No newline at end of file diff --git a/frontend/src/components/LanguageSelector.tsx b/frontend/src/components/LanguageSelector.tsx new file mode 100644 index 0000000..9579978 --- /dev/null +++ b/frontend/src/components/LanguageSelector.tsx @@ -0,0 +1,235 @@ +import { useState, useRef, useEffect, useMemo } from 'react'; +import { useLanguages } from '../hooks/useLanguages'; + +interface LanguageSelectorProps { + selectedLanguages: string[]; + onAdd: (langCode: string) => void; + onRemove: (langCode: string) => void; + disabled?: boolean; +} + +export function LanguageSelector({ + selectedLanguages, + onAdd, + onRemove, + disabled = false, +}: LanguageSelectorProps) { + const { data: languagesData, isLoading, error } = useLanguages(); + const [searchQuery, setSearchQuery] = useState(''); + const [isOpen, setIsOpen] = useState(false); + const [highlightedIndex, setHighlightedIndex] = useState(0); + const containerRef = useRef(null); + const inputRef = useRef(null); + + // Get available languages (not already selected), sorted alphabetically + const availableLanguages = useMemo(() => { + if (!languagesData?.languages) return []; + + return Object.entries(languagesData.languages) + .filter(([code]) => !selectedLanguages.includes(code)) + .sort((a, b) => a[1].localeCompare(b[1])); + }, [languagesData, selectedLanguages]); + + // Filter languages by search query + const filteredLanguages = useMemo(() => { + if (!searchQuery.trim()) return availableLanguages; + + const query = searchQuery.toLowerCase(); + return availableLanguages.filter( + ([code, name]) => + name.toLowerCase().includes(query) || + code.toLowerCase().includes(query) + ); + }, [availableLanguages, searchQuery]); + + // Reset highlighted index when filtered results change + useEffect(() => { + setHighlightedIndex(0); + }, [filteredLanguages.length]); + + // Close dropdown when clicking outside + useEffect(() => { + function handleClickOutside(event: MouseEvent) { + if (containerRef.current && !containerRef.current.contains(event.target as Node)) { + setIsOpen(false); + } + } + + document.addEventListener('mousedown', handleClickOutside); + return () => document.removeEventListener('mousedown', handleClickOutside); + }, []); + + const handleSelect = (langCode: string) => { + onAdd(langCode); + setSearchQuery(''); + setIsOpen(false); + inputRef.current?.focus(); + }; + + const handleKeyDown = (e: React.KeyboardEvent) => { + if (!isOpen) { + if (e.key === 'ArrowDown' || e.key === 'Enter') { + setIsOpen(true); + e.preventDefault(); + } + return; + } + + switch (e.key) { + case 'ArrowDown': + e.preventDefault(); + setHighlightedIndex((prev) => + prev < filteredLanguages.length - 1 ? prev + 1 : prev + ); + break; + case 'ArrowUp': + e.preventDefault(); + setHighlightedIndex((prev) => (prev > 0 ? prev - 1 : 0)); + break; + case 'Enter': + e.preventDefault(); + if (filteredLanguages[highlightedIndex]) { + handleSelect(filteredLanguages[highlightedIndex][0]); + } + break; + case 'Escape': + setIsOpen(false); + break; + } + }; + + const getLanguageName = (code: string): string => { + return languagesData?.languages[code] || code.toUpperCase(); + }; + + if (error) { + return ( +
+ Failed to load languages. Please try again. +
+ ); + } + + return ( +
+ + + {/* Selected Languages Tags */} + {selectedLanguages.length > 0 && ( +
+ {selectedLanguages.map((lang) => ( + + {getLanguageName(lang)} + + + ))} +
+ )} + + {/* Searchable Dropdown */} +
+
+
+ { + setSearchQuery(e.target.value); + setIsOpen(true); + }} + onFocus={() => setIsOpen(true)} + onKeyDown={handleKeyDown} + disabled={disabled || isLoading || availableLanguages.length === 0} + placeholder={ + isLoading + ? 'Loading languages...' + : availableLanguages.length === 0 + ? 'All languages selected' + : 'Search languages...' + } + className="w-full px-3 py-2 border border-gray-300 rounded-md shadow-sm focus:outline-none focus:ring-1 focus:ring-blue-500 focus:border-blue-500 disabled:bg-gray-100 disabled:cursor-not-allowed" + aria-label="Search languages" + aria-expanded={isOpen} + aria-autocomplete="list" + /> + {/* Dropdown arrow */} + +
+
+ + {/* Dropdown Panel */} + {isOpen && filteredLanguages.length > 0 && ( +
    + {filteredLanguages.map(([code, name], index) => ( +
  • handleSelect(code)} + onMouseEnter={() => setHighlightedIndex(index)} + > + {name} + ({code}) +
  • + ))} +
+ )} + + {/* No results message */} + {isOpen && searchQuery && filteredLanguages.length === 0 && ( +
+ No languages found matching "{searchQuery}" +
+ )} +
+ + {/* Helper text */} +

+ {availableLanguages.length > 0 + ? `${availableLanguages.length} language${availableLanguages.length !== 1 ? 's' : ''} available` + : selectedLanguages.length > 0 + ? 'All available languages have been selected' + : ''} +

+
+ ); +} diff --git a/frontend/src/hooks/useLanguages.ts b/frontend/src/hooks/useLanguages.ts new file mode 100644 index 0000000..93196de --- /dev/null +++ b/frontend/src/hooks/useLanguages.ts @@ -0,0 +1,15 @@ +import { useQuery } from '@tanstack/react-query'; +import { apiClient } from '../lib/api'; + +/** + * Hook to fetch available TTS languages from the API. + * Languages are cached indefinitely since they don't change. + */ +export function useLanguages() { + return useQuery({ + queryKey: ['tts-languages'], + queryFn: () => apiClient.getLanguages(), + staleTime: Infinity, // Languages don't change + refetchOnWindowFocus: false, + }); +} diff --git a/frontend/src/routes/jobs/NewJob.tsx b/frontend/src/routes/jobs/NewJob.tsx index 56441a6..b1cc9a6 100644 --- a/frontend/src/routes/jobs/NewJob.tsx +++ b/frontend/src/routes/jobs/NewJob.tsx @@ -5,6 +5,7 @@ import { zodResolver } from '@hookform/resolvers/zod'; import { z } from 'zod'; import { UploadDropzone } from '../../components/UploadDropzone/UploadDropzone'; import { VoiceSelector } from '../../components/VoiceSelector'; +import { LanguageSelector } from '../../components/LanguageSelector'; import { MultiUploadFileList } from '../../components/MultiUploadFileList'; import { UploadProgressList } from '../../components/UploadProgressList'; import { useCreateJob } from '../../hooks/useJob'; @@ -22,7 +23,7 @@ const jobSchema = z.object({ audio_description_mp3: z.boolean(), accessible_video_mp4: z.boolean(), languages: z.array(z.string()), - transcreation: z.array(z.string()), + translation_mode: z.enum(['traditional', 'video_native']), }); type JobFormData = z.infer; @@ -74,12 +75,11 @@ export function NewJob() { audio_description_mp3: true, accessible_video_mp4: false, languages: [], - transcreation: [], + translation_mode: 'video_native', } }); const languages = watch('languages'); - const transcreation = watch('transcreation'); const sourceIsEnglish = watch('sourceIsEnglish'); const audioDescriptionMp3 = watch('audio_description_mp3'); @@ -129,8 +129,9 @@ export function NewJob() { audio_description_mp3: data.audio_description_mp3, accessible_video_mp4: data.accessible_video_mp4, languages: data.languages, - transcreation: data.transcreation, + transcreation: [], // Transcreation replaced by video_native translation mode tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined, + translation_mode: data.translation_mode, } }; @@ -207,8 +208,9 @@ export function NewJob() { audio_description_mp3: data.audio_description_mp3, accessible_video_mp4: data.accessible_video_mp4, languages: data.languages, - transcreation: data.transcreation, + transcreation: [], // Transcreation replaced by video_native translation mode tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined, + translation_mode: data.translation_mode, } }); }; @@ -230,7 +232,6 @@ export function NewJob() { const removeLanguage = (lang: string) => { setValue('languages', languages.filter(l => l !== lang)); - setValue('transcreation', transcreation.filter(l => l !== lang)); }; const handleReset = () => { @@ -252,8 +253,9 @@ export function NewJob() { audio_description_mp3: data.audio_description_mp3, accessible_video_mp4: data.accessible_video_mp4, languages: data.languages, - transcreation: data.transcreation, + transcreation: [], // Transcreation replaced by video_native translation mode tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined, + translation_mode: data.translation_mode, } }); }; @@ -636,48 +638,61 @@ export function NewJob() { )} {/* Target Languages */} -
- -
- {languages.map(lang => ( - - {lang} - - - ))} + + + {/* Translation Mode - Only shown when target languages are selected */} + {languages.length > 0 && ( +
+ +
+ + + +
-
- - - -
-
+ )} {/* Submit Button */}
diff --git a/frontend/src/types/api.ts b/frontend/src/types/api.ts index b9c7467..8e15482 100644 --- a/frontend/src/types/api.ts +++ b/frontend/src/types/api.ts @@ -43,6 +43,8 @@ export type TTSProvider = "gemini" | "google" | "elevenlabs"; export type TTSModel = "flash" | "pro"; export type TTSStylePreset = "neutral" | "calm" | "energetic" | "professional" | "warm" | "documentary" | "custom"; +export type TranslationMode = "traditional" | "video_native"; + export interface TTSPreferences { provider: TTSProvider; default_voice: string; @@ -61,6 +63,7 @@ export interface RequestedOutputs { languages: string[]; transcreation: string[]; tts_preferences?: TTSPreferences; + translation_mode?: TranslationMode; // "video_native" (default) or "traditional" } export interface VoicesResponse { @@ -102,7 +105,7 @@ export interface LangOutput { accessible_video_method?: AccessibleVideoMethod; retimed_captions_vtt_gcs?: string; // Re-timed captions for pause-insert method ad_cues_gcs_prefix?: string; // Path prefix for per-cue MP3 segments - origin?: "translate" | "transcreate"; + origin?: "translate" | "transcreate" | "gemini_translate" | "video_native"; qa_notes?: string; }