diff --git a/backend/app/api/v1/routes_jobs.py b/backend/app/api/v1/routes_jobs.py index 4acdc32..dce4018 100644 --- a/backend/app/api/v1/routes_jobs.py +++ b/backend/app/api/v1/routes_jobs.py @@ -68,6 +68,7 @@ async def create_job( title: str = Form(...), requested_outputs: str = Form(...), # JSON string file: UploadFile = File(...), + brand_context: Optional[str] = Form(None), current_user: User = Depends(get_current_user), db: AsyncIOMotorDatabase = Depends(get_database), ): @@ -117,6 +118,7 @@ async def create_job( "by": "system" }] }, + "brand_context": brand_context or None, "created_at": datetime.utcnow(), "updated_at": datetime.utcnow() } diff --git a/backend/app/models/job.py b/backend/app/models/job.py index caea03d..8431b1a 100644 --- a/backend/app/models/job.py +++ b/backend/app/models/job.py @@ -163,6 +163,7 @@ class Job(BaseModel): ai: Optional[AISection] = None error: Optional[dict[str, Any]] = None tts_rewrites: Optional[list[dict[str, Any]]] = None # Track auto-rewritten TTS cues + brand_context: Optional[str] = None # Brand names present in the video for accurate product identification created_at: Optional[datetime] = None updated_at: Optional[datetime] = None @@ -176,6 +177,7 @@ class JobCreate(BaseModel): source_is_english: bool = True # True = English source, False = other language (auto-detect) language_hint: Optional[str] = None # Optional hint when source_is_english=False requested_outputs: RequestedOutputs + brand_context: Optional[str] = None # Comma-separated brand names present in the video (e.g. "Sellotape, Coca-Cola") class JobUpdate(BaseModel): diff --git a/backend/app/prompts/gemini_ingestion.md b/backend/app/prompts/gemini_ingestion.md index a7e9046..3fe62bf 100644 --- a/backend/app/prompts/gemini_ingestion.md +++ b/backend/app/prompts/gemini_ingestion.md @@ -20,7 +20,7 @@ CRITICAL LANGUAGE REQUIREMENT: Constraints: - Output MUST be valid JSON. Do not include markdown fences or any other text. - All JSON strings must be properly escaped (use \" for quotes within strings) -- Use detailed, descriptive audio description phrases that paint a vivid picture. Aim for rich descriptions that are 20% longer than typical AD, providing enhanced visual context without duplicating spoken dialogue. +- Write clear, concise audio descriptions — prioritise accuracy and comprehension over length. Be succinct; omit redundant or obvious details. - WebVTT must start with "WEBVTT" and follow this exact format: - Timestamp format: HH:MM:SS.mmm --> HH:MM:SS.mmm (ALWAYS include hours, even if 00:) - Example: "00:01:23.456 --> 00:01:27.890" @@ -39,13 +39,35 @@ CRITICAL TIMING REQUIREMENTS: - For audio descriptions, time them during natural speech gaps or over non-dialogue audio - Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends) +BRAND NAMES AND PRODUCTS: +{BRAND_CONTEXT} +- When you can clearly identify a product that matches a brand in the provided list, use the brand name rather than a generic descriptor (e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes") +- Only use brand names when you are confident of the identification from visible labels, logos, or distinctive design +- If a product is not on the list or is unclear, use a generic descriptor — do not guess + +ETHICAL GUIDELINES FOR DESCRIBING PEOPLE: +- Describe people objectively and factually based on what is clearly visible — do not interpret, assume, or editorialize +- Use person-first, inclusive language (e.g., "a person using a wheelchair" not "a wheelchair-bound person"; "officer" not "policeman") +- Describe race, ethnicity, gender, age, or other personal characteristics ONLY when they are relevant to the narrative or plot. When you describe these characteristics for one person, be consistent and describe them for all relevant people in the same scene +- Do NOT guess at racial, ethnic, gender, or religious identity if it is not clearly confirmed by visual context or dialogue — use general descriptors instead (e.g., "a middle-aged person" rather than specifying ethnicity when uncertain) +- For disabilities or medical conditions: describe observable facts only (e.g., "a person with a prosthetic leg" — do not interpret emotional state or capability) +- Avoid language that stereotypes, sensationalises, or assigns motivation based on appearance + AUDIO DESCRIPTION GUIDELINES: -- Provide rich, detailed descriptions that include setting, characters, actions, facial expressions, body language, and visual mood -- Describe colors, lighting, camera angles, and composition when relevant to understanding -- Include environmental details like weather, time of day, architectural features, or technological elements -- Mention clothing, objects, and spatial relationships that contribute to scene understanding -- Use vivid, engaging language that creates a complete mental picture for visually impaired viewers -- Aim for descriptions that are substantive enough to fill natural pauses and reduce silence between spoken content +Priority order for what to describe (use available time wisely): +1. ESSENTIAL: Actions and details critical for following the narrative; information that would cause confusion if omitted; scene context and setting +2. HIGH PRIORITY: Significant character appearance relevant to the story; visual details supporting understanding; scene changes and time passages +3. TIME-PERMITTING: Additional aesthetic or contextual details + +Rules: +- Place descriptions BEFORE the visual content they refer to when possible (pre-teaching), not after +- Use present tense, active voice, and third-person narrative +- Describe actions and observable gestures; do NOT infer or state emotions unless clearly displayed (e.g., "She covers her face with her hands" not "She looks devastated") +- Do NOT use cinematic terminology such as "close-up", "pan", "cut to", "flashback", or "montage" unless absolutely necessary for comprehension +- Describe on-screen text (titles, signs, captions, graphics) that is not already spoken in the audio +- Describe colors, clothing, setting, and spatial relationships when relevant to understanding +- Be succinct — omit redundant or self-evident details +- Do NOT duplicate information already in the spoken dialogue CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text. diff --git a/backend/app/prompts/gemini_ingestion_targeted.md b/backend/app/prompts/gemini_ingestion_targeted.md index 2f2c8a8..39a47d5 100644 --- a/backend/app/prompts/gemini_ingestion_targeted.md +++ b/backend/app/prompts/gemini_ingestion_targeted.md @@ -24,7 +24,7 @@ CRITICAL LANGUAGE REQUIREMENT: Constraints: - Output MUST be valid JSON. Do not include markdown fences or any other text. - All JSON strings must be properly escaped (use \" for quotes within strings) -- Use detailed, descriptive audio description phrases that paint a vivid picture. Aim for rich descriptions that are 20% longer than typical AD, providing enhanced visual context without duplicating spoken dialogue. +- Write clear, concise audio descriptions — prioritise accuracy and comprehension over length. Be succinct; omit redundant or obvious details. - WebVTT must start with "WEBVTT" and follow this exact format: - Timestamp format: HH:MM:SS.mmm --> HH:MM:SS.mmm (ALWAYS include hours, even if 00:) - Example: "00:01:23.456 --> 00:01:27.890" @@ -43,13 +43,35 @@ CRITICAL TIMING REQUIREMENTS: - For audio descriptions, time them during natural speech gaps or over non-dialogue audio - Validate that all timestamps are monotonically increasing (each cue starts after the previous one ends) +BRAND NAMES AND PRODUCTS: +{BRAND_CONTEXT} +- When you can clearly identify a product that matches a brand in the provided list, use the brand name rather than a generic descriptor (e.g., "Sellotape" not "sticky tape", "Post-it notes" not "sticky notes") +- Only use brand names when you are confident of the identification from visible labels, logos, or distinctive design +- If a product is not on the list or is unclear, use a generic descriptor — do not guess + +ETHICAL GUIDELINES FOR DESCRIBING PEOPLE: +- Describe people objectively and factually based on what is clearly visible — do not interpret, assume, or editorialize +- Use person-first, inclusive language (e.g., "a person using a wheelchair" not "a wheelchair-bound person"; "officer" not "policeman") +- Describe race, ethnicity, gender, age, or other personal characteristics ONLY when they are relevant to the narrative or plot. When you describe these characteristics for one person, be consistent and describe them for all relevant people in the same scene +- Do NOT guess at racial, ethnic, gender, or religious identity if it is not clearly confirmed by visual context or dialogue — use general descriptors instead +- For disabilities or medical conditions: describe observable facts only — do not interpret emotional state or capability +- Avoid language that stereotypes, sensationalises, or assigns motivation based on appearance + AUDIO DESCRIPTION GUIDELINES: -- Provide rich, detailed descriptions that include setting, characters, actions, facial expressions, body language, and visual mood -- Describe colors, lighting, camera angles, and composition when relevant to understanding -- Include environmental details like weather, time of day, architectural features, or technological elements -- Mention clothing, objects, and spatial relationships that contribute to scene understanding -- Use vivid, engaging language that creates a complete mental picture for visually impaired viewers -- Aim for descriptions that are substantive enough to fill natural pauses and reduce silence between spoken content +Priority order for what to describe (use available time wisely): +1. ESSENTIAL: Actions and details critical for following the narrative; information that would cause confusion if omitted; scene context and setting +2. HIGH PRIORITY: Significant character appearance relevant to the story; visual details supporting understanding; scene changes and time passages +3. TIME-PERMITTING: Additional aesthetic or contextual details + +Rules: +- Place descriptions BEFORE the visual content they refer to when possible (pre-teaching), not after +- Use present tense, active voice, and third-person narrative +- Describe actions and observable gestures; do NOT infer or state emotions unless clearly displayed (e.g., describe the gesture, not the inferred feeling) +- Do NOT use cinematic terminology such as "close-up", "pan", "cut to", "flashback", or "montage" unless absolutely necessary for comprehension +- Describe on-screen text (titles, signs, captions, graphics) that is not already spoken in the audio +- Describe colors, clothing, setting, and spatial relationships when relevant to understanding +- Be succinct — omit redundant or self-evident details +- Do NOT duplicate information already in the spoken dialogue - Write all descriptions in natural, fluent {TARGET_LANGUAGE} CRITICAL: Return ONLY valid JSON that can be parsed by JSON.parse(). No additional text. diff --git a/backend/app/services/gemini.py b/backend/app/services/gemini.py index 405ba2d..85a85d7 100644 --- a/backend/app/services/gemini.py +++ b/backend/app/services/gemini.py @@ -59,12 +59,25 @@ class GeminiService: logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s") return False - async def extract_accessibility(self, video_file_path: str) -> dict[str, Any]: + def _build_brand_context_block(self, brand_context: Optional[str]) -> str: + """Build the brand context instruction block for injection into prompts.""" + if brand_context and brand_context.strip(): + brands = [b.strip() for b in brand_context.split(",") if b.strip()] + if brands: + brand_list = ", ".join(f'"{b}"' for b in brands) + return ( + f"The client has confirmed the following brand names appear in this video: {brand_list}. " + f"Use these exact brand names when you identify those products on screen." + ) + return "No specific brand names have been provided for this video." + + async def extract_accessibility(self, video_file_path: str, brand_context: Optional[str] = None) -> dict[str, Any]: """ Extract captions and audio descriptions from video using Gemini 2.0 Returns structured JSON with transcript, captions VTT, and audio description VTT """ - prompt = self._load_prompt("gemini_ingestion.md") + prompt_template = self._load_prompt("gemini_ingestion.md") + prompt = prompt_template.replace("{BRAND_CONTEXT}", self._build_brand_context_block(brand_context)) uploaded_file = None try: @@ -244,7 +257,8 @@ Fix the JSON and return it: async def extract_accessibility_targeted( self, video_file_path: str, - target_language: str + target_language: str, + brand_context: Optional[str] = None ) -> dict[str, Any]: """ Extract captions and audio descriptions from video using Gemini, @@ -258,13 +272,16 @@ Fix the JSON and return it: Args: video_file_path: Path to the video file target_language: BCP-47 language code (e.g., "es", "fr", "de") + brand_context: Optional comma-separated brand names present in the video Returns: Structured JSON with transcript, captions VTT, and audio description VTT all in the target language """ prompt_template = self._load_prompt("gemini_ingestion_targeted.md") - prompt = prompt_template.replace("{TARGET_LANGUAGE}", target_language) + prompt = prompt_template.replace("{TARGET_LANGUAGE}", target_language).replace( + "{BRAND_CONTEXT}", self._build_brand_context_block(brand_context) + ) uploaded_file = None try: diff --git a/backend/app/tasks/ingest_and_ai.py b/backend/app/tasks/ingest_and_ai.py index f0c33dd..2691116 100644 --- a/backend/app/tasks/ingest_and_ai.py +++ b/backend/app/tasks/ingest_and_ai.py @@ -203,7 +203,8 @@ async def ingest_and_ai_task_impl(job_id: str): ) # Process with Gemini - ai_result = await gemini_service.extract_accessibility(temp_path) + brand_context = job_doc.get("brand_context") + ai_result = await gemini_service.extract_accessibility(temp_path, brand_context=brand_context) # Final safety check for required fields required_fields = ["captions_vtt", "audio_description_vtt"] diff --git a/backend/app/tasks/translate_and_synthesize.py b/backend/app/tasks/translate_and_synthesize.py index eada0d0..eb9bd44 100644 --- a/backend/app/tasks/translate_and_synthesize.py +++ b/backend/app/tasks/translate_and_synthesize.py @@ -226,6 +226,8 @@ async def _async_translate_and_synthesize(job_id: str): semaphore = asyncio.Semaphore(MAX_CONCURRENT_VIDEO_NATIVE) + job_brand_context = job_doc.get("brand_context") + async def translate_language_video_native(lang: str) -> tuple[str, str, str, str | None]: """Process a single language with video-native translation. Returns: (language, captions_gcs_uri, ad_gcs_uri, error_message or None) @@ -236,7 +238,8 @@ async def _async_translate_and_synthesize(job_id: str): async def extract_targeted(): return await gemini_service.extract_accessibility_targeted( video_local_path, - lang + lang, + brand_context=job_brand_context ) result = await retry_with_backoff(extract_targeted, max_retries=3) diff --git a/frontend/src/hooks/useMultiUpload.ts b/frontend/src/hooks/useMultiUpload.ts index bc692ee..391bf9b 100644 --- a/frontend/src/hooks/useMultiUpload.ts +++ b/frontend/src/hooks/useMultiUpload.ts @@ -12,6 +12,7 @@ export interface FileListItem { export interface SharedJobSettings { requestedOutputs: RequestedOutputs; + brandContext?: string; } interface UseMultiUploadOptions { @@ -106,6 +107,7 @@ export function useMultiUpload(options: UseMultiUploadOptions = {}): UseMultiUpl { title: item.autoTitle, requested_outputs: settings.requestedOutputs, + brand_context: settings.brandContext, }, item.file, (progressEvent) => { diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index 9a6c516..b8ce0b8 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -159,6 +159,9 @@ class ApiClient { const formData = new FormData(); formData.append('title', data.title); formData.append('requested_outputs', JSON.stringify(data.requested_outputs)); + if (data.brand_context) { + formData.append('brand_context', data.brand_context); + } formData.append('file', file); const response = await this.client.post('/jobs', formData, { diff --git a/frontend/src/routes/jobs/NewJob.tsx b/frontend/src/routes/jobs/NewJob.tsx index 117ba28..a8ca812 100644 --- a/frontend/src/routes/jobs/NewJob.tsx +++ b/frontend/src/routes/jobs/NewJob.tsx @@ -36,6 +36,7 @@ export function NewJob() { const multiUpload = useMultiUpload({ maxConcurrent: 3 }); // Shared state + const [brandContext, setBrandContext] = useState(''); const [showVoiceSettings, setShowVoiceSettings] = useState(false); const [ttsPreferences, setTtsPreferences] = useState({ provider: 'gemini', @@ -130,7 +131,8 @@ export function NewJob() { transcreation: [], // Transcreation replaced by video_native translation mode tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined, translation_mode: data.translation_mode, - } + }, + brand_context: brandContext.trim() || undefined, }; try { @@ -208,7 +210,8 @@ export function NewJob() { transcreation: [], // Transcreation replaced by video_native translation mode tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined, translation_mode: data.translation_mode, - } + }, + brandContext: brandContext.trim() || undefined, }); }; @@ -252,7 +255,8 @@ export function NewJob() { transcreation: [], // Transcreation replaced by video_native translation mode tts_preferences: data.audio_description_mp3 ? ttsPreferences : undefined, translation_mode: data.translation_mode, - } + }, + brandContext: brandContext.trim() || undefined, }); }; @@ -673,6 +677,24 @@ export function NewJob() { )} + {/* Brand Context */} +
+ + setBrandContext(e.target.value)} + className="w-full px-3 py-2 border border-gray-300 rounded-md focus:outline-none focus:ring-2 focus:ring-blue-500" + placeholder="e.g. Sellotape, Coca-Cola, Apple iPhone" + disabled={isUploading} + /> +

+ List brand names visible in the video so the AI uses them instead of generic terms (e.g. "Sellotape" instead of "sticky tape"). +

+
+ {/* Submit Button */}