350 lines
14 KiB
Python
350 lines
14 KiB
Python
import json
|
|
import asyncio
|
|
from pathlib import Path
|
|
from typing import Any, Optional
|
|
|
|
import google.genai as genai
|
|
|
|
from ..core.config import settings
|
|
from ..core.logging import get_logger
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
# Configure Gemini client
|
|
client = genai.Client(api_key=settings.gemini_api_key)
|
|
|
|
class GeminiService:
|
|
def __init__(self):
|
|
self.model_name = 'gemini-2.5-pro' # Stable production model
|
|
self.prompts_dir = Path(__file__).parent.parent / "prompts"
|
|
|
|
def _load_prompt(self, prompt_file: str) -> str:
|
|
"""Load prompt template from prompts directory"""
|
|
prompt_path = self.prompts_dir / prompt_file
|
|
try:
|
|
return prompt_path.read_text()
|
|
except FileNotFoundError:
|
|
logger.error(f"Prompt file not found: {prompt_file}")
|
|
raise
|
|
|
|
async def _wait_for_file_active(self, file_name: str, max_wait_seconds: int = 300) -> bool:
|
|
"""Wait for uploaded file to become ACTIVE state"""
|
|
wait_time = 1 # Start with 1 second
|
|
total_waited = 0
|
|
|
|
while total_waited < max_wait_seconds:
|
|
try:
|
|
# Get file status
|
|
file_info = client.files.get(name=file_name)
|
|
logger.info(f"File {file_name} status: {file_info.state} (waited {total_waited}s)")
|
|
|
|
if file_info.state == "ACTIVE":
|
|
logger.info(f"File {file_name} is now ACTIVE!")
|
|
return True
|
|
elif file_info.state == "FAILED":
|
|
logger.error(f"File {file_name} processing FAILED")
|
|
return False
|
|
|
|
# Wait with exponential backoff (max 30s)
|
|
logger.info(f"File not ready, waiting {wait_time}s...")
|
|
await asyncio.sleep(wait_time)
|
|
total_waited += wait_time
|
|
wait_time = min(wait_time * 1.5, 30) # Exponential backoff, max 30s
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error checking file status: {e}")
|
|
await asyncio.sleep(5) # Wait 5s on error
|
|
total_waited += 5
|
|
|
|
logger.error(f"File {file_name} did not become ACTIVE within {max_wait_seconds}s")
|
|
return False
|
|
|
|
async def extract_accessibility(self, video_file_path: str) -> dict[str, Any]:
|
|
"""
|
|
Extract captions and audio descriptions from video using Gemini 2.0
|
|
Returns structured JSON with transcript, captions VTT, and audio description VTT
|
|
"""
|
|
prompt = self._load_prompt("gemini_ingestion.md")
|
|
|
|
try:
|
|
logger.info(f"Starting Gemini processing for video: {video_file_path}")
|
|
|
|
# Upload video file to Gemini using new API
|
|
logger.info("Uploading video file to Gemini API...")
|
|
uploaded_file = client.files.upload(
|
|
file=video_file_path,
|
|
config={
|
|
"display_name": f"video_processing_{Path(video_file_path).name}",
|
|
"mime_type": "video/mp4"
|
|
}
|
|
)
|
|
logger.info(f"Successfully uploaded file: {uploaded_file.name} (URI: {uploaded_file.uri})")
|
|
|
|
# Wait for file to become ACTIVE before using it
|
|
logger.info("Waiting for file to become ACTIVE...")
|
|
file_ready = await self._wait_for_file_active(uploaded_file.name)
|
|
if not file_ready:
|
|
raise Exception("File failed to become ACTIVE within timeout")
|
|
|
|
# Generate content using new API
|
|
logger.info("Generating content with Gemini model...")
|
|
response = client.models.generate_content(
|
|
model=self.model_name,
|
|
contents=[
|
|
genai.types.Part.from_text(text=prompt),
|
|
genai.types.Part.from_uri(
|
|
file_uri=uploaded_file.uri,
|
|
mime_type=uploaded_file.mime_type
|
|
)
|
|
]
|
|
)
|
|
|
|
# Parse JSON response
|
|
response_text = response.text.strip()
|
|
logger.info(f"Received Gemini response (first 200 chars): {response_text[:200]}...")
|
|
|
|
# Handle potential markdown formatting
|
|
if response_text.startswith("```json"):
|
|
response_text = response_text.replace("```json", "").replace("```", "").strip()
|
|
logger.info("Cleaned markdown formatting from response")
|
|
|
|
# Additional cleanup for common JSON issues
|
|
response_text = response_text.strip()
|
|
|
|
logger.info("Parsing JSON response...")
|
|
try:
|
|
result = json.loads(response_text)
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"JSON parse error at position {e.pos}: {e.msg}")
|
|
# Log the problematic area
|
|
start = max(0, e.pos - 100)
|
|
end = min(len(response_text), e.pos + 100)
|
|
problematic_text = response_text[start:end]
|
|
logger.error(f"Problematic JSON area: ...{problematic_text}...")
|
|
raise
|
|
|
|
# Validate required fields
|
|
required_fields = [
|
|
"language", "confidence", "summary",
|
|
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
|
|
]
|
|
|
|
for field in required_fields:
|
|
if field not in result:
|
|
raise ValueError(f"Missing required field: {field}")
|
|
|
|
# Validate VTT format
|
|
if not result["captions_vtt"].startswith("WEBVTT"):
|
|
raise ValueError("Invalid captions VTT format")
|
|
|
|
if not result["audio_description_vtt"].startswith("WEBVTT"):
|
|
raise ValueError("Invalid audio description VTT format")
|
|
|
|
logger.info(
|
|
f"Successfully extracted accessibility content with confidence: {result['confidence']}"
|
|
)
|
|
|
|
# Clean up uploaded file
|
|
try:
|
|
client.files.delete(name=uploaded_file.name)
|
|
except Exception as e:
|
|
logger.warning(f"Failed to cleanup uploaded file: {e}")
|
|
|
|
return result
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse Gemini JSON response: {e}")
|
|
logger.error(f"Raw response that failed to parse: {response_text}")
|
|
# Attempt self-healing
|
|
return await self._self_heal_response(video_file_path, response_text)
|
|
except Exception as e:
|
|
logger.error(f"Gemini extraction failed with exception: {type(e).__name__}: {str(e)}")
|
|
logger.error(f"Video file path: {video_file_path}")
|
|
# Print to stdout for immediate visibility
|
|
print(f"🚨 GEMINI ERROR: {type(e).__name__}: {str(e)}")
|
|
raise
|
|
|
|
async def _self_heal_response(self, video_file_path: str, invalid_response: str) -> dict[str, Any]:
|
|
"""Attempt to self-heal invalid JSON response from Gemini"""
|
|
logger.info("Attempting to self-heal JSON response without re-uploading video")
|
|
|
|
# Try to fix common JSON issues first
|
|
try:
|
|
fixed_response = self._attempt_json_fix(invalid_response)
|
|
if fixed_response:
|
|
logger.info("Successfully fixed JSON without re-processing")
|
|
return fixed_response
|
|
except Exception as e:
|
|
logger.warning(f"JSON fix attempt failed: {e}")
|
|
|
|
# If simple fixes don't work, try a text-only self-heal prompt with more context
|
|
self_heal_prompt = f"""
|
|
SYSTEM: You are a JSON repair service. Fix the malformed JSON below and return ONLY the corrected JSON.
|
|
|
|
CRITICAL REQUIREMENTS:
|
|
- The JSON MUST contain these exact fields: language, confidence, summary, transcript_plaintext, captions_vtt, audio_description_vtt
|
|
- If audio_description_vtt is truncated or missing, reconstruct it as a valid WebVTT with at least basic descriptions
|
|
- All VTT content must start with "WEBVTT" and have proper timestamp format (HH:MM:SS.mmm --> HH:MM:SS.mmm)
|
|
- Properly escape all quotes within strings using \"
|
|
- Fix unterminated strings by adding closing quotes
|
|
- Remove trailing commas
|
|
- Ensure all JSON is properly closed with }}
|
|
|
|
Fix the JSON and return it:
|
|
|
|
{invalid_response}
|
|
"""
|
|
|
|
try:
|
|
response = client.models.generate_content(
|
|
model=self.model_name,
|
|
contents=[genai.types.Part.from_text(text=self_heal_prompt)]
|
|
)
|
|
|
|
response_text = response.text.strip()
|
|
|
|
# Handle potential markdown formatting
|
|
if response_text.startswith("```json"):
|
|
response_text = response_text.replace("```json", "").replace("```", "").strip()
|
|
|
|
result = json.loads(response_text)
|
|
|
|
# Validate that all required fields are present after healing
|
|
required_fields = [
|
|
"language", "confidence", "summary",
|
|
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
|
|
]
|
|
|
|
missing_fields = [field for field in required_fields if field not in result]
|
|
if missing_fields:
|
|
logger.error(f"Self-heal lost required fields: {missing_fields}")
|
|
# If audio_description_vtt is missing, create a basic one
|
|
if "audio_description_vtt" in missing_fields:
|
|
logger.info("Creating fallback audio_description_vtt")
|
|
result["audio_description_vtt"] = "WEBVTT\n\n00:00:00.000 --> 00:00:05.000\nVideo content with visual elements described."
|
|
|
|
# If other critical fields are missing, raise an error
|
|
remaining_missing = [f for f in missing_fields if f != "audio_description_vtt"]
|
|
if remaining_missing:
|
|
raise ValueError(f"Self-heal failed to preserve required fields: {remaining_missing}")
|
|
|
|
logger.info("Successfully self-healed Gemini response with all required fields")
|
|
return result
|
|
|
|
except Exception as e:
|
|
logger.error(f"Self-heal attempt failed: {e}")
|
|
raise ValueError("Failed to get valid JSON from Gemini after self-heal attempt")
|
|
|
|
def _attempt_json_fix(self, json_text: str) -> dict[str, Any] | None:
|
|
"""Attempt to fix common JSON syntax issues"""
|
|
# Try to identify and fix common issues
|
|
fixes_tried = []
|
|
fixed_text = json_text
|
|
import re
|
|
|
|
# Fix 1: Remove trailing commas
|
|
fixed_text = re.sub(r',(\s*[}\]])', r'\1', fixed_text)
|
|
fixes_tried.append("removed trailing commas")
|
|
|
|
# Fix 2: Try to fix unterminated strings by adding closing quote and brace
|
|
if fixed_text.count('"') % 2 != 0: # Odd number of quotes suggests unterminated string
|
|
# Find the last quote and see if we need to close the JSON
|
|
last_quote_pos = fixed_text.rfind('"')
|
|
remainder = fixed_text[last_quote_pos + 1:].strip()
|
|
|
|
# If there's no closing brace after the last quote, try to fix it
|
|
if remainder and not remainder.endswith('}'):
|
|
# Try to intelligently close the JSON
|
|
if 'audio_description_vtt' in fixed_text[max(0, last_quote_pos - 100):]:
|
|
# This appears to be in the audio_description_vtt field
|
|
fixed_text += '"\n}'
|
|
fixes_tried.append("closed unterminated audio_description_vtt string")
|
|
else:
|
|
fixed_text += '"'
|
|
fixes_tried.append("closed unterminated string")
|
|
|
|
# Fix 3: Ensure JSON ends with closing brace
|
|
if not fixed_text.rstrip().endswith('}'):
|
|
fixed_text = fixed_text.rstrip() + '\n}'
|
|
fixes_tried.append("added closing brace")
|
|
|
|
try:
|
|
result = json.loads(fixed_text)
|
|
logger.info(f"JSON fixed with: {', '.join(fixes_tried)}")
|
|
|
|
# Validate that we have the required fields
|
|
required_fields = [
|
|
"language", "confidence", "summary",
|
|
"transcript_plaintext", "captions_vtt", "audio_description_vtt"
|
|
]
|
|
|
|
missing_fields = [field for field in required_fields if field not in result]
|
|
if missing_fields:
|
|
logger.warning(f"Fixed JSON is missing required fields: {missing_fields}")
|
|
return None # Let the more advanced self-healing handle this
|
|
|
|
return result
|
|
except json.JSONDecodeError as e:
|
|
logger.debug(f"JSON fix attempt failed: {e}")
|
|
return None
|
|
|
|
async def transcreate_content(
|
|
self,
|
|
captions_vtt: str,
|
|
ad_vtt: str,
|
|
target_language: str,
|
|
brief: Optional[str] = None
|
|
) -> dict[str, str]:
|
|
"""
|
|
Transcreate English VTT content to target language with cultural adaptation
|
|
"""
|
|
prompt_template = self._load_prompt("gemini_transcreation.md")
|
|
|
|
# Format prompt with actual content
|
|
prompt = prompt_template.format(
|
|
TARGET_LANGUAGE=target_language
|
|
)
|
|
|
|
user_prompt = f"""
|
|
Input:
|
|
- captions_vtt_en: {captions_vtt}
|
|
- ad_vtt_en: {ad_vtt}
|
|
- brief: {brief or "No specific brand guidelines provided"}
|
|
|
|
Output:
|
|
JSON:
|
|
"""
|
|
|
|
try:
|
|
response = client.models.generate_content(
|
|
model=self.model_name,
|
|
contents=[
|
|
genai.types.Part.from_text(text=prompt + "\n\n" + user_prompt)
|
|
]
|
|
)
|
|
|
|
response_text = response.text.strip()
|
|
|
|
# Handle potential markdown formatting
|
|
if response_text.startswith("```json"):
|
|
response_text = response_text.replace("```json", "").replace("```", "").strip()
|
|
|
|
result = json.loads(response_text)
|
|
|
|
# Validate required fields
|
|
if "captions_vtt" not in result or "audio_description_vtt" not in result:
|
|
raise ValueError("Missing required VTT fields in transcreation response")
|
|
|
|
logger.info(f"Successfully transcreated content to {target_language}")
|
|
return result
|
|
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse transcreation JSON response: {e}")
|
|
raise ValueError("Invalid JSON response from transcreation")
|
|
except Exception as e:
|
|
logger.error(f"Transcreation failed: {e}")
|
|
raise
|
|
|
|
|
|
# Global service instance
|
|
gemini_service = GeminiService()
|