- Reorder workflow: translations now happen BEFORE QC Review step - Add language tabs to switch between translated languages in QC - Add video mode tabs (Original Video / Accessible Video) - Add interactive timeline preview showing video segments and AD cues - Enable pause point adjustment with millisecond precision - Add TTS regeneration queue for selective cue re-synthesis - Add re-render controls with optional Whisper refinement - Persist video segments and TTS MP3s to GCS for editability - Add new RENDERING_QC job status for re-render operations - Create 5 new API endpoints for accessible video editing - Add rerender_accessible_video.py Celery task Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
497 lines
19 KiB
Python
497 lines
19 KiB
Python
"""Celery task for re-rendering accessible video with QC changes."""
|
|
|
|
import asyncio
|
|
import io
|
|
import os
|
|
import tempfile
|
|
from datetime import datetime
|
|
|
|
from celery.result import allow_join_result
|
|
from motor.motor_asyncio import AsyncIOMotorClient
|
|
from pydub import AudioSegment
|
|
|
|
from ..core.config import settings
|
|
from ..core.logging import get_logger
|
|
from ..lib.vtt import VTTParser
|
|
from ..models.job import AccessibleVideoEditState, JobStatus, PausePointData, VideoSegmentMetadata
|
|
from ..services.gcs import gcs_service
|
|
from ..services.video_renderer import video_renderer_service
|
|
from ..services.vtt_retimer import vtt_retimer_service
|
|
from ..services.whisper_service import WordTimestamp, whisper_service
|
|
from . import celery_app
|
|
from .render_accessible_video import _extract_audio_for_whisper, _dispatch_whisper_transcription
|
|
from .translate_and_synthesize import broadcast_status_update
|
|
from .tts_synthesis import dispatch_language_tts, parse_ad_cues, synthesize_cue_task
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
@celery_app.task(bind=True, time_limit=7200, soft_time_limit=7000)
|
|
def rerender_accessible_video_task(
|
|
self,
|
|
job_id: str,
|
|
language: str,
|
|
regenerate_cue_indices: list[int],
|
|
whisper_refine: bool = False
|
|
):
|
|
"""
|
|
Re-render accessible video during QC review with selective TTS regeneration.
|
|
|
|
This task:
|
|
1. If regenerate_cue_indices not empty: synthesize new TTS for those cues
|
|
2. Download source video and existing segments/MP3s
|
|
3. If whisper_refine: run Whisper pause point refinement
|
|
4. Re-render video using updated pause points and new/existing TTS
|
|
5. Update job status back to PENDING_QC
|
|
|
|
Args:
|
|
job_id: Job ID
|
|
language: Language being re-rendered
|
|
regenerate_cue_indices: List of cue indices to regenerate TTS for
|
|
whisper_refine: Whether to run Whisper pause point refinement
|
|
"""
|
|
logger.info(
|
|
f"Starting accessible video re-render for job {job_id}/{language}: "
|
|
f"regenerate={regenerate_cue_indices}, whisper_refine={whisper_refine}"
|
|
)
|
|
|
|
try:
|
|
result = asyncio.run(_async_rerender_accessible_video(
|
|
job_id, language, regenerate_cue_indices, whisper_refine
|
|
))
|
|
logger.info(f"Accessible video re-render completed for job {job_id}/{language}")
|
|
return result
|
|
except Exception as e:
|
|
logger.error(f"Accessible video re-render failed for job {job_id}/{language}: {e}")
|
|
import traceback
|
|
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
|
|
# Update job status back to PENDING_QC with error
|
|
asyncio.run(_mark_rerender_failed(job_id, language, str(e)))
|
|
raise
|
|
|
|
|
|
async def _mark_rerender_failed(job_id: str, language: str, error_message: str):
|
|
"""Mark re-render as failed and return to PENDING_QC."""
|
|
client = AsyncIOMotorClient(settings.mongodb_uri)
|
|
db = client[settings.mongodb_db]
|
|
|
|
try:
|
|
await db.jobs.update_one(
|
|
{"_id": job_id},
|
|
{
|
|
"$set": {
|
|
"status": JobStatus.PENDING_QC.value,
|
|
f"outputs.{language}.accessible_video_edit_state.last_render_error": error_message,
|
|
"updated_at": datetime.utcnow()
|
|
},
|
|
"$push": {
|
|
"review.history": {
|
|
"at": datetime.utcnow(),
|
|
"status": JobStatus.PENDING_QC.value,
|
|
"by": "system",
|
|
"notes": f"Re-render failed for {language}: {error_message[:200]}"
|
|
}
|
|
}
|
|
}
|
|
)
|
|
|
|
job_doc = await db.jobs.find_one({"_id": job_id})
|
|
broadcast_status_update(
|
|
job_id,
|
|
JobStatus.PENDING_QC.value,
|
|
job_title=job_doc.get("title") if job_doc else None,
|
|
message=f"Re-render failed: {error_message[:100]}"
|
|
)
|
|
finally:
|
|
client.close()
|
|
|
|
|
|
async def _async_rerender_accessible_video(
|
|
job_id: str,
|
|
language: str,
|
|
regenerate_cue_indices: list[int],
|
|
whisper_refine: bool
|
|
):
|
|
"""Async implementation of accessible video re-rendering."""
|
|
logger.info(f"Async re-render started for job {job_id}/{language}")
|
|
|
|
client = AsyncIOMotorClient(settings.mongodb_uri)
|
|
db = client[settings.mongodb_db]
|
|
|
|
try:
|
|
# Get job details
|
|
job_doc = await db.jobs.find_one({"_id": job_id})
|
|
if not job_doc:
|
|
raise ValueError(f"Job {job_id} not found")
|
|
|
|
job_title = job_doc.get("title", "Untitled Job")
|
|
lang_output = job_doc.get("outputs", {}).get(language)
|
|
if not lang_output:
|
|
raise ValueError(f"No outputs found for language {language}")
|
|
|
|
edit_state = lang_output.get("accessible_video_edit_state")
|
|
if not edit_state:
|
|
raise ValueError(f"No edit state found for language {language}")
|
|
|
|
# Use TMPDIR env var if set
|
|
temp_base = os.environ.get('TMPDIR', None)
|
|
with tempfile.TemporaryDirectory(dir=temp_base) as temp_dir:
|
|
# 1. Download source video
|
|
source_video_gcs = job_doc["source"]["gcs_uri"]
|
|
source_blob_path = source_video_gcs.replace(f"gs://{settings.gcs_bucket}/", "")
|
|
source_video_path = os.path.join(temp_dir, "source.mp4")
|
|
|
|
logger.info(f"Downloading source video from {source_blob_path}")
|
|
source_blob = gcs_service.bucket.blob(source_blob_path)
|
|
source_blob.download_to_filename(source_video_path)
|
|
|
|
# 2. Regenerate TTS for queued cues (if any)
|
|
if regenerate_cue_indices:
|
|
logger.info(f"Regenerating TTS for cues: {regenerate_cue_indices}")
|
|
await _regenerate_tts_cues(
|
|
job_id, language, regenerate_cue_indices, job_doc, db, temp_dir
|
|
)
|
|
|
|
# Clear regeneration queue after successful synthesis
|
|
await db.jobs.update_one(
|
|
{"_id": job_id},
|
|
{
|
|
"$set": {
|
|
f"outputs.{language}.accessible_video_edit_state.tts_regeneration_queue": [],
|
|
"updated_at": datetime.utcnow()
|
|
}
|
|
}
|
|
)
|
|
|
|
# 3. Download AD VTT and per-cue MP3s
|
|
ad_vtt_gcs = lang_output.get("ad_vtt_gcs")
|
|
if not ad_vtt_gcs:
|
|
raise ValueError(f"No AD VTT found for language {language}")
|
|
|
|
ad_blob_path = ad_vtt_gcs.replace(f"gs://{settings.gcs_bucket}/", "")
|
|
ad_blob = gcs_service.bucket.blob(ad_blob_path)
|
|
ad_vtt_content = ad_blob.download_as_text()
|
|
|
|
# Download per-cue MP3s
|
|
ad_cues_prefix = lang_output.get("ad_cues_gcs_prefix")
|
|
if not ad_cues_prefix:
|
|
raise ValueError(f"No AD cue segments found for language {language}")
|
|
|
|
ad_segments = []
|
|
cue_durations = []
|
|
|
|
prefix_path = ad_cues_prefix.replace(f"gs://{settings.gcs_bucket}/", "")
|
|
blobs = list(gcs_service.bucket.list_blobs(prefix=prefix_path))
|
|
|
|
cue_blobs = [(b, int(b.name.split("_")[-1].replace(".mp3", ""))) for b in blobs if b.name.endswith(".mp3")]
|
|
cue_blobs.sort(key=lambda x: x[1])
|
|
|
|
for blob, cue_index in cue_blobs:
|
|
local_path = os.path.join(temp_dir, f"cue_{cue_index}.mp3")
|
|
blob.download_to_filename(local_path)
|
|
ad_segments.append((cue_index, local_path))
|
|
|
|
audio = AudioSegment.from_mp3(local_path)
|
|
duration = len(audio) / 1000.0
|
|
cue_durations.append(duration)
|
|
|
|
logger.info(f"Downloaded {len(ad_segments)} AD cue segments")
|
|
|
|
# 4. Build placements with adjusted pause points
|
|
method = lang_output.get("accessible_video_method", "pause_insert")
|
|
pause_points = edit_state.get("pause_points", [])
|
|
|
|
placements = _build_placements_with_adjustments(
|
|
ad_vtt_content, cue_durations, pause_points
|
|
)
|
|
logger.info(f"Built {len(placements)} placements with adjusted pause points")
|
|
|
|
analysis = {
|
|
"method": method,
|
|
"method_rationale": "QC re-render with user adjustments",
|
|
"placements": placements,
|
|
"total_added_duration": sum(cue_durations) if method == "pause_insert" else 0,
|
|
"warnings": []
|
|
}
|
|
|
|
# 5. Optionally run Whisper refinement
|
|
if whisper_refine and method == "pause_insert":
|
|
logger.info("Running Whisper pause point refinement...")
|
|
analysis, whisper_warnings = await _refine_pause_points_for_rerender(
|
|
job_id, source_video_path, analysis, db, temp_dir
|
|
)
|
|
if whisper_warnings:
|
|
analysis["warnings"] = analysis.get("warnings", []) + whisper_warnings
|
|
logger.info(f"Whisper refinement complete with {len(whisper_warnings)} warnings")
|
|
|
|
# 6. Render accessible video (persist segments again for future edits)
|
|
output_video_path = os.path.join(temp_dir, "accessible_video.mp4")
|
|
gcs_segment_prefix = f"{job_id}/{language}/segments/"
|
|
|
|
logger.info(f"Re-rendering accessible video using {method} method...")
|
|
rendered_path, updated_placements, segment_metadata, new_pause_points = await video_renderer_service.render_accessible_video(
|
|
source_video_path,
|
|
ad_segments,
|
|
analysis,
|
|
output_video_path,
|
|
persist_segments=True,
|
|
gcs_segment_prefix=gcs_segment_prefix
|
|
)
|
|
|
|
if updated_placements:
|
|
analysis["placements"] = updated_placements
|
|
|
|
# 7. Upload rendered video
|
|
video_blob_path = f"{job_id}/{language}/accessible_video.mp4"
|
|
video_blob = gcs_service.bucket.blob(video_blob_path)
|
|
video_blob.content_type = "video/mp4"
|
|
video_blob.upload_from_filename(output_video_path)
|
|
|
|
video_gcs_uri = f"gs://{settings.gcs_bucket}/{video_blob_path}"
|
|
logger.info(f"Uploaded re-rendered accessible video to {video_gcs_uri}")
|
|
|
|
# 8. Generate re-timed captions if pause-insert
|
|
retimed_captions_gcs_uri = None
|
|
if method == "pause_insert":
|
|
captions_vtt_gcs = lang_output.get("captions_vtt_gcs")
|
|
if captions_vtt_gcs:
|
|
captions_blob_path = captions_vtt_gcs.replace(f"gs://{settings.gcs_bucket}/", "")
|
|
captions_blob = gcs_service.bucket.blob(captions_blob_path)
|
|
original_captions_vtt = captions_blob.download_as_text()
|
|
|
|
retimed_captions = vtt_retimer_service.retime_for_pause_insert(
|
|
original_captions_vtt, analysis
|
|
)
|
|
|
|
retimed_blob_path = f"{job_id}/{language}/accessible_captions.vtt"
|
|
retimed_blob = gcs_service.bucket.blob(retimed_blob_path)
|
|
retimed_blob.content_type = "text/vtt"
|
|
retimed_blob.upload_from_string(retimed_captions, content_type="text/vtt")
|
|
|
|
retimed_captions_gcs_uri = f"gs://{settings.gcs_bucket}/{retimed_blob_path}"
|
|
logger.info(f"Uploaded re-timed captions to {retimed_captions_gcs_uri}")
|
|
|
|
# 9. Build new edit state
|
|
new_edit_state = None
|
|
if segment_metadata and new_pause_points:
|
|
new_edit_state = AccessibleVideoEditState(
|
|
pause_points=new_pause_points,
|
|
video_segments=segment_metadata,
|
|
tts_regeneration_queue=[],
|
|
last_render_at=datetime.utcnow(),
|
|
whisper_refine_enabled=whisper_refine
|
|
)
|
|
|
|
# 10. Update job document
|
|
update_fields = {
|
|
f"outputs.{language}.accessible_video_gcs": video_gcs_uri,
|
|
f"outputs.{language}.video_segments_gcs_prefix": f"gs://{settings.gcs_bucket}/{gcs_segment_prefix}",
|
|
"status": JobStatus.PENDING_QC.value,
|
|
"updated_at": datetime.utcnow()
|
|
}
|
|
|
|
if retimed_captions_gcs_uri:
|
|
update_fields[f"outputs.{language}.retimed_captions_vtt_gcs"] = retimed_captions_gcs_uri
|
|
|
|
if new_edit_state:
|
|
update_fields[f"outputs.{language}.accessible_video_edit_state"] = new_edit_state.model_dump()
|
|
|
|
await db.jobs.update_one(
|
|
{"_id": job_id},
|
|
{
|
|
"$set": update_fields,
|
|
"$push": {
|
|
"review.history": {
|
|
"at": datetime.utcnow(),
|
|
"status": JobStatus.PENDING_QC.value,
|
|
"by": "system",
|
|
"notes": f"Re-render complete for {language}"
|
|
}
|
|
}
|
|
}
|
|
)
|
|
|
|
# Broadcast completion
|
|
broadcast_status_update(
|
|
job_id,
|
|
JobStatus.PENDING_QC.value,
|
|
job_title=job_title,
|
|
message=f"Accessible video re-render complete for {language.upper()}"
|
|
)
|
|
|
|
logger.info(f"Accessible video re-render complete for job {job_id}/{language}")
|
|
|
|
finally:
|
|
client.close()
|
|
|
|
|
|
async def _regenerate_tts_cues(
|
|
job_id: str,
|
|
language: str,
|
|
cue_indices: list[int],
|
|
job_doc: dict,
|
|
db,
|
|
temp_dir: str
|
|
):
|
|
"""Regenerate TTS for specific cues using current VTT text."""
|
|
logger.info(f"Regenerating TTS for {len(cue_indices)} cues")
|
|
|
|
# Get AD VTT content
|
|
lang_output = job_doc.get("outputs", {}).get(language)
|
|
ad_vtt_gcs = lang_output.get("ad_vtt_gcs")
|
|
|
|
ad_blob_path = ad_vtt_gcs.replace(f"gs://{settings.gcs_bucket}/", "")
|
|
ad_blob = gcs_service.bucket.blob(ad_blob_path)
|
|
ad_vtt_content = ad_blob.download_as_text()
|
|
|
|
# Parse cues
|
|
cues = parse_ad_cues(ad_vtt_content)
|
|
|
|
# Get TTS preferences
|
|
tts_preferences = job_doc["requested_outputs"].get("tts_preferences", {})
|
|
voices_per_language = tts_preferences.get("voices_per_language", {})
|
|
voice_name = voices_per_language.get(language, tts_preferences.get("default_voice"))
|
|
provider = tts_preferences.get("provider", "gemini")
|
|
model = tts_preferences.get("model", "flash")
|
|
speed = tts_preferences.get("speed", 1.0)
|
|
style_preset = tts_preferences.get("style_preset", "neutral")
|
|
custom_style_prompt = tts_preferences.get("custom_style_prompt")
|
|
|
|
if style_preset == "custom" and custom_style_prompt:
|
|
style_prompt = custom_style_prompt
|
|
else:
|
|
style_prompt = settings.gemini_tts_style_prompts.get(style_preset, "")
|
|
|
|
# Synthesize each cue
|
|
for cue_idx in cue_indices:
|
|
if cue_idx >= len(cues):
|
|
logger.warning(f"Cue index {cue_idx} out of range, skipping")
|
|
continue
|
|
|
|
cue = cues[cue_idx]
|
|
|
|
logger.info(f"Synthesizing TTS for cue {cue_idx}: '{cue['text'][:50]}...'")
|
|
|
|
# Dispatch synthesis task
|
|
task_result = synthesize_cue_task.apply_async(
|
|
kwargs={
|
|
"job_id": job_id,
|
|
"language": language,
|
|
"cue_index": cue_idx,
|
|
"text": cue["text"],
|
|
"start_time": cue["start_time"],
|
|
"end_time": cue["end_time"],
|
|
"voice_name": voice_name,
|
|
"provider": provider,
|
|
"model": model,
|
|
"speed": speed,
|
|
"style_prompt": style_prompt
|
|
},
|
|
queue="tts"
|
|
)
|
|
|
|
# Wait for completion
|
|
poll_count = 0
|
|
while not task_result.ready():
|
|
await asyncio.sleep(1.0)
|
|
poll_count += 1
|
|
if poll_count % 30 == 0:
|
|
logger.info(f"Still waiting for TTS cue {cue_idx}...")
|
|
|
|
with allow_join_result():
|
|
result = task_result.get(timeout=120)
|
|
|
|
if not result.get("success"):
|
|
raise Exception(f"TTS synthesis failed for cue {cue_idx}: {result.get('error_message')}")
|
|
|
|
logger.info(f"TTS synthesis complete for cue {cue_idx}")
|
|
|
|
logger.info(f"All {len(cue_indices)} TTS cues regenerated")
|
|
|
|
|
|
def _build_placements_with_adjustments(
|
|
ad_vtt_content: str,
|
|
cue_durations: list[float],
|
|
pause_points: list[dict]
|
|
) -> list[dict]:
|
|
"""
|
|
Build placement instructions using adjusted pause points from QC edits.
|
|
|
|
Args:
|
|
ad_vtt_content: AD VTT content
|
|
cue_durations: TTS durations per cue
|
|
pause_points: Pause point data with original and adjusted values
|
|
|
|
Returns:
|
|
List of placement dicts
|
|
"""
|
|
cues = VTTParser.parse(ad_vtt_content)
|
|
|
|
# Build lookup of adjusted pause points by cue index
|
|
adjusted_pause_by_cue = {}
|
|
for pp in pause_points:
|
|
cue_idx = pp.get("cue_index")
|
|
adjusted = pp.get("adjusted_ms")
|
|
original = pp.get("original_ms")
|
|
# Use adjusted if set, otherwise original (in seconds)
|
|
pause_time_s = (adjusted if adjusted is not None else original) / 1000.0
|
|
adjusted_pause_by_cue[cue_idx] = pause_time_s
|
|
|
|
placements = []
|
|
for i, cue in enumerate(cues):
|
|
if i >= len(cue_durations):
|
|
break
|
|
|
|
# Get pause point: use adjusted value if available
|
|
pause_point = adjusted_pause_by_cue.get(i, cue.start_time)
|
|
|
|
placements.append({
|
|
"ad_cue_index": i,
|
|
"original_start_time": cue.start_time,
|
|
"original_end_time": cue.end_time,
|
|
"target_start_time": cue.start_time,
|
|
"ad_duration": cue_durations[i],
|
|
"pause_point": pause_point,
|
|
"resume_from": pause_point,
|
|
"pause_point_rationale": "User-adjusted during QC" if i in adjusted_pause_by_cue else "Original from VTT"
|
|
})
|
|
|
|
return placements
|
|
|
|
|
|
async def _refine_pause_points_for_rerender(
|
|
job_id: str,
|
|
video_path: str,
|
|
analysis: dict,
|
|
db,
|
|
temp_dir: str
|
|
) -> tuple[dict, list[str]]:
|
|
"""Run Whisper pause point refinement for re-render."""
|
|
logger.info(f"Refining pause points with Whisper for re-render of job {job_id}")
|
|
|
|
audio_path = os.path.join(temp_dir, "source_audio.mp3")
|
|
await _extract_audio_for_whisper(video_path, audio_path)
|
|
|
|
try:
|
|
words = await _dispatch_whisper_transcription(job_id, audio_path)
|
|
except Exception as e:
|
|
logger.error(f"Whisper transcription failed: {e}")
|
|
return analysis, [f"Whisper failed: {str(e)} - using current timestamps"]
|
|
|
|
if not words:
|
|
return analysis, ["No speech detected - using current timestamps"]
|
|
|
|
gaps = whisper_service.identify_speech_gaps(words)
|
|
|
|
refined_placements, warnings = whisper_service.refine_all_pause_points(
|
|
analysis.get("placements", []),
|
|
words,
|
|
gaps
|
|
)
|
|
|
|
refined_analysis = analysis.copy()
|
|
refined_analysis["placements"] = refined_placements
|
|
refined_analysis["whisper_refined"] = True
|
|
|
|
return refined_analysis, warnings
|