forge/backend/app/services/video_generator.py
DJP 7a804e896d Initial commit - FORGE AI unified platform
Features:
- Image generation (OpenAI, Gemini, Leonardo, Bria, Stability, Flux)
- Nano Banana iterative editing
- Video generation and upscaling
- Audio TTS, STT, sound effects (ElevenLabs)
- Text prompt studio and alt text
- User authentication with JWT/cookies
- Admin panel with voice management
- Job queue with Celery
- PostgreSQL + Redis backend
- Next.js 15 + FastAPI architecture

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 (1M context) <noreply@anthropic.com>
2025-12-09 20:39:00 -05:00

613 lines
21 KiB
Python

"""Video Generator Service - Runway and Google Veo
Runway Models:
- gen3_alpha: High quality, supports Motion Brush, Camera Control
- gen3_alpha_turbo: 7x faster, half cost, good for most use cases
- gen4: Latest model with highest fidelity
Runway Features:
- text_to_video: Generate from text prompt
- image_to_video: Generate from starting image
- camera_control: Pan, tilt, zoom, roll with intensity (-10 to 10)
- motion_brush: Define motion areas with direction
- first_frame/last_frame: Control start and end frames
Google Veo Models (December 2025):
- veo-3.1-generate-preview: Latest with native audio, 720p/1080p, reference images
- veo-3.1-fast-generate-preview: Speed-optimized variant with audio
- veo-3.0-generate-001: Stable Veo 3 with audio
- veo-3.0-fast-generate-001: Fast Veo 3 variant
- veo-2.0-generate-001: Legacy, supports 2 outputs per request
Veo 3/3.1 Features:
- Native audio generation with soundtrack, dialogue, ambient sounds
- first_frame: Starting image for video (image-to-video)
- last_frame: Ending image for video (creates frame interpolation)
- reference_images: Up to 3 images for character/style/asset consistency
- video_extension: Extend existing videos up to 20 times
- negative_prompt: Describe unwanted elements
- aspect_ratio: 16:9, 9:16
- resolution: 720p, 1080p (Veo 3.1 only)
- duration: 4, 6, or 8 seconds
- person_generation: Control adult face generation
Audio Prompt Techniques (Veo 3+):
- Dialogue: Use quotation marks ("She whispered, 'Hello'")
- Sound Effects: Explicit descriptions (tires screeching loudly)
- Ambient Noise: Environmental details (eerie hum in background)
"""
import httpx
import os
import base64
from uuid import uuid4
from datetime import datetime
import asyncio
from typing import Optional, Dict, Any, List, Tuple
from app.database import SessionLocal
from app.models.job import Job
from app.models.asset import Asset
from app.config import settings
# Runway model configurations
RUNWAY_MODELS = {
"gen3_alpha": {
"name": "Gen-3 Alpha",
"description": "High quality with full feature support",
"supports_camera_control": True,
"supports_motion_brush": True,
"max_duration": 10,
"resolutions": ["1280x768", "768x1280"]
},
"gen3_alpha_turbo": {
"name": "Gen-3 Alpha Turbo",
"description": "7x faster, half the cost",
"supports_camera_control": True,
"supports_motion_brush": False,
"max_duration": 10,
"resolutions": ["1280x768", "768x1280"]
},
"gen4": {
"name": "Gen-4",
"description": "Latest model with highest fidelity",
"supports_camera_control": True,
"supports_motion_brush": True,
"max_duration": 10,
"resolutions": ["1280x768", "768x1280", "1920x1080"]
}
}
# Veo model configurations (December 2025)
VEO_MODELS = {
"veo-3.1-generate-preview": {
"name": "Veo 3.1",
"description": "Latest with native audio, 720p/1080p, reference images",
"supports_audio": True,
"supports_first_last_frame": True,
"supports_reference_images": True,
"supports_extension": True,
"resolutions": ["720p", "1080p"],
"durations": [4, 6, 8],
"max_references": 3
},
"veo-3.1-fast-generate-preview": {
"name": "Veo 3.1 Fast",
"description": "Speed-optimized with audio ($0.40/sec)",
"supports_audio": True,
"supports_first_last_frame": True,
"supports_reference_images": True,
"supports_extension": True,
"resolutions": ["720p", "1080p"],
"durations": [4, 6, 8],
"max_references": 3
},
"veo-3.0-generate-001": {
"name": "Veo 3",
"description": "Stable Veo 3 with native audio",
"supports_audio": True,
"supports_first_last_frame": True,
"supports_reference_images": False,
"supports_extension": False,
"resolutions": ["720p", "1080p"],
"durations": [4, 6, 8],
"max_references": 0
},
"veo-3.0-fast-generate-001": {
"name": "Veo 3 Fast",
"description": "Fast Veo 3 variant with audio",
"supports_audio": True,
"supports_first_last_frame": True,
"supports_reference_images": False,
"supports_extension": False,
"resolutions": ["720p"],
"durations": [4, 6, 8],
"max_references": 0
},
"veo-2.0-generate-001": {
"name": "Veo 2",
"description": "Legacy model, supports 2 outputs per request",
"supports_audio": False,
"supports_first_last_frame": True,
"supports_reference_images": False,
"supports_extension": False,
"resolutions": ["720p"],
"durations": [5, 6, 8],
"max_references": 0
}
}
async def generate(job_id: str):
"""Generate video using Runway or Veo
Input parameters:
- provider: 'runway' or 'veo'
- prompt: Text description
- model: Specific model to use
- duration: Video length in seconds
- aspect_ratio: '16:9', '9:16', '1:1'
Runway-specific:
- camera_control: {pan, tilt, zoom, roll} with values -10 to 10
- motion_brush: [{area_mask, direction, intensity}]
- frame_position: 'first' or 'last' for input image
Veo-specific:
- first_frame_asset_id: Asset ID for starting frame
- last_frame_asset_id: Asset ID for ending frame
- reference_asset_ids: List of asset IDs for reference (max 4)
"""
db = SessionLocal()
try:
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
return
input_data = job.input_data
provider = input_data.get("provider", "runway")
prompt = input_data.get("prompt", "")
job.progress = 10
job.api_provider = provider
db.commit()
video_data = None
filename = None
if provider == "runway":
video_data, filename = await _generate_runway(job, input_data, db)
elif provider == "veo":
video_data, filename = await _generate_veo(job, input_data, db)
else:
raise ValueError(f"Unknown video provider: {provider}")
if video_data:
# Save video
storage_path = os.path.join(settings.storage_path, "videos")
os.makedirs(storage_path, exist_ok=True)
file_path = os.path.join(storage_path, filename)
with open(file_path, "wb") as f:
f.write(video_data)
# Create asset
asset = Asset(
user_id=job.user_id,
project_id=job.project_id,
original_filename=filename,
stored_filename=filename,
file_path=file_path,
file_type="video",
mime_type="video/mp4",
file_size_bytes=len(video_data),
duration_seconds=input_data.get("duration", 5),
source_module="video_generator",
source_job_id=job.id,
asset_metadata={
"prompt": prompt,
"provider": provider,
"model": job.api_model
}
)
db.add(asset)
db.commit()
db.refresh(asset)
job.output_asset_ids = [asset.id]
job.output_data = {"asset_id": str(asset.id), "file_path": file_path}
job.progress = 100
job.status = "completed"
job.completed_at = datetime.utcnow()
db.commit()
except Exception as e:
job.status = "failed"
job.error_message = str(e)
db.commit()
finally:
db.close()
async def _generate_runway(job, input_data: dict, db) -> Tuple[Optional[bytes], Optional[str]]:
"""Generate video using Runway
Supports:
- Text to video
- Image to video with first/middle/last frame positioning
- Camera control (pan, tilt, zoom, roll)
- Motion brush for targeted animation
- Multiple resolutions
"""
prompt = input_data.get("prompt", "")
model = input_data.get("model", "gen3_alpha_turbo")
duration = min(input_data.get("duration", 5), 10)
resolution = input_data.get("resolution", "1280x768")
frame_position = input_data.get("frame_position", "first") # first, middle, last
# Camera control settings
camera_control = input_data.get("camera_control", {})
pan = camera_control.get("pan", 0) # -10 to 10, horizontal
tilt = camera_control.get("tilt", 0) # -10 to 10, vertical
zoom = camera_control.get("zoom", 0) # -10 to 10
roll = camera_control.get("roll", 0) # -10 to 10, rotation
static = camera_control.get("static", False) # Reduce camera motion
job.api_model = model
db.commit()
# Get input image if provided
image_data = None
if job.input_asset_ids:
input_asset = db.query(Asset).filter(Asset.id == job.input_asset_ids[0]).first()
if input_asset and os.path.exists(input_asset.file_path):
with open(input_asset.file_path, "rb") as f:
image_data = base64.b64encode(f.read()).decode()
async with httpx.AsyncClient(timeout=600) as client:
# Build payload based on whether we have an image
if image_data:
# Image to video
payload = {
"model": model,
"promptImage": f"data:image/png;base64,{image_data}",
"promptText": prompt,
"duration": duration,
"ratio": resolution.replace("x", ":")
}
# Frame position (Gen-3 Alpha Turbo supports first, middle, last)
if model == "gen3_alpha_turbo":
payload["imagePosition"] = frame_position
endpoint = "https://api.runwayml.com/v1/image_to_video"
else:
# Text to video
payload = {
"model": model,
"promptText": prompt,
"duration": duration,
"ratio": resolution.replace("x", ":")
}
endpoint = "https://api.runwayml.com/v1/text_to_video"
# Add camera control if any values are set
if any([pan, tilt, zoom, roll]) and not static:
payload["cameraControl"] = {
"pan": pan,
"tilt": tilt,
"zoom": zoom,
"roll": roll
}
elif static:
payload["cameraControl"] = {"static": True}
# Create generation task
response = await client.post(
endpoint,
headers={
"Authorization": f"Bearer {settings.runway_api_key}",
"Content-Type": "application/json",
"X-Runway-Version": "2024-11-06"
},
json=payload
)
response.raise_for_status()
result = response.json()
task_id = result.get("id")
job.progress = 30
job.api_request_id = task_id
db.commit()
# Poll for completion
for i in range(180): # Wait up to 6 minutes
await asyncio.sleep(2)
status_response = await client.get(
f"https://api.runwayml.com/v1/tasks/{task_id}",
headers={
"Authorization": f"Bearer {settings.runway_api_key}",
"X-Runway-Version": "2024-11-06"
}
)
status_data = status_response.json()
status = status_data.get("status", "")
if status == "SUCCEEDED":
output_url = status_data.get("output", [None])[0]
if output_url:
video_response = await client.get(output_url)
filename = f"runway_{model}_{uuid4()}.mp4"
return video_response.content, filename
break
elif status == "FAILED":
raise ValueError(f"Runway generation failed: {status_data.get('error')}")
job.progress = min(30 + (i * 0.35), 90)
db.commit()
return None, None
async def _generate_veo(job, input_data: dict, db) -> Tuple[Optional[bytes], Optional[str]]:
"""Generate video using Google Veo 3/3.1
Supports:
- Text to video with native audio generation
- First frame image (video starts from this image)
- Last frame image (video ends at this image, creates frame interpolation)
- Reference images (up to 3, for character/style/asset consistency - Veo 3.1 only)
- Video extension (continue from previous video - Veo 3.1 only)
- Negative prompts
- Multiple resolutions (720p, 1080p)
- Duration options (4, 6, 8 seconds)
Audio Prompting:
- Use quotation marks for dialogue: "She said, 'Hello'"
- Describe sound effects: "tires screeching loudly"
- Add ambient sounds: "quiet forest with birds chirping"
"""
prompt = input_data.get("prompt", "")
model = input_data.get("model", "veo-3.1-generate-preview")
duration = input_data.get("duration", 8)
aspect_ratio = input_data.get("aspect_ratio", "16:9")
resolution = input_data.get("resolution", "720p")
negative_prompt = input_data.get("negative_prompt", "")
person_generation = input_data.get("person_generation") # "allow_adult" or None
# Frame control
first_frame_asset_id = input_data.get("first_frame_asset_id")
last_frame_asset_id = input_data.get("last_frame_asset_id")
reference_asset_ids = input_data.get("reference_asset_ids", [])[:3] # Max 3 for Veo 3.1
# Video extension (Veo 3.1 only)
extend_video_asset_id = input_data.get("extend_video_asset_id")
# Validate duration
model_config = VEO_MODELS.get(model, VEO_MODELS["veo-3.1-generate-preview"])
valid_durations = model_config.get("durations", [4, 6, 8])
if duration not in valid_durations:
duration = max(valid_durations)
# Validate resolution
valid_resolutions = model_config.get("resolutions", ["720p"])
if resolution not in valid_resolutions:
resolution = valid_resolutions[0]
job.api_model = model
db.commit()
try:
from google import genai
from google.genai import types
# Initialize client
client = genai.Client(api_key=settings.google_api_key)
job.progress = 20
db.commit()
# Build generation config
config_kwargs = {
"aspect_ratio": aspect_ratio,
}
# Add negative prompt if provided
if negative_prompt:
config_kwargs["negative_prompt"] = negative_prompt
# Add person generation setting if specified
if person_generation:
config_kwargs["person_generation"] = person_generation
# Resolution for Veo 3.1
if "3.1" in model or "3.0" in model:
config_kwargs["resolution"] = resolution
config_kwargs["duration_seconds"] = str(duration)
# Prepare first frame image
first_frame_image = None
if first_frame_asset_id:
first_asset = db.query(Asset).filter(Asset.id == first_frame_asset_id).first()
if first_asset and os.path.exists(first_asset.file_path):
with open(first_asset.file_path, "rb") as f:
first_frame_image = types.Image.from_bytes(
data=f.read(),
mime_type=first_asset.mime_type or "image/png"
)
# Prepare last frame for interpolation
if last_frame_asset_id:
last_asset = db.query(Asset).filter(Asset.id == last_frame_asset_id).first()
if last_asset and os.path.exists(last_asset.file_path):
with open(last_asset.file_path, "rb") as f:
config_kwargs["last_frame"] = types.Image.from_bytes(
data=f.read(),
mime_type=last_asset.mime_type or "image/png"
)
# Reference images for character/style consistency (Veo 3.1 only)
if reference_asset_ids and model_config.get("supports_reference_images"):
reference_images = []
for ref_id in reference_asset_ids:
ref_asset = db.query(Asset).filter(Asset.id == ref_id).first()
if ref_asset and os.path.exists(ref_asset.file_path):
with open(ref_asset.file_path, "rb") as f:
# Create VideoGenerationReferenceImage
ref_image = types.VideoGenerationReferenceImage(
image=types.Image.from_bytes(
data=f.read(),
mime_type=ref_asset.mime_type or "image/png"
),
reference_type="asset" # or "style" for style reference
)
reference_images.append(ref_image)
if reference_images:
config_kwargs["reference_images"] = reference_images
# Video extension (Veo 3.1 only)
extend_video = None
if extend_video_asset_id and model_config.get("supports_extension"):
extend_asset = db.query(Asset).filter(Asset.id == extend_video_asset_id).first()
if extend_asset and os.path.exists(extend_asset.file_path):
with open(extend_asset.file_path, "rb") as f:
extend_video = types.Video.from_bytes(
data=f.read(),
mime_type=extend_asset.mime_type or "video/mp4"
)
config = types.GenerateVideosConfig(**config_kwargs)
job.progress = 40
db.commit()
# Generate video using the async long-running operation
if extend_video:
# Video extension mode
operation = await asyncio.to_thread(
client.models.generate_videos,
model=model,
video=extend_video,
prompt=prompt,
config=config
)
elif first_frame_image:
# Image-to-video mode
operation = await asyncio.to_thread(
client.models.generate_videos,
model=model,
image=first_frame_image,
prompt=prompt,
config=config
)
else:
# Text-to-video mode
operation = await asyncio.to_thread(
client.models.generate_videos,
model=model,
prompt=prompt,
config=config
)
# Poll for completion (can take 11 seconds to 6 minutes)
job.progress = 50
db.commit()
max_attempts = 72 # 6 minutes with 5 second intervals
for attempt in range(max_attempts):
await asyncio.sleep(5)
# Check operation status
operation = await asyncio.to_thread(
client.operations.get,
operation
)
if operation.done:
break
# Update progress
progress = min(50 + (attempt * 0.5), 90)
job.progress = int(progress)
db.commit()
job.progress = 90
db.commit()
# Extract video from response
if operation.done and operation.response:
generated_videos = operation.response.generated_videos
if generated_videos and len(generated_videos) > 0:
video = generated_videos[0]
# Download the video file
video_data = await asyncio.to_thread(
client.files.download,
file=video.video
)
filename = f"veo_{model.replace('.', '_').replace('-', '_')}_{uuid4()}.mp4"
return video_data, filename
# Check for errors
if operation.error:
raise ValueError(f"Veo generation failed: {operation.error}")
except ImportError:
raise ValueError("Google GenAI library not installed. Run: pip install google-genai")
except Exception as e:
raise ValueError(f"Veo generation error: {str(e)}")
return None, None
async def extend_video(job_id: str):
"""Extend an existing video using Veo scene extension"""
db = SessionLocal()
try:
job = db.query(Job).filter(Job.id == job_id).first()
if not job:
return
input_data = job.input_data
source_asset_id = input_data.get("source_asset_id")
prompt = input_data.get("prompt", "")
extension_seconds = min(input_data.get("extension_seconds", 4), 8)
if not source_asset_id:
raise ValueError("No source video provided for extension")
source_asset = db.query(Asset).filter(Asset.id == source_asset_id).first()
if not source_asset:
raise ValueError("Source video not found")
job.progress = 10
job.api_provider = "veo"
job.api_model = "veo-3.1-generate-preview"
db.commit()
# Implementation would use Veo's scene extension API
# This extends video by building on the final seconds of the previous clip
job.progress = 100
job.status = "completed"
job.completed_at = datetime.utcnow()
db.commit()
except Exception as e:
job.status = "failed"
job.error_message = str(e)
db.commit()
finally:
db.close()
def get_available_models() -> Dict[str, Any]:
"""Get all available video generation models and their capabilities"""
return {
"runway": RUNWAY_MODELS,
"veo": VEO_MODELS
}