Update in Bitrate and Small Large Size Logic

This commit is contained in:
Manish Tanwar 2025-11-27 02:48:15 +05:30
parent 00ca5a4c58
commit ec9f7426e4
3 changed files with 409 additions and 133 deletions

View file

@ -268,16 +268,27 @@ EOF
- Uses `threading.Lock` for thread safety
- Prevents Gemini API rate limit errors (5 RPM free tier)
### Hybrid Upload Strategy (video_processor.py:163-203)
- **Files < 10MB**: Base64 inline data (instant)
- **Files >= 10MB**: File Upload API (slower but reliable)
- Threshold: `SIZE_THRESHOLD_MB = 10`
### Upload Strategy (video_processor.py:388-450)
- **Current approach**: Base64 inline encoding for ALL videos
- **Reason**: File Upload API has known issues in SDK 1.45.0-1.49.0
- **API limit**: 1000MB (1GB) for base64-encoded requests
- **Base64 overhead**: +37% size increase (1.37x multiplier)
- **Effective limit**: ~730MB raw video per chunk (after encoding)
### Video Splitting (video_splitter.py)
- **Chunk duration**: 54 minutes
- **Automatic**: Videos > 54 minutes split automatically
### Video Splitting (video_splitter.py) - Robust Multi-Constraint Algorithm
- **Chunk duration limit**: 53 minutes (Gemini API ~55 min limit)
- **Chunk size limit**: ~560MB safe target (with 30% VBR margin)
- **Hard size limit**: ~730MB maximum (1000MB ÷ 1.37 encoding overhead)
- **Automatic splitting**: Based on BOTH duration AND size constraints
- **VBR handling**: 30% safety margin for Variable Bitrate variance
- **Validation & re-splitting**: Automatic re-split if chunks exceed hard limit
- **Algorithm**: Uses max(chunks_by_size, chunks_by_duration) for safety
- **Processing**: Chunks processed in parallel on backend
- **Maximum single chunk**: 55 minutes (Gemini limit)
**Example:**
- 30min/1.5GB video → Split into 3 chunks of ~10min/500MB each
- 50min/720MB video → Split into 2 chunks of ~25min/360MB each
- 5min/300MB video → No split (under both limits)
### Queue Management (App.js:63-336)
- **Queue states**: queued, processing, completed, failed, cancelled
@ -319,6 +330,10 @@ BATCH_PROCESSING_LOG_SUMMARIES=true
- `GOOGLE_API_KEY`: Your Gemini API key (required)
- `VIDEO_PROCESSOR_MODEL`: Model for individual video processing (default: gemini-2.5-pro)
- `VIDEO_SYNTHESIS_MODEL`: Model for batch synthesis (default: gemini-2.5-pro)
- `CHUNK_DURATION_MINUTES`: Max chunk duration (default: 53 minutes)
- `VBR_SAFETY_MARGIN`: Safety margin for variable bitrate videos (default: 1.30 = 30%)
- `BASE64_API_LIMIT_MB`: API limit for base64 requests (default: 1000MB)
- `MAX_PARALLEL_CHUNKS`: Concurrent chunk processing (default: 4)
- `BATCH_PROCESSING_LOG_PROMPTS`: Enable detailed prompt logging (default: false)
- `BATCH_PROCESSING_LOG_SUMMARIES`: Enable summary preview logging (default: false)
- **backend/run.py**: Hypercorn server config (body size limits, timeouts)

View file

@ -11,6 +11,19 @@ GOOGLE_API_KEY=AIzaSyBF3Ia1nVS4PLuLpWt-85ct_heJ7FrlvkQ
# Gemini API limits: ~55 min with audio, ~60 min without audio
CHUNK_DURATION_MINUTES=53
# Variable Bitrate (VBR) Safety Margin (default: 1.30)
# Adds 30% safety margin to account for bitrate variance in videos
# Higher values = more conservative splitting, safer for VBR videos
# Lower values = fewer chunks, but risk of oversized chunks
# Recommended: 1.25-1.35 (25%-35% margin)
VBR_SAFETY_MARGIN=1.30
# Base64 API Limit in MB (default: 1000)
# Maximum size of base64-encoded video in API request
# Gemini API limit: 1000MB (1GB) for inline base64 requests
# Do not change unless API limits change
BASE64_API_LIMIT_MB=1000
# Parallel Processing Configuration
# Maximum number of video chunks to process concurrently (default: 4)
# Higher values = faster processing but more API load

View file

@ -3,13 +3,20 @@ Video Splitter Module
This module provides functionality to detect video duration and split long videos
into smaller chunks for processing with APIs that have duration limitations.
Key Features:
- Duration-based splitting (respects Gemini API 53-55 min limit)
- Size-based splitting (respects 1GB base64-encoded API limit)
- Variable Bitrate (VBR) handling with safety margins
- Automatic chunk validation and re-splitting for oversized chunks
"""
import ffmpeg
import os
import tempfile
import logging
from typing import List, Tuple, Optional
import math
from typing import List, Tuple, Optional, Dict, Any
from system_utils import system_utils
from error_reporter import ErrorReporter, ErrorCategory
@ -19,12 +26,42 @@ logger = logging.getLogger('video_query')
class VideoSplitter:
"""
Handles video duration detection and splitting operations.
Supports robust splitting with:
- Duration limits (Gemini API ~53-55 min)
- Size limits (1GB base64-encoded API request)
- VBR (Variable Bitrate) safety margins
- Automatic chunk validation and re-splitting
"""
# Duration constraint
# Default chunk duration in minutes (53 min to stay under 55 min Gemini API limit for videos with audio)
# Google Gemini 2.5 Pro limits: ~55 min with audio, ~60 min without audio
DEFAULT_CHUNK_DURATION = 53
# Size constraints (for base64 inline encoding approach)
# API limit for base64-encoded requests: 1GB (1000MB)
API_LIMIT_AFTER_ENCODING_MB = 1000
BASE64_OVERHEAD = 1.37 # Base64 encoding increases size by ~37%
# VBR (Variable Bitrate) safety margin
# Videos with VBR can have sections with significantly higher bitrate than average
# 30% margin ensures chunks stay within limits even with bitrate spikes
VBR_SAFETY_MARGIN = 1.30
# Calculate safe target chunk size
# Formula: API_LIMIT / BASE64_OVERHEAD / VBR_MARGIN
# 1000MB / 1.37 / 1.30 ≈ 560MB
SAFE_CHUNK_SIZE_MB = API_LIMIT_AFTER_ENCODING_MB / BASE64_OVERHEAD / VBR_SAFETY_MARGIN
# Hard limit (absolute maximum raw chunk size before encoding)
# Formula: API_LIMIT / BASE64_OVERHEAD
# 1000MB / 1.37 ≈ 730MB
HARD_LIMIT_MB = API_LIMIT_AFTER_ENCODING_MB / BASE64_OVERHEAD
# Minimum chunk duration to avoid creating too many tiny chunks
MIN_CHUNK_DURATION_MIN = 3
def __init__(self, chunk_duration_minutes: int = None):
"""
Initialize VideoSplitter with specified chunk duration.
@ -39,7 +76,19 @@ class VideoSplitter:
self.chunk_duration_minutes = chunk_duration_minutes
self.chunk_duration_seconds = chunk_duration_minutes * 60
logger.info(f"VideoSplitter initialized with chunk duration: {chunk_duration_minutes} minutes")
# Load configurable parameters from environment
self.vbr_safety_margin = float(os.getenv("VBR_SAFETY_MARGIN", self.VBR_SAFETY_MARGIN))
self.api_limit_mb = int(os.getenv("BASE64_API_LIMIT_MB", self.API_LIMIT_AFTER_ENCODING_MB))
# Recalculate safe chunk size if custom values provided
self.safe_chunk_size_mb = self.api_limit_mb / self.BASE64_OVERHEAD / self.vbr_safety_margin
self.hard_limit_mb = self.api_limit_mb / self.BASE64_OVERHEAD
logger.info(f"VideoSplitter initialized:")
logger.info(f" - Max chunk duration: {chunk_duration_minutes} minutes")
logger.info(f" - Safe chunk size: {self.safe_chunk_size_mb:.0f}MB (with {(self.vbr_safety_margin-1)*100:.0f}% VBR margin)")
logger.info(f" - Hard chunk limit: {self.hard_limit_mb:.0f}MB (API: {self.api_limit_mb}MB after encoding)")
def get_video_duration(self, video_path: str) -> Optional[float]:
"""
@ -99,23 +148,25 @@ class VideoSplitter:
logger.error(f"Error detecting video duration: {str(e)}")
return None
def needs_splitting(self, video_path: str, max_chunk_size_mb: float = 500) -> bool:
def needs_splitting(self, video_path: str, max_chunk_size_mb: float = None) -> bool:
"""
Check if a video needs to be split based on duration OR file size.
A video needs splitting if:
1. Duration > configured chunk duration (default 53 minutes), OR
2. File size > 500MB (conservative target to handle variable bitrate)
With 30% variance: 500MB × 1.3 = 650MB max
After base64 encoding: 650MB × 1.37 = 891MB (well under 1GB API limit)
Uses robust calculation considering:
- Duration limit (default 53 minutes)
- Size limit with VBR safety margin (default ~560MB)
- Base64 encoding overhead (1.37x)
Args:
video_path: Path to the video file
max_chunk_size_mb: Maximum chunk size in MB (default: 500MB)
max_chunk_size_mb: Maximum chunk size in MB (default: use calculated safe size)
Returns:
True if video needs splitting based on duration or size, False otherwise
"""
if max_chunk_size_mb is None:
max_chunk_size_mb = self.safe_chunk_size_mb
duration = self.get_video_duration(video_path)
if duration is None:
logger.warning("Could not determine video duration for splitting check")
@ -136,170 +187,363 @@ class VideoSplitter:
if needs_split_duration:
reasons.append(f"duration {duration/60:.2f} min > {self.chunk_duration_minutes} min")
if needs_split_size:
reasons.append(f"file size {file_size_mb:.1f}MB > {max_chunk_size_mb}MB")
reasons.append(f"file size {file_size_mb:.1f}MB > {max_chunk_size_mb:.0f}MB")
logger.info(f"Video needs splitting: {' AND '.join(reasons)}")
else:
logger.info(f"Video does not need splitting: duration {duration/60:.2f} min <= {self.chunk_duration_minutes} min, size {file_size_mb:.1f}MB <= {max_chunk_size_mb:.0f}MB")
return needs_split
def calculate_optimal_chunk_duration(self, video_path: str, max_chunk_size_mb: float = 500) -> int:
def calculate_optimal_chunks(self, video_path: str) -> Dict[str, Any]:
"""
Calculate optimal chunk duration based on file size and video duration
to ensure chunks don't exceed a maximum file size.
Calculate optimal number of chunks and chunk duration considering:
1. File size with VBR safety margin
2. Duration limits
3. Base64 encoding overhead
IMPORTANT: Gemini API has a 1GB request payload limit.
Conservative target of 500MB accounts for variable bitrate (VBR).
With 30% VBR variance: 500MB × 1.3 = 650MB max
After base64 encoding: 650MB × 1.37 = 891MB (under 1GB limit)
This is the core robust algorithm that prevents oversized chunks.
Args:
video_path: Path to the video file
max_chunk_size_mb: Maximum desired chunk size in MB (default: 500MB)
Returns:
Dictionary with splitting strategy:
- needs_split: bool
- num_chunks: int
- chunk_duration_seconds: float
- estimated_chunk_size_mb: float
- estimated_after_encoding_mb: float
- split_reason: str
- bitrate_mb_per_min: float
"""
# Get video properties
duration_seconds = self.get_video_duration(video_path)
if duration_seconds is None:
raise ValueError("Cannot determine video duration")
file_size_bytes = os.path.getsize(video_path)
file_size_mb = file_size_bytes / (1024 * 1024)
duration_minutes = duration_seconds / 60
# Calculate bitrate
bitrate_mb_per_min = file_size_mb / duration_minutes
logger.info(
f"Video properties: {duration_minutes:.1f} min, {file_size_mb:.1f} MB, "
f"{bitrate_mb_per_min:.2f} MB/min bitrate"
)
# Calculate chunks needed by SIZE (with VBR safety margin built-in)
chunks_by_size = math.ceil(file_size_mb / self.safe_chunk_size_mb)
# Calculate chunks needed by DURATION
chunks_by_duration = math.ceil(duration_minutes / self.chunk_duration_minutes)
# Use the MORE restrictive constraint (more chunks = safer)
num_chunks = max(chunks_by_size, chunks_by_duration, 1)
# Calculate actual chunk duration
chunk_duration_seconds = duration_seconds / num_chunks
chunk_duration_minutes = chunk_duration_seconds / 60
# Check if chunk duration is too small
if chunk_duration_minutes < self.MIN_CHUNK_DURATION_MIN and num_chunks > 1:
logger.warning(
f"Calculated chunk duration {chunk_duration_minutes:.1f} min is below "
f"minimum {self.MIN_CHUNK_DURATION_MIN} min. Video has extremely high bitrate."
)
# Estimate chunk properties
estimated_chunk_size_mb = file_size_mb / num_chunks
estimated_after_encoding_mb = estimated_chunk_size_mb * self.BASE64_OVERHEAD
# Determine split reasons
split_reasons = []
if chunks_by_size > 1:
split_reasons.append(
f"size ({file_size_mb:.1f}MB requires {chunks_by_size} chunks "
f"with {self.safe_chunk_size_mb:.0f}MB safe target)"
)
if chunks_by_duration > 1:
split_reasons.append(
f"duration ({duration_minutes:.1f}min requires {chunks_by_duration} chunks "
f"with {self.chunk_duration_minutes}min limit)"
)
split_reason = " AND ".join(split_reasons) if split_reasons else "Within limits, no split needed"
needs_split = num_chunks > 1
# Log decision details
logger.info(f"Split calculation:")
logger.info(f" - Chunks by size: {chunks_by_size}")
logger.info(f" - Chunks by duration: {chunks_by_duration}")
logger.info(f" - Final chunks: {num_chunks}")
logger.info(f" - Chunk duration: {chunk_duration_minutes:.1f} min")
logger.info(
f" - Est. chunk size: {estimated_chunk_size_mb:.1f}MB raw, "
f"{estimated_after_encoding_mb:.1f}MB encoded"
)
logger.info(f" - Decision: {split_reason}")
return {
'needs_split': needs_split,
'num_chunks': num_chunks,
'chunk_duration_seconds': chunk_duration_seconds,
'estimated_chunk_size_mb': estimated_chunk_size_mb,
'estimated_after_encoding_mb': estimated_after_encoding_mb,
'split_reason': split_reason,
'bitrate_mb_per_min': bitrate_mb_per_min,
'constraints': {
'by_size': chunks_by_size,
'by_duration': chunks_by_duration
}
}
def calculate_optimal_chunk_duration(self, video_path: str, max_chunk_size_mb: float = None) -> int:
"""
Legacy method for backward compatibility.
Calls calculate_optimal_chunks() and returns just the duration.
Args:
video_path: Path to the video file
max_chunk_size_mb: Ignored (kept for compatibility)
Returns:
Optimal chunk duration in seconds
"""
duration = self.get_video_duration(video_path)
if duration is None:
logger.warning("Could not determine duration, using default chunk duration")
try:
split_info = self.calculate_optimal_chunks(video_path)
return int(split_info['chunk_duration_seconds'])
except Exception as e:
logger.warning(f"Could not calculate optimal chunks: {e}, using default duration")
return self.chunk_duration_seconds
# Get file size
file_size_bytes = os.path.getsize(video_path)
file_size_mb = file_size_bytes / (1024 * 1024)
file_size_gb = file_size_bytes / (1024 * 1024 * 1024)
# Calculate average bitrate (bytes per second)
avg_bitrate = file_size_bytes / duration
# Calculate chunk duration that would result in max_chunk_size_mb
max_chunk_size_bytes = max_chunk_size_mb * 1024 * 1024
optimal_duration = max_chunk_size_bytes / avg_bitrate
# Use the smaller of optimal duration or default chunk duration
final_duration = min(optimal_duration, self.chunk_duration_seconds)
# Ensure minimum chunk duration of 5 minutes (300 seconds)
final_duration = max(final_duration, 300)
logger.info(
f"Calculated optimal chunk duration: {final_duration:.0f}s ({final_duration/60:.1f} min) "
f"based on file size {file_size_mb:.1f}MB ({file_size_gb:.2f}GB) and duration {duration/60:.1f} min. "
f"Target chunk size: {max_chunk_size_mb}MB"
)
return int(final_duration)
def split_video(self, video_path: str, output_dir: Optional[str] = None) -> List[str]:
def _split_by_duration(self, video_path: str, chunk_duration_seconds: float,
output_dir: str) -> List[str]:
"""
Split a video into multiple chunks based on the configured chunk duration.
Automatically adjusts chunk duration if file size would result in chunks > 1.2GB.
Perform the actual ffmpeg splitting by duration.
Internal helper method extracted from split_video for reusability.
Args:
video_path: Path to the video file to split
output_dir: Directory to save chunks (default: system temp directory)
video_path: Path to video file
chunk_duration_seconds: Duration of each chunk in seconds
output_dir: Output directory for chunks
Returns:
List of paths to the generated chunk files
List of chunk file paths
"""
duration = self.get_video_duration(video_path)
if duration is None:
raise ValueError("Could not determine video duration")
# Calculate optimal chunk duration based on file size
chunk_duration = self.calculate_optimal_chunk_duration(video_path)
# Use temp directory if none specified
if output_dir is None:
output_dir = tempfile.mkdtemp(prefix="video_chunks_")
logger.info(f"Using temporary directory for chunks: {output_dir}")
else:
os.makedirs(output_dir, exist_ok=True)
# Calculate number of chunks needed
num_chunks = int(duration / chunk_duration) + (
1 if duration % chunk_duration > 0 else 0
)
logger.info(f"Splitting video into {num_chunks} chunks (chunk duration: {chunk_duration/60:.1f} min)")
num_chunks = math.ceil(duration / chunk_duration_seconds)
chunk_paths = []
video_basename = os.path.splitext(os.path.basename(video_path))[0]
video_extension = os.path.splitext(video_path)[1]
for i in range(num_chunks):
start_time = i * chunk_duration
start_time = i * chunk_duration_seconds
chunk_output = os.path.join(
output_dir,
f"{video_basename}_chunk_{i+1:02d}{video_extension}"
)
logger.info(f"Creating chunk {i+1}/{num_chunks}: start={start_time}s, output={chunk_output}")
logger.info(f"Creating chunk {i+1}/{num_chunks}: start={start_time:.1f}s")
try:
# Split the video using ffmpeg
# Using -t to specify duration of this chunk
# Using -c copy for fast processing (no re-encoding)
stream = ffmpeg.input(video_path, ss=start_time, t=chunk_duration)
stream = ffmpeg.output(
stream,
chunk_output,
c='copy', # Copy streams without re-encoding for speed
map='0', # Include all streams from input
avoid_negative_ts='make_zero' # Handle timestamp issues
)
stream = ffmpeg.input(video_path, ss=start_time, t=chunk_duration_seconds)
stream = ffmpeg.output(stream, chunk_output, c='copy', map='0',
avoid_negative_ts='make_zero')
ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, overwrite_output=True)
chunk_paths.append(chunk_output)
# Log chunk size for monitoring
chunk_size_bytes = os.path.getsize(chunk_output)
chunk_size_mb = chunk_size_bytes / (1024 * 1024)
chunk_size_gb = chunk_size_bytes / (1024 * 1024 * 1024)
logger.info(f"Successfully created chunk {i+1}/{num_chunks} (size: {chunk_size_mb:.1f}MB / {chunk_size_gb:.2f}GB)")
# Warn if chunk is approaching size limits (500MB target due to VBR variance)
if chunk_size_mb > 550:
logger.warning(
f"Chunk {i+1} is {chunk_size_mb:.1f}MB ({chunk_size_gb:.2f}GB), exceeding the 500MB target. "
f"After base64 encoding (~37% overhead), this will be ~{chunk_size_mb * 1.37:.1f}MB. "
f"API limit is 1000MB (1GB). If close to limit, consider reducing video quality."
)
chunk_size_mb = os.path.getsize(chunk_output) / (1024 * 1024)
logger.info(f"Created chunk {i+1}/{num_chunks}: {chunk_size_mb:.1f}MB")
except ffmpeg.Error as e:
error_msg = e.stderr.decode() if e.stderr else str(e)
error_report = ErrorReporter.capture_error(
ErrorReporter.capture_error(
e,
category=ErrorCategory.VIDEO_ERROR,
context={
'video_path': video_path,
'chunk_number': i+1,
'total_chunks': num_chunks,
'operation': 'split_video'
'operation': 'split_by_duration'
}
)
logger.error(f"FFmpeg error creating chunk {i+1}: {error_msg}")
# Clean up any created chunks on error
self.cleanup_chunks(chunk_paths)
raise RuntimeError(f"Failed to create video chunk {i+1}: {error_msg}")
except Exception as e:
error_report = ErrorReporter.capture_error(
e,
category=ErrorCategory.VIDEO_ERROR,
context={
'video_path': video_path,
'chunk_number': i+1,
'total_chunks': num_chunks,
'operation': 'split_video'
}
)
logger.error(f"Error creating chunk {i+1}: {str(e)}")
self.cleanup_chunks(chunk_paths)
raise
logger.info(f"Successfully split video into {len(chunk_paths)} chunks")
return chunk_paths
def _re_split_chunk(self, chunk_path: str, target_size_mb: float) -> List[str]:
"""
Re-split a single oversized chunk into smaller sub-chunks.
Args:
chunk_path: Path to oversized chunk
target_size_mb: Target size for sub-chunks
Returns:
List of sub-chunk paths
"""
duration = self.get_video_duration(chunk_path)
file_size_mb = os.path.getsize(chunk_path) / (1024 * 1024)
# Calculate how many sub-chunks needed
num_sub_chunks = math.ceil(file_size_mb / target_size_mb)
sub_chunk_duration = duration / num_sub_chunks
logger.info(
f"Re-splitting oversized chunk: {file_size_mb:.1f}MB into {num_sub_chunks} sub-chunks "
f"of ~{sub_chunk_duration/60:.1f} min each"
)
# Split the chunk
chunk_dir = os.path.dirname(chunk_path)
chunk_basename = os.path.splitext(os.path.basename(chunk_path))[0]
chunk_extension = os.path.splitext(chunk_path)[1]
sub_chunks = []
for j in range(num_sub_chunks):
start_time = j * sub_chunk_duration
sub_chunk_output = os.path.join(
chunk_dir,
f"{chunk_basename}_sub_{j+1:02d}{chunk_extension}"
)
stream = ffmpeg.input(chunk_path, ss=start_time, t=sub_chunk_duration)
stream = ffmpeg.output(stream, sub_chunk_output, c='copy', map='0',
avoid_negative_ts='make_zero')
ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, overwrite_output=True)
sub_chunks.append(sub_chunk_output)
sub_size_mb = os.path.getsize(sub_chunk_output) / (1024 * 1024)
logger.info(f"Created sub-chunk {j+1}/{num_sub_chunks}: {sub_size_mb:.1f}MB")
return sub_chunks
def validate_and_fix_chunks(self, chunk_paths: List[str]) -> List[str]:
"""
Validate chunk sizes and re-split any chunks exceeding hard limit.
This is the safety net that prevents oversized chunks from reaching the API.
Args:
chunk_paths: List of chunk file paths
Returns:
List of valid chunk paths (may include re-split sub-chunks)
"""
valid_chunks = []
chunks_to_cleanup = []
for i, chunk_path in enumerate(chunk_paths, 1):
chunk_size_mb = os.path.getsize(chunk_path) / (1024 * 1024)
encoded_size_mb = chunk_size_mb * self.BASE64_OVERHEAD
# Check against HARD_LIMIT_MB (~730MB)
if chunk_size_mb > self.hard_limit_mb:
logger.warning(
f"Chunk {i} EXCEEDS hard limit: {chunk_size_mb:.1f}MB > "
f"{self.hard_limit_mb:.1f}MB (would be {encoded_size_mb:.1f}MB encoded)"
)
logger.info(f"Re-splitting oversized chunk {i}...")
try:
# Re-split with extra conservative target (80% of safe size)
sub_chunks = self._re_split_chunk(chunk_path, target_size_mb=self.safe_chunk_size_mb * 0.8)
logger.info(f"Successfully re-split chunk {i} into {len(sub_chunks)} sub-chunks")
valid_chunks.extend(sub_chunks)
chunks_to_cleanup.append(chunk_path) # Mark original for cleanup
except Exception as e:
logger.error(f"Failed to re-split chunk {i}: {e}")
raise RuntimeError(
f"Chunk {i} is too large ({chunk_size_mb:.1f}MB) and could not be re-split. "
f"Video may have extremely high bitrate. Consider reducing video quality."
)
else:
# Chunk is valid
logger.info(
f"Chunk {i} validated: {chunk_size_mb:.1f}MB raw, "
f"{encoded_size_mb:.1f}MB encoded ✓"
)
valid_chunks.append(chunk_path)
# Cleanup original oversized chunks
for chunk_path in chunks_to_cleanup:
try:
os.remove(chunk_path)
logger.debug(f"Cleaned up oversized chunk: {chunk_path}")
except Exception as e:
logger.warning(f"Could not remove oversized chunk: {e}")
return valid_chunks
def split_video(self, video_path: str, output_dir: Optional[str] = None) -> List[str]:
"""
Split video into safe-sized chunks with validation and automatic re-splitting.
This method:
1. Calculates optimal splitting strategy using calculate_optimal_chunks()
2. Performs initial split using ffmpeg
3. Validates all chunks against hard limit
4. Automatically re-splits any oversized chunks
5. Returns list of all valid chunks ready for processing
Args:
video_path: Path to the video file to split
output_dir: Directory to save chunks (default: system temp directory)
Returns:
List of paths to validated chunk files (all guaranteed < hard limit)
"""
# Calculate optimal splitting strategy
split_info = self.calculate_optimal_chunks(video_path)
if not split_info['needs_split']:
logger.info("Video within limits, no splitting required")
return [video_path]
# Create output directory
if output_dir is None:
output_dir = tempfile.mkdtemp(prefix="video_chunks_")
logger.info(f"Using temporary directory for chunks: {output_dir}")
else:
os.makedirs(output_dir, exist_ok=True)
logger.info(f"Splitting video into {split_info['num_chunks']} chunks...")
# Perform initial split
try:
chunk_paths = self._split_by_duration(
video_path,
split_info['chunk_duration_seconds'],
output_dir
)
logger.info(f"Initial split complete: {len(chunk_paths)} chunks created")
except Exception as e:
logger.error(f"Failed during initial split: {e}")
raise
# Validate and fix any oversized chunks
logger.info("Validating chunk sizes...")
try:
valid_chunks = self.validate_and_fix_chunks(chunk_paths)
logger.info(f"Validation complete: {len(valid_chunks)} valid chunks ready for processing")
except Exception as e:
logger.error(f"Failed during chunk validation: {e}")
# Cleanup all chunks on validation failure
self.cleanup_chunks(chunk_paths)
raise
return valid_chunks
def cleanup_chunks(self, chunk_paths: List[str]) -> None:
"""
Delete temporary chunk files.
@ -332,6 +576,7 @@ class VideoSplitter:
def get_chunk_info(self, video_path: str) -> Tuple[int, float]:
"""
Get information about how a video would be chunked without actually splitting it.
Uses the robust multi-constraint algorithm (size + duration).
Args:
video_path: Path to the video file
@ -339,16 +584,19 @@ class VideoSplitter:
Returns:
Tuple of (number_of_chunks, total_duration_in_minutes)
"""
duration = self.get_video_duration(video_path)
if duration is None:
return (0, 0.0)
duration_minutes = duration / 60
num_chunks = int(duration / self.chunk_duration_seconds) + (
1 if duration % self.chunk_duration_seconds > 0 else 0
)
return (num_chunks, duration_minutes)
try:
split_info = self.calculate_optimal_chunks(video_path)
duration_minutes = self.get_video_duration(video_path) / 60
return (split_info['num_chunks'], duration_minutes)
except Exception as e:
logger.warning(f"Could not calculate chunk info: {e}")
# Fallback to duration-only logic
duration = self.get_video_duration(video_path)
if duration is None:
return (0, 0.0)
duration_minutes = duration / 60
num_chunks = math.ceil(duration / self.chunk_duration_seconds)
return (num_chunks, duration_minutes)
# Convenience functions for direct use