diff --git a/CLAUDE.md b/CLAUDE.md index f84f8a5..544b79d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -268,16 +268,27 @@ EOF - Uses `threading.Lock` for thread safety - Prevents Gemini API rate limit errors (5 RPM free tier) -### Hybrid Upload Strategy (video_processor.py:163-203) -- **Files < 10MB**: Base64 inline data (instant) -- **Files >= 10MB**: File Upload API (slower but reliable) -- Threshold: `SIZE_THRESHOLD_MB = 10` +### Upload Strategy (video_processor.py:388-450) +- **Current approach**: Base64 inline encoding for ALL videos +- **Reason**: File Upload API has known issues in SDK 1.45.0-1.49.0 +- **API limit**: 1000MB (1GB) for base64-encoded requests +- **Base64 overhead**: +37% size increase (1.37x multiplier) +- **Effective limit**: ~730MB raw video per chunk (after encoding) -### Video Splitting (video_splitter.py) -- **Chunk duration**: 54 minutes -- **Automatic**: Videos > 54 minutes split automatically +### Video Splitting (video_splitter.py) - Robust Multi-Constraint Algorithm +- **Chunk duration limit**: 53 minutes (Gemini API ~55 min limit) +- **Chunk size limit**: ~560MB safe target (with 30% VBR margin) +- **Hard size limit**: ~730MB maximum (1000MB ÷ 1.37 encoding overhead) +- **Automatic splitting**: Based on BOTH duration AND size constraints +- **VBR handling**: 30% safety margin for Variable Bitrate variance +- **Validation & re-splitting**: Automatic re-split if chunks exceed hard limit +- **Algorithm**: Uses max(chunks_by_size, chunks_by_duration) for safety - **Processing**: Chunks processed in parallel on backend -- **Maximum single chunk**: 55 minutes (Gemini limit) + +**Example:** +- 30min/1.5GB video → Split into 3 chunks of ~10min/500MB each +- 50min/720MB video → Split into 2 chunks of ~25min/360MB each +- 5min/300MB video → No split (under both limits) ### Queue Management (App.js:63-336) - **Queue states**: queued, processing, completed, failed, cancelled @@ -319,6 +330,10 @@ BATCH_PROCESSING_LOG_SUMMARIES=true - `GOOGLE_API_KEY`: Your Gemini API key (required) - `VIDEO_PROCESSOR_MODEL`: Model for individual video processing (default: gemini-2.5-pro) - `VIDEO_SYNTHESIS_MODEL`: Model for batch synthesis (default: gemini-2.5-pro) + - `CHUNK_DURATION_MINUTES`: Max chunk duration (default: 53 minutes) + - `VBR_SAFETY_MARGIN`: Safety margin for variable bitrate videos (default: 1.30 = 30%) + - `BASE64_API_LIMIT_MB`: API limit for base64 requests (default: 1000MB) + - `MAX_PARALLEL_CHUNKS`: Concurrent chunk processing (default: 4) - `BATCH_PROCESSING_LOG_PROMPTS`: Enable detailed prompt logging (default: false) - `BATCH_PROCESSING_LOG_SUMMARIES`: Enable summary preview logging (default: false) - **backend/run.py**: Hypercorn server config (body size limits, timeouts) diff --git a/backend/.env.example b/backend/.env.example index 2b19db7..0e9f478 100644 --- a/backend/.env.example +++ b/backend/.env.example @@ -11,6 +11,19 @@ GOOGLE_API_KEY=AIzaSyBF3Ia1nVS4PLuLpWt-85ct_heJ7FrlvkQ # Gemini API limits: ~55 min with audio, ~60 min without audio CHUNK_DURATION_MINUTES=53 +# Variable Bitrate (VBR) Safety Margin (default: 1.30) +# Adds 30% safety margin to account for bitrate variance in videos +# Higher values = more conservative splitting, safer for VBR videos +# Lower values = fewer chunks, but risk of oversized chunks +# Recommended: 1.25-1.35 (25%-35% margin) +VBR_SAFETY_MARGIN=1.30 + +# Base64 API Limit in MB (default: 1000) +# Maximum size of base64-encoded video in API request +# Gemini API limit: 1000MB (1GB) for inline base64 requests +# Do not change unless API limits change +BASE64_API_LIMIT_MB=1000 + # Parallel Processing Configuration # Maximum number of video chunks to process concurrently (default: 4) # Higher values = faster processing but more API load diff --git a/backend/video_splitter.py b/backend/video_splitter.py index c2ee13c..809b926 100644 --- a/backend/video_splitter.py +++ b/backend/video_splitter.py @@ -3,13 +3,20 @@ Video Splitter Module This module provides functionality to detect video duration and split long videos into smaller chunks for processing with APIs that have duration limitations. + +Key Features: +- Duration-based splitting (respects Gemini API 53-55 min limit) +- Size-based splitting (respects 1GB base64-encoded API limit) +- Variable Bitrate (VBR) handling with safety margins +- Automatic chunk validation and re-splitting for oversized chunks """ import ffmpeg import os import tempfile import logging -from typing import List, Tuple, Optional +import math +from typing import List, Tuple, Optional, Dict, Any from system_utils import system_utils from error_reporter import ErrorReporter, ErrorCategory @@ -19,12 +26,42 @@ logger = logging.getLogger('video_query') class VideoSplitter: """ Handles video duration detection and splitting operations. + + Supports robust splitting with: + - Duration limits (Gemini API ~53-55 min) + - Size limits (1GB base64-encoded API request) + - VBR (Variable Bitrate) safety margins + - Automatic chunk validation and re-splitting """ + # Duration constraint # Default chunk duration in minutes (53 min to stay under 55 min Gemini API limit for videos with audio) # Google Gemini 2.5 Pro limits: ~55 min with audio, ~60 min without audio DEFAULT_CHUNK_DURATION = 53 + # Size constraints (for base64 inline encoding approach) + # API limit for base64-encoded requests: 1GB (1000MB) + API_LIMIT_AFTER_ENCODING_MB = 1000 + BASE64_OVERHEAD = 1.37 # Base64 encoding increases size by ~37% + + # VBR (Variable Bitrate) safety margin + # Videos with VBR can have sections with significantly higher bitrate than average + # 30% margin ensures chunks stay within limits even with bitrate spikes + VBR_SAFETY_MARGIN = 1.30 + + # Calculate safe target chunk size + # Formula: API_LIMIT / BASE64_OVERHEAD / VBR_MARGIN + # 1000MB / 1.37 / 1.30 ≈ 560MB + SAFE_CHUNK_SIZE_MB = API_LIMIT_AFTER_ENCODING_MB / BASE64_OVERHEAD / VBR_SAFETY_MARGIN + + # Hard limit (absolute maximum raw chunk size before encoding) + # Formula: API_LIMIT / BASE64_OVERHEAD + # 1000MB / 1.37 ≈ 730MB + HARD_LIMIT_MB = API_LIMIT_AFTER_ENCODING_MB / BASE64_OVERHEAD + + # Minimum chunk duration to avoid creating too many tiny chunks + MIN_CHUNK_DURATION_MIN = 3 + def __init__(self, chunk_duration_minutes: int = None): """ Initialize VideoSplitter with specified chunk duration. @@ -39,7 +76,19 @@ class VideoSplitter: self.chunk_duration_minutes = chunk_duration_minutes self.chunk_duration_seconds = chunk_duration_minutes * 60 - logger.info(f"VideoSplitter initialized with chunk duration: {chunk_duration_minutes} minutes") + + # Load configurable parameters from environment + self.vbr_safety_margin = float(os.getenv("VBR_SAFETY_MARGIN", self.VBR_SAFETY_MARGIN)) + self.api_limit_mb = int(os.getenv("BASE64_API_LIMIT_MB", self.API_LIMIT_AFTER_ENCODING_MB)) + + # Recalculate safe chunk size if custom values provided + self.safe_chunk_size_mb = self.api_limit_mb / self.BASE64_OVERHEAD / self.vbr_safety_margin + self.hard_limit_mb = self.api_limit_mb / self.BASE64_OVERHEAD + + logger.info(f"VideoSplitter initialized:") + logger.info(f" - Max chunk duration: {chunk_duration_minutes} minutes") + logger.info(f" - Safe chunk size: {self.safe_chunk_size_mb:.0f}MB (with {(self.vbr_safety_margin-1)*100:.0f}% VBR margin)") + logger.info(f" - Hard chunk limit: {self.hard_limit_mb:.0f}MB (API: {self.api_limit_mb}MB after encoding)") def get_video_duration(self, video_path: str) -> Optional[float]: """ @@ -99,23 +148,25 @@ class VideoSplitter: logger.error(f"Error detecting video duration: {str(e)}") return None - def needs_splitting(self, video_path: str, max_chunk_size_mb: float = 500) -> bool: + def needs_splitting(self, video_path: str, max_chunk_size_mb: float = None) -> bool: """ Check if a video needs to be split based on duration OR file size. - A video needs splitting if: - 1. Duration > configured chunk duration (default 53 minutes), OR - 2. File size > 500MB (conservative target to handle variable bitrate) - With 30% variance: 500MB × 1.3 = 650MB max - After base64 encoding: 650MB × 1.37 = 891MB (well under 1GB API limit) + Uses robust calculation considering: + - Duration limit (default 53 minutes) + - Size limit with VBR safety margin (default ~560MB) + - Base64 encoding overhead (1.37x) Args: video_path: Path to the video file - max_chunk_size_mb: Maximum chunk size in MB (default: 500MB) + max_chunk_size_mb: Maximum chunk size in MB (default: use calculated safe size) Returns: True if video needs splitting based on duration or size, False otherwise """ + if max_chunk_size_mb is None: + max_chunk_size_mb = self.safe_chunk_size_mb + duration = self.get_video_duration(video_path) if duration is None: logger.warning("Could not determine video duration for splitting check") @@ -136,170 +187,363 @@ class VideoSplitter: if needs_split_duration: reasons.append(f"duration {duration/60:.2f} min > {self.chunk_duration_minutes} min") if needs_split_size: - reasons.append(f"file size {file_size_mb:.1f}MB > {max_chunk_size_mb}MB") + reasons.append(f"file size {file_size_mb:.1f}MB > {max_chunk_size_mb:.0f}MB") logger.info(f"Video needs splitting: {' AND '.join(reasons)}") else: logger.info(f"Video does not need splitting: duration {duration/60:.2f} min <= {self.chunk_duration_minutes} min, size {file_size_mb:.1f}MB <= {max_chunk_size_mb:.0f}MB") return needs_split - def calculate_optimal_chunk_duration(self, video_path: str, max_chunk_size_mb: float = 500) -> int: + def calculate_optimal_chunks(self, video_path: str) -> Dict[str, Any]: """ - Calculate optimal chunk duration based on file size and video duration - to ensure chunks don't exceed a maximum file size. + Calculate optimal number of chunks and chunk duration considering: + 1. File size with VBR safety margin + 2. Duration limits + 3. Base64 encoding overhead - IMPORTANT: Gemini API has a 1GB request payload limit. - Conservative target of 500MB accounts for variable bitrate (VBR). - With 30% VBR variance: 500MB × 1.3 = 650MB max - After base64 encoding: 650MB × 1.37 = 891MB (under 1GB limit) + This is the core robust algorithm that prevents oversized chunks. Args: video_path: Path to the video file - max_chunk_size_mb: Maximum desired chunk size in MB (default: 500MB) + + Returns: + Dictionary with splitting strategy: + - needs_split: bool + - num_chunks: int + - chunk_duration_seconds: float + - estimated_chunk_size_mb: float + - estimated_after_encoding_mb: float + - split_reason: str + - bitrate_mb_per_min: float + """ + # Get video properties + duration_seconds = self.get_video_duration(video_path) + if duration_seconds is None: + raise ValueError("Cannot determine video duration") + + file_size_bytes = os.path.getsize(video_path) + file_size_mb = file_size_bytes / (1024 * 1024) + duration_minutes = duration_seconds / 60 + + # Calculate bitrate + bitrate_mb_per_min = file_size_mb / duration_minutes + + logger.info( + f"Video properties: {duration_minutes:.1f} min, {file_size_mb:.1f} MB, " + f"{bitrate_mb_per_min:.2f} MB/min bitrate" + ) + + # Calculate chunks needed by SIZE (with VBR safety margin built-in) + chunks_by_size = math.ceil(file_size_mb / self.safe_chunk_size_mb) + + # Calculate chunks needed by DURATION + chunks_by_duration = math.ceil(duration_minutes / self.chunk_duration_minutes) + + # Use the MORE restrictive constraint (more chunks = safer) + num_chunks = max(chunks_by_size, chunks_by_duration, 1) + + # Calculate actual chunk duration + chunk_duration_seconds = duration_seconds / num_chunks + chunk_duration_minutes = chunk_duration_seconds / 60 + + # Check if chunk duration is too small + if chunk_duration_minutes < self.MIN_CHUNK_DURATION_MIN and num_chunks > 1: + logger.warning( + f"Calculated chunk duration {chunk_duration_minutes:.1f} min is below " + f"minimum {self.MIN_CHUNK_DURATION_MIN} min. Video has extremely high bitrate." + ) + + # Estimate chunk properties + estimated_chunk_size_mb = file_size_mb / num_chunks + estimated_after_encoding_mb = estimated_chunk_size_mb * self.BASE64_OVERHEAD + + # Determine split reasons + split_reasons = [] + if chunks_by_size > 1: + split_reasons.append( + f"size ({file_size_mb:.1f}MB requires {chunks_by_size} chunks " + f"with {self.safe_chunk_size_mb:.0f}MB safe target)" + ) + if chunks_by_duration > 1: + split_reasons.append( + f"duration ({duration_minutes:.1f}min requires {chunks_by_duration} chunks " + f"with {self.chunk_duration_minutes}min limit)" + ) + + split_reason = " AND ".join(split_reasons) if split_reasons else "Within limits, no split needed" + needs_split = num_chunks > 1 + + # Log decision details + logger.info(f"Split calculation:") + logger.info(f" - Chunks by size: {chunks_by_size}") + logger.info(f" - Chunks by duration: {chunks_by_duration}") + logger.info(f" - Final chunks: {num_chunks}") + logger.info(f" - Chunk duration: {chunk_duration_minutes:.1f} min") + logger.info( + f" - Est. chunk size: {estimated_chunk_size_mb:.1f}MB raw, " + f"{estimated_after_encoding_mb:.1f}MB encoded" + ) + logger.info(f" - Decision: {split_reason}") + + return { + 'needs_split': needs_split, + 'num_chunks': num_chunks, + 'chunk_duration_seconds': chunk_duration_seconds, + 'estimated_chunk_size_mb': estimated_chunk_size_mb, + 'estimated_after_encoding_mb': estimated_after_encoding_mb, + 'split_reason': split_reason, + 'bitrate_mb_per_min': bitrate_mb_per_min, + 'constraints': { + 'by_size': chunks_by_size, + 'by_duration': chunks_by_duration + } + } + + def calculate_optimal_chunk_duration(self, video_path: str, max_chunk_size_mb: float = None) -> int: + """ + Legacy method for backward compatibility. + Calls calculate_optimal_chunks() and returns just the duration. + + Args: + video_path: Path to the video file + max_chunk_size_mb: Ignored (kept for compatibility) Returns: Optimal chunk duration in seconds """ - duration = self.get_video_duration(video_path) - if duration is None: - logger.warning("Could not determine duration, using default chunk duration") + try: + split_info = self.calculate_optimal_chunks(video_path) + return int(split_info['chunk_duration_seconds']) + except Exception as e: + logger.warning(f"Could not calculate optimal chunks: {e}, using default duration") return self.chunk_duration_seconds - # Get file size - file_size_bytes = os.path.getsize(video_path) - file_size_mb = file_size_bytes / (1024 * 1024) - file_size_gb = file_size_bytes / (1024 * 1024 * 1024) - - # Calculate average bitrate (bytes per second) - avg_bitrate = file_size_bytes / duration - - # Calculate chunk duration that would result in max_chunk_size_mb - max_chunk_size_bytes = max_chunk_size_mb * 1024 * 1024 - optimal_duration = max_chunk_size_bytes / avg_bitrate - - # Use the smaller of optimal duration or default chunk duration - final_duration = min(optimal_duration, self.chunk_duration_seconds) - - # Ensure minimum chunk duration of 5 minutes (300 seconds) - final_duration = max(final_duration, 300) - - logger.info( - f"Calculated optimal chunk duration: {final_duration:.0f}s ({final_duration/60:.1f} min) " - f"based on file size {file_size_mb:.1f}MB ({file_size_gb:.2f}GB) and duration {duration/60:.1f} min. " - f"Target chunk size: {max_chunk_size_mb}MB" - ) - - return int(final_duration) - - def split_video(self, video_path: str, output_dir: Optional[str] = None) -> List[str]: + def _split_by_duration(self, video_path: str, chunk_duration_seconds: float, + output_dir: str) -> List[str]: """ - Split a video into multiple chunks based on the configured chunk duration. - Automatically adjusts chunk duration if file size would result in chunks > 1.2GB. + Perform the actual ffmpeg splitting by duration. + Internal helper method extracted from split_video for reusability. Args: - video_path: Path to the video file to split - output_dir: Directory to save chunks (default: system temp directory) + video_path: Path to video file + chunk_duration_seconds: Duration of each chunk in seconds + output_dir: Output directory for chunks Returns: - List of paths to the generated chunk files + List of chunk file paths """ duration = self.get_video_duration(video_path) - if duration is None: - raise ValueError("Could not determine video duration") - - # Calculate optimal chunk duration based on file size - chunk_duration = self.calculate_optimal_chunk_duration(video_path) - - # Use temp directory if none specified - if output_dir is None: - output_dir = tempfile.mkdtemp(prefix="video_chunks_") - logger.info(f"Using temporary directory for chunks: {output_dir}") - else: - os.makedirs(output_dir, exist_ok=True) - - # Calculate number of chunks needed - num_chunks = int(duration / chunk_duration) + ( - 1 if duration % chunk_duration > 0 else 0 - ) - logger.info(f"Splitting video into {num_chunks} chunks (chunk duration: {chunk_duration/60:.1f} min)") + num_chunks = math.ceil(duration / chunk_duration_seconds) chunk_paths = [] video_basename = os.path.splitext(os.path.basename(video_path))[0] video_extension = os.path.splitext(video_path)[1] for i in range(num_chunks): - start_time = i * chunk_duration + start_time = i * chunk_duration_seconds chunk_output = os.path.join( output_dir, f"{video_basename}_chunk_{i+1:02d}{video_extension}" ) - logger.info(f"Creating chunk {i+1}/{num_chunks}: start={start_time}s, output={chunk_output}") + logger.info(f"Creating chunk {i+1}/{num_chunks}: start={start_time:.1f}s") try: - # Split the video using ffmpeg - # Using -t to specify duration of this chunk - # Using -c copy for fast processing (no re-encoding) - stream = ffmpeg.input(video_path, ss=start_time, t=chunk_duration) - stream = ffmpeg.output( - stream, - chunk_output, - c='copy', # Copy streams without re-encoding for speed - map='0', # Include all streams from input - avoid_negative_ts='make_zero' # Handle timestamp issues - ) + stream = ffmpeg.input(video_path, ss=start_time, t=chunk_duration_seconds) + stream = ffmpeg.output(stream, chunk_output, c='copy', map='0', + avoid_negative_ts='make_zero') ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, overwrite_output=True) chunk_paths.append(chunk_output) - # Log chunk size for monitoring - chunk_size_bytes = os.path.getsize(chunk_output) - chunk_size_mb = chunk_size_bytes / (1024 * 1024) - chunk_size_gb = chunk_size_bytes / (1024 * 1024 * 1024) - logger.info(f"Successfully created chunk {i+1}/{num_chunks} (size: {chunk_size_mb:.1f}MB / {chunk_size_gb:.2f}GB)") - - # Warn if chunk is approaching size limits (500MB target due to VBR variance) - if chunk_size_mb > 550: - logger.warning( - f"Chunk {i+1} is {chunk_size_mb:.1f}MB ({chunk_size_gb:.2f}GB), exceeding the 500MB target. " - f"After base64 encoding (~37% overhead), this will be ~{chunk_size_mb * 1.37:.1f}MB. " - f"API limit is 1000MB (1GB). If close to limit, consider reducing video quality." - ) + chunk_size_mb = os.path.getsize(chunk_output) / (1024 * 1024) + logger.info(f"Created chunk {i+1}/{num_chunks}: {chunk_size_mb:.1f}MB") except ffmpeg.Error as e: error_msg = e.stderr.decode() if e.stderr else str(e) - error_report = ErrorReporter.capture_error( + ErrorReporter.capture_error( e, category=ErrorCategory.VIDEO_ERROR, context={ 'video_path': video_path, 'chunk_number': i+1, 'total_chunks': num_chunks, - 'operation': 'split_video' + 'operation': 'split_by_duration' } ) logger.error(f"FFmpeg error creating chunk {i+1}: {error_msg}") - # Clean up any created chunks on error self.cleanup_chunks(chunk_paths) raise RuntimeError(f"Failed to create video chunk {i+1}: {error_msg}") - except Exception as e: - error_report = ErrorReporter.capture_error( - e, - category=ErrorCategory.VIDEO_ERROR, - context={ - 'video_path': video_path, - 'chunk_number': i+1, - 'total_chunks': num_chunks, - 'operation': 'split_video' - } - ) - logger.error(f"Error creating chunk {i+1}: {str(e)}") - self.cleanup_chunks(chunk_paths) - raise - logger.info(f"Successfully split video into {len(chunk_paths)} chunks") return chunk_paths + def _re_split_chunk(self, chunk_path: str, target_size_mb: float) -> List[str]: + """ + Re-split a single oversized chunk into smaller sub-chunks. + + Args: + chunk_path: Path to oversized chunk + target_size_mb: Target size for sub-chunks + + Returns: + List of sub-chunk paths + """ + duration = self.get_video_duration(chunk_path) + file_size_mb = os.path.getsize(chunk_path) / (1024 * 1024) + + # Calculate how many sub-chunks needed + num_sub_chunks = math.ceil(file_size_mb / target_size_mb) + sub_chunk_duration = duration / num_sub_chunks + + logger.info( + f"Re-splitting oversized chunk: {file_size_mb:.1f}MB into {num_sub_chunks} sub-chunks " + f"of ~{sub_chunk_duration/60:.1f} min each" + ) + + # Split the chunk + chunk_dir = os.path.dirname(chunk_path) + chunk_basename = os.path.splitext(os.path.basename(chunk_path))[0] + chunk_extension = os.path.splitext(chunk_path)[1] + + sub_chunks = [] + for j in range(num_sub_chunks): + start_time = j * sub_chunk_duration + sub_chunk_output = os.path.join( + chunk_dir, + f"{chunk_basename}_sub_{j+1:02d}{chunk_extension}" + ) + + stream = ffmpeg.input(chunk_path, ss=start_time, t=sub_chunk_duration) + stream = ffmpeg.output(stream, sub_chunk_output, c='copy', map='0', + avoid_negative_ts='make_zero') + ffmpeg.run(stream, capture_stdout=True, capture_stderr=True, overwrite_output=True) + + sub_chunks.append(sub_chunk_output) + + sub_size_mb = os.path.getsize(sub_chunk_output) / (1024 * 1024) + logger.info(f"Created sub-chunk {j+1}/{num_sub_chunks}: {sub_size_mb:.1f}MB") + + return sub_chunks + + def validate_and_fix_chunks(self, chunk_paths: List[str]) -> List[str]: + """ + Validate chunk sizes and re-split any chunks exceeding hard limit. + This is the safety net that prevents oversized chunks from reaching the API. + + Args: + chunk_paths: List of chunk file paths + + Returns: + List of valid chunk paths (may include re-split sub-chunks) + """ + valid_chunks = [] + chunks_to_cleanup = [] + + for i, chunk_path in enumerate(chunk_paths, 1): + chunk_size_mb = os.path.getsize(chunk_path) / (1024 * 1024) + encoded_size_mb = chunk_size_mb * self.BASE64_OVERHEAD + + # Check against HARD_LIMIT_MB (~730MB) + if chunk_size_mb > self.hard_limit_mb: + logger.warning( + f"Chunk {i} EXCEEDS hard limit: {chunk_size_mb:.1f}MB > " + f"{self.hard_limit_mb:.1f}MB (would be {encoded_size_mb:.1f}MB encoded)" + ) + logger.info(f"Re-splitting oversized chunk {i}...") + + try: + # Re-split with extra conservative target (80% of safe size) + sub_chunks = self._re_split_chunk(chunk_path, target_size_mb=self.safe_chunk_size_mb * 0.8) + + logger.info(f"Successfully re-split chunk {i} into {len(sub_chunks)} sub-chunks") + valid_chunks.extend(sub_chunks) + chunks_to_cleanup.append(chunk_path) # Mark original for cleanup + + except Exception as e: + logger.error(f"Failed to re-split chunk {i}: {e}") + raise RuntimeError( + f"Chunk {i} is too large ({chunk_size_mb:.1f}MB) and could not be re-split. " + f"Video may have extremely high bitrate. Consider reducing video quality." + ) + else: + # Chunk is valid + logger.info( + f"Chunk {i} validated: {chunk_size_mb:.1f}MB raw, " + f"{encoded_size_mb:.1f}MB encoded ✓" + ) + valid_chunks.append(chunk_path) + + # Cleanup original oversized chunks + for chunk_path in chunks_to_cleanup: + try: + os.remove(chunk_path) + logger.debug(f"Cleaned up oversized chunk: {chunk_path}") + except Exception as e: + logger.warning(f"Could not remove oversized chunk: {e}") + + return valid_chunks + + def split_video(self, video_path: str, output_dir: Optional[str] = None) -> List[str]: + """ + Split video into safe-sized chunks with validation and automatic re-splitting. + + This method: + 1. Calculates optimal splitting strategy using calculate_optimal_chunks() + 2. Performs initial split using ffmpeg + 3. Validates all chunks against hard limit + 4. Automatically re-splits any oversized chunks + 5. Returns list of all valid chunks ready for processing + + Args: + video_path: Path to the video file to split + output_dir: Directory to save chunks (default: system temp directory) + + Returns: + List of paths to validated chunk files (all guaranteed < hard limit) + """ + # Calculate optimal splitting strategy + split_info = self.calculate_optimal_chunks(video_path) + + if not split_info['needs_split']: + logger.info("Video within limits, no splitting required") + return [video_path] + + # Create output directory + if output_dir is None: + output_dir = tempfile.mkdtemp(prefix="video_chunks_") + logger.info(f"Using temporary directory for chunks: {output_dir}") + else: + os.makedirs(output_dir, exist_ok=True) + + logger.info(f"Splitting video into {split_info['num_chunks']} chunks...") + + # Perform initial split + try: + chunk_paths = self._split_by_duration( + video_path, + split_info['chunk_duration_seconds'], + output_dir + ) + logger.info(f"Initial split complete: {len(chunk_paths)} chunks created") + + except Exception as e: + logger.error(f"Failed during initial split: {e}") + raise + + # Validate and fix any oversized chunks + logger.info("Validating chunk sizes...") + try: + valid_chunks = self.validate_and_fix_chunks(chunk_paths) + logger.info(f"Validation complete: {len(valid_chunks)} valid chunks ready for processing") + + except Exception as e: + logger.error(f"Failed during chunk validation: {e}") + # Cleanup all chunks on validation failure + self.cleanup_chunks(chunk_paths) + raise + + return valid_chunks + def cleanup_chunks(self, chunk_paths: List[str]) -> None: """ Delete temporary chunk files. @@ -332,6 +576,7 @@ class VideoSplitter: def get_chunk_info(self, video_path: str) -> Tuple[int, float]: """ Get information about how a video would be chunked without actually splitting it. + Uses the robust multi-constraint algorithm (size + duration). Args: video_path: Path to the video file @@ -339,16 +584,19 @@ class VideoSplitter: Returns: Tuple of (number_of_chunks, total_duration_in_minutes) """ - duration = self.get_video_duration(video_path) - if duration is None: - return (0, 0.0) - - duration_minutes = duration / 60 - num_chunks = int(duration / self.chunk_duration_seconds) + ( - 1 if duration % self.chunk_duration_seconds > 0 else 0 - ) - - return (num_chunks, duration_minutes) + try: + split_info = self.calculate_optimal_chunks(video_path) + duration_minutes = self.get_video_duration(video_path) / 60 + return (split_info['num_chunks'], duration_minutes) + except Exception as e: + logger.warning(f"Could not calculate chunk info: {e}") + # Fallback to duration-only logic + duration = self.get_video_duration(video_path) + if duration is None: + return (0, 0.0) + duration_minutes = duration / 60 + num_chunks = math.ceil(duration / self.chunk_duration_seconds) + return (num_chunks, duration_minutes) # Convenience functions for direct use