pdf instructions update check

2025-11-15 04:59:14 +05:30 · 2025-11-15 04:59:14 +05:30 · bec26a02be
commit bec26a02be
parent dc770d65d3
5 changed files with 70 additions and 103 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -17,6 +17,12 @@ A full-stack video query application using Google Gemini 2.0 Flash Exp AI for vi
 cd backend
 python3 -m venv venv
 source venv/bin/activate  # On Windows: venv\Scripts\activate
+
+# For Python 3.10 (use special requirements file)
+pip install -r requirements-py310.txt
+bash fix_jose.sh  # Fix jose module conflict if needed
+
+# For other Python versions
 pip install -r requirements.txt

 # Install system dependencies (Ubuntu/Debian)
@ -75,8 +81,8 @@ npm run build

 ### Frontend
 - **Development server**: `npm start` (port 3000)
- **Production build**: `npm run build`
- **Build with deploy script**: `./build.sh`
+- **Production build**: `./build.sh` (recommended - sets PUBLIC_URL automatically)
+- **Manual production build**: `PUBLIC_URL=/video_query npm run build` (if not using script)

 ### Video Processing
 - **Standalone script**: `python video_query.py <video_path> [--prompt "Your custom prompt"]`
@ -97,6 +103,12 @@ npm run build
   cd /path/to/video-query/backend
   python3 -m venv venv
   source venv/bin/activate
+
+   # For Python 3.10
+   pip install -r requirements-py310.txt
+   bash fix_jose.sh
+
+   # For other Python versions
   pip install -r requirements.txt
   ```

@ -150,7 +162,11 @@ EOF
 2. **Build for production**:
   ```bash
   cd frontend
-   npm run build
+   # Option 1: Use build script (recommended - sets PUBLIC_URL automatically)
+   ./build.sh
+
+   # Option 2: Manual build with PUBLIC_URL
+   PUBLIC_URL=/video_query npm run build
   ```

 3. **Deploy to web server**:
@ -378,6 +394,11 @@ sudo systemctl restart video-query
 # Check ffmpeg installation
 which ffprobe
 ffprobe -version
+
+# Python 3.10: Fix jose module SyntaxError
+# If you see: "SyntaxError: Missing parentheses in call to 'print'"
+cd backend
+bash fix_jose.sh
 ```

 ### Frontend Issues
@ -396,6 +417,11 @@ tail -f /var/log/apache2/error.log
 tail -f /var/log/nginx/error.log
 ```

+**Static Assets Not Loading** (JS/CSS 404 errors):
+- **Symptom**: `Loading failed for <script> with source "https://domain.com/static/js/main.xxx.js"`
+- **Cause**: Application built without correct PUBLIC_URL
+- **Solution**: Rebuild with `./build.sh` or `PUBLIC_URL=/video_query npm run build`
+
 ### Rate Limiting Issues
 - **Symptom**: 400 INVALID_ARGUMENT after 3-4 videos
 - **Check**: Gemini API rate limits (5 RPM for free tier)
--- a/backend/.env
+++ b/backend/.env
@ -1,18 +1,18 @@
 GOOGLE_API_KEY=AIzaSyBF3Ia1nVS4PLuLpWt-85ct_heJ7FrlvkQ

-# API Tier Configuration (IMPORTANT!)
-# Set to "free" for free tier (5 RPM) or "paid" for paid tier (60 RPM)
-# This prevents 503 errors by enforcing proper rate limits
-GEMINI_API_TIER=free
+# Video Splitting Configuration
+# Maximum duration per chunk in minutes (default: 53)
+# Videos longer than this will be split into multiple chunks
+CHUNK_DURATION_MINUTES=53

-# Parallel Processing (auto-configured based on tier, uncomment to override)
-# MAX_PARALLEL_CHUNKS=2
+# Parallel Processing Configuration
+# Maximum number of chunks to process concurrently (default: 4)
+MAX_PARALLEL_CHUNKS=4

 # Model Configuration
 VIDEO_PROCESSOR_MODEL=gemini-2.5-pro
 VIDEO_SYNTHESIS_MODEL=gemini-2.5-pro

-
 # Enable logging of prompts sent to AI for each video/chunk
 # Shows exactly what prompt was used for each video in batch
 BATCH_PROCESSING_LOG_PROMPTS=false
--- a/backend/.env.example
+++ b/backend/.env.example
@ -1,26 +1,21 @@
 # Google Gemini API Key (REQUIRED)
-GOOGLE_API_KEY=your_api_key_here
+GOOGLE_API_KEY=AIzaSyBF3Ia1nVS4PLuLpWt-85ct_heJ7FrlvkQ

 # =============================================================================
-# API TIER AND RATE LIMITING CONFIGURATION (IMPORTANT!)
+# VIDEO PROCESSING CONFIGURATION
 # =============================================================================
-# Set this based on your Gemini API subscription level
-# This prevents 503 UNAVAILABLE errors by enforcing proper rate limits

-# API Tier: "free" or "paid"
-# - free: 5 requests per minute (RPM), 12 seconds between requests, max 2 parallel chunks
-# - paid: 60 requests per minute (RPM), 1 second between requests, max 4-10 parallel chunks
-# Default: free (conservative to prevent 503 overload errors)
-GEMINI_API_TIER=free
+# Video Splitting Configuration
+# Maximum duration per chunk in minutes (default: 53)
+# Videos longer than this will be split into multiple chunks for processing
+# Gemini API limits: ~55 min with audio, ~60 min without audio
+CHUNK_DURATION_MINUTES=53

 # Parallel Processing Configuration
-# Maximum number of video chunks to process simultaneously
-# Recommended values:
-# - Free tier: 1-2 (safe, prevents overload)
-# - Paid tier: 4-10 (faster processing)
-# Default: Auto-configured based on GEMINI_API_TIER (2 for free, 4 for paid)
-# Uncomment to override:
-# MAX_PARALLEL_CHUNKS=2
+# Maximum number of video chunks to process concurrently (default: 4)
+# Higher values = faster processing but more API load
+# Recommended: 4-10 for paid tier with high rate limits
+MAX_PARALLEL_CHUNKS=4

 # =============================================================================
 # MODEL CONFIGURATION (Optional)
@ -30,7 +25,9 @@ GEMINI_API_TIER=free
 VIDEO_PROCESSOR_MODEL=gemini-2.5-pro
 VIDEO_SYNTHESIS_MODEL=gemini-2.5-pro

-# Batch Processing Logging (Optional)
+# =============================================================================
+# BATCH PROCESSING LOGGING (Optional)
+# =============================================================================
 # Enable detailed logging for batch processing operations
 # Useful for debugging and understanding how videos are processed
 # Default: false (to reduce log volume)
--- a/backend/video_processor.py
+++ b/backend/video_processor.py
@ -43,18 +43,14 @@ class VideoProcessor:

    # Parallel processing configuration
    # Default max workers for parallel chunk processing
-    # Free tier: 5 RPM (use 1-2 workers to be safe)
-    # Paid tier: 60 RPM (can use 4-10 workers)
-    DEFAULT_MAX_WORKERS = 2  # Conservative default for free tier (reduced from 4 to prevent 503 errors)
+    DEFAULT_MAX_WORKERS = 4  # Default concurrent workers

    # Model configuration
    DEFAULT_PROCESSING_MODEL = "gemini-2.5-pro"  # Model for individual video processing
-    DEFAULT_SYNTHESIS_MODEL = "gemini-2.5-pro"   # Model for batch synthesis (updated for consistency)
+    DEFAULT_SYNTHESIS_MODEL = "gemini-2.5-pro"   # Model for batch synthesis

-    # Rate limiting and retry configuration
-    MIN_REQUEST_INTERVAL_FREE = 12  # seconds (for 5 RPM free tier: 60/5 = 12s)
-    MIN_REQUEST_INTERVAL_PAID = 1   # seconds (for 60 RPM paid tier: 60/60 = 1s)
-    MAX_RETRY_ATTEMPTS = 5          # Maximum retry attempts (increased from 3)
+    # Retry configuration
+    MAX_RETRY_ATTEMPTS = 5          # Maximum retry attempts
    RETRY_DELAYS = [5, 10, 20, 40, 60]  # Exponential backoff delays in seconds

    def __init__(self, api_key: Optional[str] = None, max_parallel_chunks: int = None):
@ -64,7 +60,7 @@ class VideoProcessor:
        Args:
            api_key: Google API key for Gemini
            max_parallel_chunks: Maximum number of chunks to process in parallel
-                                (default: 4, recommended 3-4 for free tier, up to 10+ for paid tier)
+                                (default: from MAX_PARALLEL_CHUNKS env var or 4)
        """
        self.api_key = api_key or os.getenv("GOOGLE_API_KEY")
        if not self.api_key:
@ -76,34 +72,22 @@ class VideoProcessor:
        self.client = genai.Client(api_key=self.api_key)
        logger.info("Gemini API client initialized successfully")

-        # Detect API tier (free or paid)
-        self._api_tier = self._detect_api_tier()
-
-        # Set parallel processing configuration based on API tier
+        # Set parallel processing configuration
        if max_parallel_chunks:
            self.max_parallel_chunks = max_parallel_chunks
        else:
-            # Auto-configure based on API tier
+            # Load from environment variable or use default
            env_max_workers = os.getenv("MAX_PARALLEL_CHUNKS")
            if env_max_workers:
                self.max_parallel_chunks = int(env_max_workers)
            else:
-                # Default based on tier
-                if self._api_tier == "paid":
-                    self.max_parallel_chunks = 4
-                else:
-                    self.max_parallel_chunks = 2
+                self.max_parallel_chunks = self.DEFAULT_MAX_WORKERS

-        logger.info(f"Parallel processing: max {self.max_parallel_chunks} concurrent chunks ({self._api_tier} tier)")
+        logger.info(f"Parallel processing: max {self.max_parallel_chunks} concurrent chunks")

        # Initialize video splitter
        self.video_splitter = VideoSplitter()

-        # Thread lock and tracking for rate limiting
-        self._rate_limit_lock = threading.Lock()
-        self._last_request_time = 0
-        self._request_count = 0
-
        # Load configuration from environment variables
        self.processing_model = os.getenv("VIDEO_PROCESSOR_MODEL", self.DEFAULT_PROCESSING_MODEL)
        self.synthesis_model = os.getenv("VIDEO_SYNTHESIS_MODEL", self.DEFAULT_SYNTHESIS_MODEL)
@ -155,48 +139,6 @@ class VideoProcessor:
            logger.error(f"Error sending usage data to webhook: {str(e)}")
            # Don't raise the exception - webhook failure shouldn't block the main flow

-    def _detect_api_tier(self) -> str:
-        """
-        Detect if using free or paid API tier.
-        Can be overridden with env var: GEMINI_API_TIER=free or GEMINI_API_TIER=paid
-
-        Returns:
-            "free" or "paid"
-        """
-        tier = os.getenv("GEMINI_API_TIER", "free").lower()
-        if tier in ["free", "paid"]:
-            logger.info(f"Using {tier} tier API configuration")
-            return tier
-        logger.warning(f"Unknown API tier '{tier}', defaulting to 'free' for safety")
-        return "free"
-
-    def _wait_for_rate_limit(self) -> None:
-        """
-        Smart rate limiting that respects API tier limits.
-        Free tier: 5 RPM = 12 seconds between requests
-        Paid tier: 60 RPM = 1 second between requests
-
-        This method ensures we don't overwhelm the API with parallel requests.
-        """
-        with self._rate_limit_lock:
-            current_time = time.time()
-            time_since_last = current_time - self._last_request_time
-
-            # Determine minimum interval based on API tier
-            if self._api_tier == "paid":
-                min_interval = self.MIN_REQUEST_INTERVAL_PAID
-            else:
-                min_interval = self.MIN_REQUEST_INTERVAL_FREE
-
-            if time_since_last < min_interval:
-                wait_time = min_interval - time_since_last
-                logger.info(f"Rate limiting: waiting {wait_time:.1f}s before next API call")
-                time.sleep(wait_time)
-
-            self._last_request_time = time.time()
-            self._request_count += 1
-            logger.debug(f"API request #{self._request_count} at {self._last_request_time:.2f}")
-
    def _extract_error_code(self, error_message: str) -> str:
        """
        Extract HTTP error code from error message.
@ -285,9 +227,6 @@ class VideoProcessor:

        for attempt in range(self.MAX_RETRY_ATTEMPTS):
            try:
-                # Apply rate limiting before each attempt
-                self._wait_for_rate_limit()
-
                # Make the API call
                if attempt == 0:
                    logger.info(f"{context} Sending request to Gemini API")
--- a/backend/video_splitter.py
+++ b/backend/video_splitter.py
@ -21,17 +21,22 @@ class VideoSplitter:
    Handles video duration detection and splitting operations.
    """

-    # Default chunk duration in minutes (43 min to stay under 45 min Gemini API limit for videos with audio)
-    # Google Gemini 2.5 Pro limits: ~45 min with audio, ~60 min without audio
-    DEFAULT_CHUNK_DURATION = 43
+    # Default chunk duration in minutes (53 min to stay under 55 min Gemini API limit for videos with audio)
+    # Google Gemini 2.5 Pro limits: ~55 min with audio, ~60 min without audio
+    DEFAULT_CHUNK_DURATION = 53

-    def __init__(self, chunk_duration_minutes: int = DEFAULT_CHUNK_DURATION):
+    def __init__(self, chunk_duration_minutes: int = None):
        """
        Initialize VideoSplitter with specified chunk duration.

        Args:
-            chunk_duration_minutes: Duration of each chunk in minutes (default: 54)
+            chunk_duration_minutes: Duration of each chunk in minutes
+                                   (default: from CHUNK_DURATION_MINUTES env var or 53)
        """
+        if chunk_duration_minutes is None:
+            # Load from environment variable or use default
+            chunk_duration_minutes = int(os.getenv("CHUNK_DURATION_MINUTES", self.DEFAULT_CHUNK_DURATION))
+
        self.chunk_duration_minutes = chunk_duration_minutes
        self.chunk_duration_seconds = chunk_duration_minutes * 60
        logger.info(f"VideoSplitter initialized with chunk duration: {chunk_duration_minutes} minutes")
@ -99,7 +104,7 @@ class VideoSplitter:
        Check if a video needs to be split based on duration OR file size.

        A video needs splitting if:
-        1. Duration > 54 minutes (Gemini API time limit), OR
+        1. Duration > configured chunk duration (default 53 minutes), OR
        2. File size > 500MB (conservative target to handle variable bitrate)
           With 30% variance: 500MB × 1.3 = 650MB max
           After base64 encoding: 650MB × 1.37 = 891MB (well under 1GB API limit)