import os import tempfile import uuid import logging import sys import base64 import json import re import io import cairosvg import pathlib from PIL import Image, ImageDraw, ImageFont from flask import Flask, request, jsonify, send_from_directory, send_file from werkzeug.utils import secure_filename from werkzeug.exceptions import RequestEntityTooLarge from dotenv import load_dotenv from flask_cors import CORS from chunked_upload import chunked_upload_bp from auth import require_auth, lenient_auth import pdfkit from pdfkit.configuration import Configuration from bs4 import BeautifulSoup from system_utils import system_utils from error_reporter import ErrorReporter, ErrorCategory # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(sys.stdout) ] ) logger = logging.getLogger('video_query') # Load environment variables from .env file load_dotenv() from video_processor import VideoProcessor app = Flask(__name__) # Enable CORS with permissive settings for large file uploads CORS(app, resources={r"/api/*": { "origins": ["https://brandtechsandbox.oliver.solutions", "http://localhost:3000"], "supports_credentials": True, "methods": ["GET", "POST", "OPTIONS"], "allow_headers": ["Content-Type", "X-Requested-With", "Authorization"] }}, expose_headers=["Content-Disposition", "Authorization"]) # Register the chunked upload blueprint app.register_blueprint(chunked_upload_bp) # Configuration UPLOAD_FOLDER = os.path.join(tempfile.gettempdir(), 'video_query_uploads') # 5GB max upload size MAX_CONTENT_LENGTH = 5 * 1024 * 1024 * 1024 # Create upload folder if it doesn't exist os.makedirs(UPLOAD_FOLDER, exist_ok=True) # Configuration for persistent output - commented out as no longer needed # PERSISTENT_PNG_ROOT_DIR = '/var/www/html/video_query/png_output' # Filesystem path for PNG files # PERSISTENT_SVG_ROOT_DIR = '/var/www/html/video_query/svg_output' # Filesystem path for SVG files # PERSISTENT_PNG_BASE_URL = 'https://ai-sandbox.oliver.solutions/video_query/png_output' # Web accessible URL base for PNGs # PERSISTENT_SVG_BASE_URL = 'https://ai-sandbox.oliver.solutions/video_query/svg_output' # Web accessible URL base for SVGs # Create temporary directories for PDF generation instead TEMP_PNG_DIR = os.path.join(tempfile.gettempdir(), 'video_query_png_temp') TEMP_SVG_DIR = os.path.join(tempfile.gettempdir(), 'video_query_svg_temp') os.makedirs(TEMP_PNG_DIR, exist_ok=True) os.makedirs(TEMP_SVG_DIR, exist_ok=True) # Configure the app app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH # Set larger buffer size for large file uploads app.config['MAX_CONTENT_PATH'] = 5 * 1024 * 1024 * 1024 # 5GB # Initialize video processor video_processor = VideoProcessor() # Set allowed extensions for videos ALLOWED_EXTENSIONS = {'mp4', 'avi', 'mov', 'wmv', 'mkv', 'webm'} def allowed_file(filename): """Check if file has an allowed extension""" return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS @app.route('/api/process', methods=['POST']) @lenient_auth def process_video(): """Process uploaded video with the selected mode and prompt""" logger.info("API request received: /api/process") logger.info(f"Content-Type: {request.content_type}") logger.info(f"Content-Length: {request.content_length}") # Handle chunked upload case if request.is_json: data = request.get_json() file_path = data.get('file_path') filename = data.get('filename') prompt = data.get('prompt') if not file_path or not os.path.exists(file_path): logger.error(f"File path not found: {file_path}") return jsonify({'success': False, 'message': 'Uploaded file not found'}), 400 if not prompt: logger.error("No prompt provided") return jsonify({'success': False, 'message': 'No prompt provided'}), 400 # Get user email from authentication if available user_email = "anonymous" if hasattr(request, "user") and isinstance(request.user, dict): user_email = request.user.get("email", request.user.get("preferred_username", "anonymous")) logger.info(f"Processing chunked upload from {file_path} ({filename}) for user: {user_email}") # Use auto-processing which handles both short and long videos result = video_processor.process_video_auto(file_path, prompt, user_email) # Clean up the uploaded file try: os.remove(file_path) logger.info(f"Cleaned up temporary file: {file_path}") except Exception as cleanup_error: logger.warning(f"Could not remove temporary file {file_path}: {str(cleanup_error)}") if result['success']: content_length = len(result['content']) if result['content'] else 0 logger.info(f"Returning successful response with {content_length} characters") response_data = { 'success': True, 'content': result['content'] } # Include chunk information if video was processed in chunks if result.get('chunks_processed', 0) > 1: response_data['chunks_processed'] = result['chunks_processed'] response_data['total_chunks'] = result.get('total_chunks', result['chunks_processed']) return jsonify(response_data) else: logger.error(f"Processing failed: {result['message']}") return jsonify({ 'success': False, 'message': result['message'] }), 500 # Standard direct upload method (for small files) # Check if a file was uploaded if 'video' not in request.files: logger.error("No video file in request") return jsonify({'success': False, 'message': 'No video file provided'}), 400 file = request.files['video'] prompt = request.form.get('prompt', '') logger.info(f"Received file: {file.filename}") logger.info(f"Prompt length: {len(prompt)} characters") # Check for empty filename if file.filename == '': logger.error("Empty filename provided") return jsonify({'success': False, 'message': 'No video selected'}), 400 if not prompt: logger.error("No prompt provided") return jsonify({'success': False, 'message': 'No prompt provided'}), 400 # Check file extension if not allowed_file(file.filename): logger.error(f"Invalid file type: {file.filename}") return jsonify({ 'success': False, 'message': f'Invalid file type. Allowed types: {", ".join(ALLOWED_EXTENSIONS)}' }), 400 try: # Make sure upload directory exists os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) logger.info(f"Upload directory: {app.config['UPLOAD_FOLDER']}") # Generate a unique filename to prevent collisions original_filename = secure_filename(file.filename) unique_filename = f"{uuid.uuid4()}_{original_filename}" file_path = os.path.join(app.config['UPLOAD_FOLDER'], unique_filename) logger.info(f"Writing to: {file_path}") # Stream the file to disk in larger chunks for better performance chunk_size = 1024 * 1024 # 1MB chunks total_bytes = 0 try: with open(file_path, 'wb') as f: while True: chunk = file.read(chunk_size) if not chunk: break total_bytes += len(chunk) f.write(chunk) # Periodically log progress for large files if total_bytes % (50 * 1024 * 1024) == 0: # Log every 50MB logger.info(f"Upload progress: {total_bytes / (1024 * 1024):.2f} MB") except Exception as chunk_error: logger.error(f"Error during chunked upload: {str(chunk_error)}") raise logger.info(f"File saved: {file_path} ({total_bytes} bytes)") # Get user email from authentication if available user_email = "anonymous" if hasattr(request, "user") and isinstance(request.user, dict): user_email = request.user.get("email", request.user.get("preferred_username", "anonymous")) # Process the video logger.info(f"Starting video processing for user: {user_email}...") result = video_processor.process_video(file_path, prompt, user_email) logger.info(f"Processing result: success={result['success']}") # Log if it was processed in chunks if result.get('chunks_processed', 0) > 1: logger.info(f"Video was processed in {result['chunks_processed']} chunks") # Clean up the file after processing try: os.remove(file_path) logger.info(f"Cleaned up temporary file: {file_path}") except Exception as cleanup_error: logger.warning(f"Could not remove temporary file {file_path}: {str(cleanup_error)}") if result['success']: content_length = len(result['content']) if result['content'] else 0 logger.info(f"Returning successful response with {content_length} characters") response_data = { 'success': True, 'content': result['content'] } # Include chunk information if video was processed in chunks if result.get('chunks_processed', 0) > 1: response_data['chunks_processed'] = result['chunks_processed'] response_data['total_chunks'] = result.get('total_chunks', result['chunks_processed']) return jsonify(response_data) else: logger.error(f"Processing failed: {result['message']}") return jsonify({ 'success': False, 'message': result['message'] }), 500 except RequestEntityTooLarge: logger.error(f"File too large: {request.content_length} bytes") return jsonify({ 'success': False, 'message': 'The uploaded file is too large (max 5GB)' }), 413 except Exception as e: import traceback error_report = ErrorReporter.capture_error( e, context={ 'endpoint': '/api/process', 'content_type': request.content_type, 'is_json': request.is_json } ) error_trace = traceback.format_exc() logger.error(f"Error processing video: {str(e)}") logger.error(error_trace) return jsonify({ 'success': False, 'message': error_report.format_user_message(), 'error_id': error_report.error_id, 'error_category': error_report.category.value }), 500 @app.route('/api/process-batch', methods=['POST']) @lenient_auth def process_batch(): """ Process multiple videos as a continuous batch. Videos are treated as sequential segments of one long video. """ logger.info("API request received: /api/process-batch") if not request.is_json: logger.error("Request is not JSON") return jsonify({'success': False, 'error': 'JSON request required'}), 400 try: data = request.get_json() videos = data.get('videos', []) # List of {file_path, filename, order} prompt = data.get('prompt', '') batch_id = data.get('batch_id', str(uuid.uuid4())) # Get user email if available user_email = 'anonymous' if hasattr(request, 'user'): user_email = request.user.get('email') or request.user.get('preferred_username') or request.user.get('name') or 'anonymous' logger.info(f"Batch {batch_id}: Processing {len(videos)} videos for user {user_email}") # Validation if not videos or len(videos) == 0: logger.error("No videos provided") return jsonify({ 'success': False, 'error': 'No videos provided' }), 400 if len(videos) > 10: logger.error(f"Too many videos in batch: {len(videos)}") return jsonify({ 'success': False, 'error': 'Maximum 10 videos per batch' }), 400 if not prompt or not prompt.strip(): logger.error("No prompt provided") return jsonify({ 'success': False, 'error': 'Prompt is required' }), 400 # Sort by order videos.sort(key=lambda x: x.get('order', 0)) logger.info(f"Batch {batch_id}: Video order: {[v['filename'] for v in videos]}") # Validate all files exist for video in videos: file_path = video.get('file_path') if not file_path or not os.path.exists(file_path): logger.error(f"File not found: {video.get('filename')}") return jsonify({ 'success': False, 'error': f"File not found: {video.get('filename')}" }), 400 # Process batch import time start_time = time.time() logger.info(f"Batch {batch_id}: Starting batch processing") result = video_processor.process_video_batch( video_paths=[v['file_path'] for v in videos], filenames=[v['filename'] for v in videos], prompt=prompt, user_email=user_email, batch_id=batch_id ) processing_time = time.time() - start_time logger.info(f"Batch {batch_id}: Processing completed in {processing_time:.2f}s") # Cleanup all temporary files for video in videos: try: file_path = video['file_path'] if os.path.exists(file_path): os.remove(file_path) logger.info(f"Cleaned up temporary file: {file_path}") except Exception as e: logger.warning(f"Failed to delete {file_path}: {e}") return jsonify({ 'success': True, 'content': result.get('content', ''), 'videos_processed': len(videos), 'chunks_processed': result.get('total_chunks', 0), 'processing_time': f"{processing_time:.2f}s" }) except Exception as e: import traceback error_report = ErrorReporter.capture_error( e, context={ 'endpoint': '/api/process-batch', 'num_videos': len(data.get('videos', [])), 'batch_id': data.get('batch_id', 'unknown') } ) error_trace = traceback.format_exc() logger.error(f"Batch processing error: {str(e)}") logger.error(error_trace) return jsonify({ 'success': False, 'error': error_report.format_user_message(), 'error_id': error_report.error_id, 'error_category': error_report.category.value }), 500 # Test route to verify authentication @app.route('/api/auth-test', methods=['GET']) @lenient_auth def auth_test(): """Test endpoint to verify authentication is working""" user_info = { "authenticated": True, "user": request.user.get("name", "Anonymous") if hasattr(request, "user") else "Unknown", "token_present": "Authorization" in request.headers, "token_info": {k: request.user.get(k) for k in ["name", "preferred_username", "email"] if k in request.user} if hasattr(request, "user") else {} } logger.info(f"Auth test: {user_info}") return jsonify(user_info) # Handle PDF generation @app.route('/api/generate-pdf', methods=['POST']) @lenient_auth def generate_pdf(): """Generate a PDF from HTML content with mermaid diagrams""" logger.info("API request received: /api/generate-pdf") if not request.is_json: logger.error("Request is not JSON") return jsonify({'success': False, 'message': 'JSON request required'}), 400 data = request.get_json() html_content = data.get('html') text_diagrams = data.get('textDiagrams', {}) svg_diagrams = data.get('svgDiagrams', {}) diagram_png_data_urls = data.get('diagramPngs', {}) video_file_name = data.get('videoFileName', '') # Log detailed request information logger.info(f"Request data: HTML content length: {len(html_content) if html_content else 0}") logger.info(f"Text diagrams received: {len(text_diagrams)}") logger.info(f"SVG diagrams received: {len(svg_diagrams)}") logger.info(f"Diagram PNGs received: {len(diagram_png_data_urls)}") logger.info(f"Video filename received: {video_file_name if video_file_name else 'None'}") # Comment out full HTML content logging # logger.info("HTML CONTENT RECEIVED START -------------------") # logger.info(html_content) # logger.info("HTML CONTENT RECEIVED END ---------------------") if text_diagrams: logger.info(f"Text diagram keys: {list(text_diagrams.keys())}") if svg_diagrams: logger.info(f"SVG diagram keys: {list(svg_diagrams.keys())}") for key, value in svg_diagrams.items(): logger.info(f"SVG diagram {key}: starts with data:image/svg+xml;base64: {value.startswith('data:image/svg+xml;base64,') if value else False}") if diagram_png_data_urls: logger.info(f"Diagram PNG keys: {list(diagram_png_data_urls.keys())}") for key, value in diagram_png_data_urls.items(): logger.info(f"Diagram PNG {key}: starts with data:image/png;base64: {value.startswith('data:image/png;base64,') if value else False} (length: {len(value) if value else 0})") if not html_content: logger.error("No HTML content provided") return jsonify({'success': False, 'message': 'No HTML content provided'}), 400 try: # Create a temporary directory for PDF and HTML file, not necessarily for images temp_dir_for_pdf = tempfile.mkdtemp() pdf_path = os.path.join(temp_dir_for_pdf, f"response_{uuid.uuid4()}.pdf") # Process HTML to replace mermaid divs with image tags processed_html = html_content processed_svg_ids = set() # Decide whether to use web URLs or file URIs for pdfkit # Always use file:/// URIs with enable-local-file-access # USE_WEB_URLS_FOR_PDFKIT = False # This is no longer needed # We now use temp directories and local file paths for all images # Create a subdirectory for images in the temp dir (for the HTML structure) img_dir = os.path.join(temp_dir_for_pdf, "images") os.makedirs(img_dir, exist_ok=True) logger.info("HTML content before processing:") logger.info(f"HTML contains '.mermaid' class: {'class=mermaid' in html_content}") logger.info(f"HTML contains mermaid code blocks: {'```mermaid' in html_content or 'graph TD' in html_content}") # First approach: Manually look for the mermaid pattern in the HTML before any processing pattern1 = r']*class=.?mermaid.?[^>]*>(.*?)' pattern2 = r'
(graph\s+TD.*?)
' pattern3 = r'graph\s+TD' mermaid_matches1 = re.findall(pattern1, html_content, re.DOTALL) mermaid_matches2 = re.findall(pattern2, html_content, re.DOTALL) mermaid_matches3 = re.findall(pattern3, html_content, re.DOTALL) logger.info(f"Mermaid div matches: {len(mermaid_matches1)}") if mermaid_matches1: for i, m in enumerate(mermaid_matches1): logger.info(f"Mermaid div content {i} (first 100 chars): {m[:100]}") logger.info(f"Mermaid code block matches: {len(mermaid_matches2)}") if mermaid_matches2: for i, m in enumerate(mermaid_matches2): logger.info(f"Mermaid code content {i} (first 100 chars): {m[:100]}") logger.info(f"Mermaid graph TD matches: {len(mermaid_matches3)}") # First, prioritize using the frontend-generated PNGs if available if diagram_png_data_urls: logger.info(f"Processing {len(diagram_png_data_urls)} PNG diagrams provided by frontend.") # Parse the HTML with BeautifulSoup ONCE before the loop soup = BeautifulSoup(processed_html, 'html.parser') for diagram_id, png_data_url in diagram_png_data_urls.items(): unique_png_filename = f"{diagram_id}_{uuid.uuid4()}.png" temp_png_path = os.path.join(TEMP_PNG_DIR, unique_png_filename) image_source_for_pdfkit = None try: if not png_data_url.startswith('data:image/png;base64,'): logger.warning(f"Unsupported PNG data URL format for {diagram_id}") raise ValueError("Unsupported PNG data URL format") base64_png_content = png_data_url.split(',', 1)[1] png_bytes = base64.b64decode(base64_png_content) with open(temp_png_path, 'wb') as f: f.write(png_bytes) if not os.path.exists(temp_png_path) or os.path.getsize(temp_png_path) == 0: logger.error(f"PNG for {diagram_id} (from frontend PNG) was not saved or is empty at {temp_png_path}.") raise ValueError("PNG saving failed or empty") logger.info(f"Saved frontend-generated PNG for {diagram_id} to: {temp_png_path} (size: {os.path.getsize(temp_png_path)} bytes)") # We no longer use web URLs, always use local file path image_source_for_pdfkit = pathlib.Path(temp_png_path).as_uri() alt_text = f"Diagram: {text_diagrams.get(diagram_id, diagram_id)[:50].replace('<', '<').replace('>', '>')}..." # --- MODIFIED REPLACEMENT using BeautifulSoup --- target_div = soup.find('div', id=diagram_id) if target_div: # Create the new img tag as a BeautifulSoup object new_img_tag_soup = soup.new_tag('img', src=image_source_for_pdfkit, alt=alt_text) new_img_tag_soup['style'] = "max-width:100%; margin:20px auto; display:block; border:1px solid #eee;" # Replace the target div with our new img tag target_div.replace_with(new_img_tag_soup) logger.info(f"Replaced div with id='{diagram_id}' using its frontend-generated PNG (src: {image_source_for_pdfkit}) via BeautifulSoup.") processed_svg_ids.add(diagram_id) else: logger.warning(f"PNG_WARN: Could not find div with id='{diagram_id}' in the HTML to replace with frontend PNG using BeautifulSoup.") # Fallback to replacing code block if div with ID isn't found original_code_for_png = text_diagrams.get(diagram_id) if original_code_for_png: # Try to find a pre/code block with matching content code_blocks = soup.find_all('pre') for code_block in code_blocks: code_el = code_block.find('code') if code_el and original_code_for_png.strip() in code_el.text.strip(): # Create new img tag new_img_tag_soup_fallback = soup.new_tag('img', src=image_source_for_pdfkit, alt=alt_text) new_img_tag_soup_fallback['style'] = "max-width:100%; margin:20px auto; display:block; border:1px solid #eee;" # Replace the code block with the img tag code_block.replace_with(new_img_tag_soup_fallback) logger.info(f"PNG_WARN_RECOVERY: Replaced a code block matching content of diagram {diagram_id} with its frontend-PNG img tag via BeautifulSoup.") processed_svg_ids.add(diagram_id) break else: logger.warning(f"PNG_WARN_FAIL: Also failed to find a code block for diagram {diagram_id} content for frontend-PNG replacement with BeautifulSoup.") except Exception as e_png_proc: logger.error(f"Error processing provided PNG for diagram_id '{diagram_id}': {str(e_png_proc)}") # Create a placeholder image indicating the error for this specific diagram try: img_err = Image.new('RGB', (500, 150), color=(255, 230, 230)) # Light red draw_err = ImageDraw.Draw(img_err) # Consider ImageFont.truetype for specific fonts/sizes if default is too small title_font = ImageFont.load_default() text_font = ImageFont.load_default() draw_err.text((10, 10), f"Error rendering diagram:", fill=(128, 0, 0), font=title_font) draw_err.text((10, 30), f"ID: {diagram_id}", fill=(100, 0, 0), font=text_font) draw_err.text((10, 50), f"Details: {str(e_png_proc)[:80]}", fill=(100, 0, 0), font=text_font) if text_diagrams.get(diagram_id): draw_err.text((10,70), f"Code: {text_diagrams[diagram_id][:60]}...", fill=(100,0,0), font=text_font) with open(temp_png_path, 'wb') as f_err: # Save error image with the same name pattern img_err.save(f_err, 'PNG') logger.info(f"Created error placeholder image for {diagram_id} at {temp_png_path}") # We no longer use web URLs, always use local file path image_source_for_pdfkit = pathlib.Path(temp_png_path).as_uri() # Find and replace the target div with the error image target_div_err = soup.find('div', id=diagram_id) if target_div_err: new_err_img_tag = soup.new_tag('img', src=image_source_for_pdfkit, alt=f"Error rendering diagram {diagram_id}") new_err_img_tag['style'] = "max-width:100%; margin:20px auto; display:block; border: 2px solid red;" target_div_err.replace_with(new_err_img_tag) logger.info(f"Replaced div with id='{diagram_id}' using an error placeholder image via BeautifulSoup.") processed_svg_ids.add(diagram_id) else: logger.error(f"Could not find div with id='{diagram_id}' to replace with error placeholder image.") except Exception as e_placeholder_img: logger.error(f"Failed to create error placeholder image for {diagram_id}: {str(e_placeholder_img)}") # Try to insert a simple error paragraph if div is found target_div_err2 = soup.find('div', id=diagram_id) if target_div_err2: error_p = soup.new_tag('p') error_p['style'] = "color:red; border:1px solid red; padding:10px;" error_p.string = f"[Error processing diagram: {diagram_id} - {str(e_png_proc)[:50]}]" target_div_err2.replace_with(error_p) logger.info(f"Replaced div with id='{diagram_id}' with a simple error message via BeautifulSoup.") processed_svg_ids.add(diagram_id) # After processing all PNG diagrams, update processed_html processed_html = str(soup) logger.info("Completed BeautifulSoup processing of all PNG diagrams") # Fallback to using SVG diagrams if provided if svg_diagrams: logger.info(f"Processing {len(svg_diagrams)} SVG diagrams provided by frontend.") # Ensure we're working with a BeautifulSoup object if 'soup' not in locals() or not isinstance(soup, BeautifulSoup): soup = BeautifulSoup(processed_html, 'html.parser') for diagram_id, svg_data_url in svg_diagrams.items(): # Skip if this diagram ID was already processed in the PNG section if diagram_id in processed_svg_ids: logger.info(f"Skipping SVG for diagram_id '{diagram_id}' as it was already processed in PNG section.") continue # Generate a unique filename for the persistent storage to avoid collisions unique_png_filename = f"{diagram_id}_{uuid.uuid4()}.png" temp_png_path = os.path.join(TEMP_PNG_DIR, unique_png_filename) image_source_for_pdfkit = None try: logger.info(f"Processing diagram ID: {diagram_id}") if not svg_data_url.startswith('data:image/svg+xml;base64,'): logger.warning(f"Unsupported SVG data URL format for {diagram_id}: {svg_data_url[:30]}...") raise ValueError("Unsupported SVG data URL format") # Extract base64 content base64_data = svg_data_url.split(',', 1)[1] logger.info(f"Base64 data length: {len(base64_data)}") # Decode the base64 data svg_bytes = base64.b64decode(base64_data) logger.info(f"Decoded SVG data length: {len(svg_bytes)}") # Save the SVG data to the temporary SVG directory temp_svg_filename = f"{diagram_id}_{uuid.uuid4()}.svg" temp_svg_path = os.path.join(TEMP_SVG_DIR, temp_svg_filename) with open(temp_svg_path, 'wb') as f: f.write(svg_bytes) logger.info(f"Saved SVG data to {temp_svg_path} (size: {len(svg_bytes)} bytes)") # Convert SVG to PNG using cairosvg with white background png_data = cairosvg.svg2png(bytestring=svg_bytes, scale=2.0, background_color="white") with open(temp_png_path, 'wb') as f: f.write(png_data) if not os.path.exists(temp_png_path) or os.path.getsize(temp_png_path) == 0: logger.error(f"PNG for {diagram_id} (from SVG) was not created or is empty at {temp_png_path}.") raise ValueError("PNG creation failed or empty") logger.info(f"Generated PNG for {diagram_id} from SVG: {temp_png_path} (size: {os.path.getsize(temp_png_path)} bytes)") # We no longer use web URLs, always use local file path image_source_for_pdfkit = pathlib.Path(temp_png_path).as_uri() alt_text = f"Mermaid Diagram: {text_diagrams.get(diagram_id, diagram_id)[:50].replace('<', '<').replace('>', '>')}..." # --- MODIFIED REPLACEMENT using BeautifulSoup --- target_div_svg = soup.find('div', id=diagram_id) if target_div_svg: # Create the new img tag as a BeautifulSoup object new_img_tag_soup_svg = soup.new_tag('img', src=image_source_for_pdfkit, alt=alt_text) new_img_tag_soup_svg['style'] = "max-width:100%; margin:20px auto; display:block; border:1px solid #eee;" # Replace the target div with our new img tag target_div_svg.replace_with(new_img_tag_soup_svg) logger.info(f"Replaced div with id='{diagram_id}' using its SVG-generated PNG via BeautifulSoup.") processed_svg_ids.add(diagram_id) else: logger.warning(f"SVG_WARN: Could not find div with id='{diagram_id}' for SVG replacement using BeautifulSoup.") # Try to find a code block with matching content from textDiagrams original_code_for_svg = text_diagrams.get(diagram_id) if original_code_for_svg and os.path.exists(temp_png_path): # Try to find matching code blocks code_blocks = soup.find_all('pre') for code_block in code_blocks: code_el = code_block.find('code') if code_el and original_code_for_svg.strip() in code_el.text.strip(): # Create new img tag new_img_tag_soup_svg_fallback = soup.new_tag('img', src=image_source_for_pdfkit, alt=alt_text) new_img_tag_soup_svg_fallback['style'] = "max-width:100%; margin:20px auto; display:block; border:1px solid #eee;" # Replace the code block with the img tag code_block.replace_with(new_img_tag_soup_svg_fallback) logger.info(f"SVG_WARN_RECOVERY: Replaced a code block matching content for diagram {diagram_id} with its SVG-PNG img tag via BeautifulSoup.") processed_svg_ids.add(diagram_id) break else: logger.warning(f"SVG_WARN_FAIL: Failed to find a matching code block for SVG diagram {diagram_id} with BeautifulSoup.") except Exception as e_svg_proc: logger.error(f"Error processing provided SVG for diagram_id '{diagram_id}': {str(e_svg_proc)}") # Create a placeholder image indicating the error for this specific diagram try: img_err = Image.new('RGB', (500, 150), color=(255, 230, 230)) # Light red draw_err = ImageDraw.Draw(img_err) # Consider ImageFont.truetype for specific fonts/sizes if default is too small title_font = ImageFont.load_default() text_font = ImageFont.load_default() draw_err.text((10, 10), f"Error rendering diagram:", fill=(128, 0, 0), font=title_font) draw_err.text((10, 30), f"ID: {diagram_id}", fill=(100, 0, 0), font=text_font) draw_err.text((10, 50), f"Details: {str(e_svg_proc)[:80]}", fill=(100, 0, 0), font=text_font) if text_diagrams.get(diagram_id): draw_err.text((10,70), f"Code: {text_diagrams[diagram_id][:60]}...", fill=(100,0,0), font=text_font) with open(temp_png_path, 'wb') as f_err: # Save error image with the same name pattern img_err.save(f_err, 'PNG') logger.info(f"Created error placeholder image for SVG diagram {diagram_id} at {temp_png_path}") # We no longer use web URLs, always use local file path image_source_for_pdfkit = pathlib.Path(temp_png_path).as_uri() # Find and replace the target div with the error image target_div_svg_err = soup.find('div', id=diagram_id) if target_div_svg_err: new_err_img_tag_svg = soup.new_tag('img', src=image_source_for_pdfkit, alt=f"Error rendering SVG diagram {diagram_id}") new_err_img_tag_svg['style'] = "max-width:100%; margin:20px auto; display:block; border: 2px solid red;" target_div_svg_err.replace_with(new_err_img_tag_svg) logger.info(f"Replaced div with id='{diagram_id}' using an SVG error placeholder image via BeautifulSoup.") processed_svg_ids.add(diagram_id) else: logger.error(f"Could not find div with id='{diagram_id}' to replace with SVG error placeholder image.") except Exception as e_placeholder_img: logger.error(f"Failed to create SVG error placeholder image for {diagram_id}: {str(e_placeholder_img)}") # Try to insert a simple error paragraph if div is found target_div_svg_err2 = soup.find('div', id=diagram_id) if target_div_svg_err2: error_p_svg = soup.new_tag('p') error_p_svg['style'] = "color:red; border:1px solid red; padding:10px;" error_p_svg.string = f"[Error processing SVG diagram: {diagram_id} - {str(e_svg_proc)[:50]}]" target_div_svg_err2.replace_with(error_p_svg) logger.info(f"Replaced div with id='{diagram_id}' with a simple SVG error message via BeautifulSoup.") processed_svg_ids.add(diagram_id) # After processing all SVG diagrams, update processed_html processed_html = str(soup) logger.info("Completed BeautifulSoup processing of all SVG diagrams") # Fallback for any mermaid code blocks/divs *not* covered by processed_svg_ids # This typically means the frontend didn't send an SVG for them, or all replacement attempts above failed. # Fallback for remaining
(that might not have had a corresponding SVG) logger.info("Fallback: Looking for any remaining
not already handled.") temp_processed_html_list = [] last_end = 0 # More specific regex for class="mermaid" and also capturing ID if present div_fallback_pattern = r'(]*class\s*=\s*["\']?[^"\']*mermaid[^"\']*["\']?[^>]*>(?:.*?)
)' for match_obj in re.finditer(div_fallback_pattern, processed_html, flags=re.DOTALL | re.IGNORECASE): start, end = match_obj.span() div_html_segment = match_obj.group(1) # Check if this div has an ID that was already processed id_in_div_match = re.search(r'\bid\s*=\s*["\']?([^"\s\'<>]+)["\']?', div_html_segment, re.IGNORECASE) current_div_id = None if id_in_div_match: current_div_id = id_in_div_match.group(1) if current_div_id in processed_svg_ids: # Skip this div as it was already processed by svgDiagrams temp_processed_html_list.append(processed_html[last_end:end]) last_end = end logger.info(f"Fallback Div: Skipping div with id='{current_div_id}' as it's in processed_svg_ids.") continue # This div was not handled by a provided SVG. Generate text-based placeholder. logger.warning(f"Fallback Div: Processing
at {start}-{end} (ID: {current_div_id}) not in processed_svg_ids. Generating text placeholder.") # Try to extract diagram code from the div soup_div = BeautifulSoup(div_html_segment, 'html.parser') diagram_text_content = soup_div.get_text(separator='\n', strip=True) or "No text in div" # Also check if we have this in textDiagrams if current_div_id and current_div_id in text_diagrams: diagram_text_content = text_diagrams[current_div_id] # Generate a unique ID for this fallback image fallback_uuid = str(uuid.uuid4())[:8] placeholder_img_name = f"fallback_div_{fallback_uuid}.png" placeholder_path = os.path.join(TEMP_PNG_DIR, placeholder_img_name) try: img = Image.new('RGB', (800, 300), color=(240, 240, 240)) draw = ImageDraw.Draw(img) draw.text((10, 10), "Mermaid Diagram (Fallback Render)", fill=(50, 50, 50)) draw.text((10, 30), f"ID: {current_div_id or 'N/A'}", fill=(50, 50, 50)) y_pos = 50 for i, line in enumerate(diagram_text_content.split('\n')[:15]): # Show more lines draw.text((10, y_pos), line[:80], fill=(50, 50, 50)) y_pos += 15 with open(placeholder_path, 'wb') as f: img.save(f, 'PNG') # We no longer use web URLs, always use local file path fallback_image_src = pathlib.Path(placeholder_path).as_uri() img_tag = f'Fallback Mermaid Diagram' except Exception as e_pil: logger.error(f"Fallback Div: Error creating image for {current_div_id}: {e_pil}") img_tag = f"
[Mermaid Diagram Code]:\n{diagram_text_content[:500]}
" temp_processed_html_list.append(processed_html[last_end:start]) temp_processed_html_list.append(img_tag) last_end = end if current_div_id: processed_svg_ids.add(current_div_id) # Mark as handled # Add the remaining content after the last match temp_processed_html_list.append(processed_html[last_end:]) processed_html = "".join(temp_processed_html_list) # Process any remaining mermaid code blocks that weren't already handled logger.info("Fallback: Looking for any remaining mermaid code blocks not explicitly handled by ID.") # More specific pattern for
... or similar structures
        # Avoid overly broad patterns like raw 'graph TD'
        # This pattern tries to capture the code within a language-mermaid block
        code_block_pattern = r'(]*>\s*]*class\s*=\s*["\']?[^"\']*language-mermaid[^"\']*["\']?[^>]*>([\s\S]*?)\s*
)' temp_processed_html_list_codeblocks = [] last_end_codeblocks = 0 for match_obj in re.finditer(code_block_pattern, processed_html, flags=re.DOTALL | re.IGNORECASE): start, end = match_obj.span() full_match_html = match_obj.group(1) # The whole
...
diagram_content = match_obj.group(2).strip() # Just the code logger.info(f"Found potential unhandled mermaid code block. Content starts: {diagram_content[:50]}...") # Try to find if this diagram_content matches any ID in textDiagrams # And if that ID has *already* been processed (i.e., an tag was made) is_already_processed_by_id = False matched_original_id = None for diag_id, original_code_from_textdiagrams in text_diagrams.items(): # Simple check: if the extracted diagram_content is very similar to original_code # This might need a more sophisticated similarity check. if diagram_content == original_code_from_textdiagrams.strip(): matched_original_id = diag_id if diag_id in processed_svg_ids: is_already_processed_by_id = True logger.info(f"Code block content matches diagram ID '{diag_id}' which is in processed_svg_ids. Skipping fallback.") break else: logger.info(f"Code block content matches diagram ID '{diag_id}' which was NOT in processed_svg_ids. Will attempt SVG render if available.") break # Found a match, even if not processed by ID yet temp_processed_html_list_codeblocks.append(processed_html[last_end_codeblocks:start]) # Content before this match if is_already_processed_by_id: # This code block corresponds to an image already inserted. # The original
 block should be removed or replaced by the image if it wasn't already.
                # Since the image replacement by ID targets 
, this
 might still be there.
                # For safety, if it was already processed, we should ensure this 
 block is GONE.
                # However, the primary image replacement should have taken care of the visual aspect.
                # If the pre block is still there, it's a problem with the primary replacement not being thorough.
                # For now, let's assume if is_already_processed_by_id, we don't want to add anything new here.
                # We might actually want to ensure this 'full_match_html' is *removed* if its corresponding img is present.
                # This gets complex. Let's first focus on not *adding* duplicates.
                # If an image was already made, we effectively want to remove this 
 block.
                # So, we append nothing here for this specific match.
                logger.info(f"Skipping rendering for code block of diagram {matched_original_id} as it was already processed by ID.")
                # Effectively, this removes the 
 block if its content was for an already-rendered image.
            else:
                # This code block was NOT processed by ID (or didn't match any known ID).
                # Try to render it now.
                img_tag_for_code_block = None
                # Check if we have an SVG for it (if matched_original_id was found but not in processed_svg_ids)
                if matched_original_id and matched_original_id in svg_diagrams and svg_diagrams[matched_original_id].startswith('data:image/svg+xml;base64,'):
                    # Generate PNG from SVG
                    try:
                        base64_data = svg_diagrams[matched_original_id].split(',')[1]
                        svg_data_decoded = base64.b64decode(base64_data)
                        uuid_value = uuid.uuid4()
                        
                        # Save SVG data
                        temp_svg_filename = f"{matched_original_id}_{uuid_value}.svg"
                        temp_svg_path = os.path.join(TEMP_SVG_DIR, temp_svg_filename)
                        with open(temp_svg_path, 'wb') as f:
                            f.write(svg_data_decoded)
                        logger.info(f"Saved SVG data for code block to {temp_svg_path} (size: {len(svg_data_decoded)} bytes)")
                        
                        # Save PNG data
                        temp_png_filename = f"{matched_original_id}_{uuid_value}.png"
                        temp_png_path = os.path.join(TEMP_PNG_DIR, temp_png_filename)
                        
                        png_data = cairosvg.svg2png(bytestring=svg_data_decoded, scale=2.0, background_color="white")
                        with open(temp_png_path, 'wb') as f:
                            f.write(png_data)
                        
                        # We no longer use web URLs, always use local file path
                        img_src = pathlib.Path(temp_png_path).as_uri()
                            
                        img_tag_for_code_block = f'Mermaid Diagram'
                        logger.info(f"Used SVG render for code block: {matched_original_id}")
                        processed_svg_ids.add(matched_original_id)
                    except Exception as e:
                        logger.error(f"Error converting SVG to PNG: {str(e)}")

                if not img_tag_for_code_block: # No SVG or SVG processing failed
                    logger.info(f"No specific SVG found for this code block, creating PIL fallback image or pre.")
                    # Create a fallback image if no matching SVG was found
                    fallback_uuid_code = str(uuid.uuid4())[:8]
                    placeholder_img_name_code = f"code_block_pil_{fallback_uuid_code}.png"
                    placeholder_path_code = os.path.join(TEMP_PNG_DIR, placeholder_img_name_code)
                    
                    try:
                        # Create an image with the diagram code
                        img = Image.new('RGB', (800, 400), color=(245, 245, 245))
                        draw = ImageDraw.Draw(img)
                        draw.text((10, 10), "Mermaid Diagram (Fallback)", fill=(50, 50, 50))
                        
                        # Add the diagram code content
                        y_pos = 40
                        for line_idx, line in enumerate(diagram_content.split('\n')[:20]):
                            draw.text((10, y_pos), line[:80], fill=(50, 50, 50))
                            y_pos += 15
                        
                        with open(placeholder_path_code, 'wb') as f:
                            img.save(f, 'PNG')
                        
                        # We no longer use web URLs, always use local file path
                        img_src_code = pathlib.Path(placeholder_path_code).as_uri()
                            
                        img_tag_for_code_block = f'Mermaid Diagram (Code Fallback)'
                    except Exception as e_img:
                        logger.error(f"Error creating fallback image: {str(e_img)}")
                        # IMPORTANT: Avoid just dumping the diagram_content here if that's the source of the problem.
                        # Use a more generic placeholder instead
                        img_tag_for_code_block = f'

[Mermaid diagram code could not be rendered here. Content: {diagram_content[:80]}...]

' temp_processed_html_list_codeblocks.append(img_tag_for_code_block or "") # Append the new image/placeholder last_end_codeblocks = end temp_processed_html_list_codeblocks.append(processed_html[last_end_codeblocks:]) processed_html = "".join(temp_processed_html_list_codeblocks) # Configure PDF options options = { 'page-size': 'Letter', 'margin-top': '0.75in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '0.75in', 'encoding': 'UTF-8', # 'no-outline': None, # Removed - not supported in unpatched Qt 'enable-local-file-access': True # Still needed for local file access # 'load-error-handling': 'skip', # or 'ignore' - might hide issues but prevent PDF failure # 'load-media-error-handling': 'skip', } # The server has an unpatched version of wkhtmltopdf which doesn't support # the 'enable-remote-images' option. We're using file:/// URIs with enable-local-file-access instead # Add custom CSS for better formatting css = """ body { font-family: Arial, sans-serif; font-size: 12pt; line-height: 1.6; } img { max-width: 100%; height: auto; margin: 20px auto; display: block; } h1, h2, h3, h4, h5, h6 { color: #333; margin-top: 20px; margin-bottom: 10px; } pre { background-color: #f5f5f5; padding: 10px; border-radius: 5px; overflow-x: auto; } code { font-family: 'Courier New', Courier, monospace; font-size: 11pt; } table { border-collapse: collapse; width: 100%; margin: 20px 0; } table, th, td { border: 1px solid #ddd; } th, td { padding: 8px; text-align: left; } th { background-color: #f2f2f2; } /* Special handling for pre containing mermaid code */ pre.mermaid-source { display: none; } """ # Comment out final HTML content logging # logger.info("====================================================") # logger.info("FINAL HTML CONTENT BEING SENT TO PDFKIT:") # logger.info(processed_html) # logger.info("====================================================") # Create an index.html file in the temp directory for PDF generation index_html_path = os.path.join(temp_dir_for_pdf, "index.html") with open(index_html_path, 'w', encoding='utf-8') as f: f.write(f""" Video Query Result {processed_html} """) # Log the final processed HTML for debugging logger.info(f"Final HTML length: {len(processed_html)}") logger.info("Final HTML contains image tags: " + str(']*src\s*=\s*["\']([^"\']+)["\']' img_srcs = re.findall(img_src_pattern, processed_html) logger.info(f"Found {len(img_srcs)} image sources in the HTML") for i, src in enumerate(img_srcs): logger.info(f"Image {i+1} src: {src}") # Find wkhtmltopdf on the system try: import subprocess which_result = subprocess.run(['which', 'wkhtmltopdf'], capture_output=True, text=True) if which_result.returncode == 0: wkhtmltopdf_which_path = which_result.stdout.strip() logger.info(f"wkhtmltopdf found at: {wkhtmltopdf_which_path}") else: logger.warning(f"wkhtmltopdf not found in PATH: {which_result.stderr}") # Try another approach with `whereis` whereis_result = subprocess.run(['whereis', 'wkhtmltopdf'], capture_output=True, text=True) logger.info(f"whereis wkhtmltopdf result: {whereis_result.stdout}") except Exception as e: logger.warning(f"Error while trying to locate wkhtmltopdf: {str(e)}") try: # Use cross-platform wkhtmltopdf detection try: wkhtmltopdf_path = system_utils.find_wkhtmltopdf() logger.info(f"Using wkhtmltopdf at: {wkhtmltopdf_path}") except FileNotFoundError as e: logger.error(f"wkhtmltopdf not found: {str(e)}") error_report = ErrorReporter.capture_error( e, category=ErrorCategory.SYSTEM_ERROR, context={'operation': 'pdf_generation'} ) return jsonify({ 'success': False, 'message': error_report.format_user_message() }), 500 # Check if wkhtmltopdf is available at the specified path if os.path.exists(wkhtmltopdf_path): logger.info(f"Using wkhtmltopdf at: {wkhtmltopdf_path}") pdfkit_config = Configuration(wkhtmltopdf=wkhtmltopdf_path) pdfkit.from_file(index_html_path, pdf_path, options=options, configuration=pdfkit_config) else: # Try alternate paths alternate_paths = [ '/usr/local/bin/wkhtmltopdf', '/opt/bin/wkhtmltopdf', '/snap/bin/wkhtmltopdf' ] found_path = None for path in alternate_paths: if os.path.exists(path): found_path = path break if found_path: logger.info(f"Using wkhtmltopdf at alternate path: {found_path}") pdfkit_config = Configuration(wkhtmltopdf=found_path) pdfkit.from_file(index_html_path, pdf_path, options=options, configuration=pdfkit_config) else: # Try with default config, which may use PATH environment variable logger.warning("wkhtmltopdf not found at expected paths, trying with default configuration") pdfkit.from_file(index_html_path, pdf_path, options=options) logger.info(f"PDF generated successfully, file size: {os.path.getsize(pdf_path)} bytes") except Exception as pdf_error: logger.error(f"Error generating PDF: {str(pdf_error)}") import traceback logger.error(traceback.format_exc()) # Try with direct HTML content as fallback logger.info("Trying fallback PDF generation directly from HTML string") try: # Strip out any remaining mermaid divs or code blocks that might be causing problems final_html = processed_html problem_patterns = [ r']*class=.?mermaid.?[^>]*>.*?
', r'
graph\s+TD.*?
', r'
.*?
', r'```mermaid\s+[\s\S]*?```', r'graph\s+TD[^;]*;' ] logger.info("Stripping any remaining problematic elements before final fallback") for pattern in problem_patterns: before_len = len(final_html) final_html = re.sub(pattern, '

[Diagram placeholder]

', final_html, flags=re.DOTALL) after_len = len(final_html) if before_len != after_len: logger.info(f"Removed pattern, length before: {before_len}, after: {after_len}") fallback_options = { 'page-size': 'Letter', 'margin-top': '0.75in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '0.75in', 'encoding': 'UTF-8', 'enable-local-file-access': True } # Try to locate wkhtmltopdf for fallback method too if os.path.exists(wkhtmltopdf_path): pdfkit_config = Configuration(wkhtmltopdf=wkhtmltopdf_path) pdfkit.from_string(f""" Video Query Result {final_html} """, pdf_path, options=fallback_options, configuration=pdfkit_config) else: logger.warning("Using default configuration for fallback PDF generation") pdfkit.from_string(f""" Video Query Result {final_html} """, pdf_path, options=fallback_options) logger.info("Fallback PDF generation succeeded") except Exception as fallback_error: logger.error(f"Fallback PDF generation also failed: {str(fallback_error)}") # Read the generated PDF file if os.path.exists(pdf_path): with open(pdf_path, 'rb') as file: pdf_data = file.read() # Encode the PDF as base64 pdf_base64 = base64.b64encode(pdf_data).decode('utf-8') logger.info(f"Encoded PDF data length: {len(pdf_base64)}") else: logger.error("PDF file does not exist after generation") return jsonify({'success': False, 'message': 'PDF generation failed'}), 500 # Clean up temporary PDF generation files and temp images try: # Clean up PDF temp directory for root, dirs, files in os.walk(temp_dir_for_pdf, topdown=False): for file in files: os.remove(os.path.join(root, file)) for dir_name in dirs: # dir is a reserved word os.rmdir(os.path.join(root, dir_name)) os.rmdir(temp_dir_for_pdf) logger.info(f"Cleaned up temporary PDF directory: {temp_dir_for_pdf}") # Clean up temporary PNG and SVG files for file in os.listdir(TEMP_PNG_DIR): os.remove(os.path.join(TEMP_PNG_DIR, file)) for file in os.listdir(TEMP_SVG_DIR): os.remove(os.path.join(TEMP_SVG_DIR, file)) logger.info("Cleaned up temporary PNG and SVG files") except Exception as cleanup_error: logger.warning(f"Could not remove all temporary files: {str(cleanup_error)}") # Generate PDF filename from video filename if video_file_name: # Remove video extension and add .pdf base_name = os.path.splitext(video_file_name)[0] pdf_filename = f"{base_name}.pdf" logger.info(f"Generated PDF filename from video name: {pdf_filename}") else: pdf_filename = 'video_query_result.pdf' logger.info("No video filename provided, using default PDF name") return jsonify({ 'success': True, 'pdf': pdf_base64, 'filename': pdf_filename }) except Exception as e: import traceback error_report = ErrorReporter.capture_error( e, context={ 'endpoint': '/api/generate-pdf', 'video_file_name': data.get('videoFileName', 'unknown'), 'has_diagrams': bool(data.get('diagramPngs', {})) } ) error_trace = traceback.format_exc() logger.error(f"Error generating PDF: {str(e)}") logger.error(error_trace) return jsonify({ 'success': False, 'message': error_report.format_user_message(), 'error_id': error_report.error_id, 'error_category': error_report.category.value }), 500 # Handle CORS preflight requests for all API routes @app.route('/api/', methods=['OPTIONS']) def handle_options(path): # Get the origin from the request origin = request.headers.get('Origin') allowed_origins = ['https://brandtechsandbox.oliver.solutions', 'http://localhost:3000'] response = jsonify({}) # Allow the origin if it's in our allowed list if origin in allowed_origins: response.headers.add('Access-Control-Allow-Origin', origin) else: # Default to production origin response.headers.add('Access-Control-Allow-Origin', 'https://brandtechsandbox.oliver.solutions') response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization,X-Requested-With') response.headers.add('Access-Control-Allow-Methods', 'GET,POST,OPTIONS') response.headers.add('Access-Control-Max-Age', '86400') # 24 hours response.headers.add('Access-Control-Allow-Credentials', 'true') return response # No longer need to serve frontend from the backend # Frontend will be hosted at https://ai-sandbox.oliver.solutions/video_query if __name__ == '__main__': # For development only - use Hypercorn in production app.run(debug=True, port=5000)