1124 lines
No EOL
59 KiB
Python
1124 lines
No EOL
59 KiB
Python
import os
|
|
import tempfile
|
|
import uuid
|
|
import logging
|
|
import sys
|
|
import base64
|
|
import json
|
|
import re
|
|
import io
|
|
import cairosvg
|
|
import pathlib
|
|
from PIL import Image, ImageDraw, ImageFont
|
|
from flask import Flask, request, jsonify, send_from_directory, send_file
|
|
from werkzeug.utils import secure_filename
|
|
from werkzeug.exceptions import RequestEntityTooLarge
|
|
from dotenv import load_dotenv
|
|
from flask_cors import CORS
|
|
from chunked_upload import chunked_upload_bp
|
|
from auth import require_auth, lenient_auth
|
|
import pdfkit
|
|
from pdfkit.configuration import Configuration
|
|
from bs4 import BeautifulSoup
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
|
handlers=[
|
|
logging.StreamHandler(sys.stdout)
|
|
]
|
|
)
|
|
logger = logging.getLogger('video_query')
|
|
|
|
# Load environment variables from .env file
|
|
load_dotenv()
|
|
|
|
from video_processor import VideoProcessor
|
|
|
|
app = Flask(__name__)
|
|
# Enable CORS with permissive settings for large file uploads
|
|
CORS(app, resources={r"/api/*": {
|
|
"origins": ["https://ai-sandbox.oliver.solutions"],
|
|
"supports_credentials": True,
|
|
"methods": ["GET", "POST", "OPTIONS"],
|
|
"allow_headers": ["Content-Type", "X-Requested-With", "Authorization"]
|
|
}}, expose_headers=["Content-Disposition", "Authorization"])
|
|
|
|
# Register the chunked upload blueprint
|
|
app.register_blueprint(chunked_upload_bp)
|
|
|
|
# Configuration
|
|
UPLOAD_FOLDER = os.path.join(tempfile.gettempdir(), 'video_query_uploads')
|
|
# 5GB max upload size
|
|
MAX_CONTENT_LENGTH = 5 * 1024 * 1024 * 1024
|
|
|
|
# Create upload folder if it doesn't exist
|
|
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
|
|
|
# Configuration for persistent output - commented out as no longer needed
|
|
# PERSISTENT_PNG_ROOT_DIR = '/var/www/html/video_query/png_output' # Filesystem path for PNG files
|
|
# PERSISTENT_SVG_ROOT_DIR = '/var/www/html/video_query/svg_output' # Filesystem path for SVG files
|
|
# PERSISTENT_PNG_BASE_URL = 'https://ai-sandbox.oliver.solutions/video_query/png_output' # Web accessible URL base for PNGs
|
|
# PERSISTENT_SVG_BASE_URL = 'https://ai-sandbox.oliver.solutions/video_query/svg_output' # Web accessible URL base for SVGs
|
|
|
|
# Create temporary directories for PDF generation instead
|
|
TEMP_PNG_DIR = os.path.join(tempfile.gettempdir(), 'video_query_png_temp')
|
|
TEMP_SVG_DIR = os.path.join(tempfile.gettempdir(), 'video_query_svg_temp')
|
|
os.makedirs(TEMP_PNG_DIR, exist_ok=True)
|
|
os.makedirs(TEMP_SVG_DIR, exist_ok=True)
|
|
|
|
# Configure the app
|
|
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
|
|
app.config['MAX_CONTENT_LENGTH'] = MAX_CONTENT_LENGTH
|
|
# Set larger buffer size for large file uploads
|
|
app.config['MAX_CONTENT_PATH'] = 5 * 1024 * 1024 * 1024 # 5GB
|
|
|
|
# Initialize video processor
|
|
video_processor = VideoProcessor()
|
|
|
|
# Set allowed extensions for videos
|
|
ALLOWED_EXTENSIONS = {'mp4', 'avi', 'mov', 'wmv', 'mkv', 'webm'}
|
|
|
|
def allowed_file(filename):
|
|
"""Check if file has an allowed extension"""
|
|
return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
|
|
|
|
@app.route('/api/process', methods=['POST'])
|
|
@lenient_auth
|
|
def process_video():
|
|
"""Process uploaded video with the selected mode and prompt"""
|
|
logger.info("API request received: /api/process")
|
|
logger.info(f"Content-Type: {request.content_type}")
|
|
logger.info(f"Content-Length: {request.content_length}")
|
|
|
|
# Handle chunked upload case
|
|
if request.is_json:
|
|
data = request.get_json()
|
|
file_path = data.get('file_path')
|
|
filename = data.get('filename')
|
|
prompt = data.get('prompt')
|
|
|
|
if not file_path or not os.path.exists(file_path):
|
|
logger.error(f"File path not found: {file_path}")
|
|
return jsonify({'success': False, 'message': 'Uploaded file not found'}), 400
|
|
|
|
if not prompt:
|
|
logger.error("No prompt provided")
|
|
return jsonify({'success': False, 'message': 'No prompt provided'}), 400
|
|
|
|
# Get user email from authentication if available
|
|
user_email = "anonymous"
|
|
if hasattr(request, "user") and isinstance(request.user, dict):
|
|
user_email = request.user.get("email", request.user.get("preferred_username", "anonymous"))
|
|
|
|
logger.info(f"Processing chunked upload from {file_path} ({filename}) for user: {user_email}")
|
|
result = video_processor.process_video(file_path, prompt, user_email)
|
|
|
|
# Clean up the uploaded file
|
|
try:
|
|
os.remove(file_path)
|
|
logger.info(f"Cleaned up temporary file: {file_path}")
|
|
except Exception as cleanup_error:
|
|
logger.warning(f"Could not remove temporary file {file_path}: {str(cleanup_error)}")
|
|
|
|
if result['success']:
|
|
content_length = len(result['content']) if result['content'] else 0
|
|
logger.info(f"Returning successful response with {content_length} characters")
|
|
return jsonify({
|
|
'success': True,
|
|
'content': result['content']
|
|
})
|
|
else:
|
|
logger.error(f"Processing failed: {result['message']}")
|
|
return jsonify({
|
|
'success': False,
|
|
'message': result['message']
|
|
}), 500
|
|
|
|
# Standard direct upload method (for small files)
|
|
# Check if a file was uploaded
|
|
if 'video' not in request.files:
|
|
logger.error("No video file in request")
|
|
return jsonify({'success': False, 'message': 'No video file provided'}), 400
|
|
|
|
file = request.files['video']
|
|
prompt = request.form.get('prompt', '')
|
|
|
|
logger.info(f"Received file: {file.filename}")
|
|
logger.info(f"Prompt length: {len(prompt)} characters")
|
|
|
|
# Check for empty filename
|
|
if file.filename == '':
|
|
logger.error("Empty filename provided")
|
|
return jsonify({'success': False, 'message': 'No video selected'}), 400
|
|
|
|
if not prompt:
|
|
logger.error("No prompt provided")
|
|
return jsonify({'success': False, 'message': 'No prompt provided'}), 400
|
|
|
|
# Check file extension
|
|
if not allowed_file(file.filename):
|
|
logger.error(f"Invalid file type: {file.filename}")
|
|
return jsonify({
|
|
'success': False,
|
|
'message': f'Invalid file type. Allowed types: {", ".join(ALLOWED_EXTENSIONS)}'
|
|
}), 400
|
|
|
|
try:
|
|
# Make sure upload directory exists
|
|
os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
|
|
logger.info(f"Upload directory: {app.config['UPLOAD_FOLDER']}")
|
|
|
|
# Generate a unique filename to prevent collisions
|
|
original_filename = secure_filename(file.filename)
|
|
unique_filename = f"{uuid.uuid4()}_{original_filename}"
|
|
file_path = os.path.join(app.config['UPLOAD_FOLDER'], unique_filename)
|
|
logger.info(f"Writing to: {file_path}")
|
|
|
|
# Stream the file to disk in larger chunks for better performance
|
|
chunk_size = 1024 * 1024 # 1MB chunks
|
|
total_bytes = 0
|
|
try:
|
|
with open(file_path, 'wb') as f:
|
|
while True:
|
|
chunk = file.read(chunk_size)
|
|
if not chunk:
|
|
break
|
|
total_bytes += len(chunk)
|
|
f.write(chunk)
|
|
# Periodically log progress for large files
|
|
if total_bytes % (50 * 1024 * 1024) == 0: # Log every 50MB
|
|
logger.info(f"Upload progress: {total_bytes / (1024 * 1024):.2f} MB")
|
|
except Exception as chunk_error:
|
|
logger.error(f"Error during chunked upload: {str(chunk_error)}")
|
|
raise
|
|
|
|
logger.info(f"File saved: {file_path} ({total_bytes} bytes)")
|
|
|
|
# Get user email from authentication if available
|
|
user_email = "anonymous"
|
|
if hasattr(request, "user") and isinstance(request.user, dict):
|
|
user_email = request.user.get("email", request.user.get("preferred_username", "anonymous"))
|
|
|
|
# Process the video
|
|
logger.info(f"Starting video processing for user: {user_email}...")
|
|
result = video_processor.process_video(file_path, prompt, user_email)
|
|
logger.info(f"Processing result: success={result['success']}")
|
|
|
|
# Clean up the file after processing
|
|
try:
|
|
os.remove(file_path)
|
|
logger.info(f"Cleaned up temporary file: {file_path}")
|
|
except Exception as cleanup_error:
|
|
logger.warning(f"Could not remove temporary file {file_path}: {str(cleanup_error)}")
|
|
|
|
if result['success']:
|
|
content_length = len(result['content']) if result['content'] else 0
|
|
logger.info(f"Returning successful response with {content_length} characters")
|
|
return jsonify({
|
|
'success': True,
|
|
'content': result['content']
|
|
})
|
|
else:
|
|
logger.error(f"Processing failed: {result['message']}")
|
|
return jsonify({
|
|
'success': False,
|
|
'message': result['message']
|
|
}), 500
|
|
|
|
except RequestEntityTooLarge:
|
|
logger.error(f"File too large: {request.content_length} bytes")
|
|
return jsonify({
|
|
'success': False,
|
|
'message': 'The uploaded file is too large (max 5GB)'
|
|
}), 413
|
|
except Exception as e:
|
|
import traceback
|
|
error_trace = traceback.format_exc()
|
|
logger.error(f"Error processing video: {str(e)}")
|
|
logger.error(error_trace)
|
|
return jsonify({
|
|
'success': False,
|
|
'message': f'An unexpected error occurred: {str(e)}'
|
|
}), 500
|
|
|
|
# Test route to verify authentication
|
|
@app.route('/api/auth-test', methods=['GET'])
|
|
@lenient_auth
|
|
def auth_test():
|
|
"""Test endpoint to verify authentication is working"""
|
|
user_info = {
|
|
"authenticated": True,
|
|
"user": request.user.get("name", "Anonymous") if hasattr(request, "user") else "Unknown",
|
|
"token_present": "Authorization" in request.headers,
|
|
"token_info": {k: request.user.get(k) for k in ["name", "preferred_username", "email"]
|
|
if k in request.user} if hasattr(request, "user") else {}
|
|
}
|
|
logger.info(f"Auth test: {user_info}")
|
|
return jsonify(user_info)
|
|
|
|
# Handle PDF generation
|
|
@app.route('/api/generate-pdf', methods=['POST'])
|
|
@lenient_auth
|
|
def generate_pdf():
|
|
"""Generate a PDF from HTML content with mermaid diagrams"""
|
|
logger.info("API request received: /api/generate-pdf")
|
|
|
|
if not request.is_json:
|
|
logger.error("Request is not JSON")
|
|
return jsonify({'success': False, 'message': 'JSON request required'}), 400
|
|
|
|
data = request.get_json()
|
|
html_content = data.get('html')
|
|
text_diagrams = data.get('textDiagrams', {})
|
|
svg_diagrams = data.get('svgDiagrams', {})
|
|
diagram_png_data_urls = data.get('diagramPngs', {})
|
|
video_file_name = data.get('videoFileName', '')
|
|
|
|
# Log detailed request information
|
|
logger.info(f"Request data: HTML content length: {len(html_content) if html_content else 0}")
|
|
logger.info(f"Text diagrams received: {len(text_diagrams)}")
|
|
logger.info(f"SVG diagrams received: {len(svg_diagrams)}")
|
|
logger.info(f"Diagram PNGs received: {len(diagram_png_data_urls)}")
|
|
logger.info(f"Video filename received: {video_file_name if video_file_name else 'None'}")
|
|
|
|
# Comment out full HTML content logging
|
|
# logger.info("HTML CONTENT RECEIVED START -------------------")
|
|
# logger.info(html_content)
|
|
# logger.info("HTML CONTENT RECEIVED END ---------------------")
|
|
|
|
if text_diagrams:
|
|
logger.info(f"Text diagram keys: {list(text_diagrams.keys())}")
|
|
|
|
if svg_diagrams:
|
|
logger.info(f"SVG diagram keys: {list(svg_diagrams.keys())}")
|
|
for key, value in svg_diagrams.items():
|
|
logger.info(f"SVG diagram {key}: starts with data:image/svg+xml;base64: {value.startswith('data:image/svg+xml;base64,') if value else False}")
|
|
|
|
if diagram_png_data_urls:
|
|
logger.info(f"Diagram PNG keys: {list(diagram_png_data_urls.keys())}")
|
|
for key, value in diagram_png_data_urls.items():
|
|
logger.info(f"Diagram PNG {key}: starts with data:image/png;base64: {value.startswith('data:image/png;base64,') if value else False} (length: {len(value) if value else 0})")
|
|
|
|
if not html_content:
|
|
logger.error("No HTML content provided")
|
|
return jsonify({'success': False, 'message': 'No HTML content provided'}), 400
|
|
|
|
try:
|
|
# Create a temporary directory for PDF and HTML file, not necessarily for images
|
|
temp_dir_for_pdf = tempfile.mkdtemp()
|
|
pdf_path = os.path.join(temp_dir_for_pdf, f"response_{uuid.uuid4()}.pdf")
|
|
|
|
# Process HTML to replace mermaid divs with image tags
|
|
processed_html = html_content
|
|
processed_svg_ids = set()
|
|
|
|
# Decide whether to use web URLs or file URIs for pdfkit
|
|
# Always use file:/// URIs with enable-local-file-access
|
|
# USE_WEB_URLS_FOR_PDFKIT = False # This is no longer needed
|
|
# We now use temp directories and local file paths for all images
|
|
|
|
# Create a subdirectory for images in the temp dir (for the HTML structure)
|
|
img_dir = os.path.join(temp_dir_for_pdf, "images")
|
|
os.makedirs(img_dir, exist_ok=True)
|
|
|
|
logger.info("HTML content before processing:")
|
|
logger.info(f"HTML contains '.mermaid' class: {'class=mermaid' in html_content}")
|
|
logger.info(f"HTML contains mermaid code blocks: {'```mermaid' in html_content or 'graph TD' in html_content}")
|
|
|
|
# First approach: Manually look for the mermaid pattern in the HTML before any processing
|
|
pattern1 = r'<div[^>]*class=.?mermaid.?[^>]*>(.*?)</div>'
|
|
pattern2 = r'<pre><code.*?>(graph\s+TD.*?)</code></pre>'
|
|
pattern3 = r'graph\s+TD'
|
|
|
|
mermaid_matches1 = re.findall(pattern1, html_content, re.DOTALL)
|
|
mermaid_matches2 = re.findall(pattern2, html_content, re.DOTALL)
|
|
mermaid_matches3 = re.findall(pattern3, html_content, re.DOTALL)
|
|
|
|
logger.info(f"Mermaid div matches: {len(mermaid_matches1)}")
|
|
if mermaid_matches1:
|
|
for i, m in enumerate(mermaid_matches1):
|
|
logger.info(f"Mermaid div content {i} (first 100 chars): {m[:100]}")
|
|
|
|
logger.info(f"Mermaid code block matches: {len(mermaid_matches2)}")
|
|
if mermaid_matches2:
|
|
for i, m in enumerate(mermaid_matches2):
|
|
logger.info(f"Mermaid code content {i} (first 100 chars): {m[:100]}")
|
|
|
|
logger.info(f"Mermaid graph TD matches: {len(mermaid_matches3)}")
|
|
|
|
# First, prioritize using the frontend-generated PNGs if available
|
|
if diagram_png_data_urls:
|
|
logger.info(f"Processing {len(diagram_png_data_urls)} PNG diagrams provided by frontend.")
|
|
|
|
# Parse the HTML with BeautifulSoup ONCE before the loop
|
|
soup = BeautifulSoup(processed_html, 'html.parser')
|
|
|
|
for diagram_id, png_data_url in diagram_png_data_urls.items():
|
|
unique_png_filename = f"{diagram_id}_{uuid.uuid4()}.png"
|
|
temp_png_path = os.path.join(TEMP_PNG_DIR, unique_png_filename)
|
|
|
|
image_source_for_pdfkit = None
|
|
|
|
try:
|
|
if not png_data_url.startswith('data:image/png;base64,'):
|
|
logger.warning(f"Unsupported PNG data URL format for {diagram_id}")
|
|
raise ValueError("Unsupported PNG data URL format")
|
|
|
|
base64_png_content = png_data_url.split(',', 1)[1]
|
|
png_bytes = base64.b64decode(base64_png_content)
|
|
|
|
with open(temp_png_path, 'wb') as f:
|
|
f.write(png_bytes)
|
|
|
|
if not os.path.exists(temp_png_path) or os.path.getsize(temp_png_path) == 0:
|
|
logger.error(f"PNG for {diagram_id} (from frontend PNG) was not saved or is empty at {temp_png_path}.")
|
|
raise ValueError("PNG saving failed or empty")
|
|
|
|
logger.info(f"Saved frontend-generated PNG for {diagram_id} to: {temp_png_path} (size: {os.path.getsize(temp_png_path)} bytes)")
|
|
|
|
# We no longer use web URLs, always use local file path
|
|
image_source_for_pdfkit = pathlib.Path(temp_png_path).as_uri()
|
|
|
|
alt_text = f"Diagram: {text_diagrams.get(diagram_id, diagram_id)[:50].replace('<', '<').replace('>', '>')}..."
|
|
|
|
# --- MODIFIED REPLACEMENT using BeautifulSoup ---
|
|
target_div = soup.find('div', id=diagram_id)
|
|
if target_div:
|
|
# Create the new img tag as a BeautifulSoup object
|
|
new_img_tag_soup = soup.new_tag('img', src=image_source_for_pdfkit, alt=alt_text)
|
|
new_img_tag_soup['style'] = "max-width:100%; margin:20px auto; display:block; border:1px solid #eee;"
|
|
|
|
# Replace the target div with our new img tag
|
|
target_div.replace_with(new_img_tag_soup)
|
|
logger.info(f"Replaced div with id='{diagram_id}' using its frontend-generated PNG (src: {image_source_for_pdfkit}) via BeautifulSoup.")
|
|
processed_svg_ids.add(diagram_id)
|
|
else:
|
|
logger.warning(f"PNG_WARN: Could not find div with id='{diagram_id}' in the HTML to replace with frontend PNG using BeautifulSoup.")
|
|
# Fallback to replacing code block if div with ID isn't found
|
|
original_code_for_png = text_diagrams.get(diagram_id)
|
|
if original_code_for_png:
|
|
# Try to find a pre/code block with matching content
|
|
code_blocks = soup.find_all('pre')
|
|
for code_block in code_blocks:
|
|
code_el = code_block.find('code')
|
|
if code_el and original_code_for_png.strip() in code_el.text.strip():
|
|
# Create new img tag
|
|
new_img_tag_soup_fallback = soup.new_tag('img', src=image_source_for_pdfkit, alt=alt_text)
|
|
new_img_tag_soup_fallback['style'] = "max-width:100%; margin:20px auto; display:block; border:1px solid #eee;"
|
|
|
|
# Replace the code block with the img tag
|
|
code_block.replace_with(new_img_tag_soup_fallback)
|
|
logger.info(f"PNG_WARN_RECOVERY: Replaced a code block matching content of diagram {diagram_id} with its frontend-PNG img tag via BeautifulSoup.")
|
|
processed_svg_ids.add(diagram_id)
|
|
break
|
|
else:
|
|
logger.warning(f"PNG_WARN_FAIL: Also failed to find a code block for diagram {diagram_id} content for frontend-PNG replacement with BeautifulSoup.")
|
|
|
|
except Exception as e_png_proc:
|
|
logger.error(f"Error processing provided PNG for diagram_id '{diagram_id}': {str(e_png_proc)}")
|
|
# Create a placeholder image indicating the error for this specific diagram
|
|
try:
|
|
img_err = Image.new('RGB', (500, 150), color=(255, 230, 230)) # Light red
|
|
draw_err = ImageDraw.Draw(img_err)
|
|
# Consider ImageFont.truetype for specific fonts/sizes if default is too small
|
|
title_font = ImageFont.load_default()
|
|
text_font = ImageFont.load_default()
|
|
draw_err.text((10, 10), f"Error rendering diagram:", fill=(128, 0, 0), font=title_font)
|
|
draw_err.text((10, 30), f"ID: {diagram_id}", fill=(100, 0, 0), font=text_font)
|
|
draw_err.text((10, 50), f"Details: {str(e_png_proc)[:80]}", fill=(100, 0, 0), font=text_font)
|
|
if text_diagrams.get(diagram_id):
|
|
draw_err.text((10,70), f"Code: {text_diagrams[diagram_id][:60]}...", fill=(100,0,0), font=text_font)
|
|
|
|
with open(temp_png_path, 'wb') as f_err: # Save error image with the same name pattern
|
|
img_err.save(f_err, 'PNG')
|
|
logger.info(f"Created error placeholder image for {diagram_id} at {temp_png_path}")
|
|
|
|
# We no longer use web URLs, always use local file path
|
|
image_source_for_pdfkit = pathlib.Path(temp_png_path).as_uri()
|
|
|
|
# Find and replace the target div with the error image
|
|
target_div_err = soup.find('div', id=diagram_id)
|
|
if target_div_err:
|
|
new_err_img_tag = soup.new_tag('img', src=image_source_for_pdfkit, alt=f"Error rendering diagram {diagram_id}")
|
|
new_err_img_tag['style'] = "max-width:100%; margin:20px auto; display:block; border: 2px solid red;"
|
|
target_div_err.replace_with(new_err_img_tag)
|
|
logger.info(f"Replaced div with id='{diagram_id}' using an error placeholder image via BeautifulSoup.")
|
|
processed_svg_ids.add(diagram_id)
|
|
else:
|
|
logger.error(f"Could not find div with id='{diagram_id}' to replace with error placeholder image.")
|
|
except Exception as e_placeholder_img:
|
|
logger.error(f"Failed to create error placeholder image for {diagram_id}: {str(e_placeholder_img)}")
|
|
# Try to insert a simple error paragraph if div is found
|
|
target_div_err2 = soup.find('div', id=diagram_id)
|
|
if target_div_err2:
|
|
error_p = soup.new_tag('p')
|
|
error_p['style'] = "color:red; border:1px solid red; padding:10px;"
|
|
error_p.string = f"[Error processing diagram: {diagram_id} - {str(e_png_proc)[:50]}]"
|
|
target_div_err2.replace_with(error_p)
|
|
logger.info(f"Replaced div with id='{diagram_id}' with a simple error message via BeautifulSoup.")
|
|
processed_svg_ids.add(diagram_id)
|
|
|
|
# After processing all PNG diagrams, update processed_html
|
|
processed_html = str(soup)
|
|
logger.info("Completed BeautifulSoup processing of all PNG diagrams")
|
|
|
|
# Fallback to using SVG diagrams if provided
|
|
if svg_diagrams:
|
|
logger.info(f"Processing {len(svg_diagrams)} SVG diagrams provided by frontend.")
|
|
|
|
# Ensure we're working with a BeautifulSoup object
|
|
if 'soup' not in locals() or not isinstance(soup, BeautifulSoup):
|
|
soup = BeautifulSoup(processed_html, 'html.parser')
|
|
|
|
for diagram_id, svg_data_url in svg_diagrams.items():
|
|
# Skip if this diagram ID was already processed in the PNG section
|
|
if diagram_id in processed_svg_ids:
|
|
logger.info(f"Skipping SVG for diagram_id '{diagram_id}' as it was already processed in PNG section.")
|
|
continue
|
|
|
|
# Generate a unique filename for the persistent storage to avoid collisions
|
|
unique_png_filename = f"{diagram_id}_{uuid.uuid4()}.png"
|
|
temp_png_path = os.path.join(TEMP_PNG_DIR, unique_png_filename)
|
|
|
|
image_source_for_pdfkit = None
|
|
|
|
try:
|
|
logger.info(f"Processing diagram ID: {diagram_id}")
|
|
|
|
if not svg_data_url.startswith('data:image/svg+xml;base64,'):
|
|
logger.warning(f"Unsupported SVG data URL format for {diagram_id}: {svg_data_url[:30]}...")
|
|
raise ValueError("Unsupported SVG data URL format")
|
|
|
|
# Extract base64 content
|
|
base64_data = svg_data_url.split(',', 1)[1]
|
|
logger.info(f"Base64 data length: {len(base64_data)}")
|
|
|
|
# Decode the base64 data
|
|
svg_bytes = base64.b64decode(base64_data)
|
|
logger.info(f"Decoded SVG data length: {len(svg_bytes)}")
|
|
|
|
# Save the SVG data to the temporary SVG directory
|
|
temp_svg_filename = f"{diagram_id}_{uuid.uuid4()}.svg"
|
|
temp_svg_path = os.path.join(TEMP_SVG_DIR, temp_svg_filename)
|
|
with open(temp_svg_path, 'wb') as f:
|
|
f.write(svg_bytes)
|
|
logger.info(f"Saved SVG data to {temp_svg_path} (size: {len(svg_bytes)} bytes)")
|
|
|
|
# Convert SVG to PNG using cairosvg with white background
|
|
png_data = cairosvg.svg2png(bytestring=svg_bytes, scale=2.0, background_color="white")
|
|
with open(temp_png_path, 'wb') as f:
|
|
f.write(png_data)
|
|
|
|
if not os.path.exists(temp_png_path) or os.path.getsize(temp_png_path) == 0:
|
|
logger.error(f"PNG for {diagram_id} (from SVG) was not created or is empty at {temp_png_path}.")
|
|
raise ValueError("PNG creation failed or empty")
|
|
|
|
logger.info(f"Generated PNG for {diagram_id} from SVG: {temp_png_path} (size: {os.path.getsize(temp_png_path)} bytes)")
|
|
|
|
# We no longer use web URLs, always use local file path
|
|
image_source_for_pdfkit = pathlib.Path(temp_png_path).as_uri()
|
|
|
|
alt_text = f"Mermaid Diagram: {text_diagrams.get(diagram_id, diagram_id)[:50].replace('<', '<').replace('>', '>')}..."
|
|
|
|
# --- MODIFIED REPLACEMENT using BeautifulSoup ---
|
|
target_div_svg = soup.find('div', id=diagram_id)
|
|
if target_div_svg:
|
|
# Create the new img tag as a BeautifulSoup object
|
|
new_img_tag_soup_svg = soup.new_tag('img', src=image_source_for_pdfkit, alt=alt_text)
|
|
new_img_tag_soup_svg['style'] = "max-width:100%; margin:20px auto; display:block; border:1px solid #eee;"
|
|
|
|
# Replace the target div with our new img tag
|
|
target_div_svg.replace_with(new_img_tag_soup_svg)
|
|
logger.info(f"Replaced div with id='{diagram_id}' using its SVG-generated PNG via BeautifulSoup.")
|
|
processed_svg_ids.add(diagram_id)
|
|
else:
|
|
logger.warning(f"SVG_WARN: Could not find div with id='{diagram_id}' for SVG replacement using BeautifulSoup.")
|
|
# Try to find a code block with matching content from textDiagrams
|
|
original_code_for_svg = text_diagrams.get(diagram_id)
|
|
if original_code_for_svg and os.path.exists(temp_png_path):
|
|
# Try to find matching code blocks
|
|
code_blocks = soup.find_all('pre')
|
|
for code_block in code_blocks:
|
|
code_el = code_block.find('code')
|
|
if code_el and original_code_for_svg.strip() in code_el.text.strip():
|
|
# Create new img tag
|
|
new_img_tag_soup_svg_fallback = soup.new_tag('img', src=image_source_for_pdfkit, alt=alt_text)
|
|
new_img_tag_soup_svg_fallback['style'] = "max-width:100%; margin:20px auto; display:block; border:1px solid #eee;"
|
|
|
|
# Replace the code block with the img tag
|
|
code_block.replace_with(new_img_tag_soup_svg_fallback)
|
|
logger.info(f"SVG_WARN_RECOVERY: Replaced a code block matching content for diagram {diagram_id} with its SVG-PNG img tag via BeautifulSoup.")
|
|
processed_svg_ids.add(diagram_id)
|
|
break
|
|
else:
|
|
logger.warning(f"SVG_WARN_FAIL: Failed to find a matching code block for SVG diagram {diagram_id} with BeautifulSoup.")
|
|
|
|
except Exception as e_svg_proc:
|
|
logger.error(f"Error processing provided SVG for diagram_id '{diagram_id}': {str(e_svg_proc)}")
|
|
# Create a placeholder image indicating the error for this specific diagram
|
|
try:
|
|
img_err = Image.new('RGB', (500, 150), color=(255, 230, 230)) # Light red
|
|
draw_err = ImageDraw.Draw(img_err)
|
|
# Consider ImageFont.truetype for specific fonts/sizes if default is too small
|
|
title_font = ImageFont.load_default()
|
|
text_font = ImageFont.load_default()
|
|
draw_err.text((10, 10), f"Error rendering diagram:", fill=(128, 0, 0), font=title_font)
|
|
draw_err.text((10, 30), f"ID: {diagram_id}", fill=(100, 0, 0), font=text_font)
|
|
draw_err.text((10, 50), f"Details: {str(e_svg_proc)[:80]}", fill=(100, 0, 0), font=text_font)
|
|
if text_diagrams.get(diagram_id):
|
|
draw_err.text((10,70), f"Code: {text_diagrams[diagram_id][:60]}...", fill=(100,0,0), font=text_font)
|
|
|
|
with open(temp_png_path, 'wb') as f_err: # Save error image with the same name pattern
|
|
img_err.save(f_err, 'PNG')
|
|
logger.info(f"Created error placeholder image for SVG diagram {diagram_id} at {temp_png_path}")
|
|
|
|
# We no longer use web URLs, always use local file path
|
|
image_source_for_pdfkit = pathlib.Path(temp_png_path).as_uri()
|
|
|
|
# Find and replace the target div with the error image
|
|
target_div_svg_err = soup.find('div', id=diagram_id)
|
|
if target_div_svg_err:
|
|
new_err_img_tag_svg = soup.new_tag('img', src=image_source_for_pdfkit, alt=f"Error rendering SVG diagram {diagram_id}")
|
|
new_err_img_tag_svg['style'] = "max-width:100%; margin:20px auto; display:block; border: 2px solid red;"
|
|
target_div_svg_err.replace_with(new_err_img_tag_svg)
|
|
logger.info(f"Replaced div with id='{diagram_id}' using an SVG error placeholder image via BeautifulSoup.")
|
|
processed_svg_ids.add(diagram_id)
|
|
else:
|
|
logger.error(f"Could not find div with id='{diagram_id}' to replace with SVG error placeholder image.")
|
|
except Exception as e_placeholder_img:
|
|
logger.error(f"Failed to create SVG error placeholder image for {diagram_id}: {str(e_placeholder_img)}")
|
|
# Try to insert a simple error paragraph if div is found
|
|
target_div_svg_err2 = soup.find('div', id=diagram_id)
|
|
if target_div_svg_err2:
|
|
error_p_svg = soup.new_tag('p')
|
|
error_p_svg['style'] = "color:red; border:1px solid red; padding:10px;"
|
|
error_p_svg.string = f"[Error processing SVG diagram: {diagram_id} - {str(e_svg_proc)[:50]}]"
|
|
target_div_svg_err2.replace_with(error_p_svg)
|
|
logger.info(f"Replaced div with id='{diagram_id}' with a simple SVG error message via BeautifulSoup.")
|
|
processed_svg_ids.add(diagram_id)
|
|
|
|
# After processing all SVG diagrams, update processed_html
|
|
processed_html = str(soup)
|
|
logger.info("Completed BeautifulSoup processing of all SVG diagrams")
|
|
|
|
# Fallback for any mermaid code blocks/divs *not* covered by processed_svg_ids
|
|
# This typically means the frontend didn't send an SVG for them, or all replacement attempts above failed.
|
|
|
|
# Fallback for remaining <div class="mermaid"> (that might not have had a corresponding SVG)
|
|
logger.info("Fallback: Looking for any remaining <div class='mermaid'> not already handled.")
|
|
temp_processed_html_list = []
|
|
last_end = 0
|
|
# More specific regex for class="mermaid" and also capturing ID if present
|
|
div_fallback_pattern = r'(<div[^>]*class\s*=\s*["\']?[^"\']*mermaid[^"\']*["\']?[^>]*>(?:.*?)</div>)'
|
|
|
|
for match_obj in re.finditer(div_fallback_pattern, processed_html, flags=re.DOTALL | re.IGNORECASE):
|
|
start, end = match_obj.span()
|
|
div_html_segment = match_obj.group(1)
|
|
|
|
# Check if this div has an ID that was already processed
|
|
id_in_div_match = re.search(r'\bid\s*=\s*["\']?([^"\s\'<>]+)["\']?', div_html_segment, re.IGNORECASE)
|
|
current_div_id = None
|
|
if id_in_div_match:
|
|
current_div_id = id_in_div_match.group(1)
|
|
if current_div_id in processed_svg_ids:
|
|
# Skip this div as it was already processed by svgDiagrams
|
|
temp_processed_html_list.append(processed_html[last_end:end])
|
|
last_end = end
|
|
logger.info(f"Fallback Div: Skipping div with id='{current_div_id}' as it's in processed_svg_ids.")
|
|
continue
|
|
|
|
# This div was not handled by a provided SVG. Generate text-based placeholder.
|
|
logger.warning(f"Fallback Div: Processing <div class='mermaid'> at {start}-{end} (ID: {current_div_id}) not in processed_svg_ids. Generating text placeholder.")
|
|
|
|
# Try to extract diagram code from the div
|
|
soup_div = BeautifulSoup(div_html_segment, 'html.parser')
|
|
diagram_text_content = soup_div.get_text(separator='\n', strip=True) or "No text in div"
|
|
|
|
# Also check if we have this in textDiagrams
|
|
if current_div_id and current_div_id in text_diagrams:
|
|
diagram_text_content = text_diagrams[current_div_id]
|
|
|
|
# Generate a unique ID for this fallback image
|
|
fallback_uuid = str(uuid.uuid4())[:8]
|
|
placeholder_img_name = f"fallback_div_{fallback_uuid}.png"
|
|
placeholder_path = os.path.join(TEMP_PNG_DIR, placeholder_img_name)
|
|
|
|
try:
|
|
img = Image.new('RGB', (800, 300), color=(240, 240, 240))
|
|
draw = ImageDraw.Draw(img)
|
|
draw.text((10, 10), "Mermaid Diagram (Fallback Render)", fill=(50, 50, 50))
|
|
draw.text((10, 30), f"ID: {current_div_id or 'N/A'}", fill=(50, 50, 50))
|
|
y_pos = 50
|
|
for i, line in enumerate(diagram_text_content.split('\n')[:15]): # Show more lines
|
|
draw.text((10, y_pos), line[:80], fill=(50, 50, 50))
|
|
y_pos += 15
|
|
with open(placeholder_path, 'wb') as f:
|
|
img.save(f, 'PNG')
|
|
|
|
# We no longer use web URLs, always use local file path
|
|
fallback_image_src = pathlib.Path(placeholder_path).as_uri()
|
|
img_tag = f'<img src="{fallback_image_src}" alt="Fallback Mermaid Diagram" style="max-width:100%; margin:20px auto; display:block; border:1px dashed #ccc;">'
|
|
except Exception as e_pil:
|
|
logger.error(f"Fallback Div: Error creating image for {current_div_id}: {e_pil}")
|
|
img_tag = f"<pre style='color:#333; background:#f5f5f5; padding:10px; border:1px dashed #ccc;'>[Mermaid Diagram Code]:\n{diagram_text_content[:500]}</pre>"
|
|
|
|
temp_processed_html_list.append(processed_html[last_end:start])
|
|
temp_processed_html_list.append(img_tag)
|
|
last_end = end
|
|
if current_div_id:
|
|
processed_svg_ids.add(current_div_id) # Mark as handled
|
|
|
|
# Add the remaining content after the last match
|
|
temp_processed_html_list.append(processed_html[last_end:])
|
|
processed_html = "".join(temp_processed_html_list)
|
|
|
|
# Process any remaining mermaid code blocks that weren't already handled
|
|
logger.info("Fallback: Looking for any remaining mermaid code blocks not explicitly handled by ID.")
|
|
# More specific pattern for <pre><code class="language-mermaid">...</code> or similar structures
|
|
# Avoid overly broad patterns like raw 'graph TD'
|
|
# This pattern tries to capture the code within a language-mermaid block
|
|
code_block_pattern = r'(<pre[^>]*>\s*<code[^>]*class\s*=\s*["\']?[^"\']*language-mermaid[^"\']*["\']?[^>]*>([\s\S]*?)</code>\s*</pre>)'
|
|
|
|
temp_processed_html_list_codeblocks = []
|
|
last_end_codeblocks = 0
|
|
|
|
for match_obj in re.finditer(code_block_pattern, processed_html, flags=re.DOTALL | re.IGNORECASE):
|
|
start, end = match_obj.span()
|
|
full_match_html = match_obj.group(1) # The whole <pre><code>...</code></pre>
|
|
diagram_content = match_obj.group(2).strip() # Just the code
|
|
|
|
logger.info(f"Found potential unhandled mermaid code block. Content starts: {diagram_content[:50]}...")
|
|
|
|
# Try to find if this diagram_content matches any ID in textDiagrams
|
|
# And if that ID has *already* been processed (i.e., an <img> tag was made)
|
|
is_already_processed_by_id = False
|
|
matched_original_id = None
|
|
for diag_id, original_code_from_textdiagrams in text_diagrams.items():
|
|
# Simple check: if the extracted diagram_content is very similar to original_code
|
|
# This might need a more sophisticated similarity check.
|
|
if diagram_content == original_code_from_textdiagrams.strip():
|
|
matched_original_id = diag_id
|
|
if diag_id in processed_svg_ids:
|
|
is_already_processed_by_id = True
|
|
logger.info(f"Code block content matches diagram ID '{diag_id}' which is in processed_svg_ids. Skipping fallback.")
|
|
break
|
|
else:
|
|
logger.info(f"Code block content matches diagram ID '{diag_id}' which was NOT in processed_svg_ids. Will attempt SVG render if available.")
|
|
break # Found a match, even if not processed by ID yet
|
|
|
|
temp_processed_html_list_codeblocks.append(processed_html[last_end_codeblocks:start]) # Content before this match
|
|
|
|
if is_already_processed_by_id:
|
|
# This code block corresponds to an image already inserted.
|
|
# The original <pre><code> block should be removed or replaced by the image if it wasn't already.
|
|
# Since the image replacement by ID targets <div id="...">, this <pre> might still be there.
|
|
# For safety, if it was already processed, we should ensure this <pre> block is GONE.
|
|
# However, the primary image replacement should have taken care of the visual aspect.
|
|
# If the pre block is still there, it's a problem with the primary replacement not being thorough.
|
|
# For now, let's assume if is_already_processed_by_id, we don't want to add anything new here.
|
|
# We might actually want to ensure this 'full_match_html' is *removed* if its corresponding img is present.
|
|
# This gets complex. Let's first focus on not *adding* duplicates.
|
|
# If an image was already made, we effectively want to remove this <pre><code> block.
|
|
# So, we append nothing here for this specific match.
|
|
logger.info(f"Skipping rendering for code block of diagram {matched_original_id} as it was already processed by ID.")
|
|
# Effectively, this removes the <pre><code> block if its content was for an already-rendered image.
|
|
else:
|
|
# This code block was NOT processed by ID (or didn't match any known ID).
|
|
# Try to render it now.
|
|
img_tag_for_code_block = None
|
|
# Check if we have an SVG for it (if matched_original_id was found but not in processed_svg_ids)
|
|
if matched_original_id and matched_original_id in svg_diagrams and svg_diagrams[matched_original_id].startswith('data:image/svg+xml;base64,'):
|
|
# Generate PNG from SVG
|
|
try:
|
|
base64_data = svg_diagrams[matched_original_id].split(',')[1]
|
|
svg_data_decoded = base64.b64decode(base64_data)
|
|
uuid_value = uuid.uuid4()
|
|
|
|
# Save SVG data
|
|
temp_svg_filename = f"{matched_original_id}_{uuid_value}.svg"
|
|
temp_svg_path = os.path.join(TEMP_SVG_DIR, temp_svg_filename)
|
|
with open(temp_svg_path, 'wb') as f:
|
|
f.write(svg_data_decoded)
|
|
logger.info(f"Saved SVG data for code block to {temp_svg_path} (size: {len(svg_data_decoded)} bytes)")
|
|
|
|
# Save PNG data
|
|
temp_png_filename = f"{matched_original_id}_{uuid_value}.png"
|
|
temp_png_path = os.path.join(TEMP_PNG_DIR, temp_png_filename)
|
|
|
|
png_data = cairosvg.svg2png(bytestring=svg_data_decoded, scale=2.0, background_color="white")
|
|
with open(temp_png_path, 'wb') as f:
|
|
f.write(png_data)
|
|
|
|
# We no longer use web URLs, always use local file path
|
|
img_src = pathlib.Path(temp_png_path).as_uri()
|
|
|
|
img_tag_for_code_block = f'<img src="{img_src}" alt="Mermaid Diagram" style="max-width:100%; margin:20px auto; display:block; border:1px solid #eee;">'
|
|
logger.info(f"Used SVG render for code block: {matched_original_id}")
|
|
processed_svg_ids.add(matched_original_id)
|
|
except Exception as e:
|
|
logger.error(f"Error converting SVG to PNG: {str(e)}")
|
|
|
|
if not img_tag_for_code_block: # No SVG or SVG processing failed
|
|
logger.info(f"No specific SVG found for this code block, creating PIL fallback image or pre.")
|
|
# Create a fallback image if no matching SVG was found
|
|
fallback_uuid_code = str(uuid.uuid4())[:8]
|
|
placeholder_img_name_code = f"code_block_pil_{fallback_uuid_code}.png"
|
|
placeholder_path_code = os.path.join(TEMP_PNG_DIR, placeholder_img_name_code)
|
|
|
|
try:
|
|
# Create an image with the diagram code
|
|
img = Image.new('RGB', (800, 400), color=(245, 245, 245))
|
|
draw = ImageDraw.Draw(img)
|
|
draw.text((10, 10), "Mermaid Diagram (Fallback)", fill=(50, 50, 50))
|
|
|
|
# Add the diagram code content
|
|
y_pos = 40
|
|
for line_idx, line in enumerate(diagram_content.split('\n')[:20]):
|
|
draw.text((10, y_pos), line[:80], fill=(50, 50, 50))
|
|
y_pos += 15
|
|
|
|
with open(placeholder_path_code, 'wb') as f:
|
|
img.save(f, 'PNG')
|
|
|
|
# We no longer use web URLs, always use local file path
|
|
img_src_code = pathlib.Path(placeholder_path_code).as_uri()
|
|
|
|
img_tag_for_code_block = f'<img src="{img_src_code}" alt="Mermaid Diagram (Code Fallback)" style="max-width:100%; margin:20px auto; display:block; border:1px dashed #ccc;">'
|
|
except Exception as e_img:
|
|
logger.error(f"Error creating fallback image: {str(e_img)}")
|
|
# IMPORTANT: Avoid just dumping the diagram_content here if that's the source of the problem.
|
|
# Use a more generic placeholder instead
|
|
img_tag_for_code_block = f'<p style="color:red; border:1px solid red; padding:10px;">[Mermaid diagram code could not be rendered here. Content: {diagram_content[:80]}...]</p>'
|
|
|
|
temp_processed_html_list_codeblocks.append(img_tag_for_code_block or "") # Append the new image/placeholder
|
|
|
|
last_end_codeblocks = end
|
|
|
|
temp_processed_html_list_codeblocks.append(processed_html[last_end_codeblocks:])
|
|
processed_html = "".join(temp_processed_html_list_codeblocks)
|
|
|
|
# Configure PDF options
|
|
options = {
|
|
'page-size': 'Letter',
|
|
'margin-top': '0.75in',
|
|
'margin-right': '0.75in',
|
|
'margin-bottom': '0.75in',
|
|
'margin-left': '0.75in',
|
|
'encoding': 'UTF-8',
|
|
# 'no-outline': None, # Removed - not supported in unpatched Qt
|
|
'enable-local-file-access': True # Still needed for local file access
|
|
# 'load-error-handling': 'skip', # or 'ignore' - might hide issues but prevent PDF failure
|
|
# 'load-media-error-handling': 'skip',
|
|
}
|
|
|
|
# The server has an unpatched version of wkhtmltopdf which doesn't support
|
|
# the 'enable-remote-images' option. We're using file:/// URIs with enable-local-file-access instead
|
|
|
|
# Add custom CSS for better formatting
|
|
css = """
|
|
body {
|
|
font-family: Arial, sans-serif;
|
|
font-size: 12pt;
|
|
line-height: 1.6;
|
|
}
|
|
img {
|
|
max-width: 100%;
|
|
height: auto;
|
|
margin: 20px auto;
|
|
display: block;
|
|
}
|
|
h1, h2, h3, h4, h5, h6 {
|
|
color: #333;
|
|
margin-top: 20px;
|
|
margin-bottom: 10px;
|
|
}
|
|
pre {
|
|
background-color: #f5f5f5;
|
|
padding: 10px;
|
|
border-radius: 5px;
|
|
overflow-x: auto;
|
|
}
|
|
code {
|
|
font-family: 'Courier New', Courier, monospace;
|
|
font-size: 11pt;
|
|
}
|
|
table {
|
|
border-collapse: collapse;
|
|
width: 100%;
|
|
margin: 20px 0;
|
|
}
|
|
table, th, td {
|
|
border: 1px solid #ddd;
|
|
}
|
|
th, td {
|
|
padding: 8px;
|
|
text-align: left;
|
|
}
|
|
th {
|
|
background-color: #f2f2f2;
|
|
}
|
|
/* Special handling for pre containing mermaid code */
|
|
pre.mermaid-source {
|
|
display: none;
|
|
}
|
|
"""
|
|
|
|
# Comment out final HTML content logging
|
|
# logger.info("====================================================")
|
|
# logger.info("FINAL HTML CONTENT BEING SENT TO PDFKIT:")
|
|
# logger.info(processed_html)
|
|
# logger.info("====================================================")
|
|
|
|
# Create an index.html file in the temp directory for PDF generation
|
|
index_html_path = os.path.join(temp_dir_for_pdf, "index.html")
|
|
with open(index_html_path, 'w', encoding='utf-8') as f:
|
|
f.write(f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<title>Video Query Result</title>
|
|
<style>{css}</style>
|
|
</head>
|
|
<body>
|
|
{processed_html}
|
|
</body>
|
|
</html>
|
|
""")
|
|
|
|
# Log the final processed HTML for debugging
|
|
logger.info(f"Final HTML length: {len(processed_html)}")
|
|
logger.info("Final HTML contains image tags: " + str('<img src=' in processed_html))
|
|
|
|
# Check if processed HTML still contains mermaid divs or code blocks
|
|
contains_mermaid_div = 'class=mermaid' in processed_html
|
|
contains_mermaid_code = 'graph TD' in processed_html
|
|
logger.info(f"Final HTML still contains mermaid divs: {contains_mermaid_div}")
|
|
logger.info(f"Final HTML still contains mermaid code: {contains_mermaid_code}")
|
|
|
|
# Commented out listing of files as we no longer keep persistent files
|
|
# logger.info("Files in temporary PNG directory:")
|
|
# for file_name in os.listdir(TEMP_PNG_DIR):
|
|
# file_path = os.path.join(TEMP_PNG_DIR, file_name)
|
|
# file_size = os.path.getsize(file_path)
|
|
# logger.info(f" - {file_name}: {file_size} bytes")
|
|
#
|
|
# logger.info("Files in temporary SVG directory:")
|
|
# for file_name in os.listdir(TEMP_SVG_DIR):
|
|
# file_path = os.path.join(TEMP_SVG_DIR, file_name)
|
|
# file_size = os.path.getsize(file_path)
|
|
# logger.info(f" - {file_name}: {file_size} bytes")
|
|
|
|
# Generate PDF from the file to properly handle references to image files
|
|
logger.info(f"Generating PDF to {pdf_path}")
|
|
|
|
# Log the image sources being used in the HTML
|
|
img_src_pattern = r'<img\s+[^>]*src\s*=\s*["\']([^"\']+)["\']'
|
|
img_srcs = re.findall(img_src_pattern, processed_html)
|
|
logger.info(f"Found {len(img_srcs)} image sources in the HTML")
|
|
for i, src in enumerate(img_srcs):
|
|
logger.info(f"Image {i+1} src: {src}")
|
|
|
|
# Find wkhtmltopdf on the system
|
|
try:
|
|
import subprocess
|
|
which_result = subprocess.run(['which', 'wkhtmltopdf'], capture_output=True, text=True)
|
|
if which_result.returncode == 0:
|
|
wkhtmltopdf_which_path = which_result.stdout.strip()
|
|
logger.info(f"wkhtmltopdf found at: {wkhtmltopdf_which_path}")
|
|
else:
|
|
logger.warning(f"wkhtmltopdf not found in PATH: {which_result.stderr}")
|
|
# Try another approach with `whereis`
|
|
whereis_result = subprocess.run(['whereis', 'wkhtmltopdf'], capture_output=True, text=True)
|
|
logger.info(f"whereis wkhtmltopdf result: {whereis_result.stdout}")
|
|
except Exception as e:
|
|
logger.warning(f"Error while trying to locate wkhtmltopdf: {str(e)}")
|
|
|
|
try:
|
|
# Configure pdfkit with the path to wkhtmltopdf
|
|
wkhtmltopdf_path = '/usr/bin/wkhtmltopdf' # Common location on Linux servers
|
|
|
|
# If we found the path with 'which', use that
|
|
if 'wkhtmltopdf_which_path' in locals() and os.path.exists(wkhtmltopdf_which_path):
|
|
wkhtmltopdf_path = wkhtmltopdf_which_path
|
|
logger.info(f"Using wkhtmltopdf path from 'which': {wkhtmltopdf_path}")
|
|
|
|
# Check if wkhtmltopdf is available at the specified path
|
|
if os.path.exists(wkhtmltopdf_path):
|
|
logger.info(f"Using wkhtmltopdf at: {wkhtmltopdf_path}")
|
|
pdfkit_config = Configuration(wkhtmltopdf=wkhtmltopdf_path)
|
|
pdfkit.from_file(index_html_path, pdf_path, options=options, configuration=pdfkit_config)
|
|
else:
|
|
# Try alternate paths
|
|
alternate_paths = [
|
|
'/usr/local/bin/wkhtmltopdf',
|
|
'/opt/bin/wkhtmltopdf',
|
|
'/snap/bin/wkhtmltopdf'
|
|
]
|
|
|
|
found_path = None
|
|
for path in alternate_paths:
|
|
if os.path.exists(path):
|
|
found_path = path
|
|
break
|
|
|
|
if found_path:
|
|
logger.info(f"Using wkhtmltopdf at alternate path: {found_path}")
|
|
pdfkit_config = Configuration(wkhtmltopdf=found_path)
|
|
pdfkit.from_file(index_html_path, pdf_path, options=options, configuration=pdfkit_config)
|
|
else:
|
|
# Try with default config, which may use PATH environment variable
|
|
logger.warning("wkhtmltopdf not found at expected paths, trying with default configuration")
|
|
pdfkit.from_file(index_html_path, pdf_path, options=options)
|
|
|
|
logger.info(f"PDF generated successfully, file size: {os.path.getsize(pdf_path)} bytes")
|
|
except Exception as pdf_error:
|
|
logger.error(f"Error generating PDF: {str(pdf_error)}")
|
|
import traceback
|
|
logger.error(traceback.format_exc())
|
|
|
|
# Try with direct HTML content as fallback
|
|
logger.info("Trying fallback PDF generation directly from HTML string")
|
|
try:
|
|
# Strip out any remaining mermaid divs or code blocks that might be causing problems
|
|
final_html = processed_html
|
|
problem_patterns = [
|
|
r'<div[^>]*class=.?mermaid.?[^>]*>.*?</div>',
|
|
r'<pre><code>graph\s+TD.*?</code></pre>',
|
|
r'<pre><code class=.?language-mermaid.?>.*?</code></pre>',
|
|
r'```mermaid\s+[\s\S]*?```',
|
|
r'graph\s+TD[^;]*;'
|
|
]
|
|
|
|
logger.info("Stripping any remaining problematic elements before final fallback")
|
|
for pattern in problem_patterns:
|
|
before_len = len(final_html)
|
|
final_html = re.sub(pattern, '<p>[Diagram placeholder]</p>', final_html, flags=re.DOTALL)
|
|
after_len = len(final_html)
|
|
if before_len != after_len:
|
|
logger.info(f"Removed pattern, length before: {before_len}, after: {after_len}")
|
|
|
|
fallback_options = {
|
|
'page-size': 'Letter',
|
|
'margin-top': '0.75in',
|
|
'margin-right': '0.75in',
|
|
'margin-bottom': '0.75in',
|
|
'margin-left': '0.75in',
|
|
'encoding': 'UTF-8',
|
|
'enable-local-file-access': True
|
|
}
|
|
|
|
# Try to locate wkhtmltopdf for fallback method too
|
|
if os.path.exists(wkhtmltopdf_path):
|
|
pdfkit_config = Configuration(wkhtmltopdf=wkhtmltopdf_path)
|
|
pdfkit.from_string(f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<title>Video Query Result</title>
|
|
<style>{css}</style>
|
|
</head>
|
|
<body>
|
|
{final_html}
|
|
</body>
|
|
</html>
|
|
""", pdf_path, options=fallback_options, configuration=pdfkit_config)
|
|
else:
|
|
logger.warning("Using default configuration for fallback PDF generation")
|
|
pdfkit.from_string(f"""
|
|
<!DOCTYPE html>
|
|
<html>
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<title>Video Query Result</title>
|
|
<style>{css}</style>
|
|
</head>
|
|
<body>
|
|
{final_html}
|
|
</body>
|
|
</html>
|
|
""", pdf_path, options=fallback_options)
|
|
|
|
logger.info("Fallback PDF generation succeeded")
|
|
except Exception as fallback_error:
|
|
logger.error(f"Fallback PDF generation also failed: {str(fallback_error)}")
|
|
|
|
# Read the generated PDF file
|
|
if os.path.exists(pdf_path):
|
|
with open(pdf_path, 'rb') as file:
|
|
pdf_data = file.read()
|
|
|
|
# Encode the PDF as base64
|
|
pdf_base64 = base64.b64encode(pdf_data).decode('utf-8')
|
|
logger.info(f"Encoded PDF data length: {len(pdf_base64)}")
|
|
else:
|
|
logger.error("PDF file does not exist after generation")
|
|
return jsonify({'success': False, 'message': 'PDF generation failed'}), 500
|
|
|
|
# Clean up temporary PDF generation files and temp images
|
|
try:
|
|
# Clean up PDF temp directory
|
|
for root, dirs, files in os.walk(temp_dir_for_pdf, topdown=False):
|
|
for file in files:
|
|
os.remove(os.path.join(root, file))
|
|
for dir_name in dirs: # dir is a reserved word
|
|
os.rmdir(os.path.join(root, dir_name))
|
|
os.rmdir(temp_dir_for_pdf)
|
|
logger.info(f"Cleaned up temporary PDF directory: {temp_dir_for_pdf}")
|
|
|
|
# Clean up temporary PNG and SVG files
|
|
for file in os.listdir(TEMP_PNG_DIR):
|
|
os.remove(os.path.join(TEMP_PNG_DIR, file))
|
|
for file in os.listdir(TEMP_SVG_DIR):
|
|
os.remove(os.path.join(TEMP_SVG_DIR, file))
|
|
logger.info("Cleaned up temporary PNG and SVG files")
|
|
|
|
except Exception as cleanup_error:
|
|
logger.warning(f"Could not remove all temporary files: {str(cleanup_error)}")
|
|
|
|
# Generate PDF filename from video filename
|
|
if video_file_name:
|
|
# Remove video extension and add .pdf
|
|
base_name = os.path.splitext(video_file_name)[0]
|
|
pdf_filename = f"{base_name}.pdf"
|
|
logger.info(f"Generated PDF filename from video name: {pdf_filename}")
|
|
else:
|
|
pdf_filename = 'video_query_result.pdf'
|
|
logger.info("No video filename provided, using default PDF name")
|
|
|
|
return jsonify({
|
|
'success': True,
|
|
'pdf': pdf_base64,
|
|
'filename': pdf_filename
|
|
})
|
|
|
|
except Exception as e:
|
|
import traceback
|
|
error_trace = traceback.format_exc()
|
|
logger.error(f"Error generating PDF: {str(e)}")
|
|
logger.error(error_trace)
|
|
return jsonify({
|
|
'success': False,
|
|
'message': f'An unexpected error occurred: {str(e)}'
|
|
}), 500
|
|
|
|
# Handle CORS preflight requests for all API routes
|
|
@app.route('/api/<path:path>', methods=['OPTIONS'])
|
|
def handle_options(path):
|
|
response = jsonify({})
|
|
response.headers.add('Access-Control-Allow-Origin', 'https://ai-sandbox.oliver.solutions')
|
|
response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization,X-Requested-With')
|
|
response.headers.add('Access-Control-Allow-Methods', 'GET,POST,OPTIONS')
|
|
response.headers.add('Access-Control-Max-Age', '86400') # 24 hours
|
|
response.headers.add('Access-Control-Allow-Credentials', 'true')
|
|
return response
|
|
|
|
# No longer need to serve frontend from the backend
|
|
# Frontend will be hosted at https://ai-sandbox.oliver.solutions/video_query
|
|
|
|
if __name__ == '__main__':
|
|
# For development only - use Hypercorn in production
|
|
app.run(debug=True, port=5000) |