ai_qc/visual_qc_apps/utils.py
2025-08-12 14:52:49 -05:00

55 lines
No EOL
2.2 KiB
Python
Executable file

import os
import base64
import io
import time
from PIL import Image
import fitz # PyMuPDF
import cv2 # OpenCV for video frames
# Import from centralized LLM configuration
from llm_config import run_visual_qc, pil_image_to_base64, get_model_info
# --- Helper Functions ---
def get_image_from_asset(asset_path, target_size=(1024, 1024)):
"""
Loads an image from various asset types (image, pdf, video).
Extracts the first page/frame and returns a PIL Image object.
Resizes the image if it's larger than target_size while maintaining aspect ratio.
"""
try:
file_extension = os.path.splitext(asset_path)[1].lower()
pil_image = None
if file_extension in ['.png', '.jpg', '.jpeg', '.bmp', '.webp', '.gif', '.tiff']:
pil_image = Image.open(asset_path).convert('RGB')
elif file_extension == '.pdf':
doc = fitz.open(asset_path)
if doc.page_count > 0:
page = doc.load_page(0) # Load the first page
# Render page to a pixmap at a reasonable DPI
zoom = 2.0 # Increase DPI for better quality (150 DPI)
mat = fitz.Matrix(zoom, zoom)
pix = page.get_pixmap(matrix=mat, alpha=False) # alpha=False for RGB
pil_image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
doc.close()
elif file_extension in ['.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv']:
cap = cv2.VideoCapture(asset_path)
if cap.isOpened():
ret, frame = cap.read()
if ret:
# Convert OpenCV frame (BGR) to PIL Image (RGB)
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(frame_rgb)
cap.release()
if pil_image:
# Resize image if it's too large for the API, maintaining aspect ratio
pil_image.thumbnail(target_size, Image.Resampling.LANCZOS)
return pil_image
else:
print(f"Unsupported file type or error loading: {asset_path}")
return None
except Exception as e:
print(f"Error processing asset {asset_path}: {e}")
return None