hm_ai_qc_report_tool/modules/campaigns/services.py
nickviljoen 392e0e5864 Fix campaign upload: threading context, progress bar, auto-refresh table
- Fix background parsing thread: pass app reference explicitly instead of
  trying to access current_app inside the thread (was silently failing)
- Add progress bar with animated stages during upload and parsing
- Add data-id/data-status attributes to table rows for auto-polling
- On page load, automatically poll any pending/parsing rows and update
  their status badges in-place (fixes stale "Pending" on tab return)
- Immediately inject new row into table after upload so user sees it
  without needing to refresh
- Remove broken _parse_pricing_background function

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 18:03:13 +02:00

179 lines
6 KiB
Python

"""
Campaign PDF Parsing Service.
Handles parsing of campaign presentation PDFs using LlamaParse,
extracting both text content and page images for QC reference.
"""
import os
import logging
from datetime import datetime
logger = logging.getLogger(__name__)
def parse_campaign_pdf(presentation_id: int, app=None):
"""
Parse a campaign presentation PDF and store extracted content.
Extracts text and page images from the PDF using LlamaParse.
Updates the CampaignPresentation record with parsed content.
Args:
presentation_id: ID of the CampaignPresentation record to parse
app: Flask app instance (required when called from background thread)
"""
from core.models.database import db
from core.models.campaign_presentation import CampaignPresentation
# This function expects to be called within an app context
# (the caller is responsible for wrapping with app.app_context())
presentation = CampaignPresentation.query.get(presentation_id)
if not presentation:
logger.error(f"CampaignPresentation {presentation_id} not found")
return
try:
# Update status to parsing
presentation.status = 'parsing'
db.session.commit()
pdf_path = presentation.pdf_path
if not os.path.isfile(pdf_path):
raise FileNotFoundError(f"PDF not found at {pdf_path}")
# Create page images directory
campaign_dir = os.path.dirname(pdf_path)
pages_dir = os.path.join(campaign_dir, 'pages')
os.makedirs(pages_dir, exist_ok=True)
# Parse PDF
logger.info(f"Parsing campaign PDF: {pdf_path}")
result = _parse_pdf(pdf_path, pages_dir)
# Update record with parsed content
presentation.parsed_content = result['extracted_text']
presentation.page_images_dir = pages_dir
presentation.parsed_at = datetime.utcnow()
presentation.status = 'ready'
presentation.error_message = None
db.session.commit()
logger.info(
f"Campaign PDF parsed successfully: {presentation.campaign_id} "
f"({len(result['extracted_text'])} chars, "
f"{result['image_count']} page images)"
)
except Exception as e:
logger.error(f"Failed to parse campaign PDF: {e}", exc_info=True)
presentation.status = 'error'
presentation.error_message = str(e)
db.session.commit()
def _parse_pdf(pdf_path: str, pages_dir: str) -> dict:
"""
Parse a PDF file extracting text and page images.
Uses the same LlamaParse approach as modules/hm_qc/checks/legacy/HM_parse.py
but stores page images to disk instead of keeping them in memory.
Args:
pdf_path: Path to the PDF file
pages_dir: Directory to store extracted page images
Returns:
dict with 'extracted_text', 'image_count'
"""
# Lazy imports — only needed at parse time
import nest_asyncio
nest_asyncio.apply()
from llama_parse import LlamaParse
from PIL import Image
# 1) Text extraction
parser_text = LlamaParse(
result_type="text",
add_page_breaks=False,
parsing_instruction=(
"Extract all text from the PDF. "
"Include all typography specifications, font names, sizes, positioning rules, "
"copy text, and any guidelines or instructions."
),
premium_mode=False,
)
documents = parser_text.load_data(pdf_path)
if not documents:
raise RuntimeError("No text found or PDF is empty")
extracted_text = "\n".join(doc.text for doc in documents)
# Save extracted text
text_file_path = os.path.join(pages_dir, "extracted_text.txt")
with open(text_file_path, "w", encoding="utf-8") as f:
f.write(extracted_text)
# 2) Page image extraction (multimodal)
image_count = 0
try:
parser_multimodal = LlamaParse(
result_type="markdown",
add_page_breaks=False,
parsing_instruction=(
"Generate page images of the PDF document, "
"including all visual layouts, mockups, and graphic examples."
),
use_vendor_multimodal_model=True,
vendor_multimodal_model_name="openai-gpt4o",
premium_mode=False,
)
md_json_objs = parser_multimodal.get_json_result(pdf_path)
image_dicts = parser_multimodal.get_images(md_json_objs, download_path=pages_dir)
# Rename images with sequential page numbers
for i, img_info in enumerate(image_dicts):
src_path = img_info["path"]
dst_path = os.path.join(pages_dir, f"page_{i+1:03d}.jpg")
try:
if os.path.exists(src_path):
# Ensure it's a valid image, then save as JPG
with Image.open(src_path) as img:
img.convert('RGB').save(dst_path, 'JPEG', quality=90)
# Remove original if different path
if os.path.abspath(src_path) != os.path.abspath(dst_path):
os.remove(src_path)
image_count += 1
except Exception as e:
logger.warning(f"Failed to process page image {i+1}: {e}")
continue
except Exception as e:
logger.warning(f"Multimodal parsing failed (text still available): {e}")
return {
'extracted_text': extracted_text,
'image_count': image_count
}
def get_page_image_paths(page_images_dir: str) -> list:
"""
Get sorted list of page image paths from a campaign's pages directory.
Args:
page_images_dir: Path to the pages directory
Returns:
Sorted list of image file paths
"""
if not page_images_dir or not os.path.isdir(page_images_dir):
return []
image_files = []
for f in sorted(os.listdir(page_images_dir)):
if f.lower().endswith(('.jpg', '.jpeg', '.png')) and f.startswith('page_'):
image_files.append(os.path.join(page_images_dir, f))
return image_files