hm_ai_qc_report_tool/modules/campaigns/pricing_parser.py
nickviljoen 9c33858726 Add campaign presentation management and global pricing reference
Introduces a new Campaigns module for uploading campaign presentation PDFs
that QC checks reference to validate assets against campaign-specific
guidelines (typography, layout, copy, pricing format). Also adds a global
pricing reference system that maps country codes to currency symbols and
formats for deterministic price/currency validation.

- New CampaignPresentation model + campaigns blueprint with CRUD routes
- PDF parsing via LlamaParse (text + multimodal page images)
- Global pricing PDF parsed into structured JSON lookup
- Campaign context injected into both image and video QC executors
- Quality checks enhanced with campaign guidelines in LLM prompts
- Price/currency check uses global pricing lookup (saves an LLM call)
- Campaign dropdown added to HM QC and Video QC configure pages

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-26 16:12:22 +02:00

171 lines
5.5 KiB
Python

"""
Global Pricing Reference Parser.
Parses the global pricing PDF into a structured JSON lookup
mapping country/language codes to currency information.
"""
import os
import json
import logging
logger = logging.getLogger(__name__)
def parse_global_pricing_pdf(pdf_path: str, json_output_path: str) -> dict:
"""
Parse a global pricing reference PDF into structured JSON.
Extracts text from the PDF, then uses an LLM to convert it into a
structured lookup table mapping country/language codes to their
currency symbol, position, and format.
Args:
pdf_path: Path to the global pricing PDF
json_output_path: Path to write the structured JSON output
Returns:
dict mapping language codes to currency info
"""
if not os.path.isfile(pdf_path):
raise FileNotFoundError(f"Pricing PDF not found at {pdf_path}")
logger.info(f"Parsing global pricing PDF: {pdf_path}")
# Lazy imports — only needed at parse time
import nest_asyncio
nest_asyncio.apply()
from llama_parse import LlamaParse
from core.services.llm_config import LLMConfig
# 1) Extract text from PDF
parser = LlamaParse(
result_type="text",
add_page_breaks=False,
parsing_instruction=(
"Extract all text from this pricing reference document. "
"Pay special attention to country names, country codes, "
"language codes, currency symbols, currency codes, and price formats."
),
premium_mode=False,
)
documents = parser.load_data(pdf_path)
if not documents:
raise RuntimeError("No text extracted from pricing PDF")
extracted_text = "\n".join(doc.text for doc in documents)
logger.info(f"Extracted {len(extracted_text)} chars from pricing PDF")
# 2) Use LLM to structure the data
prompt = f"""Parse this pricing reference document and extract a structured lookup table
mapping each country/region language code to its currency information.
The document contains countries, country codes, and pricing information.
IMPORTANT:
- Use language-country code format (e.g., "en-GB", "de-DE", "es-ES", "el-GR", "tr-TR")
- Also include the 2-letter country code as an alternate key (e.g., "GB", "DE", "ES")
- "position" should be "before" if symbol comes before the price (e.g., $29.99)
or "after" if symbol comes after the price (e.g., 29,99 EUR)
- "decimal_separator" should be "." or ","
- "thousands_separator" should be "," or "." or " " (space)
Return ONLY valid JSON (no markdown fences, no explanation) in this exact format:
{{
"en-GB": {{
"country": "United Kingdom",
"currency_code": "GBP",
"symbol": "\\u00a3",
"position": "before",
"decimal_separator": ".",
"thousands_separator": ",",
"format_example": "\\u00a329.99"
}},
"de-DE": {{
"country": "Germany",
"currency_code": "EUR",
"symbol": "\\u20ac",
"position": "after",
"decimal_separator": ",",
"thousands_separator": ".",
"format_example": "29,99 \\u20ac"
}}
}}
Include ALL countries/regions mentioned in the document.
DOCUMENT TEXT:
---
{extracted_text}
---"""
# Use OpenAI client directly for text-only call (no image needed)
client = LLMConfig.get_client('openai', 'gpt-4o')
api_response = client.chat.completions.create(
model='gpt-4o',
messages=[{"role": "user", "content": prompt}],
max_tokens=8192
)
response_text = api_response.choices[0].message.content or ''
# Log usage
try:
from core.models.usage_log import UsageLog
UsageLog.log_call(
provider='openai',
model='gpt-4o',
tokens=getattr(api_response.usage, 'total_tokens', None) if api_response.usage else None,
module='campaigns',
check_name='pricing_parser',
success=True
)
except Exception as log_err:
logger.warning(f"Failed to log usage: {log_err}")
# 3) Parse response as JSON
try:
# Strip markdown fences if present
text = response_text.strip()
if text.startswith('```'):
text = text.split('\n', 1)[1] if '\n' in text else text[3:]
if text.endswith('```'):
text = text[:-3]
text = text.strip()
pricing_data = json.loads(text)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse LLM response as JSON: {e}")
logger.debug(f"Response was: {response_text[:500]}")
raise RuntimeError(f"LLM response was not valid JSON: {e}")
# 4) Save to disk
os.makedirs(os.path.dirname(json_output_path), exist_ok=True)
with open(json_output_path, 'w', encoding='utf-8') as f:
json.dump(pricing_data, f, indent=2, ensure_ascii=False)
logger.info(f"Global pricing reference saved: {len(pricing_data)} entries -> {json_output_path}")
return pricing_data
def load_global_pricing(json_path: str = None) -> dict:
"""
Load the cached global pricing reference from disk.
Args:
json_path: Path to the JSON file. If None, uses default from config.
Returns:
dict mapping language codes to currency info, or empty dict if not available
"""
if json_path is None:
json_path = 'storage/reference/global_pricing.json'
if not os.path.exists(json_path):
return {}
try:
with open(json_path, 'r', encoding='utf-8') as f:
return json.load(f)
except (json.JSONDecodeError, OSError) as e:
logger.error(f"Failed to load global pricing: {e}")
return {}