Introduces a new Campaigns module for uploading campaign presentation PDFs that QC checks reference to validate assets against campaign-specific guidelines (typography, layout, copy, pricing format). Also adds a global pricing reference system that maps country codes to currency symbols and formats for deterministic price/currency validation. - New CampaignPresentation model + campaigns blueprint with CRUD routes - PDF parsing via LlamaParse (text + multimodal page images) - Global pricing PDF parsed into structured JSON lookup - Campaign context injected into both image and video QC executors - Quality checks enhanced with campaign guidelines in LLM prompts - Price/currency check uses global pricing lookup (saves an LLM call) - Campaign dropdown added to HM QC and Video QC configure pages Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
171 lines
5.5 KiB
Python
171 lines
5.5 KiB
Python
"""
|
|
Global Pricing Reference Parser.
|
|
|
|
Parses the global pricing PDF into a structured JSON lookup
|
|
mapping country/language codes to currency information.
|
|
"""
|
|
import os
|
|
import json
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def parse_global_pricing_pdf(pdf_path: str, json_output_path: str) -> dict:
|
|
"""
|
|
Parse a global pricing reference PDF into structured JSON.
|
|
|
|
Extracts text from the PDF, then uses an LLM to convert it into a
|
|
structured lookup table mapping country/language codes to their
|
|
currency symbol, position, and format.
|
|
|
|
Args:
|
|
pdf_path: Path to the global pricing PDF
|
|
json_output_path: Path to write the structured JSON output
|
|
|
|
Returns:
|
|
dict mapping language codes to currency info
|
|
"""
|
|
if not os.path.isfile(pdf_path):
|
|
raise FileNotFoundError(f"Pricing PDF not found at {pdf_path}")
|
|
|
|
logger.info(f"Parsing global pricing PDF: {pdf_path}")
|
|
|
|
# Lazy imports — only needed at parse time
|
|
import nest_asyncio
|
|
nest_asyncio.apply()
|
|
from llama_parse import LlamaParse
|
|
from core.services.llm_config import LLMConfig
|
|
|
|
# 1) Extract text from PDF
|
|
parser = LlamaParse(
|
|
result_type="text",
|
|
add_page_breaks=False,
|
|
parsing_instruction=(
|
|
"Extract all text from this pricing reference document. "
|
|
"Pay special attention to country names, country codes, "
|
|
"language codes, currency symbols, currency codes, and price formats."
|
|
),
|
|
premium_mode=False,
|
|
)
|
|
|
|
documents = parser.load_data(pdf_path)
|
|
if not documents:
|
|
raise RuntimeError("No text extracted from pricing PDF")
|
|
|
|
extracted_text = "\n".join(doc.text for doc in documents)
|
|
logger.info(f"Extracted {len(extracted_text)} chars from pricing PDF")
|
|
|
|
# 2) Use LLM to structure the data
|
|
prompt = f"""Parse this pricing reference document and extract a structured lookup table
|
|
mapping each country/region language code to its currency information.
|
|
|
|
The document contains countries, country codes, and pricing information.
|
|
|
|
IMPORTANT:
|
|
- Use language-country code format (e.g., "en-GB", "de-DE", "es-ES", "el-GR", "tr-TR")
|
|
- Also include the 2-letter country code as an alternate key (e.g., "GB", "DE", "ES")
|
|
- "position" should be "before" if symbol comes before the price (e.g., $29.99)
|
|
or "after" if symbol comes after the price (e.g., 29,99 EUR)
|
|
- "decimal_separator" should be "." or ","
|
|
- "thousands_separator" should be "," or "." or " " (space)
|
|
|
|
Return ONLY valid JSON (no markdown fences, no explanation) in this exact format:
|
|
{{
|
|
"en-GB": {{
|
|
"country": "United Kingdom",
|
|
"currency_code": "GBP",
|
|
"symbol": "\\u00a3",
|
|
"position": "before",
|
|
"decimal_separator": ".",
|
|
"thousands_separator": ",",
|
|
"format_example": "\\u00a329.99"
|
|
}},
|
|
"de-DE": {{
|
|
"country": "Germany",
|
|
"currency_code": "EUR",
|
|
"symbol": "\\u20ac",
|
|
"position": "after",
|
|
"decimal_separator": ",",
|
|
"thousands_separator": ".",
|
|
"format_example": "29,99 \\u20ac"
|
|
}}
|
|
}}
|
|
|
|
Include ALL countries/regions mentioned in the document.
|
|
|
|
DOCUMENT TEXT:
|
|
---
|
|
{extracted_text}
|
|
---"""
|
|
|
|
# Use OpenAI client directly for text-only call (no image needed)
|
|
client = LLMConfig.get_client('openai', 'gpt-4o')
|
|
api_response = client.chat.completions.create(
|
|
model='gpt-4o',
|
|
messages=[{"role": "user", "content": prompt}],
|
|
max_tokens=8192
|
|
)
|
|
response_text = api_response.choices[0].message.content or ''
|
|
|
|
# Log usage
|
|
try:
|
|
from core.models.usage_log import UsageLog
|
|
UsageLog.log_call(
|
|
provider='openai',
|
|
model='gpt-4o',
|
|
tokens=getattr(api_response.usage, 'total_tokens', None) if api_response.usage else None,
|
|
module='campaigns',
|
|
check_name='pricing_parser',
|
|
success=True
|
|
)
|
|
except Exception as log_err:
|
|
logger.warning(f"Failed to log usage: {log_err}")
|
|
|
|
# 3) Parse response as JSON
|
|
try:
|
|
# Strip markdown fences if present
|
|
text = response_text.strip()
|
|
if text.startswith('```'):
|
|
text = text.split('\n', 1)[1] if '\n' in text else text[3:]
|
|
if text.endswith('```'):
|
|
text = text[:-3]
|
|
text = text.strip()
|
|
|
|
pricing_data = json.loads(text)
|
|
except json.JSONDecodeError as e:
|
|
logger.error(f"Failed to parse LLM response as JSON: {e}")
|
|
logger.debug(f"Response was: {response_text[:500]}")
|
|
raise RuntimeError(f"LLM response was not valid JSON: {e}")
|
|
|
|
# 4) Save to disk
|
|
os.makedirs(os.path.dirname(json_output_path), exist_ok=True)
|
|
with open(json_output_path, 'w', encoding='utf-8') as f:
|
|
json.dump(pricing_data, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"Global pricing reference saved: {len(pricing_data)} entries -> {json_output_path}")
|
|
return pricing_data
|
|
|
|
|
|
def load_global_pricing(json_path: str = None) -> dict:
|
|
"""
|
|
Load the cached global pricing reference from disk.
|
|
|
|
Args:
|
|
json_path: Path to the JSON file. If None, uses default from config.
|
|
|
|
Returns:
|
|
dict mapping language codes to currency info, or empty dict if not available
|
|
"""
|
|
if json_path is None:
|
|
json_path = 'storage/reference/global_pricing.json'
|
|
|
|
if not os.path.exists(json_path):
|
|
return {}
|
|
|
|
try:
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
except (json.JSONDecodeError, OSError) as e:
|
|
logger.error(f"Failed to load global pricing: {e}")
|
|
return {}
|