hm_ai_qc_report_tool/modules/campaigns/pricing_parser.py

"""
Global Pricing Reference Parser.

Parses the global pricing PDF into a structured JSON lookup
mapping country/language codes to currency information.
"""
import os
import json
import logging

logger = logging.getLogger(__name__)


def parse_global_pricing_pdf(pdf_path: str, json_output_path: str) -> dict:
    """
    Parse a global pricing reference PDF into structured JSON.

    Extracts text from the PDF, then uses an LLM to convert it into a
    structured lookup table mapping country/language codes to their
    currency symbol, position, and format.

    Args:
        pdf_path: Path to the global pricing PDF
        json_output_path: Path to write the structured JSON output

    Returns:
        dict mapping language codes to currency info
    """
    if not os.path.isfile(pdf_path):
        raise FileNotFoundError(f"Pricing PDF not found at {pdf_path}")

    logger.info(f"Parsing global pricing PDF: {pdf_path}")

    # Lazy imports — only needed at parse time
    import nest_asyncio
    nest_asyncio.apply()
    from llama_parse import LlamaParse
    from core.services.llm_config import LLMConfig

    # 1) Extract text from PDF
    parser = LlamaParse(
        result_type="text",
        add_page_breaks=False,
        parsing_instruction=(
            "Extract all text from this pricing reference document. "
            "Pay special attention to country names, country codes, "
            "language codes, currency symbols, currency codes, and price formats."
        ),
        premium_mode=False,
    )

    documents = parser.load_data(pdf_path)
    if not documents:
        raise RuntimeError("No text extracted from pricing PDF")

    extracted_text = "\n".join(doc.text for doc in documents)
    logger.info(f"Extracted {len(extracted_text)} chars from pricing PDF")

    # 2) Use LLM to structure the data
    prompt = f"""Parse this pricing reference document and extract a structured lookup table
mapping each country/region language code to its currency information.

The document contains countries, country codes, and pricing information.

IMPORTANT:
- Use language-country code format (e.g., "en-GB", "de-DE", "es-ES", "el-GR", "tr-TR")
- Also include the 2-letter country code as an alternate key (e.g., "GB", "DE", "ES")
- "position" should be "before" if symbol comes before the price (e.g., $29.99)
  or "after" if symbol comes after the price (e.g., 29,99 EUR)
- "decimal_separator" should be "." or ","
- "thousands_separator" should be "," or "." or " " (space)

Return ONLY valid JSON (no markdown fences, no explanation) in this exact format:
{{
    "en-GB": {{
        "country": "United Kingdom",
        "currency_code": "GBP",
        "symbol": "\\u00a3",
        "position": "before",
        "decimal_separator": ".",
        "thousands_separator": ",",
        "format_example": "\\u00a329.99"
    }},
    "de-DE": {{
        "country": "Germany",
        "currency_code": "EUR",
        "symbol": "\\u20ac",
        "position": "after",
        "decimal_separator": ",",
        "thousands_separator": ".",
        "format_example": "29,99 \\u20ac"
    }}
}}

Include ALL countries/regions mentioned in the document.

DOCUMENT TEXT:
---
{extracted_text}
---"""

    # Use OpenAI client directly for text-only call (no image needed)
    client = LLMConfig.get_client('openai', 'gpt-4o')
    api_response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}],
        max_tokens=8192
    )
    response_text = api_response.choices[0].message.content or ''

    # Log usage
    try:
        from core.models.usage_log import UsageLog
        UsageLog.log_call(
            provider='openai',
            model='gpt-4o',
            tokens=getattr(api_response.usage, 'total_tokens', None) if api_response.usage else None,
            module='campaigns',
            check_name='pricing_parser',
            success=True
        )
    except Exception as log_err:
        logger.warning(f"Failed to log usage: {log_err}")

    # 3) Parse response as JSON
    try:
        # Strip markdown fences if present
        text = response_text.strip()
        if text.startswith('```'):
            text = text.split('\n', 1)[1] if '\n' in text else text[3:]
        if text.endswith('```'):
            text = text[:-3]
        text = text.strip()

        pricing_data = json.loads(text)
    except json.JSONDecodeError as e:
        logger.error(f"Failed to parse LLM response as JSON: {e}")
        logger.debug(f"Response was: {response_text[:500]}")
        raise RuntimeError(f"LLM response was not valid JSON: {e}")

    # 4) Save to disk
    os.makedirs(os.path.dirname(json_output_path), exist_ok=True)
    with open(json_output_path, 'w', encoding='utf-8') as f:
        json.dump(pricing_data, f, indent=2, ensure_ascii=False)

    logger.info(f"Global pricing reference saved: {len(pricing_data)} entries -> {json_output_path}")
    return pricing_data


def load_global_pricing(json_path: str = None) -> dict:
    """
    Load the cached global pricing reference from disk.

    Args:
        json_path: Path to the JSON file. If None, uses default from config.

    Returns:
        dict mapping language codes to currency info, or empty dict if not available
    """
    if json_path is None:
        json_path = 'storage/reference/global_pricing.json'

    if not os.path.exists(json_path):
        return {}

    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except (json.JSONDecodeError, OSError) as e:
        logger.error(f"Failed to load global pricing: {e}")
        return {}