"""Heuristic page classifier for Boots Production Packs. Tags each page of a multi-page production pack with a `page_type` so the dispatcher can: • run the same QC checks on every page (the user wants cover-page logo / typography / brand-name compliance to be QC'd, not skipped) • flag non-artwork pages as exempt from the strict-grade override (a cover page with no offer roundel shouldn't tank the overall grade just because `boots_offer_mechanics` finds nothing to evaluate) Categories observed across the 10 sample packs in `/Users/nickviljoen/Desktop/AI_QC_Bitbucket/boots/PPacks to Test/`: cover — title page: "Production Pack / P7B IMODIUM / 21.11.25" checklist — asset suitability tick-list ("Asset suitable", "Fonts present", ...) palette — creative-guidance colour spec page (CMYK / RGB / Hex blocks) notes — copywriter / yellow-notes commentary page artwork — actual ad layout (default — anything that isn't one of the above) Only `artwork` counts towards strict-grade Pass/Fail. Everything else is informational. The classifier never raises — unknown pages default to `artwork` because false positives there are recoverable (an extra check on a cover page just produces an N/A score) whereas missing artwork is not. """ from __future__ import annotations import re from typing import Dict, List PAGE_TYPES = ('cover', 'checklist', 'palette', 'notes', 'artwork') # ── Regex patterns derived from observed text on the 10 sample packs ────────── # These are intentionally conservative — they only fire when the page is # unambiguously one of the admin types. Anything fuzzy falls through to artwork. _RE_PRODUCTION_PACK_TITLE = re.compile(r'\bProduction Pack\b', re.IGNORECASE) _RE_VERSION_LINE = re.compile(r'^\s*Version\s+\d+\s*$', re.IGNORECASE | re.MULTILINE) _RE_JOB_NUMBER = re.compile(r'\bJob number:\s*\d', re.IGNORECASE) # Asset checklist markers — the suitability tick-list _CHECKLIST_TOKENS = ( 'Asset suitable', 'Assets suitable', 'Fonts present', 'Resolution fine', 'Print ready asset', 'Asset low res', 'Supplied visual', 'More info required', 'Content complete', 'Colours resolved', 'IS PACK A', # appears on No7 Bundle Wobbler P9 cover/checklist combo ) # Creative guidance / colour palette markers _PALETTE_TOKENS = ('CMYK', 'RGB', 'Hexadecimal') _RE_HEX_COLOUR = re.compile(r'#[0-9a-fA-F]{6}\b') # Copywriter / yellow-notes markers _RE_YELLOW_NOTES = re.compile(r'\bYellow Notes\b', re.IGNORECASE) _RE_CLIENT_QUERIES = re.compile(r'\bCLIENT QUERIES\b', re.IGNORECASE) # Strong artwork signals — text that only appears on real ad layouts. # Used to override notes/checklist classification when an artwork page # happens to include a Yellow Notes footer or a supplied-assets sidebar. _RE_PRICE = re.compile(r'£\s*\d') _RE_OFFER_MECHANIC = re.compile( r'\b(?:3\s*FOR\s*2|BOGOF|2\s*FOR\s*£|BUY\s+\d|FREE|GIFT|MULTIBUY)\b', re.IGNORECASE, ) _RE_OFFER_DATES = re.compile(r'\bOffer valid (?:from|on)\b', re.IGNORECASE) _RE_CLICK_COLLECT = re.compile(r'\bClick\s*&\s*Collect\b', re.IGNORECASE) _RE_GSL_BARCODE = re.compile(r'\bGSL\b\s*\d', re.IGNORECASE) def _has_artwork_signals(text: str) -> bool: """True if the page contains at least one strong artwork-only signal.""" return any(rx.search(text) for rx in ( _RE_PRICE, _RE_OFFER_MECHANIC, _RE_OFFER_DATES, _RE_CLICK_COLLECT, _RE_GSL_BARCODE, )) def classify_page(page: Dict) -> str: """Classify a single ingested page dict. Decision order: 1. Strong palette match (multi-token + hex colours) → palette 2. Strong checklist match (≥3 suitability tokens) → checklist 3. Pages that pass _has_artwork_signals() → artwork (catches Maybelline p5 that has Yellow Notes footer + T&Cs) 4. Yellow Notes / Client Queries with no artwork signals → notes 5. Sparse Production Pack title block → cover (covers brief / context too) 6. Fallthrough → artwork Strong palette/checklist precede artwork signals because some palette and checklist pages render thumbnail previews of the artwork (e.g. Nicorette's "£3 off offer ellipse" sample on the palette page) that otherwise look like artwork. Defaults to 'artwork' on any uncertainty — false positives there are cheap (an N/A score on a non-applicable check) whereas missing artwork means a real compliance issue would slip through. """ text = (page.get('raw_text') or '').strip() if not text: return 'artwork' # ── 1. Palette ──────────────────────────────────────────────────────── palette_tokens_found = sum(1 for tok in _PALETTE_TOKENS if tok in text) hex_count = len(_RE_HEX_COLOUR.findall(text)) if palette_tokens_found >= 3 and hex_count >= 2: return 'palette' # Looser palette match for short palette pages with only one swatch if palette_tokens_found >= 2 and hex_count >= 1 and 'COLOUR PALETTE' in text.upper(): return 'palette' # ── 2. Checklist ────────────────────────────────────────────────────── checklist_tokens_found = sum(1 for tok in _CHECKLIST_TOKENS if tok in text) if checklist_tokens_found >= 3: return 'checklist' # ── 3. Artwork signals (T&Cs, offer mechanics, prices) ───────────────── if _has_artwork_signals(text): return 'artwork' # ── 4. Notes / client queries ───────────────────────────────────────── if _RE_YELLOW_NOTES.search(text) or _RE_CLIENT_QUERIES.search(text): return 'notes' # ── 5. Cover / brief / context page ─────────────────────────────────── line_count = len([ln for ln in text.splitlines() if ln.strip()]) if ( line_count <= 15 and _RE_PRODUCTION_PACK_TITLE.search(text) and _RE_VERSION_LINE.search(text) and _RE_JOB_NUMBER.search(text) ): return 'cover' # ── 6. Fallthrough ──────────────────────────────────────────────────── return 'artwork' def classify_pages(pages: List[Dict]) -> List[str]: """Classify every page in an ingest_pdf result. Returns parallel list.""" return [classify_page(p) for p in pages] def is_artwork(page_type: str) -> bool: """Convenience predicate for the strict-grade override.""" return page_type == 'artwork' # Human-readable labels for report rendering. PAGE_TYPE_LABELS = { 'cover': 'Cover', 'checklist': 'Asset Checklist', 'palette': 'Creative Guidance / Colour Palette', 'notes': 'Yellow Notes', 'artwork': 'Artwork', }