ai_qc/backend/localization_processor.py
nickviljoen d80a9fc9cf Add localization matrix support via reference asset upload
- New localization_processor.py: parses Excel localization matrices with
  MESSAGE A/B sections, extracting expected headline, dates, logo, legal
  per country
- Excel files uploaded as reference assets are auto-detected and parsed
  as localization matrices if they contain MESSAGE A/B structure
- During analysis, cross-references media plan creative_name (Message A/B)
  and country with parsed matrix to inject expected copy into QC prompts
- LLM checks can now verify asset text matches the correct message version
  and market localization

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-31 17:54:24 +02:00

316 lines
10 KiB
Python

"""
Localization Matrix Processor
Parses Excel localization matrices (e.g., Amazon Prime Day DOOH/Print)
into structured JSON for cross-referencing with media plan data during QC analysis.
The matrix contains expected copy (headline, dates, logo, legal) per message type
(Message A, Message B) and country.
"""
import os
import re
from datetime import datetime
from typing import Dict, Optional
try:
import openpyxl
except ImportError:
openpyxl = None
def parse_localization_matrix(excel_path: str) -> Optional[Dict]:
"""
Parse a localization matrix Excel file into structured JSON.
Expected structure:
- Row with "MESSAGE A" in column A starts Message A section
- Row with "MESSAGE B" in column A starts Message B section
- Country codes in row below the message header (B=first country, C=second, etc.)
- Content rows: Tagline (2/3/4/5 lines), Dates, Logo, CTA, Legal Disclaimer
Returns structured dict or None if the file doesn't look like a localization matrix.
"""
if openpyxl is None:
print("openpyxl not installed, cannot parse localization matrix")
return None
try:
wb = openpyxl.load_workbook(excel_path, data_only=True)
except Exception as e:
print(f"Failed to open localization matrix: {e}")
return None
result = {
'messages': {},
'countries': [],
'source_file': os.path.basename(excel_path),
'parsed_at': datetime.now().isoformat(),
}
# Try each sheet
for sheet_name in wb.sheetnames:
ws = wb[sheet_name]
parsed = _parse_sheet(ws)
if parsed and parsed.get('messages'):
result['messages'].update(parsed['messages'])
# Merge countries
for c in parsed.get('countries', []):
if c not in result['countries']:
result['countries'].append(c)
if not result['messages']:
print(f"No message sections found in {excel_path} — not a localization matrix")
return None
result['sheet_names'] = wb.sheetnames
return result
def _parse_sheet(ws) -> Optional[Dict]:
"""Parse a single worksheet for message sections."""
messages = {}
countries = []
# Read all rows into memory
all_rows = []
for row in ws.iter_rows(min_row=1, max_row=ws.max_row, values_only=False):
all_rows.append(row)
# Find message sections by scanning column A
message_starts = []
for idx, row in enumerate(all_rows):
a_val = str(row[0].value or '').strip().upper()
if 'MESSAGE A' in a_val:
message_starts.append((idx, 'Message A'))
elif 'MESSAGE B' in a_val:
message_starts.append((idx, 'Message B'))
if not message_starts:
return None
# Extract country codes from message header rows
# Countries are in columns B onwards in the same row as the message header
first_msg_row = all_rows[message_starts[0][0]]
for cell in first_msg_row[1:]: # Skip column A
val = str(cell.value or '').strip()
if val and val not in ('', 'None'):
countries.append(val)
if not countries:
return None
# Parse each message section
for i, (start_idx, msg_name) in enumerate(message_starts):
# Determine end of this section (next message start or end of sheet)
if i + 1 < len(message_starts):
end_idx = message_starts[i + 1][0]
else:
end_idx = len(all_rows)
msg_data = _parse_message_section(all_rows, start_idx, end_idx, countries)
if msg_data:
messages[msg_name] = msg_data
return {'messages': messages, 'countries': countries}
def _parse_message_section(all_rows, start_idx, end_idx, countries) -> Dict:
"""Parse a single message section (Message A or Message B)."""
msg_data = {}
for idx in range(start_idx + 1, end_idx):
if idx >= len(all_rows):
break
row = all_rows[idx]
label = str(row[0].value or '').strip().lower()
if not label:
continue
# Determine the field type from the label
field_name = _classify_row_label(label)
if not field_name:
continue
# Extract values per country
for col_idx, country in enumerate(countries):
cell_idx = col_idx + 1 # +1 because column A is the label
if cell_idx < len(row):
val = row[cell_idx].value
if val is not None:
val = str(val).strip()
# Clean up non-breaking spaces
val = val.replace('\xa0', ' ').strip()
if val and val != '-':
if country not in msg_data:
msg_data[country] = {}
msg_data[country][field_name] = val
return msg_data
def _classify_row_label(label: str) -> Optional[str]:
"""Classify a row label into a standardized field name."""
label_lower = label.lower()
if 'tagline' in label_lower or 'headline' in label_lower:
if '2 line' in label_lower:
return 'tagline_2_lines'
elif '3 line' in label_lower:
return 'tagline_3_lines'
elif '4 line' in label_lower:
return 'tagline_4_lines'
elif '5 line' in label_lower:
return 'tagline_5_lines'
else:
return 'tagline'
elif 'date' in label_lower:
return 'dates'
elif 'logo' in label_lower:
return 'logo'
elif 'cta' in label_lower:
return 'cta'
elif 'legal' in label_lower or 'disclaimer' in label_lower:
return 'legal_disclaimer'
elif 'urgency' in label_lower:
return 'urgency_messaging'
elif 'event' in label_lower:
return None # Skip event name row
else:
return None
def build_localization_context(parsed_matrix: Dict, message_type: str, country: str) -> str:
"""
Build a context string for QC prompt injection from the localization matrix.
Args:
parsed_matrix: Parsed localization matrix data (from parse_localization_matrix)
message_type: "Message A" or "Message B" (from media plan creative_name)
country: Country code e.g. "IE", "DE", "IT" (from media plan)
Returns:
Context string to inject into QC prompts, or empty string if no match.
"""
if not parsed_matrix or not message_type or not country:
return ""
messages = parsed_matrix.get('messages', {})
# Normalize message type — handle variations like "message a", "Message A", "MSG A"
msg_key = None
msg_type_upper = message_type.strip().upper()
for key in messages:
if key.upper() == msg_type_upper:
msg_key = key
break
# Also try partial match: "Message A" in "Live1_Message A" etc.
if not msg_key:
for key in messages:
if key.upper() in msg_type_upper or msg_type_upper in key.upper():
msg_key = key
break
if not msg_key:
return ""
msg_data = messages[msg_key]
# Find country data — try exact match, then case-insensitive
country_data = None
country_upper = country.strip().upper()
for c_key, c_data in msg_data.items():
if c_key.upper() == country_upper:
country_data = c_data
break
if not country_data:
return ""
# Build the context string
lines = [
"\n=== LOCALIZATION MATRIX — EXPECTED COPY ===",
f"Message Type: {msg_key}",
f"Country/Market: {country}",
"",
"The following is the EXPECTED copy for this asset based on the localization matrix.",
"Verify the asset contains the correct text matching one of these variants:",
"",
]
# Add tagline variants
tagline_fields = [
('tagline_2_lines', 'Expected Headline (2-line layout)'),
('tagline_3_lines', 'Expected Headline (3-line layout)'),
('tagline_4_lines', 'Expected Headline (4-line layout)'),
('tagline_5_lines', 'Expected Headline (5-line layout)'),
('tagline', 'Expected Headline'),
]
has_tagline = False
for field, label in tagline_fields:
val = country_data.get(field)
if val:
# Show line breaks clearly
display_val = val.replace('\n', ' / ')
lines.append(f"{label}: {display_val}")
has_tagline = True
if has_tagline:
lines.append("")
# Add dates
dates_val = country_data.get('dates')
if dates_val:
lines.append(f"Expected Dates: {dates_val}")
# Add logo
logo_val = country_data.get('logo')
if logo_val:
lines.append(f"Expected Logo: {logo_val}")
# Add CTA
cta_val = country_data.get('cta')
if cta_val:
lines.append(f"Expected CTA: {cta_val}")
# Add legal
legal_val = country_data.get('legal_disclaimer')
if legal_val:
lines.append(f"Expected Legal Disclaimer: {legal_val}")
# Add urgency messaging if present
urgency_val = country_data.get('urgency_messaging')
if urgency_val:
lines.append(f"Urgency Messaging (alternative to dates): {urgency_val}")
lines.append("")
lines.append("IMPORTANT: Cross-check the text visible in the asset against the expected copy above.")
lines.append("The headline should match one of the tagline variants for this market and message type.")
lines.append("Flag any text that doesn't match — wrong language, wrong message version, missing elements, or incorrect copy.")
lines.append("=== END LOCALIZATION MATRIX ===\n")
return "\n".join(lines)
def detect_message_type(creative_name: str) -> Optional[str]:
"""
Detect if a creative_name from the media plan maps to a message type.
Returns "Message A", "Message B", etc., or None if no match.
"""
if not creative_name:
return None
name_upper = creative_name.strip().upper()
if 'MESSAGE A' in name_upper:
return 'Message A'
elif 'MESSAGE B' in name_upper:
return 'Message B'
elif 'MESSAGE C' in name_upper:
return 'Message C'
else:
return None