""" Localization Matrix Processor Parses Excel localization matrices (e.g., Amazon Prime Day DOOH/Print) into structured JSON for cross-referencing with media plan data during QC analysis. The matrix contains expected copy (headline, dates, logo, legal) per message type (Message A, Message B) and country. """ import os import re from datetime import datetime from typing import Dict, Optional try: import openpyxl except ImportError: openpyxl = None def parse_localization_matrix(excel_path: str) -> Optional[Dict]: """ Parse a localization matrix Excel file into structured JSON. Expected structure: - Row with "MESSAGE A" in column A starts Message A section - Row with "MESSAGE B" in column A starts Message B section - Country codes in row below the message header (B=first country, C=second, etc.) - Content rows: Tagline (2/3/4/5 lines), Dates, Logo, CTA, Legal Disclaimer Returns structured dict or None if the file doesn't look like a localization matrix. """ if openpyxl is None: print("openpyxl not installed, cannot parse localization matrix") return None try: wb = openpyxl.load_workbook(excel_path, data_only=True) except Exception as e: print(f"Failed to open localization matrix: {e}") return None result = { 'messages': {}, 'countries': [], 'source_file': os.path.basename(excel_path), 'parsed_at': datetime.now().isoformat(), } # Try each sheet for sheet_name in wb.sheetnames: ws = wb[sheet_name] parsed = _parse_sheet(ws) if parsed and parsed.get('messages'): result['messages'].update(parsed['messages']) # Merge countries for c in parsed.get('countries', []): if c not in result['countries']: result['countries'].append(c) if not result['messages']: print(f"No message sections found in {excel_path} — not a localization matrix") return None result['sheet_names'] = wb.sheetnames return result def _parse_sheet(ws) -> Optional[Dict]: """Parse a single worksheet for message sections.""" messages = {} countries = [] # Read all rows into memory all_rows = [] for row in ws.iter_rows(min_row=1, max_row=ws.max_row, values_only=False): all_rows.append(row) # Find message sections by scanning column A message_starts = [] for idx, row in enumerate(all_rows): a_val = str(row[0].value or '').strip().upper() if 'MESSAGE A' in a_val: message_starts.append((idx, 'Message A')) elif 'MESSAGE B' in a_val: message_starts.append((idx, 'Message B')) if not message_starts: return None # Extract country codes from message header rows # Countries are in columns B onwards in the same row as the message header first_msg_row = all_rows[message_starts[0][0]] for cell in first_msg_row[1:]: # Skip column A val = str(cell.value or '').strip() if val and val not in ('', 'None'): countries.append(val) if not countries: return None # Parse each message section for i, (start_idx, msg_name) in enumerate(message_starts): # Determine end of this section (next message start or end of sheet) if i + 1 < len(message_starts): end_idx = message_starts[i + 1][0] else: end_idx = len(all_rows) msg_data = _parse_message_section(all_rows, start_idx, end_idx, countries) if msg_data: messages[msg_name] = msg_data return {'messages': messages, 'countries': countries} def _parse_message_section(all_rows, start_idx, end_idx, countries) -> Dict: """Parse a single message section (Message A or Message B).""" msg_data = {} last_field_type = None # Track what the previous row was for idx in range(start_idx + 1, end_idx): if idx >= len(all_rows): break row = all_rows[idx] label = str(row[0].value or '').strip() label_lower = label.lower() if not label: # Handle rows with empty column A — these may be unlabeled dates rows # that follow tagline rows. Check if prior row was a tagline and this row has data. if last_field_type and last_field_type.startswith('tagline'): # Check if this row has values in the country columns has_values = any( row[col_idx + 1].value is not None for col_idx in range(min(len(countries), len(row) - 1)) ) if has_values: field_name = 'dates' else: continue else: continue else: # Skip sub-section headers (e.g. "Print Day 4") — stop parsing here # to avoid overwriting main message data with sub-section data if 'print day' in label_lower or ('print' in label_lower and 'day' in label_lower): break # Determine the field type from the label field_name = _classify_row_label(label_lower) if not field_name: continue last_field_type = field_name # Extract values per country for col_idx, country in enumerate(countries): cell_idx = col_idx + 1 # +1 because column A is the label if cell_idx < len(row): val = row[cell_idx].value if val is not None: val = str(val).strip() # Clean up non-breaking spaces val = val.replace('\xa0', ' ').strip() if val and val != '-': if country not in msg_data: msg_data[country] = {} msg_data[country][field_name] = val return msg_data def _classify_row_label(label: str) -> Optional[str]: """Classify a row label into a standardized field name.""" label_lower = label.lower() if 'tagline' in label_lower or 'headline' in label_lower: if '2 line' in label_lower: return 'tagline_2_lines' elif '3 line' in label_lower: return 'tagline_3_lines' elif '4 line' in label_lower: return 'tagline_4_lines' elif '5 line' in label_lower: return 'tagline_5_lines' else: return 'tagline' # Check urgency BEFORE dates — "urgency messaging (to replace dates)" contains "date" elif 'urgency' in label_lower: return 'urgency_messaging' elif label_lower.startswith('date') or label_lower == 'dates': return 'dates' elif 'logo' in label_lower: return 'logo' elif 'cta' in label_lower: return 'cta' elif 'legal' in label_lower or 'disclaimer' in label_lower: return 'legal_disclaimer' # Skip section headers like "Print Day 4 (...)" elif 'print day' in label_lower or 'print only' in label_lower: return None elif 'event' in label_lower: return None # Skip event name row else: return None def build_localization_context(parsed_matrix: Dict, message_type: str, country: str) -> str: """ Build a context string for QC prompt injection from the localization matrix. Args: parsed_matrix: Parsed localization matrix data (from parse_localization_matrix) message_type: "Message A" or "Message B" (from media plan creative_name) country: Country code e.g. "IE", "DE", "IT" (from media plan) Returns: Context string to inject into QC prompts, or empty string if no match. """ if not parsed_matrix or not message_type or not country: return "" messages = parsed_matrix.get('messages', {}) # Normalize message type — handle variations like "message a", "Message A", "MSG A" msg_key = None msg_type_upper = message_type.strip().upper() for key in messages: if key.upper() == msg_type_upper: msg_key = key break # Also try partial match: "Message A" in "Live1_Message A" etc. if not msg_key: for key in messages: if key.upper() in msg_type_upper or msg_type_upper in key.upper(): msg_key = key break if not msg_key: return "" msg_data = messages[msg_key] # Find country data — try exact match, then case-insensitive country_data = None country_upper = country.strip().upper() for c_key, c_data in msg_data.items(): if c_key.upper() == country_upper: country_data = c_data break if not country_data: return "" # Build the context string lines = [ "\n=== LOCALIZATION MATRIX — EXPECTED COPY ===", f"Message Type: {msg_key}", f"Country/Market: {country}", "", "The following is the EXPECTED copy for this asset based on the localization matrix.", "Verify the asset contains the correct text matching one of these variants:", "", ] # Add tagline variants tagline_fields = [ ('tagline_2_lines', 'Expected Headline (2-line layout)'), ('tagline_3_lines', 'Expected Headline (3-line layout)'), ('tagline_4_lines', 'Expected Headline (4-line layout)'), ('tagline_5_lines', 'Expected Headline (5-line layout)'), ('tagline', 'Expected Headline'), ] has_tagline = False for field, label in tagline_fields: val = country_data.get(field) if val: # Show line breaks clearly display_val = val.replace('\n', ' / ') lines.append(f"{label}: {display_val}") has_tagline = True if has_tagline: lines.append("") # Add dates dates_val = country_data.get('dates') if dates_val: lines.append(f"Expected Dates: {dates_val}") # Add logo logo_val = country_data.get('logo') if logo_val: lines.append(f"Expected Logo: {logo_val}") # Add CTA cta_val = country_data.get('cta') if cta_val: lines.append(f"Expected CTA: {cta_val}") # Add legal legal_val = country_data.get('legal_disclaimer') if legal_val: lines.append(f"Expected Legal Disclaimer: {legal_val}") # Add urgency messaging if present urgency_val = country_data.get('urgency_messaging') if urgency_val: lines.append(f"Urgency Messaging (alternative to dates): {urgency_val}") lines.append("") lines.append("IMPORTANT: Cross-check the text visible in the asset against the expected copy above.") lines.append("The headline should match one of the tagline variants for this market and message type.") lines.append("Flag any text that doesn't match — wrong language, wrong message version, missing elements, or incorrect copy.") lines.append("=== END LOCALIZATION MATRIX ===\n") return "\n".join(lines) def detect_message_type(creative_name: str) -> Optional[str]: """ Detect if a creative_name from the media plan maps to a message type. Returns "Message A", "Message B", etc., or None if no match. """ if not creative_name: return None name_upper = creative_name.strip().upper() if 'MESSAGE A' in name_upper: return 'Message A' elif 'MESSAGE B' in name_upper: return 'Message B' elif 'MESSAGE C' in name_upper: return 'Message C' else: return None