psycopg2 performs %-substitution when params are passed to execute(), so 'M%' in the LIKE clause was being interpreted as a positional placeholder, raising IndexError when there's only one real %s (LIMIT). Escape as 'M%%' so it's preserved as a literal percent. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
141 lines
4.9 KiB
Python
141 lines
4.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Diagnostic: Inspect what metadata B1 global masters actually carry in
|
|
master_assets.full_metadata, so we can tell why the CX backfill found 0.
|
|
|
|
Two checks:
|
|
1. Top-level keys of full_metadata (does the structure even contain
|
|
metadata.metadata_element_list?).
|
|
2. Across a larger sample, count occurrences of any element_id that
|
|
looks CX/score/quality-related (case-insensitive) — surfaces the
|
|
actual element IDs used by client B1 masters, in case they differ
|
|
from the A1 IDs the extractor expects.
|
|
|
|
Read-only. Safe to run any time.
|
|
|
|
Usage:
|
|
python scripts/diagnose_b1_master_metadata.py
|
|
python scripts/diagnose_b1_master_metadata.py --sample 200
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import json
|
|
import argparse
|
|
import logging
|
|
from collections import Counter
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
|
|
from shared.config_loader import load_config
|
|
from shared.database import Database
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger('B1MetaDiag')
|
|
|
|
CX_HINTS = ('creativex', 'cx', 'score', 'quality')
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--sample', type=int, default=100,
|
|
help='How many B1 masters to scan for element-ID counts (default 100)')
|
|
parser.add_argument('--show-full', type=int, default=2,
|
|
help='How many sample full_metadata blobs to dump in full (default 2)')
|
|
args = parser.parse_args()
|
|
|
|
config = load_config('config/config.yaml')
|
|
db = Database(config)
|
|
if not db.test_connection():
|
|
sys.exit(1)
|
|
|
|
conn = db.get_connection()
|
|
try:
|
|
cursor = conn.cursor()
|
|
cursor.execute("""
|
|
SELECT tracking_id, original_filename, full_metadata
|
|
FROM master_assets
|
|
WHERE tracking_id LIKE 'M%%'
|
|
AND local_campaign_id IS NULL
|
|
AND status = 'active'
|
|
ORDER BY created_at DESC
|
|
LIMIT %s
|
|
""", (args.sample,))
|
|
rows = cursor.fetchall()
|
|
finally:
|
|
cursor.close()
|
|
db.put_connection(conn)
|
|
|
|
logger.info('Sampled %d B1 global masters', len(rows))
|
|
|
|
# 1. Top-level structure check
|
|
top_key_counter = Counter()
|
|
has_meta_list = 0
|
|
empty_full_meta = 0
|
|
for r in rows:
|
|
full = r[2] if isinstance(r[2], dict) else (r[2] or {})
|
|
if not full:
|
|
empty_full_meta += 1
|
|
continue
|
|
for k in full.keys():
|
|
top_key_counter[k] += 1
|
|
meta = full.get('metadata')
|
|
if isinstance(meta, dict) and isinstance(meta.get('metadata_element_list'), list):
|
|
has_meta_list += 1
|
|
|
|
logger.info('=' * 60)
|
|
logger.info('Top-level keys present in full_metadata (count of rows containing the key):')
|
|
for k, c in top_key_counter.most_common():
|
|
logger.info(' %-30s %d', k, c)
|
|
logger.info('Rows with empty full_metadata: %d', empty_full_meta)
|
|
logger.info('Rows with metadata.metadata_element_list: %d', has_meta_list)
|
|
logger.info('=' * 60)
|
|
|
|
# 2. Hunt for CX-flavored element IDs
|
|
id_counter = Counter()
|
|
cx_id_counter = Counter()
|
|
rows_with_cx_hint = 0
|
|
for r in rows:
|
|
full = r[2] if isinstance(r[2], dict) else (r[2] or {})
|
|
elements = (full.get('metadata') or {}).get('metadata_element_list') or []
|
|
row_had_hint = False
|
|
for e in elements:
|
|
eid = (e.get('id') or '').strip()
|
|
if not eid:
|
|
continue
|
|
id_counter[eid] += 1
|
|
lower = eid.lower()
|
|
if any(h in lower for h in CX_HINTS):
|
|
cx_id_counter[eid] += 1
|
|
row_had_hint = True
|
|
if row_had_hint:
|
|
rows_with_cx_hint += 1
|
|
|
|
logger.info('Distinct element_ids seen across sample: %d', len(id_counter))
|
|
logger.info('Rows containing at least one CX-flavored element_id: %d / %d',
|
|
rows_with_cx_hint, len(rows))
|
|
logger.info('-' * 60)
|
|
if cx_id_counter:
|
|
logger.info('CX/score/quality-flavored element_ids found:')
|
|
for eid, c in cx_id_counter.most_common():
|
|
logger.info(' %-50s %d', eid, c)
|
|
else:
|
|
logger.info('NO CX/score/quality-flavored element_ids found in the sample.')
|
|
logger.info('Likely: client B1 masters were uploaded before CX scoring ran on them.')
|
|
logger.info('=' * 60)
|
|
|
|
# 3. Dump first few full blobs verbatim for manual inspection
|
|
if args.show_full > 0:
|
|
logger.info('First %d full_metadata blobs (truncated to 4KB each):', args.show_full)
|
|
for r in rows[:args.show_full]:
|
|
full = r[2] if isinstance(r[2], dict) else (r[2] or {})
|
|
blob = json.dumps(full, indent=2, default=str)
|
|
if len(blob) > 4096:
|
|
blob = blob[:4096] + '\n... [truncated]'
|
|
logger.info('--- %s (%s) ---\n%s', r[0], r[1], blob)
|
|
|
|
db.close()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|