Fix header detection: distinguish header text from data values

Problem: Header detection picked data rows (with Yes/No/numbers) as headers
because they had more filled cells than the actual header row (which had
merged cells with gaps). Result: data values became column labels, deep
extraction failed.

Fix:
- Header values must be text-like (not numbers, Yes/No, 0/1, ü, x, -)
- Only consecutive header rows count - stop scanning at first data row
- Multi-row headers combined (row 1 + row 2 both contribute)
- Tested against Wella Job Routes 2: correctly identifies row 2 as header
  with "Buckets | Categories | Top 10 deliverables | Tier A | Tier B | Tier C"

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
DJP 2026-04-12 16:15:00 -04:00
parent d85ef96a06
commit a79529470e

View file

@ -164,30 +164,49 @@ def _extract_excel_text(content: bytes) -> str:
parts.append(f"\n=== Sheet: {sheet_name} ({ws.max_row} rows × {ws.max_column} cols) ===")
# Detect header row (row with most non-empty cells in first 5 rows)
header_row_idx = 1
max_cells = 0
for r in range(1, min(6, ws.max_row + 1)):
count = sum(1 for c in range(1, ws.max_column + 1) if ws.cell(row=r, column=c).value is not None)
if count > max_cells:
max_cells = count
header_row_idx = r
# Read header names
# Build combined headers from first few rows (many spreadsheets have multi-row headers)
# Strategy: scan rows 1-5, combine all text-like values as column labels
# A "header" cell contains text (not just numbers, Yes/No, 0/1)
headers = {}
for c in range(1, ws.max_column + 1):
val = ws.cell(row=header_row_idx, column=c).value
if val:
headers[c] = str(val).strip()[:50]
data_start_row = 1
def _is_header_value(val):
if val is None:
return False
s = str(val).strip()
if not s:
return False
# Pure numbers, Yes/No, single chars like ü are likely data, not headers
if s.replace('.', '').replace(',', '').isdigit():
return False
if s.lower() in ('yes', 'no', 'true', 'false', 'x', 'ü', "ü'", '0', '1', '-'):
return False
return True
found_data = False
for r in range(1, min(6, ws.max_row + 1)):
row_vals = [(c, ws.cell(row=r, column=c).value) for c in range(1, ws.max_column + 1)]
header_cells = sum(1 for _, v in row_vals if _is_header_value(v))
data_cells = sum(1 for _, v in row_vals if v is not None and not _is_header_value(v))
# If this row has more header-like text than data values, it's a header row
# But stop looking once we've hit a data row
if not found_data and header_cells >= 2 and header_cells >= data_cells:
for c, v in row_vals:
if _is_header_value(v) and c not in headers:
headers[c] = str(v).strip()[:50]
data_start_row = r + 1
elif data_cells > 0:
found_data = True
if headers:
parts.append(f"[Headers from row {header_row_idx}]: {' | '.join(headers.values())}")
parts.append(f"[Column headers]: {' | '.join(headers.values())}")
# Track last non-empty value per column for merged cell carry-forward
last_vals = {}
# Extract data rows with labelled fields
for row_idx in range(header_row_idx + 1, min(ws.max_row + 1, header_row_idx + 200)):
for row_idx in range(data_start_row, min(ws.max_row + 1, data_start_row + 200)):
fields = []
has_data = False
for c in range(1, ws.max_column + 1):