Fix header detection: distinguish header text from data values
Problem: Header detection picked data rows (with Yes/No/numbers) as headers because they had more filled cells than the actual header row (which had merged cells with gaps). Result: data values became column labels, deep extraction failed. Fix: - Header values must be text-like (not numbers, Yes/No, 0/1, ü, x, -) - Only consecutive header rows count - stop scanning at first data row - Multi-row headers combined (row 1 + row 2 both contribute) - Tested against Wella Job Routes 2: correctly identifies row 2 as header with "Buckets | Categories | Top 10 deliverables | Tier A | Tier B | Tier C" Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d85ef96a06
commit
a79529470e
1 changed files with 35 additions and 16 deletions
|
|
@ -164,30 +164,49 @@ def _extract_excel_text(content: bytes) -> str:
|
|||
|
||||
parts.append(f"\n=== Sheet: {sheet_name} ({ws.max_row} rows × {ws.max_column} cols) ===")
|
||||
|
||||
# Detect header row (row with most non-empty cells in first 5 rows)
|
||||
header_row_idx = 1
|
||||
max_cells = 0
|
||||
for r in range(1, min(6, ws.max_row + 1)):
|
||||
count = sum(1 for c in range(1, ws.max_column + 1) if ws.cell(row=r, column=c).value is not None)
|
||||
if count > max_cells:
|
||||
max_cells = count
|
||||
header_row_idx = r
|
||||
|
||||
# Read header names
|
||||
# Build combined headers from first few rows (many spreadsheets have multi-row headers)
|
||||
# Strategy: scan rows 1-5, combine all text-like values as column labels
|
||||
# A "header" cell contains text (not just numbers, Yes/No, 0/1)
|
||||
headers = {}
|
||||
for c in range(1, ws.max_column + 1):
|
||||
val = ws.cell(row=header_row_idx, column=c).value
|
||||
if val:
|
||||
headers[c] = str(val).strip()[:50]
|
||||
data_start_row = 1
|
||||
|
||||
def _is_header_value(val):
|
||||
if val is None:
|
||||
return False
|
||||
s = str(val).strip()
|
||||
if not s:
|
||||
return False
|
||||
# Pure numbers, Yes/No, single chars like ü are likely data, not headers
|
||||
if s.replace('.', '').replace(',', '').isdigit():
|
||||
return False
|
||||
if s.lower() in ('yes', 'no', 'true', 'false', 'x', 'ü', "ü'", '0', '1', '-'):
|
||||
return False
|
||||
return True
|
||||
|
||||
found_data = False
|
||||
for r in range(1, min(6, ws.max_row + 1)):
|
||||
row_vals = [(c, ws.cell(row=r, column=c).value) for c in range(1, ws.max_column + 1)]
|
||||
header_cells = sum(1 for _, v in row_vals if _is_header_value(v))
|
||||
data_cells = sum(1 for _, v in row_vals if v is not None and not _is_header_value(v))
|
||||
|
||||
# If this row has more header-like text than data values, it's a header row
|
||||
# But stop looking once we've hit a data row
|
||||
if not found_data and header_cells >= 2 and header_cells >= data_cells:
|
||||
for c, v in row_vals:
|
||||
if _is_header_value(v) and c not in headers:
|
||||
headers[c] = str(v).strip()[:50]
|
||||
data_start_row = r + 1
|
||||
elif data_cells > 0:
|
||||
found_data = True
|
||||
|
||||
if headers:
|
||||
parts.append(f"[Headers from row {header_row_idx}]: {' | '.join(headers.values())}")
|
||||
parts.append(f"[Column headers]: {' | '.join(headers.values())}")
|
||||
|
||||
# Track last non-empty value per column for merged cell carry-forward
|
||||
last_vals = {}
|
||||
|
||||
# Extract data rows with labelled fields
|
||||
for row_idx in range(header_row_idx + 1, min(ws.max_row + 1, header_row_idx + 200)):
|
||||
for row_idx in range(data_start_row, min(ws.max_row + 1, data_start_row + 200)):
|
||||
fields = []
|
||||
has_data = False
|
||||
for c in range(1, ws.max_column + 1):
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue