From a79529470e2ffa42cd87851f8d28ac06c3d2c3c1 Mon Sep 17 00:00:00 2001 From: DJP Date: Sun, 12 Apr 2026 16:15:00 -0400 Subject: [PATCH] Fix header detection: distinguish header text from data values MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: Header detection picked data rows (with Yes/No/numbers) as headers because they had more filled cells than the actual header row (which had merged cells with gaps). Result: data values became column labels, deep extraction failed. Fix: - Header values must be text-like (not numbers, Yes/No, 0/1, ü, x, -) - Only consecutive header rows count - stop scanning at first data row - Multi-row headers combined (row 1 + row 2 both contribute) - Tested against Wella Job Routes 2: correctly identifies row 2 as header with "Buckets | Categories | Top 10 deliverables | Tier A | Tier B | Tier C" Co-Authored-By: Claude Opus 4.6 (1M context) --- backend/app/services/doc_parser.py | 51 ++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 16 deletions(-) diff --git a/backend/app/services/doc_parser.py b/backend/app/services/doc_parser.py index 503bb9e..0c280d6 100644 --- a/backend/app/services/doc_parser.py +++ b/backend/app/services/doc_parser.py @@ -164,30 +164,49 @@ def _extract_excel_text(content: bytes) -> str: parts.append(f"\n=== Sheet: {sheet_name} ({ws.max_row} rows × {ws.max_column} cols) ===") - # Detect header row (row with most non-empty cells in first 5 rows) - header_row_idx = 1 - max_cells = 0 - for r in range(1, min(6, ws.max_row + 1)): - count = sum(1 for c in range(1, ws.max_column + 1) if ws.cell(row=r, column=c).value is not None) - if count > max_cells: - max_cells = count - header_row_idx = r - - # Read header names + # Build combined headers from first few rows (many spreadsheets have multi-row headers) + # Strategy: scan rows 1-5, combine all text-like values as column labels + # A "header" cell contains text (not just numbers, Yes/No, 0/1) headers = {} - for c in range(1, ws.max_column + 1): - val = ws.cell(row=header_row_idx, column=c).value - if val: - headers[c] = str(val).strip()[:50] + data_start_row = 1 + + def _is_header_value(val): + if val is None: + return False + s = str(val).strip() + if not s: + return False + # Pure numbers, Yes/No, single chars like ü are likely data, not headers + if s.replace('.', '').replace(',', '').isdigit(): + return False + if s.lower() in ('yes', 'no', 'true', 'false', 'x', 'ü', "ü'", '0', '1', '-'): + return False + return True + + found_data = False + for r in range(1, min(6, ws.max_row + 1)): + row_vals = [(c, ws.cell(row=r, column=c).value) for c in range(1, ws.max_column + 1)] + header_cells = sum(1 for _, v in row_vals if _is_header_value(v)) + data_cells = sum(1 for _, v in row_vals if v is not None and not _is_header_value(v)) + + # If this row has more header-like text than data values, it's a header row + # But stop looking once we've hit a data row + if not found_data and header_cells >= 2 and header_cells >= data_cells: + for c, v in row_vals: + if _is_header_value(v) and c not in headers: + headers[c] = str(v).strip()[:50] + data_start_row = r + 1 + elif data_cells > 0: + found_data = True if headers: - parts.append(f"[Headers from row {header_row_idx}]: {' | '.join(headers.values())}") + parts.append(f"[Column headers]: {' | '.join(headers.values())}") # Track last non-empty value per column for merged cell carry-forward last_vals = {} # Extract data rows with labelled fields - for row_idx in range(header_row_idx + 1, min(ws.max_row + 1, header_row_idx + 200)): + for row_idx in range(data_start_row, min(ws.max_row + 1, data_start_row + 200)): fields = [] has_data = False for c in range(1, ws.max_column + 1):