Fix OCR element grouping: use line-level instead of block-level detection

Tesseract was grouping date + logo text into one block (e.g. "8-11 luglio amazon prime day"), inflating the date char_height and causing false typography failures. Now groups by (block_num, line_num) so each text line becomes a separate element, enabling correct identification of date, logo, and legal as distinct elements. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-02 13:26:11 +02:00 · 2026-04-02 13:26:11 +02:00 · 1f6782b1ef
commit 1f6782b1ef
parent 20ed52d2a2
1 changed files with 8 additions and 6 deletions
--- a/backend/ocr_measurement.py
+++ b/backend/ocr_measurement.py
@ -96,7 +96,8 @@ def _tesseract_detect(image_path, img_width, img_height):
        # Get word-level bounding boxes from Tesseract
        data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)

-        # Group words into blocks by block_num
+        # Group words by block + line for finer element separation
+        # (Tesseract often groups date + logo into one block)
        block_groups = {}
        for i in range(len(data['text'])):
            text = data['text'][i].strip()
@ -104,9 +105,10 @@ def _tesseract_detect(image_path, img_width, img_height):
            if not text or conf < 30:  # Skip low confidence detections
                continue

-            block_num = data['block_num'][i]
-            if block_num not in block_groups:
-                block_groups[block_num] = {
+            # Use block_num + line_num as key for finer grouping
+            group_key = (data['block_num'][i], data['line_num'][i])
+            if group_key not in block_groups:
+                block_groups[group_key] = {
                    'words': [],
                    'lefts': [],
                    'tops': [],
@ -115,7 +117,7 @@ def _tesseract_detect(image_path, img_width, img_height):
                    'word_heights': [],
                }

-            bg = block_groups[block_num]
+            bg = block_groups[group_key]
            bg['words'].append(text)
            left = data['left'][i]
            top = data['top'][i]
@ -128,7 +130,7 @@ def _tesseract_detect(image_path, img_width, img_height):
            bg['word_heights'].append(h)

        blocks = []
-        for block_num, bg in block_groups.items():
+        for group_key, bg in block_groups.items():
            if not bg['words']:
                continue