Fix OCR element grouping: use line-level instead of block-level detection

Tesseract was grouping date + logo text into one block (e.g. "8-11 luglio
amazon prime day"), inflating the date char_height and causing false
typography failures. Now groups by (block_num, line_num) so each text
line becomes a separate element, enabling correct identification of
date, logo, and legal as distinct elements.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
nickviljoen 2026-04-02 13:26:11 +02:00
parent 20ed52d2a2
commit 1f6782b1ef

View file

@ -96,7 +96,8 @@ def _tesseract_detect(image_path, img_width, img_height):
# Get word-level bounding boxes from Tesseract
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
# Group words into blocks by block_num
# Group words by block + line for finer element separation
# (Tesseract often groups date + logo into one block)
block_groups = {}
for i in range(len(data['text'])):
text = data['text'][i].strip()
@ -104,9 +105,10 @@ def _tesseract_detect(image_path, img_width, img_height):
if not text or conf < 30: # Skip low confidence detections
continue
block_num = data['block_num'][i]
if block_num not in block_groups:
block_groups[block_num] = {
# Use block_num + line_num as key for finer grouping
group_key = (data['block_num'][i], data['line_num'][i])
if group_key not in block_groups:
block_groups[group_key] = {
'words': [],
'lefts': [],
'tops': [],
@ -115,7 +117,7 @@ def _tesseract_detect(image_path, img_width, img_height):
'word_heights': [],
}
bg = block_groups[block_num]
bg = block_groups[group_key]
bg['words'].append(text)
left = data['left'][i]
top = data['top'][i]
@ -128,7 +130,7 @@ def _tesseract_detect(image_path, img_width, img_height):
bg['word_heights'].append(h)
blocks = []
for block_num, bg in block_groups.items():
for group_key, bg in block_groups.items():
if not bg['words']:
continue