Fix OCR element grouping: use line-level instead of block-level detection
Tesseract was grouping date + logo text into one block (e.g. "8-11 luglio amazon prime day"), inflating the date char_height and causing false typography failures. Now groups by (block_num, line_num) so each text line becomes a separate element, enabling correct identification of date, logo, and legal as distinct elements. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
20ed52d2a2
commit
1f6782b1ef
1 changed files with 8 additions and 6 deletions
|
|
@ -96,7 +96,8 @@ def _tesseract_detect(image_path, img_width, img_height):
|
|||
# Get word-level bounding boxes from Tesseract
|
||||
data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
|
||||
|
||||
# Group words into blocks by block_num
|
||||
# Group words by block + line for finer element separation
|
||||
# (Tesseract often groups date + logo into one block)
|
||||
block_groups = {}
|
||||
for i in range(len(data['text'])):
|
||||
text = data['text'][i].strip()
|
||||
|
|
@ -104,9 +105,10 @@ def _tesseract_detect(image_path, img_width, img_height):
|
|||
if not text or conf < 30: # Skip low confidence detections
|
||||
continue
|
||||
|
||||
block_num = data['block_num'][i]
|
||||
if block_num not in block_groups:
|
||||
block_groups[block_num] = {
|
||||
# Use block_num + line_num as key for finer grouping
|
||||
group_key = (data['block_num'][i], data['line_num'][i])
|
||||
if group_key not in block_groups:
|
||||
block_groups[group_key] = {
|
||||
'words': [],
|
||||
'lefts': [],
|
||||
'tops': [],
|
||||
|
|
@ -115,7 +117,7 @@ def _tesseract_detect(image_path, img_width, img_height):
|
|||
'word_heights': [],
|
||||
}
|
||||
|
||||
bg = block_groups[block_num]
|
||||
bg = block_groups[group_key]
|
||||
bg['words'].append(text)
|
||||
left = data['left'][i]
|
||||
top = data['top'][i]
|
||||
|
|
@ -128,7 +130,7 @@ def _tesseract_detect(image_path, img_width, img_height):
|
|||
bg['word_heights'].append(h)
|
||||
|
||||
blocks = []
|
||||
for block_num, bg in block_groups.items():
|
||||
for group_key, bg in block_groups.items():
|
||||
if not bg['words']:
|
||||
continue
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue