diff --git a/backend/ocr_measurement.py b/backend/ocr_measurement.py index 6fddf45..3782815 100644 --- a/backend/ocr_measurement.py +++ b/backend/ocr_measurement.py @@ -96,7 +96,8 @@ def _tesseract_detect(image_path, img_width, img_height): # Get word-level bounding boxes from Tesseract data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT) - # Group words into blocks by block_num + # Group words by block + line for finer element separation + # (Tesseract often groups date + logo into one block) block_groups = {} for i in range(len(data['text'])): text = data['text'][i].strip() @@ -104,9 +105,10 @@ def _tesseract_detect(image_path, img_width, img_height): if not text or conf < 30: # Skip low confidence detections continue - block_num = data['block_num'][i] - if block_num not in block_groups: - block_groups[block_num] = { + # Use block_num + line_num as key for finer grouping + group_key = (data['block_num'][i], data['line_num'][i]) + if group_key not in block_groups: + block_groups[group_key] = { 'words': [], 'lefts': [], 'tops': [], @@ -115,7 +117,7 @@ def _tesseract_detect(image_path, img_width, img_height): 'word_heights': [], } - bg = block_groups[block_num] + bg = block_groups[group_key] bg['words'].append(text) left = data['left'][i] top = data['top'][i] @@ -128,7 +130,7 @@ def _tesseract_detect(image_path, img_width, img_height): bg['word_heights'].append(h) blocks = [] - for block_num, bg in block_groups.items(): + for group_key, bg in block_groups.items(): if not bg['words']: continue