ppt-tool/backend/services/layout_analysis_service.py
Vadym Samoilenko e8295d6e71 Phase 4: Fix critical bugs, improve document parsing, add vision OCR
- Fix SSE stream 500: use async_session_maker inside StreamingResponse generator
  (Depends session closes when endpoint returns, before streaming starts)
- Fix template application: store template_name in prepare endpoint so worker
  uses the selected custom template instead of defaulting to "general"
- Fix OverlayLoader: replace loading.gif with HamsterLoader component
- Fix parse_mode default: change from "slides" to "layouts" to avoid 70+ layouts
- Update Gemini Flash model to gemini-3.1-flash-image-preview
- Improve DOCX parsing: python-docx for structured table extraction, OCR enabled
- Add vision-based image text extraction via Gemini for uploaded images
- Add LayoutParser integration for slide layout structure analysis
- Add Phase 4 MVP features: transfer ownership, URL input, follow-up questions,
  attachment-to-slide mapping, content router

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-27 14:07:00 +00:00

142 lines
4.4 KiB
Python

"""Layout analysis service using LayoutParser for slide structure detection.
Analyzes slide screenshots to detect regions (text, image, table, title)
and provides structural metadata for LLM-based code generation.
"""
import os
from typing import List, Optional
# LayoutParser is optional — graceful fallback if not installed
_LAYOUTPARSER_AVAILABLE = False
try:
import layoutparser as lp
_LAYOUTPARSER_AVAILABLE = True
except ImportError:
pass
class DetectedRegion:
"""A detected region on a slide."""
__slots__ = ("type", "x1", "y1", "x2", "y2", "score")
def __init__(self, type: str, x1: float, y1: float, x2: float, y2: float, score: float = 1.0):
self.type = type
self.x1 = x1
self.y1 = y1
self.x2 = x2
self.y2 = y2
self.score = score
def to_dict(self) -> dict:
return {
"type": self.type,
"x1": round(self.x1),
"y1": round(self.y1),
"x2": round(self.x2),
"y2": round(self.y2),
"score": round(self.score, 3),
}
def analyze_slide_layout(image_path: str) -> List[DetectedRegion]:
"""Analyze a slide screenshot and return detected layout regions.
Uses LayoutParser with a PubLayNet model if available.
Falls back to empty list if LayoutParser is not installed.
"""
if not _LAYOUTPARSER_AVAILABLE:
return []
if not os.path.exists(image_path):
return []
try:
import cv2
image = cv2.imread(image_path)
if image is None:
return []
# Use PubLayNet model — detects: Text, Title, List, Table, Figure
model = lp.Detectron2LayoutModel(
config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
)
layout = model.detect(image)
regions = []
for block in layout:
regions.append(DetectedRegion(
type=block.type,
x1=block.block.x_1,
y1=block.block.y_1,
x2=block.block.x_2,
y2=block.block.y_2,
score=block.score,
))
return regions
except Exception as e:
print(f"[LayoutAnalysis] Detection failed: {e}")
return []
def regions_to_description(regions: List[DetectedRegion], image_width: int = 960, image_height: int = 540) -> str:
"""Convert detected regions to a text description for LLM context.
Normalizes coordinates to percentages for resolution-independent descriptions.
"""
if not regions:
return ""
lines = ["Detected layout regions (coordinates as % of slide dimensions):"]
for r in sorted(regions, key=lambda r: (r.y1, r.x1)):
x_pct = round(r.x1 / image_width * 100)
y_pct = round(r.y1 / image_height * 100)
w_pct = round((r.x2 - r.x1) / image_width * 100)
h_pct = round((r.y2 - r.y1) / image_height * 100)
lines.append(
f"- {r.type}: position ({x_pct}%, {y_pct}%), size ({w_pct}% x {h_pct}%), confidence: {r.score:.0%}"
)
return "\n".join(lines)
def classify_layout_from_regions(regions: List[DetectedRegion]) -> Optional[str]:
"""Classify slide layout type based on detected regions.
Returns a layout type string or None if classification is uncertain.
"""
if not regions:
return None
type_counts = {}
for r in regions:
type_counts[r.type] = type_counts.get(r.type, 0) + 1
has_title = type_counts.get("Title", 0) > 0
has_text = type_counts.get("Text", 0) > 0
has_figure = type_counts.get("Figure", 0) > 0
has_table = type_counts.get("Table", 0) > 0
has_list = type_counts.get("List", 0) > 0
text_count = type_counts.get("Text", 0)
# Classification heuristics
if has_title and not has_text and not has_figure and not has_table:
return "title_slide"
if has_title and has_figure and not has_text:
return "picture"
if has_table:
return "table"
if text_count >= 2 or (has_text and has_list):
return "two_column"
if has_title and (has_text or has_list):
return "content"
if has_figure and (has_text or has_title):
return "picture_with_caption"
if not any([has_title, has_text, has_figure, has_table, has_list]):
return "blank"
return "content"