- Fix SSE stream 500: use async_session_maker inside StreamingResponse generator (Depends session closes when endpoint returns, before streaming starts) - Fix template application: store template_name in prepare endpoint so worker uses the selected custom template instead of defaulting to "general" - Fix OverlayLoader: replace loading.gif with HamsterLoader component - Fix parse_mode default: change from "slides" to "layouts" to avoid 70+ layouts - Update Gemini Flash model to gemini-3.1-flash-image-preview - Improve DOCX parsing: python-docx for structured table extraction, OCR enabled - Add vision-based image text extraction via Gemini for uploaded images - Add LayoutParser integration for slide layout structure analysis - Add Phase 4 MVP features: transfer ownership, URL input, follow-up questions, attachment-to-slide mapping, content router Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
142 lines
4.4 KiB
Python
142 lines
4.4 KiB
Python
"""Layout analysis service using LayoutParser for slide structure detection.
|
|
|
|
Analyzes slide screenshots to detect regions (text, image, table, title)
|
|
and provides structural metadata for LLM-based code generation.
|
|
"""
|
|
import os
|
|
from typing import List, Optional
|
|
|
|
# LayoutParser is optional — graceful fallback if not installed
|
|
_LAYOUTPARSER_AVAILABLE = False
|
|
try:
|
|
import layoutparser as lp
|
|
_LAYOUTPARSER_AVAILABLE = True
|
|
except ImportError:
|
|
pass
|
|
|
|
|
|
class DetectedRegion:
|
|
"""A detected region on a slide."""
|
|
__slots__ = ("type", "x1", "y1", "x2", "y2", "score")
|
|
|
|
def __init__(self, type: str, x1: float, y1: float, x2: float, y2: float, score: float = 1.0):
|
|
self.type = type
|
|
self.x1 = x1
|
|
self.y1 = y1
|
|
self.x2 = x2
|
|
self.y2 = y2
|
|
self.score = score
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"type": self.type,
|
|
"x1": round(self.x1),
|
|
"y1": round(self.y1),
|
|
"x2": round(self.x2),
|
|
"y2": round(self.y2),
|
|
"score": round(self.score, 3),
|
|
}
|
|
|
|
|
|
def analyze_slide_layout(image_path: str) -> List[DetectedRegion]:
|
|
"""Analyze a slide screenshot and return detected layout regions.
|
|
|
|
Uses LayoutParser with a PubLayNet model if available.
|
|
Falls back to empty list if LayoutParser is not installed.
|
|
"""
|
|
if not _LAYOUTPARSER_AVAILABLE:
|
|
return []
|
|
|
|
if not os.path.exists(image_path):
|
|
return []
|
|
|
|
try:
|
|
import cv2
|
|
image = cv2.imread(image_path)
|
|
if image is None:
|
|
return []
|
|
|
|
# Use PubLayNet model — detects: Text, Title, List, Table, Figure
|
|
model = lp.Detectron2LayoutModel(
|
|
config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
|
|
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
|
|
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
|
|
)
|
|
|
|
layout = model.detect(image)
|
|
|
|
regions = []
|
|
for block in layout:
|
|
regions.append(DetectedRegion(
|
|
type=block.type,
|
|
x1=block.block.x_1,
|
|
y1=block.block.y_1,
|
|
x2=block.block.x_2,
|
|
y2=block.block.y_2,
|
|
score=block.score,
|
|
))
|
|
|
|
return regions
|
|
|
|
except Exception as e:
|
|
print(f"[LayoutAnalysis] Detection failed: {e}")
|
|
return []
|
|
|
|
|
|
def regions_to_description(regions: List[DetectedRegion], image_width: int = 960, image_height: int = 540) -> str:
|
|
"""Convert detected regions to a text description for LLM context.
|
|
|
|
Normalizes coordinates to percentages for resolution-independent descriptions.
|
|
"""
|
|
if not regions:
|
|
return ""
|
|
|
|
lines = ["Detected layout regions (coordinates as % of slide dimensions):"]
|
|
for r in sorted(regions, key=lambda r: (r.y1, r.x1)):
|
|
x_pct = round(r.x1 / image_width * 100)
|
|
y_pct = round(r.y1 / image_height * 100)
|
|
w_pct = round((r.x2 - r.x1) / image_width * 100)
|
|
h_pct = round((r.y2 - r.y1) / image_height * 100)
|
|
lines.append(
|
|
f"- {r.type}: position ({x_pct}%, {y_pct}%), size ({w_pct}% x {h_pct}%), confidence: {r.score:.0%}"
|
|
)
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def classify_layout_from_regions(regions: List[DetectedRegion]) -> Optional[str]:
|
|
"""Classify slide layout type based on detected regions.
|
|
|
|
Returns a layout type string or None if classification is uncertain.
|
|
"""
|
|
if not regions:
|
|
return None
|
|
|
|
type_counts = {}
|
|
for r in regions:
|
|
type_counts[r.type] = type_counts.get(r.type, 0) + 1
|
|
|
|
has_title = type_counts.get("Title", 0) > 0
|
|
has_text = type_counts.get("Text", 0) > 0
|
|
has_figure = type_counts.get("Figure", 0) > 0
|
|
has_table = type_counts.get("Table", 0) > 0
|
|
has_list = type_counts.get("List", 0) > 0
|
|
text_count = type_counts.get("Text", 0)
|
|
|
|
# Classification heuristics
|
|
if has_title and not has_text and not has_figure and not has_table:
|
|
return "title_slide"
|
|
if has_title and has_figure and not has_text:
|
|
return "picture"
|
|
if has_table:
|
|
return "table"
|
|
if text_count >= 2 or (has_text and has_list):
|
|
return "two_column"
|
|
if has_title and (has_text or has_list):
|
|
return "content"
|
|
if has_figure and (has_text or has_title):
|
|
return "picture_with_caption"
|
|
if not any([has_title, has_text, has_figure, has_table, has_list]):
|
|
return "blank"
|
|
|
|
return "content"
|