"""Layout analysis service using LayoutParser for slide structure detection. Analyzes slide screenshots to detect regions (text, image, table, title) and provides structural metadata for LLM-based code generation. """ import os from typing import List, Optional # LayoutParser is optional — graceful fallback if not installed _LAYOUTPARSER_AVAILABLE = False try: import layoutparser as lp _LAYOUTPARSER_AVAILABLE = True except ImportError: pass class DetectedRegion: """A detected region on a slide.""" __slots__ = ("type", "x1", "y1", "x2", "y2", "score") def __init__(self, type: str, x1: float, y1: float, x2: float, y2: float, score: float = 1.0): self.type = type self.x1 = x1 self.y1 = y1 self.x2 = x2 self.y2 = y2 self.score = score def to_dict(self) -> dict: return { "type": self.type, "x1": round(self.x1), "y1": round(self.y1), "x2": round(self.x2), "y2": round(self.y2), "score": round(self.score, 3), } def analyze_slide_layout(image_path: str) -> List[DetectedRegion]: """Analyze a slide screenshot and return detected layout regions. Uses LayoutParser with a PubLayNet model if available. Falls back to empty list if LayoutParser is not installed. """ if not _LAYOUTPARSER_AVAILABLE: return [] if not os.path.exists(image_path): return [] try: import cv2 image = cv2.imread(image_path) if image is None: return [] # Use PubLayNet model — detects: Text, Title, List, Table, Figure model = lp.Detectron2LayoutModel( config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config", label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}, extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5], ) layout = model.detect(image) regions = [] for block in layout: regions.append(DetectedRegion( type=block.type, x1=block.block.x_1, y1=block.block.y_1, x2=block.block.x_2, y2=block.block.y_2, score=block.score, )) return regions except Exception as e: print(f"[LayoutAnalysis] Detection failed: {e}") return [] def regions_to_description(regions: List[DetectedRegion], image_width: int = 960, image_height: int = 540) -> str: """Convert detected regions to a text description for LLM context. Normalizes coordinates to percentages for resolution-independent descriptions. """ if not regions: return "" lines = ["Detected layout regions (coordinates as % of slide dimensions):"] for r in sorted(regions, key=lambda r: (r.y1, r.x1)): x_pct = round(r.x1 / image_width * 100) y_pct = round(r.y1 / image_height * 100) w_pct = round((r.x2 - r.x1) / image_width * 100) h_pct = round((r.y2 - r.y1) / image_height * 100) lines.append( f"- {r.type}: position ({x_pct}%, {y_pct}%), size ({w_pct}% x {h_pct}%), confidence: {r.score:.0%}" ) return "\n".join(lines) def classify_layout_from_regions(regions: List[DetectedRegion]) -> Optional[str]: """Classify slide layout type based on detected regions. Returns a layout type string or None if classification is uncertain. """ if not regions: return None type_counts = {} for r in regions: type_counts[r.type] = type_counts.get(r.type, 0) + 1 has_title = type_counts.get("Title", 0) > 0 has_text = type_counts.get("Text", 0) > 0 has_figure = type_counts.get("Figure", 0) > 0 has_table = type_counts.get("Table", 0) > 0 has_list = type_counts.get("List", 0) > 0 text_count = type_counts.get("Text", 0) # Classification heuristics if has_title and not has_text and not has_figure and not has_table: return "title_slide" if has_title and has_figure and not has_text: return "picture" if has_table: return "table" if text_count >= 2 or (has_text and has_list): return "two_column" if has_title and (has_text or has_list): return "content" if has_figure and (has_text or has_title): return "picture_with_caption" if not any([has_title, has_text, has_figure, has_table, has_list]): return "blank" return "content"