ppt-tool/backend/services/layout_analysis_service.py

"""Layout analysis service using LayoutParser for slide structure detection.

Analyzes slide screenshots to detect regions (text, image, table, title)
and provides structural metadata for LLM-based code generation.
"""
import os
from typing import List, Optional

# LayoutParser is optional — graceful fallback if not installed
_LAYOUTPARSER_AVAILABLE = False
try:
    import layoutparser as lp
    _LAYOUTPARSER_AVAILABLE = True
except ImportError:
    pass


class DetectedRegion:
    """A detected region on a slide."""
    __slots__ = ("type", "x1", "y1", "x2", "y2", "score")

    def __init__(self, type: str, x1: float, y1: float, x2: float, y2: float, score: float = 1.0):
        self.type = type
        self.x1 = x1
        self.y1 = y1
        self.x2 = x2
        self.y2 = y2
        self.score = score

    def to_dict(self) -> dict:
        return {
            "type": self.type,
            "x1": round(self.x1),
            "y1": round(self.y1),
            "x2": round(self.x2),
            "y2": round(self.y2),
            "score": round(self.score, 3),
        }


def analyze_slide_layout(image_path: str) -> List[DetectedRegion]:
    """Analyze a slide screenshot and return detected layout regions.

    Uses LayoutParser with a PubLayNet model if available.
    Falls back to empty list if LayoutParser is not installed.
    """
    if not _LAYOUTPARSER_AVAILABLE:
        return []

    if not os.path.exists(image_path):
        return []

    try:
        import cv2
        image = cv2.imread(image_path)
        if image is None:
            return []

        # Use PubLayNet model — detects: Text, Title, List, Table, Figure
        model = lp.Detectron2LayoutModel(
            config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
            label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
            extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
        )

        layout = model.detect(image)

        regions = []
        for block in layout:
            regions.append(DetectedRegion(
                type=block.type,
                x1=block.block.x_1,
                y1=block.block.y_1,
                x2=block.block.x_2,
                y2=block.block.y_2,
                score=block.score,
            ))

        return regions

    except Exception as e:
        print(f"[LayoutAnalysis] Detection failed: {e}")
        return []


def regions_to_description(regions: List[DetectedRegion], image_width: int = 960, image_height: int = 540) -> str:
    """Convert detected regions to a text description for LLM context.

    Normalizes coordinates to percentages for resolution-independent descriptions.
    """
    if not regions:
        return ""

    lines = ["Detected layout regions (coordinates as % of slide dimensions):"]
    for r in sorted(regions, key=lambda r: (r.y1, r.x1)):
        x_pct = round(r.x1 / image_width * 100)
        y_pct = round(r.y1 / image_height * 100)
        w_pct = round((r.x2 - r.x1) / image_width * 100)
        h_pct = round((r.y2 - r.y1) / image_height * 100)
        lines.append(
            f"- {r.type}: position ({x_pct}%, {y_pct}%), size ({w_pct}% x {h_pct}%), confidence: {r.score:.0%}"
        )

    return "\n".join(lines)


def classify_layout_from_regions(regions: List[DetectedRegion]) -> Optional[str]:
    """Classify slide layout type based on detected regions.

    Returns a layout type string or None if classification is uncertain.
    """
    if not regions:
        return None

    type_counts = {}
    for r in regions:
        type_counts[r.type] = type_counts.get(r.type, 0) + 1

    has_title = type_counts.get("Title", 0) > 0
    has_text = type_counts.get("Text", 0) > 0
    has_figure = type_counts.get("Figure", 0) > 0
    has_table = type_counts.get("Table", 0) > 0
    has_list = type_counts.get("List", 0) > 0
    text_count = type_counts.get("Text", 0)

    # Classification heuristics
    if has_title and not has_text and not has_figure and not has_table:
        return "title_slide"
    if has_title and has_figure and not has_text:
        return "picture"
    if has_table:
        return "table"
    if text_count >= 2 or (has_text and has_list):
        return "two_column"
    if has_title and (has_text or has_list):
        return "content"
    if has_figure and (has_text or has_title):
        return "picture_with_caption"
    if not any([has_title, has_text, has_figure, has_table, has_list]):
        return "blank"

    return "content"