ppt-tool/backend/services/slide_mapping_engine.py
Vadym Samoilenko a2bd4cfefa Phase 3: Content Pipeline — file parsing, content intelligence, slide mapping, native charts
- Step 10: Extended file upload for Excel/CSV/images/URLs (openpyxl, trafilatura)
- Step 11: Content intelligence service with rule-based + LLM classification
- Step 12: Slide mapping engine mapping content blocks to master deck layouts
- Step 13: Chart data extractor, native PPTX chart service (bar/line/pie/gantt/waterfall), ChartDataEditor skeleton

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-26 15:54:04 +00:00

305 lines
12 KiB
Python

"""Slide Mapping Engine: map classified content blocks to master deck layouts."""
from typing import Dict, List, Optional
from pydantic import BaseModel
from models.content_models import ClassifiedContent, ContentBlock, ContentBlockType
from models.llm_message import LLMSystemMessage, LLMUserMessage
from services.llm_client import LLMClient
from utils.llm_provider import get_model
class SlideMapping(BaseModel):
content_block_indices: List[int] # which content blocks go on this slide
layout_id: str
layout_name: str
slide_type: str
content_summary: str
attachment_ids: List[str] = []
# Map content block types to preferred layout types (as stored in MasterDeckModel.layouts[].layout_type)
_BLOCK_TO_LAYOUT_TYPE: Dict[ContentBlockType, List[str]] = {
ContentBlockType.metric: ["metrics", "kpi", "data", "chart", "content"],
ContentBlockType.quote: ["quote", "testimonial", "content"],
ContentBlockType.table: ["table", "chart", "data", "content"],
ContentBlockType.timeline: ["timeline", "process", "content"],
ContentBlockType.comparison: ["comparison", "two_column", "content"],
ContentBlockType.list_items: ["content", "bullet", "list"],
ContentBlockType.narrative: ["content", "text", "description"],
ContentBlockType.image_reference: ["picture", "image", "content"],
ContentBlockType.call_to_action: ["content", "title_slide"],
}
class SlideMappingEngine:
async def map(
self,
classified_content: ClassifiedContent,
layouts: List[dict],
n_slides: int,
instructions: Optional[str] = None,
) -> List[SlideMapping]:
"""Map classified content blocks to master deck layouts.
Args:
classified_content: Output from ContentIntelligenceService.classify()
layouts: MasterDeckModel.layouts list — each dict has layout_name, layout_type, index, etc.
n_slides: Target number of slides
instructions: Optional user instructions
Returns:
Ordered list of SlideMapping
"""
if not layouts:
return self._fallback_mapping(classified_content, n_slides)
# Build layout index by type for fast lookup
layout_by_type: Dict[str, List[dict]] = {}
for layout in layouts:
lt = (layout.get("layout_type") or "custom").lower()
layout_by_type.setdefault(lt, []).append(layout)
blocks = classified_content.blocks
# 1. Always start with a title slide
mappings: List[SlideMapping] = []
title_layout = self._find_layout(layout_by_type, ["title_slide", "title"], layouts)
mappings.append(
SlideMapping(
content_block_indices=[],
layout_id=str(title_layout.get("index", 0)),
layout_name=title_layout.get("layout_name", "Title"),
slide_type="title_slide",
content_summary=classified_content.title or "Presentation Title",
)
)
# 2. If many blocks, add agenda/section header
if len(blocks) > 5:
section_layout = self._find_layout(
layout_by_type, ["section_header", "section", "content"], layouts
)
sections = list(
{b.source_section for b in blocks if b.source_section}
)
mappings.append(
SlideMapping(
content_block_indices=[],
layout_id=str(section_layout.get("index", 0)),
layout_name=section_layout.get("layout_name", "Agenda"),
slide_type="section_header",
content_summary="Agenda: " + ", ".join(sections[:6]),
)
)
# 3. Map each content block to a layout
remaining_slots = n_slides - len(mappings)
block_mappings = self._assign_blocks_to_slides(
blocks, layout_by_type, layouts, remaining_slots
)
mappings.extend(block_mappings)
# 4. If we have more slides than content, add transitional slides
while len(mappings) < n_slides:
content_layout = self._find_layout(
layout_by_type, ["content", "blank"], layouts
)
mappings.append(
SlideMapping(
content_block_indices=[],
layout_id=str(content_layout.get("index", 0)),
layout_name=content_layout.get("layout_name", "Content"),
slide_type="content",
content_summary="Additional content",
)
)
# 5. Trim if over target
if len(mappings) > n_slides:
# Keep title + agenda, trim lowest-priority from the rest
fixed = mappings[:2] if len(mappings) > 2 else mappings[:1]
rest = mappings[len(fixed):]
rest = rest[: n_slides - len(fixed)]
mappings = fixed + rest
# 6. Optional LLM refinement for ambiguous mappings
if instructions:
mappings = await self._llm_refine(
mappings, classified_content, layouts, instructions, n_slides
)
return mappings
def _find_layout(
self,
layout_by_type: Dict[str, List[dict]],
preferred_types: List[str],
all_layouts: List[dict],
) -> dict:
"""Find best matching layout by type preference, fallback to first layout."""
for lt in preferred_types:
if lt in layout_by_type and layout_by_type[lt]:
return layout_by_type[lt][0]
return all_layouts[0] if all_layouts else {"index": 0, "layout_name": "Default", "layout_type": "content"}
def _assign_blocks_to_slides(
self,
blocks: List[ContentBlock],
layout_by_type: Dict[str, List[dict]],
all_layouts: List[dict],
max_slides: int,
) -> List[SlideMapping]:
"""Assign content blocks to slides, respecting max_slides constraint."""
if max_slides <= 0:
return []
mappings: List[SlideMapping] = []
if len(blocks) <= max_slides:
# One block per slide
for i, block in enumerate(blocks):
preferred = _BLOCK_TO_LAYOUT_TYPE.get(block.type, ["content"])
layout = self._find_layout(layout_by_type, preferred, all_layouts)
mappings.append(
SlideMapping(
content_block_indices=[i],
layout_id=str(layout.get("index", 0)),
layout_name=layout.get("layout_name", "Content"),
slide_type=block.type.value,
content_summary=block.raw_text[:120],
)
)
else:
# More blocks than slides — merge low-priority blocks
# Sort by priority descending, take top max_slides groups
sorted_blocks = sorted(
enumerate(blocks), key=lambda x: -x[1].priority
)
# High-priority blocks get their own slide
high_priority = sorted_blocks[:max_slides]
overflow = sorted_blocks[max_slides:]
# Group overflow with nearest high-priority block
for idx, block in high_priority:
preferred = _BLOCK_TO_LAYOUT_TYPE.get(block.type, ["content"])
layout = self._find_layout(layout_by_type, preferred, all_layouts)
mappings.append(
SlideMapping(
content_block_indices=[idx],
layout_id=str(layout.get("index", 0)),
layout_name=layout.get("layout_name", "Content"),
slide_type=block.type.value,
content_summary=block.raw_text[:120],
)
)
# Distribute overflow blocks across existing slides
for i, (idx, block) in enumerate(overflow):
target = i % len(mappings)
mappings[target].content_block_indices.append(idx)
# Re-sort mappings by original block order
mappings.sort(
key=lambda m: min(m.content_block_indices) if m.content_block_indices else 999
)
return mappings
async def _llm_refine(
self,
mappings: List[SlideMapping],
content: ClassifiedContent,
layouts: List[dict],
instructions: str,
n_slides: int,
) -> List[SlideMapping]:
"""Use LLM to refine layout assignments based on user instructions."""
client = LLMClient()
model = get_model()
layout_info = "\n".join(
f"- Index {l.get('index')}: {l.get('layout_name')} (type: {l.get('layout_type')})"
for l in layouts
)
current_mapping = "\n".join(
f"Slide {i + 1}: [{m.slide_type}] {m.content_summary[:80]} → layout '{m.layout_name}'"
for i, m in enumerate(mappings)
)
messages = [
LLMSystemMessage(
content="You refine slide-to-layout mappings for presentations. "
"Given the current mapping and user instructions, suggest layout changes. "
"Return a JSON with 'changes' array of {slide_index: int, new_layout_index: int} objects. "
"Only include slides that need changing. Return empty array if no changes needed."
),
LLMUserMessage(
content=f"Available layouts:\n{layout_info}\n\n"
f"Current mapping:\n{current_mapping}\n\n"
f"User instructions: {instructions}"
),
]
schema = {
"type": "object",
"properties": {
"changes": {
"type": "array",
"items": {
"type": "object",
"properties": {
"slide_index": {"type": "integer"},
"new_layout_index": {"type": "integer"},
},
"required": ["slide_index", "new_layout_index"],
},
}
},
"required": ["changes"],
}
try:
result = await client.generate_structured(
model=model, messages=messages, response_format=schema
)
for change in result.get("changes", []):
si = change.get("slide_index", -1)
li = change.get("new_layout_index", -1)
if 0 <= si < len(mappings) and 0 <= li < len(layouts):
mappings[si].layout_id = str(li)
mappings[si].layout_name = layouts[li].get("layout_name", "Content")
except Exception:
pass # Keep original mapping on LLM failure
return mappings
def _fallback_mapping(
self, content: ClassifiedContent, n_slides: int
) -> List[SlideMapping]:
"""Fallback when no master deck layouts are available."""
mappings = [
SlideMapping(
content_block_indices=[],
layout_id="0",
layout_name="Title",
slide_type="title_slide",
content_summary=content.title or "Presentation",
)
]
for i, block in enumerate(content.blocks[: n_slides - 1]):
mappings.append(
SlideMapping(
content_block_indices=[i],
layout_id="0",
layout_name="Content",
slide_type=block.type.value,
content_summary=block.raw_text[:120],
)
)
return mappings