Phase 1 (Foundation): - Project restructure (presenton-main → backend/ + frontend/) - Database schema (8 new models, Alembic config, seed script) - Auth (Azure AD SSO + dev bypass, JWT sessions, AuthMiddleware) - RBAC (access_service, rbac_middleware, admin routers) - Audit logging (fire-and-forget, AuditMiddleware, admin router) - i18n (react-i18next with 5 namespace files) Phase 2 (Admin Panel & Client Management): - Admin panel shell (sidebar layout, role guard, 12 pages) - Redux admin slice with 18 async thunks - User management (role changes, deactivation) - Client management (CRUD, brand config, team management) - Brand config editor (colors, fonts, logos, voice rules) - Master deck upload & parser (PPTX → HTML → React pipeline) - Audit log viewer with filters and CSV/JSON export Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
317 lines
11 KiB
Python
317 lines
11 KiB
Python
"""Master deck parser: extract layouts from a PPTX master/template file.
|
|
|
|
Pipeline per layout:
|
|
1. Unzip PPTX → extract slide layout XMLs and slide master XML
|
|
2. Convert to PDF via LibreOffice → screenshots per layout
|
|
3. Extract theme colors, fonts from OXML
|
|
4. For each slide layout: generate HTML → generate React code via LLM
|
|
5. Auto-classify layout type via LLM
|
|
6. Persist results to MasterDeckModel
|
|
"""
|
|
import asyncio
|
|
import base64
|
|
import os
|
|
import shutil
|
|
import tempfile
|
|
import traceback
|
|
import uuid
|
|
import zipfile
|
|
import xml.etree.ElementTree as ET
|
|
from typing import List, Optional
|
|
|
|
from services.database import async_session_maker
|
|
|
|
# Reuse existing extraction utilities
|
|
from api.v1.ppt.endpoints.pptx_slides import (
|
|
_extract_slide_xmls,
|
|
_convert_pptx_to_pdf,
|
|
extract_fonts_from_oxml,
|
|
normalize_font_family_name,
|
|
)
|
|
from api.v1.ppt.endpoints.slide_to_html import (
|
|
generate_html_from_slide,
|
|
generate_react_component_from_html,
|
|
)
|
|
from services.documents_loader import DocumentsLoader
|
|
|
|
# OXML namespaces
|
|
NS = {
|
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
"p": "http://schemas.openxmlformats.org/presentationml/2006/main",
|
|
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
}
|
|
|
|
LAYOUT_TYPE_HINTS = {
|
|
"title": "title_slide",
|
|
"section": "section_header",
|
|
"two content": "two_column",
|
|
"comparison": "comparison",
|
|
"content": "content",
|
|
"blank": "blank",
|
|
"picture": "picture",
|
|
"caption": "caption",
|
|
}
|
|
|
|
|
|
def _extract_slide_layout_xmls(pptx_path: str, temp_dir: str) -> List[dict]:
|
|
"""Extract slide layout XMLs from ppt/slideLayouts/ and return metadata."""
|
|
extract_dir = os.path.join(temp_dir, "pptx_extract")
|
|
if not os.path.exists(extract_dir):
|
|
with zipfile.ZipFile(pptx_path, "r") as zf:
|
|
zf.extractall(extract_dir)
|
|
|
|
layouts_dir = os.path.join(extract_dir, "ppt", "slideLayouts")
|
|
if not os.path.exists(layouts_dir):
|
|
return []
|
|
|
|
layout_files = sorted(
|
|
[f for f in os.listdir(layouts_dir) if f.endswith(".xml")],
|
|
key=lambda x: int("".join(c for c in x if c.isdigit()) or "0"),
|
|
)
|
|
|
|
layouts = []
|
|
for lf in layout_files:
|
|
path = os.path.join(layouts_dir, lf)
|
|
with open(path, "r", encoding="utf-8") as f:
|
|
xml_content = f.read()
|
|
|
|
# Try to extract layout name from OXML
|
|
layout_name = lf.replace(".xml", "")
|
|
try:
|
|
root = ET.fromstring(xml_content)
|
|
cSld = root.find("p:cSld", NS)
|
|
if cSld is not None and cSld.get("name"):
|
|
layout_name = cSld.get("name")
|
|
except Exception:
|
|
pass
|
|
|
|
layouts.append({
|
|
"filename": lf,
|
|
"layout_name": layout_name,
|
|
"xml_content": xml_content,
|
|
})
|
|
|
|
return layouts
|
|
|
|
|
|
def _extract_theme_info(pptx_path: str, temp_dir: str) -> dict:
|
|
"""Extract theme colors and font scheme from the PPTX theme XML."""
|
|
extract_dir = os.path.join(temp_dir, "pptx_extract")
|
|
if not os.path.exists(extract_dir):
|
|
with zipfile.ZipFile(pptx_path, "r") as zf:
|
|
zf.extractall(extract_dir)
|
|
|
|
theme_dir = os.path.join(extract_dir, "ppt", "theme")
|
|
if not os.path.exists(theme_dir):
|
|
return {"colors": [], "fonts": {}}
|
|
|
|
theme_files = [f for f in os.listdir(theme_dir) if f.endswith(".xml")]
|
|
if not theme_files:
|
|
return {"colors": [], "fonts": {}}
|
|
|
|
theme_path = os.path.join(theme_dir, theme_files[0])
|
|
with open(theme_path, "r", encoding="utf-8") as f:
|
|
theme_xml = f.read()
|
|
|
|
colors = []
|
|
fonts_info = {}
|
|
|
|
try:
|
|
root = ET.fromstring(theme_xml)
|
|
|
|
# Extract color scheme
|
|
clrScheme = root.find(".//a:clrScheme", NS)
|
|
if clrScheme is not None:
|
|
for child in clrScheme:
|
|
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
|
# Look for srgbClr or sysClr
|
|
srgb = child.find("a:srgbClr", NS)
|
|
if srgb is not None:
|
|
colors.append({"name": tag, "hex": f"#{srgb.get('val', '')}"})
|
|
else:
|
|
sys_clr = child.find("a:sysClr", NS)
|
|
if sys_clr is not None:
|
|
last_clr = sys_clr.get("lastClr", "")
|
|
colors.append({"name": tag, "hex": f"#{last_clr}"})
|
|
|
|
# Extract font scheme
|
|
majorFont = root.find(".//a:majorFont/a:latin", NS)
|
|
minorFont = root.find(".//a:minorFont/a:latin", NS)
|
|
if majorFont is not None:
|
|
fonts_info["heading"] = majorFont.get("typeface", "")
|
|
if minorFont is not None:
|
|
fonts_info["body"] = minorFont.get("typeface", "")
|
|
|
|
except Exception:
|
|
pass
|
|
|
|
return {"colors": colors, "fonts": fonts_info}
|
|
|
|
|
|
def _guess_layout_type(layout_name: str) -> str:
|
|
"""Heuristic layout type guess from layout name."""
|
|
name_lower = layout_name.lower()
|
|
for hint, layout_type in LAYOUT_TYPE_HINTS.items():
|
|
if hint in name_lower:
|
|
return layout_type
|
|
return "custom"
|
|
|
|
|
|
async def parse_master_deck(deck_id: uuid.UUID) -> None:
|
|
"""Parse a master deck PPTX asynchronously. Updates DB on completion/failure."""
|
|
async with async_session_maker() as session:
|
|
from models.sql.master_deck import MasterDeckModel
|
|
|
|
deck = await session.get(MasterDeckModel, deck_id)
|
|
if not deck:
|
|
return
|
|
|
|
deck.parse_status = "processing"
|
|
await session.commit()
|
|
|
|
try:
|
|
result = await _do_parse(deck_id)
|
|
async with async_session_maker() as session:
|
|
deck = await session.get(MasterDeckModel, deck_id)
|
|
if not deck:
|
|
return
|
|
deck.parsed_config = result["parsed_config"]
|
|
deck.layouts = result["layouts"]
|
|
deck.thumbnail_path = result.get("thumbnail_path")
|
|
deck.parse_status = "completed"
|
|
await session.commit()
|
|
|
|
except Exception as e:
|
|
traceback.print_exc()
|
|
async with async_session_maker() as session:
|
|
deck = await session.get(MasterDeckModel, deck_id)
|
|
if not deck:
|
|
return
|
|
deck.parse_status = "failed"
|
|
deck.parsed_config = {"error": str(e)}
|
|
await session.commit()
|
|
|
|
|
|
async def _do_parse(deck_id: uuid.UUID) -> dict:
|
|
"""Core parsing logic. Returns dict with parsed_config, layouts, thumbnail_path."""
|
|
async with async_session_maker() as session:
|
|
from models.sql.master_deck import MasterDeckModel
|
|
|
|
deck = await session.get(MasterDeckModel, deck_id)
|
|
if not deck:
|
|
raise ValueError("Deck not found")
|
|
pptx_path = deck.original_file_path
|
|
client_id = deck.client_id
|
|
|
|
if not os.path.exists(pptx_path):
|
|
raise FileNotFoundError(f"PPTX file not found: {pptx_path}")
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# 1. Extract slide XMLs (actual slides, not layouts) for screenshots
|
|
slide_xmls = _extract_slide_xmls(pptx_path, temp_dir)
|
|
|
|
# 2. Extract slide layout XMLs from ppt/slideLayouts/
|
|
layout_metas = _extract_slide_layout_xmls(pptx_path, temp_dir)
|
|
|
|
# 3. Extract theme info
|
|
theme_info = _extract_theme_info(pptx_path, temp_dir)
|
|
|
|
# 4. Convert to PDF → screenshots (for slides, used as layout previews)
|
|
screenshots = []
|
|
thumbnail_path = None
|
|
try:
|
|
pdf_path = await _convert_pptx_to_pdf(pptx_path, temp_dir)
|
|
screenshot_paths = await DocumentsLoader.get_page_images_from_pdf_async(
|
|
pdf_path, temp_dir
|
|
)
|
|
# Copy screenshots to permanent location
|
|
deck_dir = os.path.join(
|
|
os.path.dirname(__file__), "..", "data", "clients",
|
|
str(client_id), "master_decks", str(deck_id), "screenshots"
|
|
)
|
|
os.makedirs(deck_dir, exist_ok=True)
|
|
|
|
for i, sp in enumerate(screenshot_paths):
|
|
if os.path.exists(sp) and os.path.getsize(sp) > 0:
|
|
dest = os.path.join(deck_dir, f"slide_{i + 1}.png")
|
|
shutil.copy2(sp, dest)
|
|
screenshots.append(dest)
|
|
if i == 0:
|
|
thumbnail_path = dest
|
|
|
|
except Exception as e:
|
|
print(f"Screenshot generation failed (non-fatal): {e}")
|
|
|
|
# 5. Collect all fonts used
|
|
all_fonts = set()
|
|
for lm in layout_metas:
|
|
raw = extract_fonts_from_oxml(lm["xml_content"])
|
|
all_fonts.update(normalize_font_family_name(f) for f in raw if f)
|
|
for sx in slide_xmls:
|
|
raw = extract_fonts_from_oxml(sx)
|
|
all_fonts.update(normalize_font_family_name(f) for f in raw if f)
|
|
|
|
# 6. Process each slide layout through LLM pipeline
|
|
api_key = os.getenv("OPENAI_API_KEY")
|
|
layouts_result = []
|
|
|
|
for idx, lm in enumerate(layout_metas):
|
|
layout_entry = {
|
|
"index": idx,
|
|
"layout_name": lm["layout_name"],
|
|
"layout_type": _guess_layout_type(lm["layout_name"]),
|
|
"xml_snippet": lm["xml_content"][:2000], # Store truncated XML for reference
|
|
"fonts": list(
|
|
{normalize_font_family_name(f) for f in extract_fonts_from_oxml(lm["xml_content"]) if f}
|
|
),
|
|
"html": None,
|
|
"react_code": None,
|
|
"screenshot_path": screenshots[idx] if idx < len(screenshots) else None,
|
|
}
|
|
|
|
# Run LLM pipeline if API key available and we have a screenshot
|
|
if api_key and idx < len(screenshots) and os.path.exists(screenshots[idx]):
|
|
try:
|
|
with open(screenshots[idx], "rb") as img_f:
|
|
img_b64 = base64.b64encode(img_f.read()).decode("utf-8")
|
|
|
|
# Step A: Generate HTML from slide screenshot + layout OXML
|
|
html = await generate_html_from_slide(
|
|
base64_image=img_b64,
|
|
media_type="image/png",
|
|
xml_content=lm["xml_content"],
|
|
api_key=api_key,
|
|
fonts=layout_entry["fonts"] or None,
|
|
)
|
|
html = html.replace("```html", "").replace("```", "")
|
|
layout_entry["html"] = html
|
|
|
|
# Step B: Generate React component from HTML
|
|
react_code = await generate_react_component_from_html(
|
|
html_content=html,
|
|
api_key=api_key,
|
|
image_base64=img_b64,
|
|
media_type="image/png",
|
|
)
|
|
react_code = react_code.replace("```tsx", "").replace("```", "")
|
|
layout_entry["react_code"] = react_code
|
|
|
|
except Exception as e:
|
|
print(f"LLM pipeline failed for layout {idx} ({lm['layout_name']}): {e}")
|
|
layout_entry["html"] = None
|
|
layout_entry["react_code"] = None
|
|
|
|
layouts_result.append(layout_entry)
|
|
|
|
parsed_config = {
|
|
"theme": theme_info,
|
|
"total_slides": len(slide_xmls),
|
|
"total_layouts": len(layout_metas),
|
|
"fonts": sorted(all_fonts),
|
|
}
|
|
|
|
return {
|
|
"parsed_config": parsed_config,
|
|
"layouts": layouts_result,
|
|
"thumbnail_path": thumbnail_path,
|
|
}
|