Phase 4: Fix critical bugs, improve document parsing, add vision OCR

- Fix SSE stream 500: use async_session_maker inside StreamingResponse generator
  (Depends session closes when endpoint returns, before streaming starts)
- Fix template application: store template_name in prepare endpoint so worker
  uses the selected custom template instead of defaulting to "general"
- Fix OverlayLoader: replace loading.gif with HamsterLoader component
- Fix parse_mode default: change from "slides" to "layouts" to avoid 70+ layouts
- Update Gemini Flash model to gemini-3.1-flash-image-preview
- Improve DOCX parsing: python-docx for structured table extraction, OCR enabled
- Add vision-based image text extraction via Gemini for uploaded images
- Add LayoutParser integration for slide layout structure analysis
- Add Phase 4 MVP features: transfer ownership, URL input, follow-up questions,
  attachment-to-slide mapping, content router

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-02-27 14:07:00 +00:00
parent 69a8829750
commit e8295d6e71
21 changed files with 859 additions and 62 deletions

View file

@ -16,6 +16,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
chromium \
fontconfig \
curl \
libgl1 \
libglib2.0-0 \
&& rm -rf /var/lib/apt/lists/*
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium

View file

@ -64,7 +64,7 @@ async def _list_decks(client_id: uuid.UUID, include_inactive: bool, session: Asy
"name": d.name,
"description": d.description,
"thumbnail_path": d.thumbnail_path,
"parse_mode": getattr(d, "parse_mode", None) or "slides",
"parse_mode": getattr(d, "parse_mode", None) or "layouts",
"parse_status": d.parse_status,
"is_active": d.is_active,
"layouts": d.layouts,
@ -104,7 +104,7 @@ async def list_master_decks(
async def upload_master_deck(
client_id: uuid.UUID,
file: UploadFile = File(...),
parse_mode: str = Query("slides", description="Parse mode: 'slides' (default) or 'layouts'"),
parse_mode: str = Query("layouts", description="Parse mode: 'layouts' (default, unique slideLayouts) or 'slides' (one layout per slide)"),
admin: UserModel = Depends(require_client_admin),
session: AsyncSession = Depends(get_async_session),
):

View file

@ -3,9 +3,12 @@ from typing import List, Optional
import uuid
from fastapi import APIRouter, Depends, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import func
from sqlalchemy.ext.asyncio import AsyncSession
from sqlmodel import select
from sqlmodel import select, update
from models.sql.presentation import PresentationModel
from models.sql.user import UserModel
from services.database import get_async_session
from utils.auth_dependencies import require_super_admin
@ -15,6 +18,11 @@ USERS_ROUTER = APIRouter(prefix="/users", tags=["Admin - Users"])
VALID_ROLES = {"super_admin", "client_admin", "user"}
class TransferOwnershipRequest(BaseModel):
new_owner_id: uuid.UUID
client_id: Optional[uuid.UUID] = None
@USERS_ROUTER.get("", response_model=List[dict])
async def list_users(
_: UserModel = Depends(require_super_admin),
@ -93,6 +101,55 @@ async def update_user_role(
return {"message": "Role updated", "user_id": str(user.id), "role": role}
@USERS_ROUTER.post("/{user_id}/transfer-ownership")
async def transfer_ownership(
user_id: uuid.UUID,
body: TransferOwnershipRequest,
_: UserModel = Depends(require_super_admin),
session: AsyncSession = Depends(get_async_session),
):
"""Transfer all presentations from one user to another.
Used for GDPR compliance before deactivating a user.
"""
if user_id == body.new_owner_id:
raise HTTPException(
status_code=400, detail="Cannot transfer ownership to the same user"
)
# Validate source user exists
source_user = await session.get(UserModel, user_id)
if not source_user:
raise HTTPException(status_code=404, detail="Source user not found")
# Validate target user exists
target_user = await session.get(UserModel, body.new_owner_id)
if not target_user:
raise HTTPException(status_code=404, detail="Target user not found")
# Build the update statement for non-deleted presentations owned by the source user
stmt = (
update(PresentationModel)
.where(PresentationModel.owner_id == user_id)
.where(PresentationModel.deleted_at.is_(None))
)
if body.client_id is not None:
stmt = stmt.where(PresentationModel.client_id == body.client_id)
stmt = stmt.values(owner_id=body.new_owner_id)
result = await session.execute(stmt)
await session.commit()
transferred_count = result.rowcount
return {
"message": f"Transferred {transferred_count} presentations",
"from_user_id": str(user_id),
"to_user_id": str(body.new_owner_id),
}
@USERS_ROUTER.delete("/{user_id}")
async def deactivate_user(
user_id: uuid.UUID,
@ -106,7 +163,26 @@ async def deactivate_user(
if user.id == admin.id:
raise HTTPException(status_code=400, detail="Cannot deactivate yourself")
# Check how many active presentations this user still owns
count_query = (
select(func.count())
.select_from(PresentationModel)
.where(PresentationModel.owner_id == user_id)
.where(PresentationModel.deleted_at.is_(None))
)
count_result = await session.execute(count_query)
presentation_count = count_result.scalar_one()
user.is_active = False
session.add(user)
await session.commit()
return {"message": "User deactivated", "user_id": str(user.id)}
response = {"message": "User deactivated", "user_id": str(user.id)}
if presentation_count > 0:
response["warning"] = (
f"User still has {presentation_count} active presentations. "
"Consider transferring ownership first."
)
return response

View file

@ -0,0 +1,25 @@
from typing import List
from fastapi import APIRouter, Body, Depends, HTTPException
from models.sql.user import UserModel
from services.content_intelligence_service import ContentIntelligenceService
from utils.auth_dependencies import get_current_user
CONTENT_ROUTER = APIRouter(prefix="/content", tags=["Content"])
@CONTENT_ROUTER.post("/follow-up-questions")
async def follow_up_questions(
content: str = Body(..., embed=True),
_current_user: UserModel = Depends(get_current_user),
):
"""Classify content and return follow-up questions if the brief is thin."""
if not content or not content.strip():
raise HTTPException(status_code=400, detail="Content is required")
ci_service = ContentIntelligenceService()
classified = await ci_service.classify(content)
questions: List[str] = await ci_service.ask_followup_questions(classified) or []
return {"questions": questions}

View file

@ -4,9 +4,12 @@ import os
import uuid
from typing import Annotated, List, Optional
from fastapi import APIRouter, Body, File, HTTPException, UploadFile
from fastapi import APIRouter, Body, Depends, File, HTTPException, UploadFile
from pydantic import BaseModel
from models.sql.user import UserModel
from utils.auth_dependencies import get_current_user
from constants.documents import (
EXCEL_TYPES,
IMAGE_UPLOAD_TYPES,
@ -143,17 +146,46 @@ async def decompose_files(file_paths: Annotated[List[str], Body(embed=True)]):
)
)
# --- Image files ---
# --- Image files (with vision-based text extraction) ---
for img_path in image_files:
info = extract_images_metadata(img_path)
response.append(
DecomposedFileInfo(
name=info.filename,
file_path=img_path,
file_type="image",
image_info=info.model_dump(),
# Try to extract text from image via Gemini vision
extracted_text = None
try:
from services.docling_service import extract_text_from_image_via_vision
mime_type, _ = mimetypes.guess_type(img_path)
with open(img_path, "rb") as f:
image_bytes = f.read()
extracted_text = await extract_text_from_image_via_vision(
image_bytes, mime_type or "image/png"
)
except Exception as e:
print(f"[decompose] Vision text extraction failed for {img_path}: {e}")
if extracted_text:
# Save extracted text as a text file alongside the image
text_path = TEMP_FILE_SERVICE.create_temp_file_path(
f"{uuid.uuid4()}.txt", temp_dir
)
with open(text_path, "w") as tf:
tf.write(extracted_text)
response.append(
DecomposedFileInfo(
name=os.path.basename(img_path),
file_path=text_path,
file_type="text",
)
)
else:
response.append(
DecomposedFileInfo(
name=info.filename,
file_path=img_path,
file_type="image",
image_info=info.model_dump(),
)
)
)
return response
@ -182,6 +214,24 @@ async def parse_url_endpoint(body: UrlParseRequest):
return UrlParseResponse(content=content, url=body.url)
@FILES_ROUTER.post("/fetch-url")
async def fetch_url_content(
url: str = Body(..., embed=True),
_current_user: UserModel = Depends(get_current_user),
):
"""Fetch a URL and extract its text content."""
if not url.startswith(("http://", "https://")):
raise HTTPException(status_code=400, detail="Invalid URL")
text = await parse_url(url)
if not text:
raise HTTPException(
status_code=422, detail="Could not extract content from URL"
)
return {"text": text, "url": url}
@FILES_ROUTER.post("/update")
async def update_files(
file_path: Annotated[str, Body()],

View file

@ -43,7 +43,7 @@ from utils.llm_calls.generate_presentation_outlines import generate_ppt_outline
from models.sql.slide import SlideModel
from models.sse_response import SSECompleteResponse, SSEErrorResponse, SSEResponse
from services.database import get_async_session
from services.database import get_async_session, async_session_maker
from services.temp_file_service import TEMP_FILE_SERVICE
from services.concurrent_service import CONCURRENT_SERVICE
from models.sql.presentation import PresentationModel
@ -290,6 +290,7 @@ async def prepare_presentation(
sql_session.add(presentation)
presentation.outlines = presentation_outline_model.model_dump(mode="json")
presentation.title = title or presentation.title
presentation.template_name = layout.name
presentation.set_layout(layout)
presentation.set_structure(presentation_structure)
await sql_session.commit()
@ -319,11 +320,18 @@ async def stream_presentation(
image_generation_service = ImageGenerationService(get_images_directory())
async def inner():
structure = presentation.get_structure()
layout = presentation.get_layout()
outline = presentation.get_presentation_outline()
# Capture data before returning StreamingResponse, because the Depends
# session is closed once this function returns.
pres_id = id
structure = presentation.get_structure()
layout = presentation.get_layout()
outline = presentation.get_presentation_outline()
pres_language = presentation.language
pres_tone = presentation.tone
pres_verbosity = presentation.verbosity
pres_instructions = presentation.instructions
async def inner():
# These tasks will be gathered and awaited after all slides are generated
async_assets_generation_tasks = []
@ -339,17 +347,17 @@ async def stream_presentation(
slide_content = await get_slide_content_from_type_and_outline(
slide_layout,
outline.slides[i],
presentation.language,
presentation.tone,
presentation.verbosity,
presentation.instructions,
pres_language,
pres_tone,
pres_verbosity,
pres_instructions,
)
except HTTPException as e:
yield SSEErrorResponse(detail=e.detail).to_string()
return
slide = SlideModel(
presentation=id,
presentation=pres_id,
layout_group=layout.name,
layout=slide_layout.id,
index=i,
@ -381,21 +389,24 @@ async def stream_presentation(
for assets_list in generated_assets_lists:
generated_assets.extend(assets_list)
# Moved this here to make sure new slides are generated before deleting the old ones
await sql_session.execute(
delete(SlideModel).where(SlideModel.presentation == id)
)
await sql_session.commit()
# Use a new session for DB writes — the Depends session is already
# closed by the time the streaming generator executes.
async with async_session_maker() as session:
await session.execute(
delete(SlideModel).where(SlideModel.presentation == pres_id)
)
await session.commit()
sql_session.add(presentation)
sql_session.add_all(slides)
sql_session.add_all(generated_assets)
await sql_session.commit()
pres = await session.get(PresentationModel, pres_id)
session.add(pres)
session.add_all(slides)
session.add_all(generated_assets)
await session.commit()
response = PresentationWithSlides(
**presentation.model_dump(),
slides=slides,
)
response = PresentationWithSlides(
**pres.model_dump(),
slides=slides,
)
yield SSECompleteResponse(
key="presentation",

View file

@ -15,6 +15,7 @@ from api.v1.ppt.endpoints.ollama import OLLAMA_ROUTER
from api.v1.ppt.endpoints.outlines import OUTLINES_ROUTER
from api.v1.ppt.endpoints.slide import SLIDE_ROUTER
from api.v1.ppt.endpoints.pptx_slides import PPTX_FONTS_ROUTER
from api.v1.ppt.endpoints.content import CONTENT_ROUTER
API_V1_PPT_ROUTER = APIRouter(prefix="/api/v1/ppt")
@ -37,3 +38,4 @@ API_V1_PPT_ROUTER.include_router(OPENAI_ROUTER)
API_V1_PPT_ROUTER.include_router(ANTHROPIC_ROUTER)
API_V1_PPT_ROUTER.include_router(GOOGLE_ROUTER)
API_V1_PPT_ROUTER.include_router(PPTX_FONTS_ROUTER)
API_V1_PPT_ROUTER.include_router(CONTENT_ROUTER)

View file

@ -20,7 +20,10 @@ dependencies = [
"pathvalidate>=3.3.1",
"pdfplumber>=0.11.7",
"pytest>=8.4.1",
"python-docx>=1.1",
"python-pptx>=1.0.2",
"layoutparser>=0.3",
"opencv-python-headless>=4.8",
"redis>=5.0,<6",
"sqlmodel>=0.0.24",
"alembic>=1.15",

View file

@ -1,3 +1,13 @@
"""Document parsing service.
Uses Docling for PDF/PPTX and python-docx for DOCX (better table handling).
Optionally extracts text from embedded images via Gemini vision.
"""
import asyncio
import base64
import os
from typing import List, Optional
from docling.document_converter import (
DocumentConverter,
PdfFormatOption,
@ -11,7 +21,7 @@ from docling.datamodel.base_models import InputFormat
class DoclingService:
def __init__(self):
self.pipeline_options = PdfPipelineOptions()
self.pipeline_options.do_ocr = False
self.pipeline_options.do_ocr = True
self.converter = DocumentConverter(
allowed_formats=[InputFormat.PPTX, InputFormat.PDF, InputFormat.DOCX],
@ -29,5 +39,163 @@ class DoclingService:
)
def parse_to_markdown(self, file_path: str) -> str:
"""Parse any supported document to markdown via Docling."""
result = self.converter.convert(file_path)
return result.document.export_to_markdown()
def parse_docx_structured(self, file_path: str) -> str:
"""Parse DOCX with python-docx for better table/structure handling.
Falls back to Docling if python-docx is not available.
"""
try:
return self._parse_docx_with_python_docx(file_path)
except Exception as e:
print(f"[DoclingService] python-docx parsing failed ({e}), falling back to Docling")
return self.parse_to_markdown(file_path)
def _parse_docx_with_python_docx(self, file_path: str) -> str:
"""Extract text from DOCX using python-docx with proper table handling."""
from docx import Document
doc = Document(file_path)
parts: List[str] = []
for element in doc.element.body:
tag = element.tag.split("}")[-1] if "}" in element.tag else element.tag
if tag == "p":
# Paragraph
para = _find_paragraph_by_element(doc, element)
if para is not None:
text = para.text.strip()
if text:
# Check heading style
style_name = (para.style.name or "").lower() if para.style else ""
if "heading" in style_name:
level = 1
for ch in style_name:
if ch.isdigit():
level = int(ch)
break
parts.append(f"{'#' * level} {text}")
else:
parts.append(text)
elif tag == "tbl":
# Table — extract as markdown table
tbl = _find_table_by_element(doc, element)
if tbl is not None:
md_table = _table_to_markdown(tbl)
if md_table:
parts.append(md_table)
# Also extract images descriptions if possible
embedded_images = self._extract_docx_images(doc)
if embedded_images:
parts.append("\n## Embedded Images\n")
for desc in embedded_images:
parts.append(f"- {desc}")
return "\n\n".join(parts)
def _extract_docx_images(self, doc) -> List[str]:
"""Extract image descriptions from DOCX.
Returns alt text for images, or placeholder if no alt text.
"""
descriptions = []
try:
for rel in doc.part.rels.values():
if "image" in rel.reltype:
descriptions.append("[Embedded image]")
except Exception:
pass
return descriptions
def _find_paragraph_by_element(doc, element):
"""Find a Paragraph object matching the given XML element."""
for para in doc.paragraphs:
if para._element is element:
return para
return None
def _find_table_by_element(doc, element):
"""Find a Table object matching the given XML element."""
for table in doc.tables:
if table._element is element:
return table
return None
def _table_to_markdown(table) -> str:
"""Convert a python-docx Table to a markdown table string."""
rows = []
for row in table.rows:
cells = [cell.text.strip().replace("|", "\\|") for cell in row.cells]
rows.append(cells)
if not rows:
return ""
# Deduplicate merged cells (python-docx repeats merged cell text)
clean_rows = []
for row_cells in rows:
clean = []
for i, cell_text in enumerate(row_cells):
if i > 0 and cell_text == row_cells[i - 1]:
clean.append("") # merged cell
else:
clean.append(cell_text)
clean_rows.append(clean)
# Build markdown table
lines = []
if clean_rows:
header = clean_rows[0]
lines.append("| " + " | ".join(header) + " |")
lines.append("| " + " | ".join(["---"] * len(header)) + " |")
for row in clean_rows[1:]:
# Pad row to match header length
padded = row + [""] * (len(header) - len(row))
lines.append("| " + " | ".join(padded[:len(header)]) + " |")
return "\n".join(lines)
async def extract_text_from_image_via_vision(image_bytes: bytes, mime_type: str = "image/png") -> Optional[str]:
"""Use Gemini vision to extract text from an image.
Returns extracted text or None if unavailable.
"""
try:
import google.genai as genai
api_key = os.environ.get("GOOGLE_API_KEY")
if not api_key:
return None
client = genai.Client()
b64 = base64.b64encode(image_bytes).decode("utf-8")
response = await asyncio.to_thread(
client.models.generate_content,
model="gemini-2.5-flash",
contents=[
{
"parts": [
{"text": "Extract all text from this image. Return only the extracted text, nothing else. If no text is found, return 'No text found'."},
{"inline_data": {"mime_type": mime_type, "data": b64}},
]
}
],
)
text = response.text.strip() if response.text else None
if text and text.lower() != "no text found":
return text
return None
except Exception as e:
print(f"[DoclingService] Vision text extraction failed: {e}")
return None

View file

@ -92,7 +92,8 @@ class DocumentsLoader:
return await asyncio.to_thread(file.read)
def load_msword(self, file_path: str) -> str:
return self.docling_service.parse_to_markdown(file_path)
"""Parse DOCX with python-docx for better table/structure handling."""
return self.docling_service.parse_docx_structured(file_path)
def load_powerpoint(self, file_path: str) -> str:
return self.docling_service.parse_to_markdown(file_path)

View file

@ -193,9 +193,9 @@ class ImageGenerationService:
async def generate_image_gemini_flash(
self, prompt: str, output_directory: str
) -> str:
"""Generate image using Gemini Flash (gemini-2.5-flash-image-preview)."""
"""Generate image using Gemini Flash (gemini-3.1-flash-image-preview)."""
return await self._generate_image_google(
prompt, output_directory, "gemini-2.5-flash-image-preview"
prompt, output_directory, "gemini-3.1-flash-image-preview"
)
async def generate_image_nanobanana_pro(

View file

@ -0,0 +1,142 @@
"""Layout analysis service using LayoutParser for slide structure detection.
Analyzes slide screenshots to detect regions (text, image, table, title)
and provides structural metadata for LLM-based code generation.
"""
import os
from typing import List, Optional
# LayoutParser is optional — graceful fallback if not installed
_LAYOUTPARSER_AVAILABLE = False
try:
import layoutparser as lp
_LAYOUTPARSER_AVAILABLE = True
except ImportError:
pass
class DetectedRegion:
"""A detected region on a slide."""
__slots__ = ("type", "x1", "y1", "x2", "y2", "score")
def __init__(self, type: str, x1: float, y1: float, x2: float, y2: float, score: float = 1.0):
self.type = type
self.x1 = x1
self.y1 = y1
self.x2 = x2
self.y2 = y2
self.score = score
def to_dict(self) -> dict:
return {
"type": self.type,
"x1": round(self.x1),
"y1": round(self.y1),
"x2": round(self.x2),
"y2": round(self.y2),
"score": round(self.score, 3),
}
def analyze_slide_layout(image_path: str) -> List[DetectedRegion]:
"""Analyze a slide screenshot and return detected layout regions.
Uses LayoutParser with a PubLayNet model if available.
Falls back to empty list if LayoutParser is not installed.
"""
if not _LAYOUTPARSER_AVAILABLE:
return []
if not os.path.exists(image_path):
return []
try:
import cv2
image = cv2.imread(image_path)
if image is None:
return []
# Use PubLayNet model — detects: Text, Title, List, Table, Figure
model = lp.Detectron2LayoutModel(
config_path="lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config",
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
)
layout = model.detect(image)
regions = []
for block in layout:
regions.append(DetectedRegion(
type=block.type,
x1=block.block.x_1,
y1=block.block.y_1,
x2=block.block.x_2,
y2=block.block.y_2,
score=block.score,
))
return regions
except Exception as e:
print(f"[LayoutAnalysis] Detection failed: {e}")
return []
def regions_to_description(regions: List[DetectedRegion], image_width: int = 960, image_height: int = 540) -> str:
"""Convert detected regions to a text description for LLM context.
Normalizes coordinates to percentages for resolution-independent descriptions.
"""
if not regions:
return ""
lines = ["Detected layout regions (coordinates as % of slide dimensions):"]
for r in sorted(regions, key=lambda r: (r.y1, r.x1)):
x_pct = round(r.x1 / image_width * 100)
y_pct = round(r.y1 / image_height * 100)
w_pct = round((r.x2 - r.x1) / image_width * 100)
h_pct = round((r.y2 - r.y1) / image_height * 100)
lines.append(
f"- {r.type}: position ({x_pct}%, {y_pct}%), size ({w_pct}% x {h_pct}%), confidence: {r.score:.0%}"
)
return "\n".join(lines)
def classify_layout_from_regions(regions: List[DetectedRegion]) -> Optional[str]:
"""Classify slide layout type based on detected regions.
Returns a layout type string or None if classification is uncertain.
"""
if not regions:
return None
type_counts = {}
for r in regions:
type_counts[r.type] = type_counts.get(r.type, 0) + 1
has_title = type_counts.get("Title", 0) > 0
has_text = type_counts.get("Text", 0) > 0
has_figure = type_counts.get("Figure", 0) > 0
has_table = type_counts.get("Table", 0) > 0
has_list = type_counts.get("List", 0) > 0
text_count = type_counts.get("Text", 0)
# Classification heuristics
if has_title and not has_text and not has_figure and not has_table:
return "title_slide"
if has_title and has_figure and not has_text:
return "picture"
if has_table:
return "table"
if text_count >= 2 or (has_text and has_list):
return "two_column"
if has_title and (has_text or has_list):
return "content"
if has_figure and (has_text or has_title):
return "picture_with_caption"
if not any([has_title, has_text, has_figure, has_table, has_list]):
return "blank"
return "content"

View file

@ -461,7 +461,7 @@ async def _do_parse(deck_id: uuid.UUID) -> dict:
raise ValueError("Deck not found")
pptx_path = deck.original_file_path
client_id = deck.client_id
parse_mode = getattr(deck, "parse_mode", None) or "slides"
parse_mode = getattr(deck, "parse_mode", None) or "layouts"
if not os.path.exists(pptx_path):
raise FileNotFoundError(f"PPTX file not found: {pptx_path}")
@ -529,29 +529,55 @@ async def _do_parse(deck_id: uuid.UUID) -> dict:
print(f"[MasterDeckParser] LLM provider: {llm_provider['provider'] if llm_provider else 'NONE'}")
print(f"[MasterDeckParser] Processing {len(primary_metas)} items, {llm_count} with screenshots for LLM")
# Optional: LayoutParser region detection for better classification
from services.layout_analysis_service import (
analyze_slide_layout,
classify_layout_from_regions,
regions_to_description,
)
for idx, lm in enumerate(primary_metas):
screenshot_path = screenshots[idx] if idx < len(screenshots) else None
# Try LayoutParser classification if a screenshot is available
lp_layout_type = None
lp_region_desc = ""
if screenshot_path and os.path.exists(screenshot_path):
try:
regions = await asyncio.to_thread(analyze_slide_layout, screenshot_path)
if regions:
lp_layout_type = classify_layout_from_regions(regions)
lp_region_desc = regions_to_description(regions)
except Exception as lp_err:
print(f"[MasterDeckParser] LayoutParser skipped for {idx}: {lp_err}")
layout_entry = {
"index": idx,
"layout_name": lm["layout_name"],
"layout_type": _guess_layout_type(lm["layout_name"]),
"layout_type": lp_layout_type or _guess_layout_type(lm["layout_name"]),
"xml_snippet": lm["xml_content"][:2000],
"fonts": list(
{normalize_font_family_name(f) for f in extract_fonts_from_oxml(lm["xml_content"]) if f}
),
"html": None,
"react_code": None,
"screenshot_path": screenshots[idx] if idx < len(screenshots) else None,
"screenshot_path": screenshot_path,
}
# Run LLM pipeline if provider available and we have a screenshot
if llm_provider and idx < len(screenshots) and os.path.exists(screenshots[idx]):
if llm_provider and screenshot_path and os.path.exists(screenshot_path):
try:
print(f"[MasterDeckParser] Layout {idx + 1}/{llm_count}: {lm['layout_name']} — generating HTML...")
with open(screenshots[idx], "rb") as img_f:
with open(screenshot_path, "rb") as img_f:
img_b64 = base64.b64encode(img_f.read()).decode("utf-8")
# Include LayoutParser region info in LLM context
xml_context = lm["xml_content"]
if lp_region_desc:
xml_context = f"{lp_region_desc}\n\n---\n\n{xml_context}"
html = await _llm_generate_html(
llm_provider, img_b64, lm["xml_content"],
llm_provider, img_b64, xml_context,
layout_entry["fonts"] or None,
)
html = html.replace("```html", "").replace("```", "")

View file

@ -21,6 +21,7 @@ import {
} from "@/store/slices/presentationGeneration";
import { Button } from "@/components/ui/button";
import { Textarea } from "@/components/ui/textarea";
import { Input } from "@/components/ui/input";
import { Label } from "@/components/ui/label";
import { Slider } from "@/components/ui/slider";
import {
@ -30,7 +31,7 @@ import {
SelectTrigger,
SelectValue,
} from "@/components/ui/select";
import { ChevronLeft, ChevronRight, Layers } from "lucide-react";
import { ChevronLeft, ChevronRight, Layers, MessageCircleQuestion } from "lucide-react";
import { toast } from "sonner";
import { cn } from "@/lib/utils";
import { OverlayLoader } from "@/components/ui/overlay-loader";
@ -70,6 +71,9 @@ export default function WizardConfigurePage() {
const [loadingClients, setLoadingClients] = useState(true);
const [loadingDecks, setLoadingDecks] = useState(false);
const [isGenerating, setIsGenerating] = useState(false);
const [followUpQuestions, setFollowUpQuestions] = useState<string[]>([]);
const [followUpAnswers, setFollowUpAnswers] = useState<Record<string, string>>({});
const [loadingFollowUp, setLoadingFollowUp] = useState(false);
// Fetch clients on mount
useEffect(() => {
@ -90,6 +94,23 @@ export default function WizardConfigurePage() {
.finally(() => setLoadingDecks(false));
}, [wizard.selectedClientId]);
// Fetch follow-up questions if brief is short
useEffect(() => {
const briefContent = wizard.briefText;
if (!briefContent || briefContent.trim().length < 10) {
setFollowUpQuestions([]);
return;
}
setLoadingFollowUp(true);
WizardApi.checkFollowUpQuestions(briefContent)
.then((questions) => {
setFollowUpQuestions(questions);
setFollowUpAnswers({});
})
.finally(() => setLoadingFollowUp(false));
// eslint-disable-next-line react-hooks/exhaustive-deps
}, []); // Run once on page load
const handleBack = () => {
dispatch(setWizardStep(1));
router.push("/generate/upload");
@ -109,6 +130,16 @@ export default function WizardConfigurePage() {
.map((f) => f.serverPath)
.filter(Boolean) as string[];
// Append follow-up Q&A to instructions if any answers are provided
let finalInstructions = wizard.instructions;
const answeredPairs = followUpQuestions
.filter((q) => followUpAnswers[q]?.trim())
.map((q) => `Q: ${q}\nA: ${followUpAnswers[q].trim()}`);
if (answeredPairs.length > 0) {
const qaSuffix = "\n\n--- Follow-up Context ---\n" + answeredPairs.join("\n\n");
finalInstructions = (finalInstructions || "") + qaSuffix;
}
// Create presentation (outline mode)
const result = await WizardApi.createPresentation({
content: wizard.briefText,
@ -116,7 +147,7 @@ export default function WizardConfigurePage() {
file_paths: filePaths,
language: wizard.language,
tone: wizard.tone,
instructions: wizard.instructions,
instructions: finalInstructions,
client_id: wizard.selectedClientId ?? undefined,
master_deck_id: wizard.selectedDeckId ?? undefined,
});
@ -266,6 +297,39 @@ export default function WizardConfigurePage() {
</Select>
</div>
{/* Follow-Up Questions */}
{loadingFollowUp && (
<div className="rounded-xl border border-amber-200 bg-amber-50 p-4">
<p className="text-sm text-amber-700">Checking if we need more context...</p>
</div>
)}
{followUpQuestions.length > 0 && !loadingFollowUp && (
<div className="rounded-xl border border-amber-200 bg-amber-50 p-4 space-y-4">
<div className="flex items-center gap-2 text-amber-800">
<MessageCircleQuestion className="w-5 h-5 flex-shrink-0" />
<p className="text-sm font-medium">
A few quick questions to improve your presentation
</p>
</div>
{followUpQuestions.map((question, idx) => (
<div key={idx} className="space-y-1.5">
<Label className="text-sm text-amber-900">{question}</Label>
<Input
placeholder="Your answer (optional)"
value={followUpAnswers[question] ?? ""}
onChange={(e) =>
setFollowUpAnswers((prev) => ({
...prev,
[question]: e.target.value,
}))
}
className="bg-white border-amber-200 focus:border-amber-400"
/>
</div>
))}
</div>
)}
{/* Instructions */}
<div>
<Label className="mb-2 block">Additional Instructions</Label>

View file

@ -9,6 +9,7 @@ import {
setJobId,
setPresentationId as setWizardPresentationId,
WizardOutlineItem,
toggleSlideAttachment,
} from "@/store/slices/wizardSlice";
import { clearPresentationData } from "@/store/slices/presentationGeneration";
import { useOutlineStreaming } from "../../outline/hooks/useOutlineStreaming";
@ -22,8 +23,15 @@ import {
FileText,
Layers,
Loader2,
Paperclip,
} from "lucide-react";
import { toast } from "sonner";
import {
Popover,
PopoverContent,
PopoverTrigger,
} from "@/components/ui/popover";
import { Checkbox } from "@/components/ui/checkbox";
import { OverlayLoader } from "@/components/ui/overlay-loader";
import Wrapper from "@/components/Wrapper";
import { PresentationGenerationApi } from "../../services/api/presentation-generation";
@ -183,15 +191,88 @@ export default function WizardOutlinePage() {
Uploaded Files
</h4>
<div className="space-y-2">
{wizard.uploadedFiles.map((f, i) => (
<div
key={i}
className="flex items-center gap-2 p-2 rounded-lg bg-gray-50 text-xs"
>
<FileText className="w-3.5 h-3.5 text-[#5146E5]" />
<span className="truncate flex-1">{f.name}</span>
</div>
))}
{wizard.uploadedFiles.map((f, i) => {
// Count how many slides this file is linked to
const linkedCount = Object.values(
wizard.slideAttachments
).filter((names) => names.includes(f.name)).length;
return (
<div
key={i}
className="flex items-center gap-2 p-2 rounded-lg bg-gray-50 text-xs"
>
<FileText className="w-3.5 h-3.5 text-[#5146E5] flex-shrink-0" />
<span className="truncate flex-1">{f.name}</span>
{/* Link to slides popover */}
{outlines && outlines.length > 0 && (
<Popover>
<PopoverTrigger asChild>
<button
className="inline-flex items-center gap-1 px-1.5 py-0.5 rounded text-[10px] font-medium text-gray-500 hover:text-[#5146E5] hover:bg-[#5146E5]/5 transition-colors flex-shrink-0"
title="Link to slides"
>
<Paperclip className="w-3 h-3" />
{linkedCount > 0 && (
<span className="text-[#5146E5]">
{linkedCount}
</span>
)}
</button>
</PopoverTrigger>
<PopoverContent
side="right"
align="start"
className="w-64 p-3"
>
<p className="text-xs font-semibold text-gray-700 mb-2">
Link to slides
</p>
<div className="space-y-1.5 max-h-48 overflow-y-auto">
{outlines.map((outline, slideIdx) => {
const title =
(outline.content || "")
.split("\n")[0]
?.replace(/^#+\s*/, "")
.trim() || `Slide ${slideIdx + 1}`;
const isLinked = (
wizard.slideAttachments[slideIdx] || []
).includes(f.name);
return (
<label
key={slideIdx}
className="flex items-center gap-2 p-1.5 rounded hover:bg-gray-50 cursor-pointer text-xs"
>
<Checkbox
checked={isLinked}
onCheckedChange={() =>
dispatch(
toggleSlideAttachment({
slideIndex: slideIdx,
fileName: f.name,
})
)
}
className="h-3.5 w-3.5"
/>
<span className="text-gray-600 font-medium w-5 flex-shrink-0">
{slideIdx + 1}.
</span>
<span className="truncate text-gray-700">
{title}
</span>
</label>
);
})}
</div>
</PopoverContent>
</Popover>
)}
</div>
);
})}
</div>
</div>
)}
@ -248,6 +329,8 @@ export default function WizardOutlinePage() {
highestActiveIndex={streamState.highestActiveIndex}
onDragEnd={handleDragEnd}
onAddSlide={handleAddSlide}
slideAttachments={wizard.slideAttachments}
uploadedFiles={wizard.uploadedFiles}
/>
</TabsContent>

View file

@ -11,8 +11,9 @@ import {
setWizardStep,
WizardUploadedFile,
} from "@/store/slices/wizardSlice";
import { Upload, X, FileText, ChevronRight, Plus } from "lucide-react";
import { Upload, X, FileText, ChevronRight, Plus, Link } from "lucide-react";
import { Button } from "@/components/ui/button";
import { Input } from "@/components/ui/input";
import { Textarea } from "@/components/ui/textarea";
import { toast } from "sonner";
import { cn } from "@/lib/utils";
@ -60,6 +61,8 @@ export default function WizardUploadPage() {
const [localFiles, setLocalFiles] = useState<File[]>([]);
const [isDragging, setIsDragging] = useState(false);
const [isProcessing, setIsProcessing] = useState(false);
const [referenceUrl, setReferenceUrl] = useState("");
const [isFetchingUrl, setIsFetchingUrl] = useState(false);
const fileInputRef = useRef<HTMLInputElement>(null);
const handleDragOver = (e: React.DragEvent) => {
@ -116,6 +119,27 @@ export default function WizardUploadPage() {
const allFiles = uploadedFiles; // display list from Redux
const handleFetchUrl = async () => {
if (!referenceUrl.trim()) {
toast.error("Please enter a URL");
return;
}
try {
setIsFetchingUrl(true);
const text = await WizardApi.fetchUrl(referenceUrl.trim());
const separator = briefText.trim() ? "\n\n---\n\n" : "";
dispatch(setBriefText(briefText + separator + text));
toast.success("URL content fetched and appended to brief");
setReferenceUrl("");
} catch (error: any) {
toast.error("Failed to fetch URL", {
description: error.message || "Please check the URL and try again.",
});
} finally {
setIsFetchingUrl(false);
}
};
const handleNext = async () => {
if (!briefText.trim() && allFiles.length === 0) {
toast.error("Please enter a brief or upload documents");
@ -244,6 +268,39 @@ export default function WizardUploadPage() {
</div>
)}
{/* Reference URL */}
<div className="mt-6">
<label className="block text-sm font-medium text-gray-700 mb-2">
<Link className="w-4 h-4 inline mr-1.5 -mt-0.5" />
Or add a reference URL
</label>
<div className="flex gap-2">
<Input
type="text"
placeholder="https://example.com/article"
value={referenceUrl}
onChange={(e) => setReferenceUrl(e.target.value)}
onKeyDown={(e) => {
if (e.key === "Enter") {
e.preventDefault();
handleFetchUrl();
}
}}
className="flex-1"
/>
<Button
type="button"
variant="outline"
size="sm"
onClick={handleFetchUrl}
disabled={isFetchingUrl || !referenceUrl.trim()}
className="px-4 whitespace-nowrap"
>
{isFetchingUrl ? "Fetching..." : "Fetch"}
</Button>
</div>
</div>
{/* Brief Text */}
<div className="mt-6">
<label className="block text-sm font-medium text-gray-700 mb-2">

View file

@ -16,6 +16,7 @@ import {
import { OutlineItem } from "./OutlineItem";
import { Button } from "@/components/ui/button";
import { FileText, Loader2 } from "lucide-react";
import type { WizardUploadedFile } from "@/store/slices/wizardSlice";
interface OutlineContentProps {
outlines: { content: string }[] | null;
@ -25,6 +26,10 @@ interface OutlineContentProps {
highestActiveIndex: number;
onDragEnd: (event: any) => void;
onAddSlide: () => void;
/** Map of slide index -> attached file names */
slideAttachments?: Record<number, string[]>;
/** All uploaded files (for reference) */
uploadedFiles?: WizardUploadedFile[];
}
const OutlineContent: React.FC<OutlineContentProps> = ({
@ -34,7 +39,9 @@ const OutlineContent: React.FC<OutlineContentProps> = ({
activeSlideIndex,
highestActiveIndex,
onDragEnd,
onAddSlide
onAddSlide,
slideAttachments,
uploadedFiles,
}) => {
const sensors = useSensors(
useSensor(PointerSensor),
@ -104,6 +111,7 @@ const OutlineContent: React.FC<OutlineContentProps> = ({
isStreaming={isStreaming}
isActiveStreaming={activeSlideIndex === index}
isStableStreaming={highestActiveIndex >= 0 && index < highestActiveIndex}
attachedFiles={slideAttachments?.[index]}
/>
))
) :
@ -119,6 +127,7 @@ const OutlineContent: React.FC<OutlineContentProps> = ({
isStreaming={isStreaming}
isActiveStreaming={false}
isStableStreaming={false}
attachedFiles={slideAttachments?.[index]}
/>
))}
</SortableContext>}

View file

@ -1,6 +1,6 @@
import { useSortable } from "@dnd-kit/sortable"
import { CSS } from "@dnd-kit/utilities"
import { Trash2 } from "lucide-react"
import { Paperclip, Trash2 } from "lucide-react"
import { RootState } from "@/store/store"
import { useDispatch, useSelector } from "react-redux"
import { deleteSlideOutline, setOutlines } from "@/store/slices/presentationGeneration"
@ -18,6 +18,8 @@ interface OutlineItemProps {
isStreaming: boolean
isActiveStreaming?: boolean
isStableStreaming?: boolean
/** File names attached to this slide */
attachedFiles?: string[]
}
export function OutlineItem({
@ -26,6 +28,7 @@ export function OutlineItem({
isStreaming,
isActiveStreaming = false,
isStableStreaming = false,
attachedFiles,
}: OutlineItemProps) {
const {
outlines,
@ -164,6 +167,21 @@ export function OutlineItem({
/>
)}
{/* Attached file badges */}
{attachedFiles && attachedFiles.length > 0 && (
<div className="flex flex-wrap gap-1.5 mt-1.5">
{attachedFiles.map((fileName) => (
<span
key={fileName}
className="inline-flex items-center gap-1 rounded-full bg-[#5146E5]/5 border border-[#5146E5]/15 px-2 py-0.5 text-[10px] text-[#5146E5]/80"
>
<Paperclip className="w-2.5 h-2.5" />
<span className="truncate max-w-[120px]">{fileName}</span>
</span>
))}
</div>
)}
</div>
{/* Action Buttons */}

View file

@ -121,6 +121,34 @@ export class WizardApi {
await ApiResponseHandler.handleResponse(response, "Failed to cancel job");
}
/** Fetch URL content and extract text */
static async fetchUrl(url: string): Promise<string> {
const response = await fetch("/api/v1/ppt/files/fetch-url", {
method: "POST",
headers: getHeader(),
body: JSON.stringify({ url }),
cache: "no-cache",
});
return await ApiResponseHandler.handleResponse(response, "Failed to fetch URL");
}
/** Check if brief needs follow-up questions */
static async checkFollowUpQuestions(content: string): Promise<string[]> {
if (!content || content.trim().length < 10) return [];
try {
const response = await fetch("/api/v1/ppt/content/follow-up-questions", {
method: "POST",
headers: getHeader(),
body: JSON.stringify({ content }),
cache: "no-cache",
});
const data = await ApiResponseHandler.handleResponse(response, "");
return data.questions ?? [];
} catch {
return [];
}
}
/** Create presentation (outline-only, like existing flow) */
static async createPresentation(params: {
content: string;

View file

@ -1,5 +1,5 @@
import { cn } from "@/lib/utils"
import { Loader } from "./loader"
import { HamsterLoader } from "./hamster-loader"
import { ProgressBar } from "./progress-bar"
import { useEffect, useState } from "react"
@ -53,7 +53,9 @@ export const OverlayLoader = ({
)}
>
<img loading="eager" src={'/loading.gif'} alt="loading" width={250} height={250} />
<div className="py-8">
<HamsterLoader size="lg" />
</div>
{showProgress ? (
<div className="w-full space-y-6 pt-4">
<ProgressBar

View file

@ -39,6 +39,8 @@ interface WizardState {
presentationId: string | null;
/** Decomposed document data from server */
decomposedFiles: any[];
/** Map of slide index -> attached file names */
slideAttachments: Record<number, string[]>;
}
const STORAGE_KEY = "deckforge_wizard";
@ -77,6 +79,7 @@ const defaultState: WizardState = {
jobId: null,
presentationId: null,
decomposedFiles: [],
slideAttachments: {},
};
const persisted = loadFromStorage();
@ -142,6 +145,31 @@ const wizardSlice = createSlice({
state.decomposedFiles = action.payload;
saveToStorage(state);
},
setSlideAttachments: (
state,
action: PayloadAction<Record<number, string[]>>
) => {
state.slideAttachments = action.payload;
saveToStorage(state);
},
toggleSlideAttachment: (
state,
action: PayloadAction<{ slideIndex: number; fileName: string }>
) => {
const { slideIndex, fileName } = action.payload;
const current = state.slideAttachments[slideIndex] || [];
if (current.includes(fileName)) {
state.slideAttachments[slideIndex] = current.filter(
(f) => f !== fileName
);
if (state.slideAttachments[slideIndex].length === 0) {
delete state.slideAttachments[slideIndex];
}
} else {
state.slideAttachments[slideIndex] = [...current, fileName];
}
saveToStorage(state);
},
resetWizard: (state) => {
Object.assign(state, defaultState);
if (typeof window !== "undefined") {
@ -165,6 +193,8 @@ export const {
setJobId,
setPresentationId,
setDecomposedFiles,
setSlideAttachments,
toggleSlideAttachment,
resetWizard,
} = wizardSlice.actions;