diff --git a/backend/app/api/matching.py b/backend/app/api/matching.py index d6915c1..a4b3a12 100644 --- a/backend/app/api/matching.py +++ b/backend/app/api/matching.py @@ -10,7 +10,7 @@ from app.database import get_db from app.models.gmal import GmalAsset from app.models.project import Project, ClientAsset, Match, ProjectStatus, MatchConfidence from app.schemas.project import ClientAssetOut, ClientAssetUpdate, MatchOut, MatchSelectRequest, ManualMatchRequest -from app.services.doc_parser import parse_uploaded_file +from app.services.doc_parser import extract_text_from_file, parse_text_with_ai from app.services.ai_matching import match_client_assets router = APIRouter() @@ -26,25 +26,52 @@ async def upload_client_document( """Upload a client document and extract assets using AI.""" project = await _get_project(project_id, db) - # Read file + # Stage 1: Uploading content = await file.read() project.source_filename = file.filename - project.status = ProjectStatus.PARSING + project.status = ProjectStatus.UPLOADING + project.parse_stage = f"Uploading {file.filename}..." + await db.commit() + + # Stage 2: Extracting text + project.status = ProjectStatus.EXTRACTING + project.parse_stage = "Extracting text from document..." + await db.commit() - # Parse document to extract assets try: - extracted, usage_info = parse_uploaded_file(content, file.filename) + text, metadata = extract_text_from_file(content, file.filename) except Exception as e: - logger.error(f"Document parsing failed: {e}") + project.status = ProjectStatus.DRAFT + project.parse_stage = None + await db.commit() + raise HTTPException(status_code=400, detail=f"Failed to extract text: {str(e)}") + + sheets_info = f" ({metadata['sheet_count']} sheets)" if metadata['sheet_count'] else "" + project.parse_stage = f"Extracted {metadata['char_count']:,} characters{sheets_info}. Sending to AI..." + project.status = ProjectStatus.PARSING + await db.commit() + + # Stage 3: AI parsing + try: + extracted, usage_info = parse_text_with_ai(text) + except Exception as e: + logger.error(f"AI parsing failed: {e}") + project.status = ProjectStatus.DRAFT + project.parse_stage = None + await db.commit() raise HTTPException(status_code=400, detail=f"Failed to parse document: {str(e)}") - # Save AI costs to project + # Save AI costs project.ai_input_tokens = (project.ai_input_tokens or 0) + usage_info.get("input_tokens", 0) project.ai_output_tokens = (project.ai_output_tokens or 0) + usage_info.get("output_tokens", 0) project.ai_cost_usd = float(project.ai_cost_usd or 0) + usage_info.get("cost_usd", 0) project.ai_call_count = (project.ai_call_count or 0) + 1 - # Clear existing client assets for this project + # Stage 4: Saving results + project.parse_stage = f"AI found {len(extracted)} assets. Saving..." + await db.commit() + + # Clear existing client assets existing = await db.execute( select(ClientAsset).where(ClientAsset.project_id == project_id) ) @@ -65,6 +92,7 @@ async def upload_client_document( assets.append(ca) project.status = ProjectStatus.REVIEW + project.parse_stage = f"Done! {len(assets)} assets extracted." await db.commit() return { diff --git a/backend/app/api/projects.py b/backend/app/api/projects.py index d0493cd..237ea12 100644 --- a/backend/app/api/projects.py +++ b/backend/app/api/projects.py @@ -97,6 +97,7 @@ def _project_out(project: Project, asset_count: int) -> ProjectOut: model_type=project.model_type.value, status=project.status.value, source_filename=project.source_filename, + parse_stage=project.parse_stage, ai_input_tokens=project.ai_input_tokens or 0, ai_output_tokens=project.ai_output_tokens or 0, ai_cost_usd=float(project.ai_cost_usd or 0), diff --git a/backend/app/models/project.py b/backend/app/models/project.py index 446da58..18a7ebb 100644 --- a/backend/app/models/project.py +++ b/backend/app/models/project.py @@ -10,6 +10,8 @@ from app.models.gmal import ModelType class ProjectStatus(str, enum.Enum): DRAFT = "draft" + UPLOADING = "uploading" + EXTRACTING = "extracting" PARSING = "parsing" MATCHING = "matching" REVIEW = "review" @@ -34,6 +36,7 @@ class Project(Base): model_type: Mapped[ModelType] = mapped_column(Enum(ModelType), default=ModelType.CURRENT_OPLUS) status: Mapped[ProjectStatus] = mapped_column(Enum(ProjectStatus), default=ProjectStatus.DRAFT) source_filename: Mapped[str | None] = mapped_column(String(255)) + parse_stage: Mapped[str | None] = mapped_column(String(255)) ai_input_tokens: Mapped[int] = mapped_column(Integer, default=0) ai_output_tokens: Mapped[int] = mapped_column(Integer, default=0) ai_cost_usd: Mapped[float] = mapped_column(Numeric(10, 6), default=0) diff --git a/backend/app/schemas/project.py b/backend/app/schemas/project.py index 33e622e..e42ca05 100644 --- a/backend/app/schemas/project.py +++ b/backend/app/schemas/project.py @@ -24,6 +24,7 @@ class ProjectOut(BaseModel): model_type: str status: str source_filename: str | None + parse_stage: str | None = None ai_input_tokens: int = 0 ai_output_tokens: int = 0 ai_cost_usd: float = 0 diff --git a/backend/app/services/ai_matching.py b/backend/app/services/ai_matching.py index 488a25b..535e8b1 100644 --- a/backend/app/services/ai_matching.py +++ b/backend/app/services/ai_matching.py @@ -6,7 +6,7 @@ import threading from collections import defaultdict from concurrent.futures import ThreadPoolExecutor -from sqlalchemy import select +from sqlalchemy import select, text, func from sqlalchemy.ext.asyncio import AsyncSession from app.models.gmal import GmalAsset @@ -86,24 +86,28 @@ SYSTEM_PROMPT = """You are a GMAL asset matching specialist for a creative produ Your job is to match client-described assets/deliverables to the closest equivalent(s) in the GMAL catalog. -The GMAL catalog is a standardized list of creative production assets, each with: -- A unique GMAL ID (e.g., GMAL101) -- Asset name and description -- Complexity level (Simple=1, Medium=2, Complex=3) -- Detailed complexity description +You are given the FULL GMAL catalog. Each entry has: GMAL ID | Asset Name | Complexity | Category. Guidelines: - Match based on the TYPE of deliverable first, then complexity level. -- Consider that clients may use different terminology (e.g., "banner" vs "web banner", "copywriting" vs "editorial"). +- Clients use different terminology than GMAL. Use your understanding of creative production to bridge the gap: + - "Key Visual" / "KV" = Photography/Key Visual GMALs + - "PDP copy" / "product listing" = Copywriting/eCommerce GMALs + - "Launch video" / "hero video" = Campaign Video/TVC GMALs + - "Presentation deck" / "toolbox" = Presentation GMALs + - "Display banner" / "digital ad" = Standard Banner/Display GMALs + - "Social post" / "social content" = Social Content/Social Video GMALs + - "BTS" / "behind the scenes" = Behind The Scenes GMALs - If the client asset maps clearly to one GMAL, set confidence="exact" with score 0.9-1.0. - If similar but with notable differences, set confidence="close" with score 0.6-0.89. - If multiple GMALs could match, return up to 3 ranked options with confidence="multiple". - If nothing matches well, return the closest option with confidence="none" and score below 0.3. - Always explain caveats: what the GMAL includes/excludes vs what the client described. -- Pay attention to complexity: a "simple banner" should match a Simple complexity GMAL, not Complex.""" +- Pay attention to complexity: a "simple banner" should match a Simple complexity GMAL, not Complex. +- Be generous with scoring when the match is semantically correct even if the naming differs.""" -def _match_single_asset(client_asset_name, client_asset_desc, volume, candidates_text, num_candidates): +def _match_single_asset(client_asset_name, client_asset_desc, volume, catalog_text, num_assets): """Run a single match call to Claude (synchronous, for use in thread pool).""" user_msg = f"""Match this client asset to the best GMAL equivalent(s): @@ -112,8 +116,8 @@ Name: {client_asset_name} Description: {client_asset_desc or 'No description provided'} Volume: {volume} -GMAL CATALOG CANDIDATES ({num_candidates} assets): -{candidates_text}""" +FULL GMAL CATALOG ({num_assets} assets): +{catalog_text}""" response = call_claude( system=SYSTEM_PROMPT, @@ -138,13 +142,17 @@ async def match_client_assets( """ _clear_cancel(project_id) - # Load all GMAL assets for candidate selection + # Load all GMAL assets - send full compact catalog to Claude (only ~3k tokens) result = await db.execute( - select(GmalAsset).where(GmalAsset.has_hour_routes == True) + select(GmalAsset).where(GmalAsset.has_hour_routes == True).order_by(GmalAsset.gmal_id) ) all_gmals = result.scalars().all() gmal_by_id = {g.gmal_id: g for g in all_gmals} + # Build compact catalog once - reused for every match call + catalog_text = _format_compact_catalog(all_gmals) + logger.info(f"Full GMAL catalog: {len(all_gmals)} assets, ~{len(catalog_text)} chars") + all_matches = [] total = len(client_assets) @@ -158,18 +166,11 @@ async def match_client_assets( batch_num = batch_start // BATCH_SIZE + 1 logger.info(f"Matching batch {batch_num} ({batch_start+1}-{min(batch_start+BATCH_SIZE, total)} of {total})") - # Prepare all calls for this batch - call_args = [] - for ca in batch: - candidates = _prefilter_candidates(ca, all_gmals) - candidates_text = _format_candidates(candidates) - call_args.append((ca, candidates, candidates_text)) - # Run batch in parallel using thread pool loop = asyncio.get_event_loop() with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor: futures = [] - for ca, candidates, candidates_text in call_args: + for ca in batch: if _is_cancelled(project_id): break future = loop.run_in_executor( @@ -178,8 +179,8 @@ async def match_client_assets( ca.raw_name, ca.raw_description, ca.volume, - candidates_text, - len(candidates), + catalog_text, + len(all_gmals), ) futures.append((ca, future)) @@ -241,60 +242,18 @@ async def match_client_assets( return all_matches -def _prefilter_candidates(client_asset: ClientAsset, all_gmals: list[GmalAsset], max_candidates: int = 25) -> list[GmalAsset]: - """Pre-filter GMAL candidates using keyword overlap to reduce token usage.""" - name = (client_asset.raw_name or "").lower() - desc = (client_asset.raw_description or "").lower() - search_text = f"{name} {desc}" +def _format_compact_catalog(all_gmals: list[GmalAsset]) -> str: + """Format the full GMAL catalog as a compact list for Claude. - stop_words = {"the", "a", "an", "and", "or", "for", "to", "in", "of", "with", "is", "on", "at", "by"} - keywords = set(search_text.split()) - stop_words + ~3k tokens for 243 assets. Much cheaper than pre-filtering and missing the right match. + """ + lines = [] + current_cat = None + for g in sorted(all_gmals, key=lambda x: (x.sub_category or '', x.gmal_id)): + if g.sub_category != current_cat: + current_cat = g.sub_category + lines.append(f"\n[{current_cat}]") + complexity = g.complexity_name or f"L{g.complexity_level}" + lines.append(f" {g.gmal_id}: {g.unique_name or g.asset_name} ({complexity})") - scored = [] - for gmal in all_gmals: - gmal_text = " ".join(filter(None, [ - gmal.asset_name, - gmal.sub_category, - gmal.unique_name, - gmal.complexity_description, - gmal.ai_enhanced_description, - ])).lower() - - score = sum(1 for kw in keywords if kw in gmal_text) - - if gmal.asset_name and any(word in gmal.asset_name.lower() for word in name.split() if len(word) > 3): - score += 5 - - scored.append((score, gmal)) - - scored.sort(key=lambda x: x[0], reverse=True) - candidates = [g for _, g in scored[:max_candidates]] - - if len([s for s, _ in scored[:max_candidates] if s > 0]) < 5: - seen_cats = set() - for _, g in scored: - if g.sub_category not in seen_cats and g not in candidates: - candidates.append(g) - seen_cats.add(g.sub_category) - if len(candidates) >= max_candidates: - break - - return candidates - - -def _format_candidates(candidates: list[GmalAsset]) -> str: - """Format GMAL candidates as text for the Claude prompt.""" - parts = [] - for g in candidates: - desc = g.ai_enhanced_description or g.complexity_description or g.asset_description or "" - if len(desc) > 300: - desc = desc[:300] + "..." - - parts.append( - f"- {g.gmal_id}: {g.unique_name or g.asset_name} " - f"(Complexity: {g.complexity_name or g.complexity_level}, " - f"Category: {g.sub_category})\n" - f" Description: {desc}" - ) - - return "\n\n".join(parts) + return "\n".join(lines) diff --git a/backend/app/services/doc_parser.py b/backend/app/services/doc_parser.py index 336866a..34061c8 100644 --- a/backend/app/services/doc_parser.py +++ b/backend/app/services/doc_parser.py @@ -63,29 +63,41 @@ Be thorough - extract every distinct asset type mentioned. If the same asset app Do NOT combine different asset types into one entry.""" -def parse_uploaded_file(file_content: bytes, filename: str) -> list[dict]: - """Parse a client document and extract assets using Claude. - - Returns a list of dicts: [{name, description, complexity_hint, volume}, ...] - """ +def extract_text_from_file(file_content: bytes, filename: str) -> tuple[str, dict]: + """Extract text from a file. Returns (text, metadata).""" ext = Path(filename).suffix.lower() if ext == ".docx": text = _extract_docx_text(file_content) + sheet_count = 0 elif ext in (".xlsx", ".xls"): text = _extract_excel_text(file_content) + wb = openpyxl.load_workbook(io.BytesIO(file_content), data_only=True) + sheet_count = len(wb.sheetnames) elif ext == ".txt": text = file_content.decode("utf-8", errors="replace") + sheet_count = 0 else: raise ValueError(f"Unsupported file type: {ext}. Use .docx, .xlsx, or .txt") if not text or len(text.strip()) < 20: raise ValueError("Document appears to be empty or too short to extract assets from.") + metadata = { + "char_count": len(text), + "sheet_count": sheet_count, + "file_type": ext, + } + # Truncate very long documents to manage token usage if len(text) > 50000: text = text[:50000] + "\n\n[Document truncated...]" + return text, metadata + + +def parse_text_with_ai(text: str) -> tuple[list[dict], dict]: + """Send extracted text to Claude to identify assets. Returns (assets, usage_info).""" response = call_claude( system=SYSTEM_PROMPT, user_message=f"Extract all deliverable assets from this client document:\n\n{text}", @@ -94,7 +106,6 @@ def parse_uploaded_file(file_content: bytes, filename: str) -> list[dict]: max_tokens=16000, ) - # Extract usage info for per-project tracking usage_info = getattr(response, '_usage_info', {"input_tokens": 0, "output_tokens": 0, "cost_usd": 0}) result = extract_tool_result(response) diff --git a/backend/app/services/excel_parser.py b/backend/app/services/excel_parser.py index 991ffbb..8fcc004 100644 --- a/backend/app/services/excel_parser.py +++ b/backend/app/services/excel_parser.py @@ -61,6 +61,16 @@ def parse_gmal_workbook(filepath: str, db: Session) -> dict: # Step 4: Load role-level mappings result["role_mappings_loaded"] = _load_role_mappings(wb, db) + # Populate full-text search vectors + db.execute(text(""" + UPDATE gmal_assets SET search_vector = + setweight(to_tsvector('english', coalesce(asset_name, '')), 'A') || + setweight(to_tsvector('english', coalesce(unique_name, '')), 'A') || + setweight(to_tsvector('english', coalesce(sub_category, '')), 'B') || + setweight(to_tsvector('english', coalesce(asset_description, '')), 'C') || + setweight(to_tsvector('english', coalesce(complexity_description, '')), 'C') + """)) + db.commit() logger.info(f"Ingestion complete: {result}") return result diff --git a/backend/app/utils/claude_client.py b/backend/app/utils/claude_client.py index cd5db38..5a1e923 100644 --- a/backend/app/utils/claude_client.py +++ b/backend/app/utils/claude_client.py @@ -129,10 +129,10 @@ def call_claude( # Attach usage to response for callers to save per-project response._usage_info = {"input_tokens": inp, "output_tokens": out, "cost_usd": cost} - logger.info( - f"Claude API call: {inp} in / {out} out tokens, " - f"${cost:.4f} this call, ${_usage['total_cost_usd']:.4f} total" - ) + logger.info( + f"Claude API call: {inp} in / {out} out tokens, " + f"${cost:.4f} this call, ${_usage['total_cost_usd']:.4f} total" + ) return response diff --git a/frontend/src/pages/ProjectView.css b/frontend/src/pages/ProjectView.css index b874617..b7055c2 100644 --- a/frontend/src/pages/ProjectView.css +++ b/frontend/src/pages/ProjectView.css @@ -92,6 +92,37 @@ border-color: var(--color-text-muted); } +.upload-active { + border-color: var(--color-primary); + background: rgba(255, 196, 7, 0.03); +} + +.upload-spinner { + width: 36px; + height: 36px; + border: 3px solid var(--color-border); + border-top-color: var(--color-primary); + border-radius: 50%; + animation: spin 0.8s linear infinite; + margin: 0 auto 16px; +} + +@keyframes spin { + to { transform: rotate(360deg); } +} + +.upload-stage { + color: var(--color-primary); + font-size: 14px; + font-weight: 600; + animation: pulse 1.5s ease-in-out infinite; +} + +@keyframes pulse { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.6; } +} + .upload-icon { color: var(--color-text-muted); margin-bottom: 16px; @@ -235,14 +266,6 @@ span.conf-badge-sm.conf-none { background: var(--color-danger); } font-style: italic; } -.match-group-collapsed-body { - display: none; -} - -.match-group-expanded .match-group-collapsed-body { - display: block; -} - .match-group-collapsed .match-group-header { border-bottom: none; } diff --git a/frontend/src/pages/ProjectView.tsx b/frontend/src/pages/ProjectView.tsx index 9f58de8..3eaf6f4 100644 --- a/frontend/src/pages/ProjectView.tsx +++ b/frontend/src/pages/ProjectView.tsx @@ -23,8 +23,10 @@ export default function ProjectView() { const [ratecard, setRatecard] = useState(null); const [loading, setLoading] = useState(true); const [uploading, setUploading] = useState(false); + const [uploadStage, setUploadStage] = useState(''); const [matching, setMatching] = useState(false); const [building, setBuilding] = useState(false); + const [expandedGroups, setExpandedGroups] = useState>(new Set()); const loadProject = useCallback(async () => { try { @@ -62,6 +64,18 @@ export default function ProjectView() { const file = e.target.files?.[0]; if (!file) return; setUploading(true); + setUploadStage(`Uploading ${file.name}...`); + + // Poll project status for stage updates + const pollInterval = setInterval(async () => { + try { + const res = await api.get(`/projects/${id}`); + if (res.data.parse_stage) { + setUploadStage(res.data.parse_stage); + } + } catch {} + }, 1500); + try { const form = new FormData(); form.append('file', file); @@ -71,7 +85,9 @@ export default function ProjectView() { } catch (err: any) { alert(`Upload failed: ${err.response?.data?.detail || err.message}`); } finally { + clearInterval(pollInterval); setUploading(false); + setUploadStage(''); } } @@ -133,9 +149,15 @@ export default function ProjectView() { } } - async function handleSelectMatch(matchId: number) { + async function handleSelectMatch(matchId: number, clientAssetId: number) { try { await api.put(`/projects/${id}/matches/${matchId}/select`, { is_selected: true }); + // Collapse the group after selection + setExpandedGroups(prev => { + const next = new Set(prev); + next.delete(clientAssetId); + return next; + }); await loadProject(); } catch (err: any) { alert(`Failed: ${err.response?.data?.detail || err.message}`); @@ -206,20 +228,29 @@ export default function ProjectView() { {tab === 'upload' && (
-
-
- - - -
-

Upload Client Document

-

Word (.docx) or Excel (.xlsx) file with the client's asset brief

- - {project.source_filename && ( -

Current: {project.source_filename}

+
+ {uploading ? ( + <> +
+

{uploadStage}

+ + ) : ( + <> +
+ + + +
+

Upload Client Document

+

Word (.docx) or Excel (.xlsx) file with the client's asset brief

+ + {project.source_filename && ( +

Current: {project.source_filename}

+ )} + )}
@@ -266,22 +297,31 @@ export default function ProjectView() { {assets.map(a => { const assetMatches = matchesByAsset[a.id] || []; const selectedMatch = assetMatches.find(m => m.is_selected); - const isCollapsed = !!selectedMatch && (selectedMatch.confidence_score ?? 0) >= 0.8; + const hasSelected = !!selectedMatch; + const isExpanded = expandedGroups.has(a.id); + const showBody = !hasSelected || isExpanded; + + function toggleGroup() { + if (!hasSelected) return; + setExpandedGroups(prev => { + const next = new Set(prev); + if (next.has(a.id)) next.delete(a.id); + else next.add(a.id); + return next; + }); + } + return ( -
+
{ - // Toggle expand/collapse by toggling a CSS class - const el = document.getElementById(`match-group-${a.id}`); - el?.classList.toggle('match-group-expanded'); - }} - style={{ cursor: isCollapsed ? 'pointer' : 'default' }} + onClick={toggleGroup} + style={{ cursor: hasSelected ? 'pointer' : 'default' }} >
{a.raw_name} - {selectedMatch && ( + {hasSelected && !isExpanded && ( {Math.round((selectedMatch.confidence_score || 0) * 100)}% @@ -289,7 +329,7 @@ export default function ProjectView() { {selectedMatch.gmal_id} - {selectedMatch.gmal_unique_name || selectedMatch.gmal_name} )} - {isCollapsed && click to expand} + {hasSelected && {isExpanded ? 'click to collapse' : 'click to expand'}}
{a.raw_description && ( {a.raw_description} @@ -298,7 +338,8 @@ export default function ProjectView() { Vol: {a.volume}
-
+ {showBody && ( +
{assetMatches.length === 0 ? (
No matches yet. Click "Run AI Matching" to find GMAL equivalents. @@ -318,7 +359,7 @@ export default function ProjectView() { {m.confidence} {m.confidence_score ? `${Math.round(m.confidence_score * 100)}%` : ''} {!m.is_selected ? ( - ) : ( @@ -341,6 +382,7 @@ export default function ProjectView() { )) )}
+ )}
); })}