diff --git a/backend/app/api/v1/routes_admin_production.py b/backend/app/api/v1/routes_admin_production.py new file mode 100644 index 0000000..a6ffd48 --- /dev/null +++ b/backend/app/api/v1/routes_admin_production.py @@ -0,0 +1,169 @@ +"""Admin production endpoints: failure dashboard and bulk retry.""" +from datetime import datetime + +from fastapi import APIRouter, Depends, HTTPException, Query, status +from motor.motor_asyncio import AsyncIOMotorDatabase +from pydantic import BaseModel + +from ...core.database import get_database +from ...core.dependencies import require_roles +from ...core.logging import get_logger +from ...models.audit_log import AuditAction +from ...models.job import JobStatus, RequestedOutputs +from ...models.user import User, UserRole +from ...schemas.job import JobResponse +from ...services.audit_logger import audit_logger +from ...tasks.ingest_and_ai import ingest_and_ai_task +from ...tasks.translate_and_synthesize import translate_and_synthesize_task + +logger = get_logger(__name__) +router = APIRouter(prefix="/admin/production", tags=["admin-production"]) + +_FAILURE_STATUSES = [ + JobStatus.PROCESSING_FAILED.value, + JobStatus.TTS_FAILED.value, + JobStatus.RENDER_FAILED.value, +] + +_RETRY_CAP = 50 + + +class BulkRetryRequest(BaseModel): + job_ids: list[str] + strategy: str = "auto" # "auto" | "from_scratch" + + +class BulkRetryResponse(BaseModel): + retried: list[str] + skipped: list[str] + errors: list[dict] + + +@router.get("/failures", response_model=list[JobResponse]) +async def list_failures( + step: str | None = Query(None, description="Filter by failure.step"), + org_id: str | None = Query(None, description="Filter by organization_id"), + limit: int = Query(50, ge=1, le=200), + skip: int = Query(0, ge=0), + current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)), + db: AsyncIOMotorDatabase = Depends(get_database), +): + """List all jobs in a failed status, optionally filtered by step and org.""" + query: dict = {"status": {"$in": _FAILURE_STATUSES}} + if step: + query["failure.step"] = step + if org_id: + query["organization_id"] = org_id + + cursor = db.jobs.find(query).sort("updated_at", -1).skip(skip).limit(limit) + jobs = await cursor.to_list(length=limit) + + return [ + JobResponse( + id=str(j["_id"]), + title=j["title"], + status=j["status"], + source=j["source"], + requested_outputs=RequestedOutputs(**j["requested_outputs"]), + review=j.get("review", {"notes": "", "history": []}), + outputs=j.get("outputs"), + created_at=j["created_at"].isoformat(), + updated_at=j["updated_at"].isoformat(), + ) + for j in jobs + ] + + +@router.post("/bulk-retry", response_model=BulkRetryResponse) +async def bulk_retry( + payload: BulkRetryRequest, + current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)), + db: AsyncIOMotorDatabase = Depends(get_database), +): + """Retry up to 50 failed jobs in one call.""" + if len(payload.job_ids) > _RETRY_CAP: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Cannot retry more than {_RETRY_CAP} jobs at once", + ) + + retried: list[str] = [] + skipped: list[str] = [] + errors: list[dict] = [] + now = datetime.utcnow() + + for job_id in payload.job_ids: + try: + job_doc = await db.jobs.find_one({"_id": job_id}) + if not job_doc: + skipped.append(job_id) + continue + if job_doc["status"] not in _FAILURE_STATUSES: + skipped.append(job_id) + continue + + failure = job_doc.get("failure") or {} + if payload.strategy == "from_scratch": + step = "ingestion" + else: + step = failure.get("step") + if not step: + step = "tts" if job_doc["status"] == JobStatus.TTS_FAILED.value else "render" + + if step in ("ingestion", "ai_processing"): + reset_status = JobStatus.CREATED.value + elif step == "translation": + reset_status = JobStatus.AI_PROCESSING.value + elif step == "tts": + src = job_doc["source"].get("language", "en") + reset_status = ( + JobStatus.APPROVED_ENGLISH.value if src == "en" else JobStatus.APPROVED_SOURCE.value + ) + elif step == "render": + reset_status = JobStatus.PENDING_QC.value + else: + skipped.append(job_id) + continue + + await db.jobs.update_one( + {"_id": job_id}, + { + "$set": {"status": reset_status, "error": None, "updated_at": now}, + "$inc": {"retry_count": 1}, + "$push": { + "review.history": { + "at": now, + "status": f"bulk_retry_{step}", + "by": str(current_user.id), + } + }, + }, + ) + + if step in ("ingestion", "ai_processing"): + ingest_and_ai_task.delay(job_id) + elif step in ("translation", "tts"): + translate_and_synthesize_task.delay(job_id) + elif step == "render": + from ...tasks.rerender_accessible_video import rerender_accessible_video_task + rerender_accessible_video_task.delay(job_id) + + retried.append(job_id) + except Exception as e: + logger.error(f"bulk-retry failed for job {job_id}: {e}") + errors.append({"job_id": job_id, "error": str(e)}) + + try: + await audit_logger.log( + action=AuditAction.JOB_BULK_RETRY, + user_id=str(current_user.id), + user_email=current_user.email, + user_role=current_user.role.value if current_user.role else None, + resource_type="job", + description=f"Bulk retry {len(retried)} jobs (strategy={payload.strategy})", + details={"retried": retried, "skipped": skipped, "error_count": len(errors)}, + ) + except Exception as e: + logger.warning(f"Failed to write bulk-retry audit log: {e}") + + return BulkRetryResponse(retried=retried, skipped=skipped, errors=errors) diff --git a/backend/app/main.py b/backend/app/main.py index b3009ba..b51a744 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -11,6 +11,7 @@ from sentry_sdk.integrations.pymongo import PyMongoIntegration from sentry_sdk.integrations.redis import RedisIntegration from .api.v1.routes_admin import router as admin_router +from .api.v1.routes_admin_production import router as admin_production_router from .api.v1.routes_auth import router as auth_router from .api.v1.routes_clients import router as clients_router from .api.v1.routes_files import router as files_router @@ -266,6 +267,7 @@ app.include_router(language_qc_router, prefix="/api/v1") app.include_router(glossaries_router, prefix="/api/v1") app.include_router(tts_router, prefix="/api/v1") app.include_router(admin_router, prefix="/api/v1") +app.include_router(admin_production_router, prefix="/api/v1") app.include_router(share_router, prefix="/api/v1") app.include_router(websockets_router, prefix="/api/v1") diff --git a/frontend/src/App.tsx b/frontend/src/App.tsx index b6586f0..0765b4e 100644 --- a/frontend/src/App.tsx +++ b/frontend/src/App.tsx @@ -22,6 +22,7 @@ import { GlossaryList } from './routes/admin/glossaries/GlossaryList'; import { GlossaryUpload } from './routes/admin/glossaries/GlossaryUpload'; import { GlossaryDetail } from './routes/admin/glossaries/GlossaryDetail'; import { AuditLog } from './routes/admin/AuditLog'; +import { FailuresList } from './routes/admin/FailuresList'; import { LinguistQueue } from './routes/jobs/LinguistQueue'; import { Downloads } from './routes/Downloads'; import { ShareView } from './routes/ShareView'; @@ -182,6 +183,13 @@ function AppContent() { } /> + + + + + + } /> diff --git a/frontend/src/components/Layout/Sidebar.tsx b/frontend/src/components/Layout/Sidebar.tsx index 879c3c9..f303728 100644 --- a/frontend/src/components/Layout/Sidebar.tsx +++ b/frontend/src/components/Layout/Sidebar.tsx @@ -23,6 +23,7 @@ export function Sidebar({ onMobileClose }: SidebarProps) { const isQCRole = ['linguist', 'reviewer', 'production', 'admin'].includes(user?.role || ''); const isPMOrAdmin = ['project_manager', 'admin'].includes(user?.role || ''); + const isAdminOrProduction = ['production', 'admin'].includes(user?.role || ''); const { data: qcData } = useJobs( { status: 'pending_qc', size: 1 }, @@ -32,9 +33,14 @@ export function Sidebar({ onMobileClose }: SidebarProps) { { status: 'pending_final_review', size: 1 }, { enabled: isPMOrAdmin } ); + const { data: failuresData } = useJobs( + { status: 'processing_failed,tts_failed,render_failed', size: 1 }, + { enabled: isAdminOrProduction } + ); const qcBadge = isQCRole ? (qcData?.total || 0) : 0; const finalBadge = isPMOrAdmin ? (finalData?.total || 0) : 0; + const failuresBadge = isAdminOrProduction ? (failuresData?.total || 0) : 0; // Determine current org from route params or first membership const currentOrgSlug = @@ -90,6 +96,13 @@ export function Sidebar({ onMobileClose }: SidebarProps) { icon: '🏢', roles: ['admin', 'project_manager'], }, + { + label: 'Failures', + href: '/admin/failures', + icon: '🔥', + roles: ['production', 'admin'], + badge: failuresBadge || undefined, + }, { label: 'Audit Log', href: '/admin/audit-log', diff --git a/frontend/src/hooks/useJob.ts b/frontend/src/hooks/useJob.ts index bf1479c..5157837 100644 --- a/frontend/src/hooks/useJob.ts +++ b/frontend/src/hooks/useJob.ts @@ -333,4 +333,25 @@ export function useBulkReturnToQC() { queryClient.invalidateQueries({ queryKey: ['jobs'] }); }, }); +} + +export function useFailures(filters?: { step?: string; org_id?: string; limit?: number; skip?: number }) { + return useQuery({ + queryKey: ['failures', filters], + queryFn: () => apiClient.listFailures(filters), + refetchInterval: 30_000, + }); +} + +export function useBulkRetry() { + const queryClient = useQueryClient(); + + return useMutation({ + mutationFn: ({ job_ids, strategy }: { job_ids: string[]; strategy?: 'auto' | 'from_scratch' }) => + apiClient.bulkRetry(job_ids, strategy), + onSuccess: () => { + queryClient.invalidateQueries({ queryKey: ['failures'] }); + queryClient.invalidateQueries({ queryKey: ['jobs'] }); + }, + }); } \ No newline at end of file diff --git a/frontend/src/lib/api.ts b/frontend/src/lib/api.ts index 7b93caa..64b5358 100644 --- a/frontend/src/lib/api.ts +++ b/frontend/src/lib/api.ts @@ -362,6 +362,21 @@ class ApiClient { return response.data; } + async listFailures(filters?: { step?: string; org_id?: string; limit?: number; skip?: number }): Promise { + const params: Record = {}; + if (filters?.step) params.step = filters.step; + if (filters?.org_id) params.org_id = filters.org_id; + if (filters?.limit != null) params.limit = String(filters.limit); + if (filters?.skip != null) params.skip = String(filters.skip); + const response = await this.client.get('/admin/production/failures', { params }); + return response.data; + } + + async bulkRetry(job_ids: string[], strategy: 'auto' | 'from_scratch' = 'auto') { + const response = await this.client.post('/admin/production/bulk-retry', { job_ids, strategy }); + return response.data as { retried: string[]; skipped: string[]; errors: Array<{ job_id: string; error: string }> }; + } + // User Management endpoints async listUsers(filters?: { page?: number; diff --git a/frontend/src/routes/admin/FailuresList.tsx b/frontend/src/routes/admin/FailuresList.tsx new file mode 100644 index 0000000..630d559 --- /dev/null +++ b/frontend/src/routes/admin/FailuresList.tsx @@ -0,0 +1,204 @@ +import { useState } from 'react'; +import { Link } from 'react-router-dom'; +import { useFailures, useBulkRetry } from '../../hooks/useJob'; +import { StatusBadge } from '../../components/StatusBadge'; +import { useToastContext } from '../../contexts/ToastContext'; + +const STEP_LABELS: Record = { + ingestion: 'Ingestion', + ai_processing: 'AI Processing', + translation: 'Translation', + tts: 'TTS', + render: 'Render', +}; + +export function FailuresList() { + const [stepFilter, setStepFilter] = useState(''); + const [selected, setSelected] = useState>(new Set()); + const [strategy, setStrategy] = useState<'auto' | 'from_scratch'>('auto'); + const toast = useToastContext(); + + const { data: jobs = [], isLoading, error, refetch } = useFailures( + stepFilter ? { step: stepFilter } : undefined + ); + const bulkRetryMutation = useBulkRetry(); + + const toggle = (id: string) => { + const next = new Set(selected); + next.has(id) ? next.delete(id) : next.add(id); + setSelected(next); + }; + const selectAll = () => setSelected(new Set(jobs.map(j => j.id))); + const clearSelection = () => setSelected(new Set()); + + const handleBulkRetry = async () => { + if (selected.size === 0) return; + const ids = [...selected]; + try { + const result = await bulkRetryMutation.mutateAsync({ job_ids: ids, strategy }); + toast.toastOnly.success( + `Retried ${result.retried.length} job(s). Skipped: ${result.skipped.length}. Errors: ${result.errors.length}.` + ); + clearSelection(); + refetch(); + } catch { + toast.toastOnly.error('Bulk retry failed'); + } + }; + + // Group by failure type for accordion + const byType = jobs.reduce>((acc, job) => { + const key = job.failure?.type || job.status; + acc[key] = acc[key] ?? []; + acc[key].push(job); + return acc; + }, {}); + + if (isLoading) { + return ( +
+
+
+ {[...Array(5)].map((_, i) =>
)} +
+
+ ); + } + + if (error) { + return ( +
+
+

Failed to load failures dashboard.

+
+
+ ); + } + + return ( +
+
+

Failures Dashboard

+
+ +
+
+ + {jobs.length === 0 ? ( +
+

No failures — all clear!

+
+ ) : ( + <> + {/* Bulk actions bar */} + {selected.size > 0 && ( +
+ {selected.size} selected + + + +
+ )} + +
+ + | + {jobs.length} failure{jobs.length !== 1 ? 's' : ''} total +
+ + {/* Grouped by error type */} +
+ {Object.entries(byType).map(([type, group]) => ( +
+ + {type} + {group.length} + +
+ + + + + + + + + + + + + + + {group.map(job => ( + + + + + + + + + + + ))} + +
TitleStatusStepMessageRetriesUpdated
+ toggle(job.id)} + className="rounded border-gray-300" + /> + + {job.title} + + + + {job.failure?.step ? STEP_LABELS[job.failure.step] ?? job.failure.step : '—'} + + {job.failure?.message || '—'} + 3 ? 'text-red-600' : 'text-gray-600'}`}> + {job.failure?.retry_count ?? 0} + + {new Date(job.updated_at).toLocaleDateString()} + + + View + +
+
+
+ ))} +
+ + )} +
+ ); +}