feat(w-14): bulk failures dashboard + sidebar badge

- GET /admin/production/failures: list failed jobs filtered by step/org
- POST /admin/production/bulk-retry: dispatch retry for up to 50 jobs
  with "auto" (from failure.step) or "from_scratch" strategies
- FailuresList.tsx: accordion-grouped by error type, multi-select,
  bulk retry action, step label, retry count (red >3), updated date
- Sidebar: "Failures" item with live badge for production/admin roles
  (polls useJobs with processing_failed,tts_failed,render_failed)
- New useFailures / useBulkRetry hooks

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-04-29 20:36:30 +01:00
parent 264561895e
commit a945653e73
7 changed files with 432 additions and 0 deletions

View file

@ -0,0 +1,169 @@
"""Admin production endpoints: failure dashboard and bulk retry."""
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, Query, status
from motor.motor_asyncio import AsyncIOMotorDatabase
from pydantic import BaseModel
from ...core.database import get_database
from ...core.dependencies import require_roles
from ...core.logging import get_logger
from ...models.audit_log import AuditAction
from ...models.job import JobStatus, RequestedOutputs
from ...models.user import User, UserRole
from ...schemas.job import JobResponse
from ...services.audit_logger import audit_logger
from ...tasks.ingest_and_ai import ingest_and_ai_task
from ...tasks.translate_and_synthesize import translate_and_synthesize_task
logger = get_logger(__name__)
router = APIRouter(prefix="/admin/production", tags=["admin-production"])
_FAILURE_STATUSES = [
JobStatus.PROCESSING_FAILED.value,
JobStatus.TTS_FAILED.value,
JobStatus.RENDER_FAILED.value,
]
_RETRY_CAP = 50
class BulkRetryRequest(BaseModel):
job_ids: list[str]
strategy: str = "auto" # "auto" | "from_scratch"
class BulkRetryResponse(BaseModel):
retried: list[str]
skipped: list[str]
errors: list[dict]
@router.get("/failures", response_model=list[JobResponse])
async def list_failures(
step: str | None = Query(None, description="Filter by failure.step"),
org_id: str | None = Query(None, description="Filter by organization_id"),
limit: int = Query(50, ge=1, le=200),
skip: int = Query(0, ge=0),
current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""List all jobs in a failed status, optionally filtered by step and org."""
query: dict = {"status": {"$in": _FAILURE_STATUSES}}
if step:
query["failure.step"] = step
if org_id:
query["organization_id"] = org_id
cursor = db.jobs.find(query).sort("updated_at", -1).skip(skip).limit(limit)
jobs = await cursor.to_list(length=limit)
return [
JobResponse(
id=str(j["_id"]),
title=j["title"],
status=j["status"],
source=j["source"],
requested_outputs=RequestedOutputs(**j["requested_outputs"]),
review=j.get("review", {"notes": "", "history": []}),
outputs=j.get("outputs"),
created_at=j["created_at"].isoformat(),
updated_at=j["updated_at"].isoformat(),
)
for j in jobs
]
@router.post("/bulk-retry", response_model=BulkRetryResponse)
async def bulk_retry(
payload: BulkRetryRequest,
current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)),
db: AsyncIOMotorDatabase = Depends(get_database),
):
"""Retry up to 50 failed jobs in one call."""
if len(payload.job_ids) > _RETRY_CAP:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Cannot retry more than {_RETRY_CAP} jobs at once",
)
retried: list[str] = []
skipped: list[str] = []
errors: list[dict] = []
now = datetime.utcnow()
for job_id in payload.job_ids:
try:
job_doc = await db.jobs.find_one({"_id": job_id})
if not job_doc:
skipped.append(job_id)
continue
if job_doc["status"] not in _FAILURE_STATUSES:
skipped.append(job_id)
continue
failure = job_doc.get("failure") or {}
if payload.strategy == "from_scratch":
step = "ingestion"
else:
step = failure.get("step")
if not step:
step = "tts" if job_doc["status"] == JobStatus.TTS_FAILED.value else "render"
if step in ("ingestion", "ai_processing"):
reset_status = JobStatus.CREATED.value
elif step == "translation":
reset_status = JobStatus.AI_PROCESSING.value
elif step == "tts":
src = job_doc["source"].get("language", "en")
reset_status = (
JobStatus.APPROVED_ENGLISH.value if src == "en" else JobStatus.APPROVED_SOURCE.value
)
elif step == "render":
reset_status = JobStatus.PENDING_QC.value
else:
skipped.append(job_id)
continue
await db.jobs.update_one(
{"_id": job_id},
{
"$set": {"status": reset_status, "error": None, "updated_at": now},
"$inc": {"retry_count": 1},
"$push": {
"review.history": {
"at": now,
"status": f"bulk_retry_{step}",
"by": str(current_user.id),
}
},
},
)
if step in ("ingestion", "ai_processing"):
ingest_and_ai_task.delay(job_id)
elif step in ("translation", "tts"):
translate_and_synthesize_task.delay(job_id)
elif step == "render":
from ...tasks.rerender_accessible_video import rerender_accessible_video_task
rerender_accessible_video_task.delay(job_id)
retried.append(job_id)
except Exception as e:
logger.error(f"bulk-retry failed for job {job_id}: {e}")
errors.append({"job_id": job_id, "error": str(e)})
try:
await audit_logger.log(
action=AuditAction.JOB_BULK_RETRY,
user_id=str(current_user.id),
user_email=current_user.email,
user_role=current_user.role.value if current_user.role else None,
resource_type="job",
description=f"Bulk retry {len(retried)} jobs (strategy={payload.strategy})",
details={"retried": retried, "skipped": skipped, "error_count": len(errors)},
)
except Exception as e:
logger.warning(f"Failed to write bulk-retry audit log: {e}")
return BulkRetryResponse(retried=retried, skipped=skipped, errors=errors)

View file

@ -11,6 +11,7 @@ from sentry_sdk.integrations.pymongo import PyMongoIntegration
from sentry_sdk.integrations.redis import RedisIntegration
from .api.v1.routes_admin import router as admin_router
from .api.v1.routes_admin_production import router as admin_production_router
from .api.v1.routes_auth import router as auth_router
from .api.v1.routes_clients import router as clients_router
from .api.v1.routes_files import router as files_router
@ -266,6 +267,7 @@ app.include_router(language_qc_router, prefix="/api/v1")
app.include_router(glossaries_router, prefix="/api/v1")
app.include_router(tts_router, prefix="/api/v1")
app.include_router(admin_router, prefix="/api/v1")
app.include_router(admin_production_router, prefix="/api/v1")
app.include_router(share_router, prefix="/api/v1")
app.include_router(websockets_router, prefix="/api/v1")

View file

@ -22,6 +22,7 @@ import { GlossaryList } from './routes/admin/glossaries/GlossaryList';
import { GlossaryUpload } from './routes/admin/glossaries/GlossaryUpload';
import { GlossaryDetail } from './routes/admin/glossaries/GlossaryDetail';
import { AuditLog } from './routes/admin/AuditLog';
import { FailuresList } from './routes/admin/FailuresList';
import { LinguistQueue } from './routes/jobs/LinguistQueue';
import { Downloads } from './routes/Downloads';
import { ShareView } from './routes/ShareView';
@ -182,6 +183,13 @@ function AppContent() {
</RoleGate>
</AuthenticatedRoute>
} />
<Route path="/admin/failures" element={
<AuthenticatedRoute>
<RoleGate allowedRoles={['production', 'admin']}>
<FailuresList />
</RoleGate>
</AuthenticatedRoute>
} />
<Route path="/qc/queue" element={
<AuthenticatedRoute>
<RoleGate allowedRoles={['linguist', 'reviewer', 'production', 'admin']}>

View file

@ -23,6 +23,7 @@ export function Sidebar({ onMobileClose }: SidebarProps) {
const isQCRole = ['linguist', 'reviewer', 'production', 'admin'].includes(user?.role || '');
const isPMOrAdmin = ['project_manager', 'admin'].includes(user?.role || '');
const isAdminOrProduction = ['production', 'admin'].includes(user?.role || '');
const { data: qcData } = useJobs(
{ status: 'pending_qc', size: 1 },
@ -32,9 +33,14 @@ export function Sidebar({ onMobileClose }: SidebarProps) {
{ status: 'pending_final_review', size: 1 },
{ enabled: isPMOrAdmin }
);
const { data: failuresData } = useJobs(
{ status: 'processing_failed,tts_failed,render_failed', size: 1 },
{ enabled: isAdminOrProduction }
);
const qcBadge = isQCRole ? (qcData?.total || 0) : 0;
const finalBadge = isPMOrAdmin ? (finalData?.total || 0) : 0;
const failuresBadge = isAdminOrProduction ? (failuresData?.total || 0) : 0;
// Determine current org from route params or first membership
const currentOrgSlug =
@ -90,6 +96,13 @@ export function Sidebar({ onMobileClose }: SidebarProps) {
icon: '🏢',
roles: ['admin', 'project_manager'],
},
{
label: 'Failures',
href: '/admin/failures',
icon: '🔥',
roles: ['production', 'admin'],
badge: failuresBadge || undefined,
},
{
label: 'Audit Log',
href: '/admin/audit-log',

View file

@ -333,4 +333,25 @@ export function useBulkReturnToQC() {
queryClient.invalidateQueries({ queryKey: ['jobs'] });
},
});
}
export function useFailures(filters?: { step?: string; org_id?: string; limit?: number; skip?: number }) {
return useQuery({
queryKey: ['failures', filters],
queryFn: () => apiClient.listFailures(filters),
refetchInterval: 30_000,
});
}
export function useBulkRetry() {
const queryClient = useQueryClient();
return useMutation({
mutationFn: ({ job_ids, strategy }: { job_ids: string[]; strategy?: 'auto' | 'from_scratch' }) =>
apiClient.bulkRetry(job_ids, strategy),
onSuccess: () => {
queryClient.invalidateQueries({ queryKey: ['failures'] });
queryClient.invalidateQueries({ queryKey: ['jobs'] });
},
});
}

View file

@ -362,6 +362,21 @@ class ApiClient {
return response.data;
}
async listFailures(filters?: { step?: string; org_id?: string; limit?: number; skip?: number }): Promise<Job[]> {
const params: Record<string, string> = {};
if (filters?.step) params.step = filters.step;
if (filters?.org_id) params.org_id = filters.org_id;
if (filters?.limit != null) params.limit = String(filters.limit);
if (filters?.skip != null) params.skip = String(filters.skip);
const response = await this.client.get('/admin/production/failures', { params });
return response.data;
}
async bulkRetry(job_ids: string[], strategy: 'auto' | 'from_scratch' = 'auto') {
const response = await this.client.post('/admin/production/bulk-retry', { job_ids, strategy });
return response.data as { retried: string[]; skipped: string[]; errors: Array<{ job_id: string; error: string }> };
}
// User Management endpoints
async listUsers(filters?: {
page?: number;

View file

@ -0,0 +1,204 @@
import { useState } from 'react';
import { Link } from 'react-router-dom';
import { useFailures, useBulkRetry } from '../../hooks/useJob';
import { StatusBadge } from '../../components/StatusBadge';
import { useToastContext } from '../../contexts/ToastContext';
const STEP_LABELS: Record<string, string> = {
ingestion: 'Ingestion',
ai_processing: 'AI Processing',
translation: 'Translation',
tts: 'TTS',
render: 'Render',
};
export function FailuresList() {
const [stepFilter, setStepFilter] = useState('');
const [selected, setSelected] = useState<Set<string>>(new Set());
const [strategy, setStrategy] = useState<'auto' | 'from_scratch'>('auto');
const toast = useToastContext();
const { data: jobs = [], isLoading, error, refetch } = useFailures(
stepFilter ? { step: stepFilter } : undefined
);
const bulkRetryMutation = useBulkRetry();
const toggle = (id: string) => {
const next = new Set(selected);
next.has(id) ? next.delete(id) : next.add(id);
setSelected(next);
};
const selectAll = () => setSelected(new Set(jobs.map(j => j.id)));
const clearSelection = () => setSelected(new Set());
const handleBulkRetry = async () => {
if (selected.size === 0) return;
const ids = [...selected];
try {
const result = await bulkRetryMutation.mutateAsync({ job_ids: ids, strategy });
toast.toastOnly.success(
`Retried ${result.retried.length} job(s). Skipped: ${result.skipped.length}. Errors: ${result.errors.length}.`
);
clearSelection();
refetch();
} catch {
toast.toastOnly.error('Bulk retry failed');
}
};
// Group by failure type for accordion
const byType = jobs.reduce<Record<string, typeof jobs>>((acc, job) => {
const key = job.failure?.type || job.status;
acc[key] = acc[key] ?? [];
acc[key].push(job);
return acc;
}, {});
if (isLoading) {
return (
<div className="container mx-auto px-4 py-8">
<div className="animate-pulse space-y-4">
<div className="h-8 bg-gray-200 rounded w-1/4"></div>
{[...Array(5)].map((_, i) => <div key={i} className="h-14 bg-gray-200 rounded"></div>)}
</div>
</div>
);
}
if (error) {
return (
<div className="container mx-auto px-4 py-8">
<div className="bg-red-50 border border-red-200 rounded-md p-4">
<p className="text-red-600">Failed to load failures dashboard.</p>
</div>
</div>
);
}
return (
<div className="container mx-auto px-4 py-8">
<div className="flex items-center justify-between mb-6">
<h1 className="text-2xl font-bold text-gray-900">Failures Dashboard</h1>
<div className="flex items-center gap-3">
<select
value={stepFilter}
onChange={e => setStepFilter(e.target.value)}
className="border border-gray-300 rounded px-3 py-1.5 text-sm"
>
<option value="">All steps</option>
{Object.entries(STEP_LABELS).map(([k, v]) => (
<option key={k} value={k}>{v}</option>
))}
</select>
</div>
</div>
{jobs.length === 0 ? (
<div className="bg-green-50 border border-green-200 rounded-md p-6 text-center">
<p className="text-green-700 font-medium">No failures all clear!</p>
</div>
) : (
<>
{/* Bulk actions bar */}
{selected.size > 0 && (
<div className="flex items-center gap-3 bg-orange-50 border border-orange-200 rounded-lg px-4 py-3 mb-4">
<span className="text-sm font-medium text-orange-800">{selected.size} selected</span>
<select
value={strategy}
onChange={e => setStrategy(e.target.value as 'auto' | 'from_scratch')}
className="border border-orange-300 rounded px-2 py-1 text-sm"
>
<option value="auto">Auto (from failure step)</option>
<option value="from_scratch">From scratch (re-ingest)</option>
</select>
<button
onClick={handleBulkRetry}
disabled={bulkRetryMutation.isPending}
className="px-4 py-1.5 bg-orange-600 text-white text-sm rounded hover:bg-orange-700 disabled:opacity-50"
>
{bulkRetryMutation.isPending ? 'Retrying...' : 'Retry selected'}
</button>
<button onClick={clearSelection} className="text-sm text-gray-500 hover:text-gray-700">
Clear
</button>
</div>
)}
<div className="mb-3 flex gap-3 text-sm text-gray-500">
<button onClick={selectAll} className="hover:text-gray-800">Select all</button>
<span>|</span>
<span>{jobs.length} failure{jobs.length !== 1 ? 's' : ''} total</span>
</div>
{/* Grouped by error type */}
<div className="space-y-4">
{Object.entries(byType).map(([type, group]) => (
<details key={type} className="bg-white border border-gray-200 rounded-lg" open>
<summary className="flex items-center justify-between px-4 py-3 cursor-pointer select-none">
<span className="font-medium text-gray-800 text-sm">{type}</span>
<span className="text-xs text-gray-500 bg-gray-100 rounded-full px-2 py-0.5">{group.length}</span>
</summary>
<div className="border-t border-gray-100">
<table className="w-full text-sm">
<thead className="bg-gray-50 text-xs text-gray-500 uppercase">
<tr>
<th className="w-8 px-3 py-2"></th>
<th className="px-3 py-2 text-left">Title</th>
<th className="px-3 py-2 text-left">Status</th>
<th className="px-3 py-2 text-left">Step</th>
<th className="px-3 py-2 text-left max-w-xs">Message</th>
<th className="px-3 py-2 text-left">Retries</th>
<th className="px-3 py-2 text-left">Updated</th>
<th className="px-3 py-2"></th>
</tr>
</thead>
<tbody className="divide-y divide-gray-100">
{group.map(job => (
<tr key={job.id} className={`hover:bg-gray-50 ${selected.has(job.id) ? 'bg-orange-50' : ''}`}>
<td className="px-3 py-2">
<input
type="checkbox"
checked={selected.has(job.id)}
onChange={() => toggle(job.id)}
className="rounded border-gray-300"
/>
</td>
<td className="px-3 py-2 font-medium text-gray-900 max-w-xs truncate">
{job.title}
</td>
<td className="px-3 py-2">
<StatusBadge status={job.status} />
</td>
<td className="px-3 py-2 text-gray-600">
{job.failure?.step ? STEP_LABELS[job.failure.step] ?? job.failure.step : '—'}
</td>
<td className="px-3 py-2 text-gray-500 max-w-xs">
<span className="line-clamp-2 text-xs">{job.failure?.message || '—'}</span>
</td>
<td className={`px-3 py-2 font-medium ${(job.failure?.retry_count ?? 0) > 3 ? 'text-red-600' : 'text-gray-600'}`}>
{job.failure?.retry_count ?? 0}
</td>
<td className="px-3 py-2 text-gray-400 whitespace-nowrap text-xs">
{new Date(job.updated_at).toLocaleDateString()}
</td>
<td className="px-3 py-2">
<Link
to={`/jobs/${job.id}`}
className="text-indigo-600 hover:text-indigo-800 text-xs"
>
View
</Link>
</td>
</tr>
))}
</tbody>
</table>
</div>
</details>
))}
</div>
</>
)}
</div>
);
}