feat(w-14): bulk failures dashboard + sidebar badge
- GET /admin/production/failures: list failed jobs filtered by step/org - POST /admin/production/bulk-retry: dispatch retry for up to 50 jobs with "auto" (from failure.step) or "from_scratch" strategies - FailuresList.tsx: accordion-grouped by error type, multi-select, bulk retry action, step label, retry count (red >3), updated date - Sidebar: "Failures" item with live badge for production/admin roles (polls useJobs with processing_failed,tts_failed,render_failed) - New useFailures / useBulkRetry hooks Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
parent
264561895e
commit
a945653e73
7 changed files with 432 additions and 0 deletions
169
backend/app/api/v1/routes_admin_production.py
Normal file
169
backend/app/api/v1/routes_admin_production.py
Normal file
|
|
@ -0,0 +1,169 @@
|
|||
"""Admin production endpoints: failure dashboard and bulk retry."""
|
||||
from datetime import datetime
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, status
|
||||
from motor.motor_asyncio import AsyncIOMotorDatabase
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ...core.database import get_database
|
||||
from ...core.dependencies import require_roles
|
||||
from ...core.logging import get_logger
|
||||
from ...models.audit_log import AuditAction
|
||||
from ...models.job import JobStatus, RequestedOutputs
|
||||
from ...models.user import User, UserRole
|
||||
from ...schemas.job import JobResponse
|
||||
from ...services.audit_logger import audit_logger
|
||||
from ...tasks.ingest_and_ai import ingest_and_ai_task
|
||||
from ...tasks.translate_and_synthesize import translate_and_synthesize_task
|
||||
|
||||
logger = get_logger(__name__)
|
||||
router = APIRouter(prefix="/admin/production", tags=["admin-production"])
|
||||
|
||||
_FAILURE_STATUSES = [
|
||||
JobStatus.PROCESSING_FAILED.value,
|
||||
JobStatus.TTS_FAILED.value,
|
||||
JobStatus.RENDER_FAILED.value,
|
||||
]
|
||||
|
||||
_RETRY_CAP = 50
|
||||
|
||||
|
||||
class BulkRetryRequest(BaseModel):
|
||||
job_ids: list[str]
|
||||
strategy: str = "auto" # "auto" | "from_scratch"
|
||||
|
||||
|
||||
class BulkRetryResponse(BaseModel):
|
||||
retried: list[str]
|
||||
skipped: list[str]
|
||||
errors: list[dict]
|
||||
|
||||
|
||||
@router.get("/failures", response_model=list[JobResponse])
|
||||
async def list_failures(
|
||||
step: str | None = Query(None, description="Filter by failure.step"),
|
||||
org_id: str | None = Query(None, description="Filter by organization_id"),
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
skip: int = Query(0, ge=0),
|
||||
current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""List all jobs in a failed status, optionally filtered by step and org."""
|
||||
query: dict = {"status": {"$in": _FAILURE_STATUSES}}
|
||||
if step:
|
||||
query["failure.step"] = step
|
||||
if org_id:
|
||||
query["organization_id"] = org_id
|
||||
|
||||
cursor = db.jobs.find(query).sort("updated_at", -1).skip(skip).limit(limit)
|
||||
jobs = await cursor.to_list(length=limit)
|
||||
|
||||
return [
|
||||
JobResponse(
|
||||
id=str(j["_id"]),
|
||||
title=j["title"],
|
||||
status=j["status"],
|
||||
source=j["source"],
|
||||
requested_outputs=RequestedOutputs(**j["requested_outputs"]),
|
||||
review=j.get("review", {"notes": "", "history": []}),
|
||||
outputs=j.get("outputs"),
|
||||
created_at=j["created_at"].isoformat(),
|
||||
updated_at=j["updated_at"].isoformat(),
|
||||
)
|
||||
for j in jobs
|
||||
]
|
||||
|
||||
|
||||
@router.post("/bulk-retry", response_model=BulkRetryResponse)
|
||||
async def bulk_retry(
|
||||
payload: BulkRetryRequest,
|
||||
current_user: User = Depends(require_roles(UserRole.PRODUCTION, UserRole.ADMIN)),
|
||||
db: AsyncIOMotorDatabase = Depends(get_database),
|
||||
):
|
||||
"""Retry up to 50 failed jobs in one call."""
|
||||
if len(payload.job_ids) > _RETRY_CAP:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Cannot retry more than {_RETRY_CAP} jobs at once",
|
||||
)
|
||||
|
||||
retried: list[str] = []
|
||||
skipped: list[str] = []
|
||||
errors: list[dict] = []
|
||||
now = datetime.utcnow()
|
||||
|
||||
for job_id in payload.job_ids:
|
||||
try:
|
||||
job_doc = await db.jobs.find_one({"_id": job_id})
|
||||
if not job_doc:
|
||||
skipped.append(job_id)
|
||||
continue
|
||||
if job_doc["status"] not in _FAILURE_STATUSES:
|
||||
skipped.append(job_id)
|
||||
continue
|
||||
|
||||
failure = job_doc.get("failure") or {}
|
||||
if payload.strategy == "from_scratch":
|
||||
step = "ingestion"
|
||||
else:
|
||||
step = failure.get("step")
|
||||
if not step:
|
||||
step = "tts" if job_doc["status"] == JobStatus.TTS_FAILED.value else "render"
|
||||
|
||||
if step in ("ingestion", "ai_processing"):
|
||||
reset_status = JobStatus.CREATED.value
|
||||
elif step == "translation":
|
||||
reset_status = JobStatus.AI_PROCESSING.value
|
||||
elif step == "tts":
|
||||
src = job_doc["source"].get("language", "en")
|
||||
reset_status = (
|
||||
JobStatus.APPROVED_ENGLISH.value if src == "en" else JobStatus.APPROVED_SOURCE.value
|
||||
)
|
||||
elif step == "render":
|
||||
reset_status = JobStatus.PENDING_QC.value
|
||||
else:
|
||||
skipped.append(job_id)
|
||||
continue
|
||||
|
||||
await db.jobs.update_one(
|
||||
{"_id": job_id},
|
||||
{
|
||||
"$set": {"status": reset_status, "error": None, "updated_at": now},
|
||||
"$inc": {"retry_count": 1},
|
||||
"$push": {
|
||||
"review.history": {
|
||||
"at": now,
|
||||
"status": f"bulk_retry_{step}",
|
||||
"by": str(current_user.id),
|
||||
}
|
||||
},
|
||||
},
|
||||
)
|
||||
|
||||
if step in ("ingestion", "ai_processing"):
|
||||
ingest_and_ai_task.delay(job_id)
|
||||
elif step in ("translation", "tts"):
|
||||
translate_and_synthesize_task.delay(job_id)
|
||||
elif step == "render":
|
||||
from ...tasks.rerender_accessible_video import rerender_accessible_video_task
|
||||
rerender_accessible_video_task.delay(job_id)
|
||||
|
||||
retried.append(job_id)
|
||||
except Exception as e:
|
||||
logger.error(f"bulk-retry failed for job {job_id}: {e}")
|
||||
errors.append({"job_id": job_id, "error": str(e)})
|
||||
|
||||
try:
|
||||
await audit_logger.log(
|
||||
action=AuditAction.JOB_BULK_RETRY,
|
||||
user_id=str(current_user.id),
|
||||
user_email=current_user.email,
|
||||
user_role=current_user.role.value if current_user.role else None,
|
||||
resource_type="job",
|
||||
description=f"Bulk retry {len(retried)} jobs (strategy={payload.strategy})",
|
||||
details={"retried": retried, "skipped": skipped, "error_count": len(errors)},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to write bulk-retry audit log: {e}")
|
||||
|
||||
return BulkRetryResponse(retried=retried, skipped=skipped, errors=errors)
|
||||
|
|
@ -11,6 +11,7 @@ from sentry_sdk.integrations.pymongo import PyMongoIntegration
|
|||
from sentry_sdk.integrations.redis import RedisIntegration
|
||||
|
||||
from .api.v1.routes_admin import router as admin_router
|
||||
from .api.v1.routes_admin_production import router as admin_production_router
|
||||
from .api.v1.routes_auth import router as auth_router
|
||||
from .api.v1.routes_clients import router as clients_router
|
||||
from .api.v1.routes_files import router as files_router
|
||||
|
|
@ -266,6 +267,7 @@ app.include_router(language_qc_router, prefix="/api/v1")
|
|||
app.include_router(glossaries_router, prefix="/api/v1")
|
||||
app.include_router(tts_router, prefix="/api/v1")
|
||||
app.include_router(admin_router, prefix="/api/v1")
|
||||
app.include_router(admin_production_router, prefix="/api/v1")
|
||||
app.include_router(share_router, prefix="/api/v1")
|
||||
app.include_router(websockets_router, prefix="/api/v1")
|
||||
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ import { GlossaryList } from './routes/admin/glossaries/GlossaryList';
|
|||
import { GlossaryUpload } from './routes/admin/glossaries/GlossaryUpload';
|
||||
import { GlossaryDetail } from './routes/admin/glossaries/GlossaryDetail';
|
||||
import { AuditLog } from './routes/admin/AuditLog';
|
||||
import { FailuresList } from './routes/admin/FailuresList';
|
||||
import { LinguistQueue } from './routes/jobs/LinguistQueue';
|
||||
import { Downloads } from './routes/Downloads';
|
||||
import { ShareView } from './routes/ShareView';
|
||||
|
|
@ -182,6 +183,13 @@ function AppContent() {
|
|||
</RoleGate>
|
||||
</AuthenticatedRoute>
|
||||
} />
|
||||
<Route path="/admin/failures" element={
|
||||
<AuthenticatedRoute>
|
||||
<RoleGate allowedRoles={['production', 'admin']}>
|
||||
<FailuresList />
|
||||
</RoleGate>
|
||||
</AuthenticatedRoute>
|
||||
} />
|
||||
<Route path="/qc/queue" element={
|
||||
<AuthenticatedRoute>
|
||||
<RoleGate allowedRoles={['linguist', 'reviewer', 'production', 'admin']}>
|
||||
|
|
|
|||
|
|
@ -23,6 +23,7 @@ export function Sidebar({ onMobileClose }: SidebarProps) {
|
|||
|
||||
const isQCRole = ['linguist', 'reviewer', 'production', 'admin'].includes(user?.role || '');
|
||||
const isPMOrAdmin = ['project_manager', 'admin'].includes(user?.role || '');
|
||||
const isAdminOrProduction = ['production', 'admin'].includes(user?.role || '');
|
||||
|
||||
const { data: qcData } = useJobs(
|
||||
{ status: 'pending_qc', size: 1 },
|
||||
|
|
@ -32,9 +33,14 @@ export function Sidebar({ onMobileClose }: SidebarProps) {
|
|||
{ status: 'pending_final_review', size: 1 },
|
||||
{ enabled: isPMOrAdmin }
|
||||
);
|
||||
const { data: failuresData } = useJobs(
|
||||
{ status: 'processing_failed,tts_failed,render_failed', size: 1 },
|
||||
{ enabled: isAdminOrProduction }
|
||||
);
|
||||
|
||||
const qcBadge = isQCRole ? (qcData?.total || 0) : 0;
|
||||
const finalBadge = isPMOrAdmin ? (finalData?.total || 0) : 0;
|
||||
const failuresBadge = isAdminOrProduction ? (failuresData?.total || 0) : 0;
|
||||
|
||||
// Determine current org from route params or first membership
|
||||
const currentOrgSlug =
|
||||
|
|
@ -90,6 +96,13 @@ export function Sidebar({ onMobileClose }: SidebarProps) {
|
|||
icon: '🏢',
|
||||
roles: ['admin', 'project_manager'],
|
||||
},
|
||||
{
|
||||
label: 'Failures',
|
||||
href: '/admin/failures',
|
||||
icon: '🔥',
|
||||
roles: ['production', 'admin'],
|
||||
badge: failuresBadge || undefined,
|
||||
},
|
||||
{
|
||||
label: 'Audit Log',
|
||||
href: '/admin/audit-log',
|
||||
|
|
|
|||
|
|
@ -333,4 +333,25 @@ export function useBulkReturnToQC() {
|
|||
queryClient.invalidateQueries({ queryKey: ['jobs'] });
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
export function useFailures(filters?: { step?: string; org_id?: string; limit?: number; skip?: number }) {
|
||||
return useQuery({
|
||||
queryKey: ['failures', filters],
|
||||
queryFn: () => apiClient.listFailures(filters),
|
||||
refetchInterval: 30_000,
|
||||
});
|
||||
}
|
||||
|
||||
export function useBulkRetry() {
|
||||
const queryClient = useQueryClient();
|
||||
|
||||
return useMutation({
|
||||
mutationFn: ({ job_ids, strategy }: { job_ids: string[]; strategy?: 'auto' | 'from_scratch' }) =>
|
||||
apiClient.bulkRetry(job_ids, strategy),
|
||||
onSuccess: () => {
|
||||
queryClient.invalidateQueries({ queryKey: ['failures'] });
|
||||
queryClient.invalidateQueries({ queryKey: ['jobs'] });
|
||||
},
|
||||
});
|
||||
}
|
||||
|
|
@ -362,6 +362,21 @@ class ApiClient {
|
|||
return response.data;
|
||||
}
|
||||
|
||||
async listFailures(filters?: { step?: string; org_id?: string; limit?: number; skip?: number }): Promise<Job[]> {
|
||||
const params: Record<string, string> = {};
|
||||
if (filters?.step) params.step = filters.step;
|
||||
if (filters?.org_id) params.org_id = filters.org_id;
|
||||
if (filters?.limit != null) params.limit = String(filters.limit);
|
||||
if (filters?.skip != null) params.skip = String(filters.skip);
|
||||
const response = await this.client.get('/admin/production/failures', { params });
|
||||
return response.data;
|
||||
}
|
||||
|
||||
async bulkRetry(job_ids: string[], strategy: 'auto' | 'from_scratch' = 'auto') {
|
||||
const response = await this.client.post('/admin/production/bulk-retry', { job_ids, strategy });
|
||||
return response.data as { retried: string[]; skipped: string[]; errors: Array<{ job_id: string; error: string }> };
|
||||
}
|
||||
|
||||
// User Management endpoints
|
||||
async listUsers(filters?: {
|
||||
page?: number;
|
||||
|
|
|
|||
204
frontend/src/routes/admin/FailuresList.tsx
Normal file
204
frontend/src/routes/admin/FailuresList.tsx
Normal file
|
|
@ -0,0 +1,204 @@
|
|||
import { useState } from 'react';
|
||||
import { Link } from 'react-router-dom';
|
||||
import { useFailures, useBulkRetry } from '../../hooks/useJob';
|
||||
import { StatusBadge } from '../../components/StatusBadge';
|
||||
import { useToastContext } from '../../contexts/ToastContext';
|
||||
|
||||
const STEP_LABELS: Record<string, string> = {
|
||||
ingestion: 'Ingestion',
|
||||
ai_processing: 'AI Processing',
|
||||
translation: 'Translation',
|
||||
tts: 'TTS',
|
||||
render: 'Render',
|
||||
};
|
||||
|
||||
export function FailuresList() {
|
||||
const [stepFilter, setStepFilter] = useState('');
|
||||
const [selected, setSelected] = useState<Set<string>>(new Set());
|
||||
const [strategy, setStrategy] = useState<'auto' | 'from_scratch'>('auto');
|
||||
const toast = useToastContext();
|
||||
|
||||
const { data: jobs = [], isLoading, error, refetch } = useFailures(
|
||||
stepFilter ? { step: stepFilter } : undefined
|
||||
);
|
||||
const bulkRetryMutation = useBulkRetry();
|
||||
|
||||
const toggle = (id: string) => {
|
||||
const next = new Set(selected);
|
||||
next.has(id) ? next.delete(id) : next.add(id);
|
||||
setSelected(next);
|
||||
};
|
||||
const selectAll = () => setSelected(new Set(jobs.map(j => j.id)));
|
||||
const clearSelection = () => setSelected(new Set());
|
||||
|
||||
const handleBulkRetry = async () => {
|
||||
if (selected.size === 0) return;
|
||||
const ids = [...selected];
|
||||
try {
|
||||
const result = await bulkRetryMutation.mutateAsync({ job_ids: ids, strategy });
|
||||
toast.toastOnly.success(
|
||||
`Retried ${result.retried.length} job(s). Skipped: ${result.skipped.length}. Errors: ${result.errors.length}.`
|
||||
);
|
||||
clearSelection();
|
||||
refetch();
|
||||
} catch {
|
||||
toast.toastOnly.error('Bulk retry failed');
|
||||
}
|
||||
};
|
||||
|
||||
// Group by failure type for accordion
|
||||
const byType = jobs.reduce<Record<string, typeof jobs>>((acc, job) => {
|
||||
const key = job.failure?.type || job.status;
|
||||
acc[key] = acc[key] ?? [];
|
||||
acc[key].push(job);
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
if (isLoading) {
|
||||
return (
|
||||
<div className="container mx-auto px-4 py-8">
|
||||
<div className="animate-pulse space-y-4">
|
||||
<div className="h-8 bg-gray-200 rounded w-1/4"></div>
|
||||
{[...Array(5)].map((_, i) => <div key={i} className="h-14 bg-gray-200 rounded"></div>)}
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
if (error) {
|
||||
return (
|
||||
<div className="container mx-auto px-4 py-8">
|
||||
<div className="bg-red-50 border border-red-200 rounded-md p-4">
|
||||
<p className="text-red-600">Failed to load failures dashboard.</p>
|
||||
</div>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="container mx-auto px-4 py-8">
|
||||
<div className="flex items-center justify-between mb-6">
|
||||
<h1 className="text-2xl font-bold text-gray-900">Failures Dashboard</h1>
|
||||
<div className="flex items-center gap-3">
|
||||
<select
|
||||
value={stepFilter}
|
||||
onChange={e => setStepFilter(e.target.value)}
|
||||
className="border border-gray-300 rounded px-3 py-1.5 text-sm"
|
||||
>
|
||||
<option value="">All steps</option>
|
||||
{Object.entries(STEP_LABELS).map(([k, v]) => (
|
||||
<option key={k} value={k}>{v}</option>
|
||||
))}
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{jobs.length === 0 ? (
|
||||
<div className="bg-green-50 border border-green-200 rounded-md p-6 text-center">
|
||||
<p className="text-green-700 font-medium">No failures — all clear!</p>
|
||||
</div>
|
||||
) : (
|
||||
<>
|
||||
{/* Bulk actions bar */}
|
||||
{selected.size > 0 && (
|
||||
<div className="flex items-center gap-3 bg-orange-50 border border-orange-200 rounded-lg px-4 py-3 mb-4">
|
||||
<span className="text-sm font-medium text-orange-800">{selected.size} selected</span>
|
||||
<select
|
||||
value={strategy}
|
||||
onChange={e => setStrategy(e.target.value as 'auto' | 'from_scratch')}
|
||||
className="border border-orange-300 rounded px-2 py-1 text-sm"
|
||||
>
|
||||
<option value="auto">Auto (from failure step)</option>
|
||||
<option value="from_scratch">From scratch (re-ingest)</option>
|
||||
</select>
|
||||
<button
|
||||
onClick={handleBulkRetry}
|
||||
disabled={bulkRetryMutation.isPending}
|
||||
className="px-4 py-1.5 bg-orange-600 text-white text-sm rounded hover:bg-orange-700 disabled:opacity-50"
|
||||
>
|
||||
{bulkRetryMutation.isPending ? 'Retrying...' : 'Retry selected'}
|
||||
</button>
|
||||
<button onClick={clearSelection} className="text-sm text-gray-500 hover:text-gray-700">
|
||||
Clear
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="mb-3 flex gap-3 text-sm text-gray-500">
|
||||
<button onClick={selectAll} className="hover:text-gray-800">Select all</button>
|
||||
<span>|</span>
|
||||
<span>{jobs.length} failure{jobs.length !== 1 ? 's' : ''} total</span>
|
||||
</div>
|
||||
|
||||
{/* Grouped by error type */}
|
||||
<div className="space-y-4">
|
||||
{Object.entries(byType).map(([type, group]) => (
|
||||
<details key={type} className="bg-white border border-gray-200 rounded-lg" open>
|
||||
<summary className="flex items-center justify-between px-4 py-3 cursor-pointer select-none">
|
||||
<span className="font-medium text-gray-800 text-sm">{type}</span>
|
||||
<span className="text-xs text-gray-500 bg-gray-100 rounded-full px-2 py-0.5">{group.length}</span>
|
||||
</summary>
|
||||
<div className="border-t border-gray-100">
|
||||
<table className="w-full text-sm">
|
||||
<thead className="bg-gray-50 text-xs text-gray-500 uppercase">
|
||||
<tr>
|
||||
<th className="w-8 px-3 py-2"></th>
|
||||
<th className="px-3 py-2 text-left">Title</th>
|
||||
<th className="px-3 py-2 text-left">Status</th>
|
||||
<th className="px-3 py-2 text-left">Step</th>
|
||||
<th className="px-3 py-2 text-left max-w-xs">Message</th>
|
||||
<th className="px-3 py-2 text-left">Retries</th>
|
||||
<th className="px-3 py-2 text-left">Updated</th>
|
||||
<th className="px-3 py-2"></th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody className="divide-y divide-gray-100">
|
||||
{group.map(job => (
|
||||
<tr key={job.id} className={`hover:bg-gray-50 ${selected.has(job.id) ? 'bg-orange-50' : ''}`}>
|
||||
<td className="px-3 py-2">
|
||||
<input
|
||||
type="checkbox"
|
||||
checked={selected.has(job.id)}
|
||||
onChange={() => toggle(job.id)}
|
||||
className="rounded border-gray-300"
|
||||
/>
|
||||
</td>
|
||||
<td className="px-3 py-2 font-medium text-gray-900 max-w-xs truncate">
|
||||
{job.title}
|
||||
</td>
|
||||
<td className="px-3 py-2">
|
||||
<StatusBadge status={job.status} />
|
||||
</td>
|
||||
<td className="px-3 py-2 text-gray-600">
|
||||
{job.failure?.step ? STEP_LABELS[job.failure.step] ?? job.failure.step : '—'}
|
||||
</td>
|
||||
<td className="px-3 py-2 text-gray-500 max-w-xs">
|
||||
<span className="line-clamp-2 text-xs">{job.failure?.message || '—'}</span>
|
||||
</td>
|
||||
<td className={`px-3 py-2 font-medium ${(job.failure?.retry_count ?? 0) > 3 ? 'text-red-600' : 'text-gray-600'}`}>
|
||||
{job.failure?.retry_count ?? 0}
|
||||
</td>
|
||||
<td className="px-3 py-2 text-gray-400 whitespace-nowrap text-xs">
|
||||
{new Date(job.updated_at).toLocaleDateString()}
|
||||
</td>
|
||||
<td className="px-3 py-2">
|
||||
<Link
|
||||
to={`/jobs/${job.id}`}
|
||||
className="text-indigo-600 hover:text-indigo-800 text-xs"
|
||||
>
|
||||
View
|
||||
</Link>
|
||||
</td>
|
||||
</tr>
|
||||
))}
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
</details>
|
||||
))}
|
||||
</div>
|
||||
</>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue