On server restart, stale active jobs are automatically resumed rather than failed. Docs already parsed in a prior run are skipped (resume from cache), docs stuck at 'parsing' are reset to 'pending' and re-parsed. - Repository: add get_all_stale_active_jobs() and reset_stuck_parsing_docs() - Service: skip already-parsed docs in _parse_doc(), reset stuck docs on start - Main: recover stale jobs via asyncio.create_task() in lifespan startup Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
354 lines
21 KiB
Python
354 lines
21 KiB
Python
import asyncio
|
|
import logging
|
|
import uuid
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
from app.models.database import async_session_factory
|
|
from app.repositories.knowledge_base_repository import KnowledgeBaseRepository
|
|
from app.services.storage_service import storage_service
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MAX_CONCURRENT_PARSES = 10
|
|
|
|
# Distillation prompt templates per agent type
|
|
DISTILLATION_PROMPTS = {
|
|
"legal": """You are a compliance documentation specialist. Below is raw reference material about legal compliance, advertising standards, financial promotions, and disclaimers relevant to Barclays marketing materials.
|
|
|
|
Your task is to restructure and organise ALL of this content into a clear, well-structured specification document that an AI compliance agent can use to review marketing proofs. You are imposing structure on the content — NOT reducing it.
|
|
|
|
CRITICAL — ZERO TOLERANCE FOR DETAIL LOSS:
|
|
Every specific rule, instruction, condition, threshold, measurement, example, exception, and edge case from the source material MUST appear in the output. If in doubt about whether something is relevant, INCLUDE IT. Source documents may contain unconventional rules or conditional instructions that seem tangential (e.g. "disclaimers must use a specific font size only when appearing below the fold on mobile" or "this rule does not apply to materials distributed in Scotland"). These MUST be preserved verbatim — do not silently drop them.
|
|
|
|
What TO do:
|
|
- Merge exact duplicates (identical content repeated across sources)
|
|
- Impose logical structure with clear headings: required disclaimers, prohibited content, compliance thresholds, financial promotion rules, advertising standards, conditional rules, exceptions
|
|
- Use clear formatting (headings, bullet points, tables where helpful)
|
|
- Preserve the exact wording of specific rules, legal requirements, and thresholds
|
|
|
|
What NOT to do:
|
|
- Do NOT omit, summarise away, or paraphrase specific rules, values, or conditions
|
|
- Do NOT drop content because it seems minor, unusual, or edge-case
|
|
- Do NOT filter out content you consider "not relevant" — the downstream agent needs ALL of it
|
|
|
|
RAW REFERENCE MATERIAL:
|
|
{combined_markdown}
|
|
|
|
OUTPUT: A comprehensive, well-structured markdown specification document containing ALL content from the source material.""",
|
|
|
|
"brand_barclays": """You are a brand guidelines specialist. Below is raw reference material about Barclays brand guidelines including logo usage, colour palettes, typography, design principles, and visual identity standards.
|
|
|
|
Your task is to restructure and organise ALL of this content into a clear, well-structured specification document that an AI brand compliance agent can use to review marketing proofs. You are imposing structure on the content — NOT reducing it.
|
|
|
|
CRITICAL — ZERO TOLERANCE FOR DETAIL LOSS:
|
|
Every specific rule, instruction, condition, threshold, measurement, example, exception, and edge case from the source material MUST appear in the output. If in doubt about whether something is relevant, INCLUDE IT. Source documents may contain unconventional rules or conditional instructions that seem tangential (e.g. "the eagle icon must be rotated 5° clockwise when used on dark backgrounds below 200x200px" or "gradient usage is prohibited except in Q4 seasonal campaigns"). These MUST be preserved verbatim — do not silently drop them.
|
|
|
|
What TO do:
|
|
- Merge exact duplicates (identical content repeated across sources)
|
|
- Impose logical structure with clear headings: logo rules, colour specifications (with exact hex/RGB values), typography rules, spacing/layout requirements, imagery guidelines, do's and don'ts, conditional usage rules
|
|
- Use clear formatting (headings, bullet points, tables where helpful)
|
|
- Preserve all exact measurements, colour values, ratios, and conditional rules
|
|
|
|
What NOT to do:
|
|
- Do NOT omit, summarise away, or paraphrase specific rules, values, or conditions
|
|
- Do NOT drop content because it seems minor, unusual, or edge-case
|
|
- Do NOT filter out content you consider "not relevant" — the downstream agent needs ALL of it
|
|
|
|
RAW REFERENCE MATERIAL:
|
|
{combined_markdown}
|
|
|
|
OUTPUT: A comprehensive, well-structured markdown specification document containing ALL content from the source material.""",
|
|
|
|
"brand_barclaycard": """You are a brand guidelines specialist. Below is raw reference material about Barclaycard brand guidelines including logo usage, colour palettes, typography, design principles, and visual identity standards.
|
|
|
|
Your task is to restructure and organise ALL of this content into a clear, well-structured specification document that an AI brand compliance agent can use to review marketing proofs. You are imposing structure on the content — NOT reducing it.
|
|
|
|
CRITICAL — ZERO TOLERANCE FOR DETAIL LOSS:
|
|
Every specific rule, instruction, condition, threshold, measurement, example, exception, and edge case from the source material MUST appear in the output. If in doubt about whether something is relevant, INCLUDE IT. Source documents may contain unconventional rules or conditional instructions that seem tangential (e.g. "the Barclaycard wordmark must use the condensed variant when co-branded with partner logos" or "minimum clear space increases to 2x on print materials below A5 size"). These MUST be preserved verbatim — do not silently drop them.
|
|
|
|
What TO do:
|
|
- Merge exact duplicates (identical content repeated across sources)
|
|
- Impose logical structure with clear headings: logo rules, colour specifications (with exact hex/RGB values), typography rules, spacing/layout requirements, imagery guidelines, do's and don'ts, conditional usage rules
|
|
- Use clear formatting (headings, bullet points, tables where helpful)
|
|
- Preserve all exact measurements, colour values, ratios, and conditional rules
|
|
|
|
What NOT to do:
|
|
- Do NOT omit, summarise away, or paraphrase specific rules, values, or conditions
|
|
- Do NOT drop content because it seems minor, unusual, or edge-case
|
|
- Do NOT filter out content you consider "not relevant" — the downstream agent needs ALL of it
|
|
|
|
RAW REFERENCE MATERIAL:
|
|
{combined_markdown}
|
|
|
|
OUTPUT: A comprehensive, well-structured markdown specification document containing ALL content from the source material.""",
|
|
|
|
"channel_best_practices": """You are a marketing channel specialist. Below is raw reference material about best practices for various marketing channels (social media, display, email, print, OOH) relevant to Barclays marketing.
|
|
|
|
Your task is to restructure and organise ALL of this content into a clear, well-structured specification document that an AI channel compliance agent can use to review marketing proofs. You are imposing structure on the content — NOT reducing it.
|
|
|
|
CRITICAL — ZERO TOLERANCE FOR DETAIL LOSS:
|
|
Every specific rule, instruction, condition, threshold, measurement, example, exception, and edge case from the source material MUST appear in the output. If in doubt about whether something is relevant, INCLUDE IT. Source documents may contain unconventional rules or conditional instructions that seem tangential (e.g. "carousel posts must not exceed 5 cards when promoting credit products" or "reply-to-comment tone shifts to informal only on Instagram and TikTok"). These MUST be preserved verbatim — do not silently drop them.
|
|
|
|
What TO do:
|
|
- Merge exact duplicates (identical content repeated across sources)
|
|
- Impose logical structure with clear headings organised by channel type, then platform/format: content guidelines, accessibility requirements, engagement rules, tone/voice guidance, conditional instructions
|
|
- Use clear formatting (headings, bullet points, tables where helpful)
|
|
- Preserve all specific recommendations, requirements, and platform-specific rules
|
|
|
|
What NOT to do:
|
|
- Do NOT omit, summarise away, or paraphrase specific rules, values, or conditions
|
|
- Do NOT drop content because it seems minor, unusual, or edge-case
|
|
- Do NOT filter out content you consider "not relevant" — the downstream agent needs ALL of it
|
|
|
|
RAW REFERENCE MATERIAL:
|
|
{combined_markdown}
|
|
|
|
OUTPUT: A comprehensive, well-structured markdown specification document containing ALL content from the source material.""",
|
|
|
|
"channel_tech_specs": """You are a marketing production specialist. Below is raw reference material about technical specifications for various marketing channels (dimensions, file formats, file sizes, resolution requirements, platform constraints).
|
|
|
|
Your task is to restructure and organise ALL of this content into a clear, well-structured specification document that an AI technical compliance agent can use to review marketing proofs. You are imposing structure on the content — NOT reducing it.
|
|
|
|
CRITICAL — ZERO TOLERANCE FOR DETAIL LOSS:
|
|
Every specific rule, instruction, condition, threshold, measurement, example, exception, and edge case from the source material MUST appear in the output. If in doubt about whether something is relevant, INCLUDE IT. Source documents may contain unconventional rules or conditional instructions that seem tangential (e.g. "safe zone insets increase to 20px for YouTube bumper ads on connected TV" or "max file size is 100KB for email hero images but 200KB when animated"). These MUST be preserved verbatim — do not silently drop them.
|
|
|
|
What TO do:
|
|
- Merge exact duplicates (identical content repeated across sources)
|
|
- Impose logical structure with clear headings organised by channel → platform → format. Use tables where appropriate for dimensions, file sizes, and format specs
|
|
- Use clear formatting (headings, bullet points, tables where helpful)
|
|
- Preserve all specific dimensions, file size limits, format requirements, resolution values, and platform constraints
|
|
|
|
What NOT to do:
|
|
- Do NOT omit, summarise away, or paraphrase specific rules, values, or conditions
|
|
- Do NOT drop content because it seems minor, unusual, or edge-case
|
|
- Do NOT filter out content you consider "not relevant" — the downstream agent needs ALL of it
|
|
|
|
RAW REFERENCE MATERIAL:
|
|
{combined_markdown}
|
|
|
|
OUTPUT: A comprehensive, well-structured markdown specification document containing ALL content from the source material.""",
|
|
}
|
|
|
|
|
|
class KnowledgeBaseService:
|
|
"""Orchestrates the document processing pipeline."""
|
|
|
|
def __init__(self, llamaparse_service, gemini_service, reference_docs_service):
|
|
self.llamaparse = llamaparse_service
|
|
self.gemini = gemini_service
|
|
self.reference_docs = reference_docs_service
|
|
|
|
async def process_documents(
|
|
self,
|
|
kb_id: uuid.UUID,
|
|
job_id: uuid.UUID,
|
|
agent_key: str,
|
|
user_id: Optional[uuid.UUID] = None,
|
|
user_name: Optional[str] = None,
|
|
) -> None:
|
|
"""
|
|
Run the full processing pipeline as a background task.
|
|
|
|
1. Parse each source document with LlamaParse
|
|
2. Combine parsed markdown
|
|
3. Distil with Gemini into a spec
|
|
4. Save as new SpecVersion
|
|
5. Invalidate ReferenceDocsService cache
|
|
"""
|
|
async with async_session_factory() as session:
|
|
try:
|
|
repo = KnowledgeBaseRepository(session)
|
|
|
|
# Update job status to parsing
|
|
await repo.update_processing_job(job_id, status="parsing_documents")
|
|
# Reset docs stuck at "parsing" from a previous interrupted run
|
|
await repo.reset_stuck_parsing_docs(kb_id)
|
|
await session.commit()
|
|
|
|
# Get all source documents for this KB
|
|
docs = await repo.get_source_documents(kb_id)
|
|
if not docs:
|
|
await repo.update_processing_job(
|
|
job_id, status="failed",
|
|
error_message="No source documents found.",
|
|
completed_at=datetime.now(timezone.utc),
|
|
)
|
|
await session.commit()
|
|
return
|
|
|
|
# Parse documents in parallel (up to MAX_CONCURRENT_PARSES at a time)
|
|
semaphore = asyncio.Semaphore(MAX_CONCURRENT_PARSES)
|
|
progress_lock = asyncio.Lock()
|
|
parsed_count = 0
|
|
|
|
async def _parse_doc(doc):
|
|
nonlocal parsed_count
|
|
# Resume: skip docs already parsed in a previous interrupted run
|
|
if doc.parse_status in ("parsed", "partial"):
|
|
async with progress_lock:
|
|
parsed_count += 1
|
|
await repo.update_processing_job(job_id, parsed_documents=parsed_count)
|
|
await session.commit()
|
|
if doc.parsed_markdown and doc.parsed_markdown.strip():
|
|
return (doc.filename, str(doc.id), doc.parsed_markdown)
|
|
return None
|
|
async with semaphore:
|
|
try:
|
|
async with async_session_factory() as doc_session:
|
|
doc_repo = KnowledgeBaseRepository(doc_session)
|
|
|
|
# Load file data from storage
|
|
file_data = await storage_service.get_file(doc.file_storage_key)
|
|
if not file_data:
|
|
logger.error(f"[KB_SERVICE] File not found: {doc.file_storage_key}")
|
|
await doc_repo.update_source_document_parse_status(
|
|
doc.id, "error", parse_error="File not found in storage."
|
|
)
|
|
await doc_session.commit()
|
|
async with progress_lock:
|
|
parsed_count += 1
|
|
await repo.update_processing_job(job_id, parsed_documents=parsed_count)
|
|
await session.commit()
|
|
return None
|
|
|
|
# Parse with LlamaParse
|
|
await doc_repo.update_source_document_parse_status(doc.id, "parsing")
|
|
await doc_session.commit()
|
|
|
|
result = await self.llamaparse.parse_document(file_data, doc.filename)
|
|
|
|
# Determine status based on failed pages
|
|
if result.failed_pages:
|
|
if not result.markdown.strip():
|
|
# All pages failed
|
|
failed_desc = "; ".join(
|
|
f"Page {fp['page']}: {fp['error']}" for fp in result.failed_pages
|
|
)
|
|
status = "error"
|
|
error_msg = f"All {result.total_pages} pages failed to parse. {failed_desc}"
|
|
else:
|
|
failed_desc = "; ".join(
|
|
f"Page {fp['page']}: {fp['error']}" for fp in result.failed_pages
|
|
)
|
|
status = "partial"
|
|
error_msg = (
|
|
f"{len(result.failed_pages)} of {result.total_pages} pages "
|
|
f"failed to parse. {failed_desc}"
|
|
)
|
|
else:
|
|
status = "parsed"
|
|
error_msg = None
|
|
|
|
async with async_session_factory() as doc_session:
|
|
doc_repo = KnowledgeBaseRepository(doc_session)
|
|
await doc_repo.update_source_document_parse_status(
|
|
doc.id, status,
|
|
parsed_markdown=result.markdown if result.markdown.strip() else None,
|
|
parse_error=error_msg,
|
|
)
|
|
await doc_session.commit()
|
|
|
|
async with progress_lock:
|
|
parsed_count += 1
|
|
await repo.update_processing_job(job_id, parsed_documents=parsed_count)
|
|
await session.commit()
|
|
|
|
if result.markdown.strip():
|
|
return (doc.filename, str(doc.id), result.markdown)
|
|
return None
|
|
|
|
except Exception as e:
|
|
logger.error(f"[KB_SERVICE] Error parsing {doc.filename}: {e}")
|
|
try:
|
|
async with async_session_factory() as doc_session:
|
|
doc_repo = KnowledgeBaseRepository(doc_session)
|
|
await doc_repo.update_source_document_parse_status(
|
|
doc.id, "error", parse_error=str(e)
|
|
)
|
|
await doc_session.commit()
|
|
except Exception:
|
|
logger.error(f"[KB_SERVICE] Failed to update error status for {doc.filename}")
|
|
async with progress_lock:
|
|
parsed_count += 1
|
|
await repo.update_processing_job(job_id, parsed_documents=parsed_count)
|
|
await session.commit()
|
|
return None
|
|
|
|
results = await asyncio.gather(*[_parse_doc(doc) for doc in docs])
|
|
combined_parts = [
|
|
f"# {filename}\n\n{markdown}"
|
|
for filename, doc_id, markdown in (r for r in results if r is not None)
|
|
]
|
|
source_doc_ids = [
|
|
doc_id
|
|
for _, doc_id, _ in (r for r in results if r is not None)
|
|
]
|
|
|
|
if not combined_parts:
|
|
await repo.update_processing_job(
|
|
job_id, status="failed",
|
|
error_message="No documents were successfully parsed.",
|
|
completed_at=datetime.now(timezone.utc),
|
|
)
|
|
await session.commit()
|
|
return
|
|
|
|
# Distil with Gemini
|
|
await repo.update_processing_job(job_id, status="distilling")
|
|
await session.commit()
|
|
|
|
combined_markdown = "\n\n---\n\n".join(combined_parts)
|
|
prompt_template = DISTILLATION_PROMPTS.get(agent_key, DISTILLATION_PROMPTS["legal"])
|
|
prompt = prompt_template.format(combined_markdown=combined_markdown)
|
|
|
|
logger.info(f"[KB_SERVICE] Sending {len(combined_markdown)} chars to Gemini for distillation")
|
|
try:
|
|
response = await self.gemini.primary_client.aio.models.generate_content(
|
|
model=self.gemini.model,
|
|
contents=prompt,
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"[KB_SERVICE] Primary model failed: {e}. Retrying with fallback.")
|
|
response = await self.gemini.fallback_client.aio.models.generate_content(
|
|
model=self.gemini.fallback_model,
|
|
contents=prompt,
|
|
)
|
|
spec_content = response.text.strip()
|
|
logger.info(f"[KB_SERVICE] Distillation complete: {len(spec_content)} chars")
|
|
|
|
# Save as new spec version
|
|
spec = await repo.create_spec_version(
|
|
knowledge_base_id=kb_id,
|
|
content=spec_content,
|
|
source_document_ids=source_doc_ids,
|
|
generated_by_id=user_id,
|
|
generated_by_name=user_name,
|
|
processing_job_id=job_id,
|
|
)
|
|
await session.commit()
|
|
|
|
# Update job as completed
|
|
await repo.update_processing_job(
|
|
job_id,
|
|
status="completed",
|
|
spec_version_id=spec.id,
|
|
completed_at=datetime.now(timezone.utc),
|
|
)
|
|
await session.commit()
|
|
|
|
# Update reference docs cache with new spec content
|
|
self.reference_docs.invalidate_cache(agent_key, new_spec_content=spec_content)
|
|
logger.info(f"[KB_SERVICE] Pipeline complete for {agent_key}, spec version {spec.version_number}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"[KB_SERVICE] Pipeline failed for job {job_id}: {e}")
|
|
try:
|
|
await repo.update_processing_job(
|
|
job_id, status="failed",
|
|
error_message=str(e),
|
|
completed_at=datetime.now(timezone.utc),
|
|
)
|
|
await session.commit()
|
|
except Exception:
|
|
logger.error("[KB_SERVICE] Failed to update job status after error")
|