video-accessibility/backend/app/services/vtt_versioning.py
Vadym Samoilenko 31199f8705 chore: push all session changes — backend hardening, tests, apache config, deploy scripts
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-30 15:52:14 +01:00

216 lines
6.5 KiB
Python

"""VTT version control service — DB-backed snapshots with GCS storage."""
import difflib
from datetime import datetime
from motor.motor_asyncio import AsyncIOMotorDatabase
from ..core.config import settings
from ..core.logging import get_logger
from ..models.user import User
from ..models.vtt_version import (
DiffLine,
VttDiffResponse,
VttKind,
VttVersion,
VttVersionActor,
VttVersionListResponse,
VttVersionSummary,
)
from ..services.gcs import gcs_service
logger = get_logger(__name__)
_COUNTER_COLLECTION = "vtt_version_counters"
_VERSION_COLLECTION = "vtt_versions"
def _snapshot_path(job_id: str, lang: str, kind: VttKind, version: int) -> str:
return f"{job_id}/{lang}/versions/{kind}/v{version}.vtt"
def _count_cues(content: str) -> int:
"""Count WebVTT cue blocks (lines containing ' --> ')."""
return sum(1 for line in content.splitlines() if " --> " in line)
async def _next_version(db: AsyncIOMotorDatabase, job_id: str, lang: str, kind: VttKind) -> int:
"""Atomically increment and return the next version number for this job/lang/kind."""
counter_id = f"{job_id}:{lang}:{kind}"
result = await db[_COUNTER_COLLECTION].find_one_and_update(
{"_id": counter_id},
{"$inc": {"count": 1}},
upsert=True,
return_document=True,
)
return result["count"]
async def create_version(
db: AsyncIOMotorDatabase,
job_id: str,
lang: str,
kind: VttKind,
content: str,
user: User,
note: str | None = None,
parent_version: int | None = None,
) -> VttVersionSummary:
"""Snapshot VTT content as a new immutable version. Returns summary (no content field)."""
version_num = await _next_version(db, job_id, lang, kind)
blob_path = _snapshot_path(job_id, lang, kind, version_num)
# Write snapshot to GCS (fire-and-forget errors — live file is source of truth)
try:
await gcs_service.upload_text_to_gcs(content, blob_path, "text/vtt")
except Exception as exc:
logger.warning(f"VTT version GCS upload failed (non-fatal): {exc}")
gcs_uri = f"gs://{settings.gcs_bucket}/{blob_path}"
actor = VttVersionActor(user_id=str(user.id), user_email=user.email)
doc = {
"job_id": job_id,
"lang": lang,
"kind": kind,
"version": version_num,
"content": content,
"gcs_uri": gcs_uri,
"created_at": datetime.utcnow(),
"created_by": actor.dict(),
"note": note,
"parent_version": parent_version,
"cue_count": _count_cues(content),
"byte_size": len(content.encode()),
}
result = await db[_VERSION_COLLECTION].insert_one(doc)
doc["_id"] = str(result.inserted_id)
return VttVersionSummary(**{**doc, "content": None})
async def list_versions(
db: AsyncIOMotorDatabase,
job_id: str,
lang: str,
kind: VttKind,
skip: int = 0,
limit: int = 50,
) -> VttVersionListResponse:
query = {"job_id": job_id, "lang": lang, "kind": kind}
total = await db[_VERSION_COLLECTION].count_documents(query)
cursor = (
db[_VERSION_COLLECTION]
.find(query, {"content": 0}) # exclude large content from list
.sort("version", -1)
.skip(skip)
.limit(limit)
)
docs = await cursor.to_list(length=limit)
versions = []
for d in docs:
d["_id"] = str(d["_id"])
versions.append(VttVersionSummary(**d))
return VttVersionListResponse(versions=versions, total=total)
async def get_version(
db: AsyncIOMotorDatabase,
job_id: str,
lang: str,
kind: VttKind,
version: int,
) -> VttVersion | None:
doc = await db[_VERSION_COLLECTION].find_one(
{"job_id": job_id, "lang": lang, "kind": kind, "version": version}
)
if not doc:
return None
doc["_id"] = str(doc["_id"])
return VttVersion(**doc)
async def restore_version(
db: AsyncIOMotorDatabase,
job_id: str,
lang: str,
kind: VttKind,
version: int,
user: User,
) -> VttVersionSummary | None:
"""Create a new version whose content is a copy of an older version (non-destructive)."""
src = await get_version(db, job_id, lang, kind, version)
if not src:
return None
note = f"Restored from v{version}"
return await create_version(
db, job_id, lang, kind, src.content, user,
note=note, parent_version=version,
)
def diff_versions(
job_id: str,
lang: str,
kind: VttKind,
from_version: VttVersion,
to_version: VttVersion,
) -> VttDiffResponse:
"""Line-level diff between two versions using difflib."""
old_lines = from_version.content.splitlines()
new_lines = to_version.content.splitlines()
diff_lines: list[DiffLine] = []
added = 0
removed = 0
old_no = 0
new_no = 0
matcher = difflib.SequenceMatcher(None, old_lines, new_lines, autojunk=False)
for tag, i1, i2, j1, j2 in matcher.get_opcodes():
if tag == "equal":
for k in range(i2 - i1):
diff_lines.append(DiffLine(
type="unchanged",
content=old_lines[i1 + k],
line_no_old=old_no + k + 1,
line_no_new=new_no + k + 1,
))
old_no += i2 - i1
new_no += j2 - j1
elif tag in ("replace", "delete"):
for k in range(i2 - i1):
diff_lines.append(DiffLine(
type="removed",
content=old_lines[i1 + k],
line_no_old=old_no + k + 1,
))
removed += 1
old_no += i2 - i1
if tag == "replace":
for k in range(j2 - j1):
diff_lines.append(DiffLine(
type="added",
content=new_lines[j1 + k],
line_no_new=new_no + k + 1,
))
added += 1
new_no += j2 - j1
elif tag == "insert":
for k in range(j2 - j1):
diff_lines.append(DiffLine(
type="added",
content=new_lines[j1 + k],
line_no_new=new_no + k + 1,
))
added += 1
new_no += j2 - j1
return VttDiffResponse(
job_id=job_id,
lang=lang,
kind=kind,
from_version=from_version.version,
to_version=to_version.version,
lines=diff_lines,
added_count=added,
removed_count=removed,
)