fix(embed): respect Gemini 429 retryDelay and reduce concurrency

- Parse retryDelay from 429 error body and sleep for server_delay+1s
  instead of our own 2s/4s backoff (which was shorter than API requires)
- Reduce embed concurrency 5→2 to halve burst when multiple glossary
  versions embed simultaneously
- Increase max_retries 3→5 and initial backoff 2s→8s for headroom

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-05-13 18:07:22 +01:00
parent ca312d48fa
commit 7a7b6c1c12
2 changed files with 19 additions and 5 deletions

View file

@ -8,6 +8,7 @@ for memory and retry ergonomics with large glossaries).
from __future__ import annotations
import asyncio
import re
from collections.abc import Sequence
from google import genai
@ -20,8 +21,17 @@ logger = get_logger(__name__)
_EMBED_MODEL = "gemini-embedding-001"
_BATCH_SIZE = 100
_MAX_RETRIES = 3
_INITIAL_BACKOFF = 2.0
_MAX_RETRIES = 5
_INITIAL_BACKOFF = 8.0
# Matches the 'retryDelay': '7s' field in Gemini 429 error bodies
_RETRY_DELAY_RE = re.compile(r"'retryDelay':\s*'(\d+)s'")
def _parse_retry_delay(exc: Exception) -> float | None:
"""Extract the server-suggested retry delay from a Gemini 429 error."""
m = _RETRY_DELAY_RE.search(str(exc))
return float(m.group(1)) if m else None
class EmbeddingService:
@ -62,8 +72,12 @@ class EmbeddingService:
if attempt == _MAX_RETRIES:
logger.error(f"Embedding batch failed after {_MAX_RETRIES} attempts: {exc}")
raise
logger.warning(f"Embedding attempt {attempt} failed, retrying in {backoff}s: {exc}")
await asyncio.sleep(backoff)
# Honour the server-suggested retryDelay when present (e.g. 429 RESOURCE_EXHAUSTED).
# Fall back to our own exponential backoff otherwise.
server_delay = _parse_retry_delay(exc)
delay = max(server_delay + 1.0, backoff) if server_delay else backoff
logger.warning(f"Embedding attempt {attempt} failed, retrying in {delay}s: {exc}")
await asyncio.sleep(delay)
backoff *= 2
raise RuntimeError("unreachable") # makes type-checker happy

View file

@ -20,7 +20,7 @@ from . import celery_app
logger = get_logger(__name__)
_BATCH_SIZE = 250
_CONCURRENCY = 5
_CONCURRENCY = 2
@celery_app.task(name="embed_glossary_version", bind=True, max_retries=3)