From 7a7b6c1c121ceffb2fb7fa76fcdc94ab77937e6b Mon Sep 17 00:00:00 2001 From: Vadym Samoilenko Date: Wed, 13 May 2026 18:07:22 +0100 Subject: [PATCH] fix(embed): respect Gemini 429 retryDelay and reduce concurrency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Parse retryDelay from 429 error body and sleep for server_delay+1s instead of our own 2s/4s backoff (which was shorter than API requires) - Reduce embed concurrency 5→2 to halve burst when multiple glossary versions embed simultaneously - Increase max_retries 3→5 and initial backoff 2s→8s for headroom Co-Authored-By: Claude Sonnet 4.6 --- backend/app/services/embedding_service.py | 22 ++++++++++++++++++---- backend/app/tasks/embed_glossary.py | 2 +- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/backend/app/services/embedding_service.py b/backend/app/services/embedding_service.py index 8556945..ef2b1db 100644 --- a/backend/app/services/embedding_service.py +++ b/backend/app/services/embedding_service.py @@ -8,6 +8,7 @@ for memory and retry ergonomics with large glossaries). from __future__ import annotations import asyncio +import re from collections.abc import Sequence from google import genai @@ -20,8 +21,17 @@ logger = get_logger(__name__) _EMBED_MODEL = "gemini-embedding-001" _BATCH_SIZE = 100 -_MAX_RETRIES = 3 -_INITIAL_BACKOFF = 2.0 +_MAX_RETRIES = 5 +_INITIAL_BACKOFF = 8.0 + +# Matches the 'retryDelay': '7s' field in Gemini 429 error bodies +_RETRY_DELAY_RE = re.compile(r"'retryDelay':\s*'(\d+)s'") + + +def _parse_retry_delay(exc: Exception) -> float | None: + """Extract the server-suggested retry delay from a Gemini 429 error.""" + m = _RETRY_DELAY_RE.search(str(exc)) + return float(m.group(1)) if m else None class EmbeddingService: @@ -62,8 +72,12 @@ class EmbeddingService: if attempt == _MAX_RETRIES: logger.error(f"Embedding batch failed after {_MAX_RETRIES} attempts: {exc}") raise - logger.warning(f"Embedding attempt {attempt} failed, retrying in {backoff}s: {exc}") - await asyncio.sleep(backoff) + # Honour the server-suggested retryDelay when present (e.g. 429 RESOURCE_EXHAUSTED). + # Fall back to our own exponential backoff otherwise. + server_delay = _parse_retry_delay(exc) + delay = max(server_delay + 1.0, backoff) if server_delay else backoff + logger.warning(f"Embedding attempt {attempt} failed, retrying in {delay}s: {exc}") + await asyncio.sleep(delay) backoff *= 2 raise RuntimeError("unreachable") # makes type-checker happy diff --git a/backend/app/tasks/embed_glossary.py b/backend/app/tasks/embed_glossary.py index ba9941f..a2e49d0 100644 --- a/backend/app/tasks/embed_glossary.py +++ b/backend/app/tasks/embed_glossary.py @@ -20,7 +20,7 @@ from . import celery_app logger = get_logger(__name__) _BATCH_SIZE = 250 -_CONCURRENCY = 5 +_CONCURRENCY = 2 @celery_app.task(name="embed_glossary_version", bind=True, max_retries=3)