From f76d17314a733240173791abdffd3741429a3ed8 Mon Sep 17 00:00:00 2001
From: sauravniraula <developmentsaurav@gmail.com>
Date: Thu, 23 Apr 2026 11:42:15 +0545
Subject: [PATCH 1/3] feat: integrates llmai instead of using old llm client
 and tool call handlers

---
 README.md                                     |    1 -
 docker-compose.yml                            |    4 -
 .../fastapi/api/v1/ppt/endpoints/outlines.py  |    5 +-
 .../api/v1/ppt/endpoints/presentation.py      |    5 +-
 servers/fastapi/models/llm_message.py         |   59 -
 servers/fastapi/models/llm_tool_call.py       |   30 -
 servers/fastapi/models/llm_tools.py           |   29 -
 servers/fastapi/models/user_config.py         |    1 -
 .../presenton_backend.egg-info/PKG-INFO       |    6 +-
 .../presenton_backend.egg-info/SOURCES.txt    |   13 +-
 .../presenton_backend.egg-info/requires.txt   |    6 +-
 servers/fastapi/pyproject.toml                |    5 +-
 servers/fastapi/services/codex_llm.py         |  431 ---
 servers/fastapi/services/llm_client.py        | 2366 -----------------
 .../services/llm_tool_calls_handler.py        |  211 --
 servers/fastapi/templates/providers.py        |   60 +-
 servers/fastapi/utils/available_models.py     |   19 +-
 servers/fastapi/utils/get_env.py              |    4 -
 servers/fastapi/utils/llm_calls/edit_slide.py |   59 +-
 .../utils/llm_calls/edit_slide_html.py        |   34 +-
 .../generate_presentation_outlines.py         |   91 +-
 .../generate_presentation_structure.py        |   95 +-
 .../utils/llm_calls/generate_slide_content.py |   57 +-
 .../llm_calls/select_slide_type_on_edit.py    |   58 +-
 .../fastapi/utils/llm_client_error_handler.py |   11 +-
 servers/fastapi/utils/llm_config.py           |  146 +
 servers/fastapi/utils/llm_utils.py            |  134 +
 servers/fastapi/utils/set_env.py              |    4 -
 servers/fastapi/utils/user_config.py          |    9 -
 servers/fastapi/uv.lock                       |   79 +-
 servers/nextjs/components/CustomConfig.tsx    |   26 +-
 servers/nextjs/components/LLMSelection.tsx    |    1 -
 servers/nextjs/types/llm_config.ts            |    1 -
 servers/nextjs/utils/providerUtils.ts         |    3 +-
 start.js                                      |    1 -
 35 files changed, 695 insertions(+), 3369 deletions(-)
 delete mode 100644 servers/fastapi/models/llm_message.py
 delete mode 100644 servers/fastapi/models/llm_tool_call.py
 delete mode 100644 servers/fastapi/models/llm_tools.py
 delete mode 100644 servers/fastapi/services/codex_llm.py
 delete mode 100644 servers/fastapi/services/llm_client.py
 delete mode 100644 servers/fastapi/services/llm_tool_calls_handler.py
 create mode 100644 servers/fastapi/utils/llm_config.py
 create mode 100644 servers/fastapi/utils/llm_utils.py

diff --git a/README.md b/README.md
index 849708ba..3aa749d5 100644
--- a/README.md
+++ b/README.md
@@ -214,7 +214,6 @@ Other optional variables exist in code (for example advanced Mem0 paths, LitePar
 - **CUSTOM_LLM_URL**: OpenAI-compatible base URL if **LLM** is **custom**.
 - **CUSTOM_LLM_API_KEY**: API key if **LLM** is **custom**.
 - **CUSTOM_MODEL**: Model id if **LLM** is **custom**.
-- **TOOL_CALLS**=[true/false]: If **true**, the custom LLM uses tool calls instead of JSON schema for structured output.
 - **DISABLE_THINKING**=[true/false]: If **true**, disables “thinking” on the custom LLM.
 - **WEB_GROUNDING**=[true/false]: If **true**, enables web search for OpenAI, Google, and Anthropic models.
 - **EXTENDED_REASONING**=[true/false]: Enables extended reasoning where supported by the configured stack.
diff --git a/docker-compose.yml b/docker-compose.yml
index 9e883cfc..386cdde7 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -35,7 +35,6 @@ services:
       - DALL_E_3_QUALITY=${DALL_E_3_QUALITY}
       - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY}
       - EXTENDED_REASONING=${EXTENDED_REASONING}
-      - TOOL_CALLS=${TOOL_CALLS}
       - DISABLE_THINKING=${DISABLE_THINKING}
       - WEB_GROUNDING=${WEB_GROUNDING}
       - DATABASE_URL=${DATABASE_URL}
@@ -99,7 +98,6 @@ services:
       - DALL_E_3_QUALITY=${DALL_E_3_QUALITY}
       - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY}
       - EXTENDED_REASONING=${EXTENDED_REASONING}
-      - TOOL_CALLS=${TOOL_CALLS}
       - DISABLE_THINKING=${DISABLE_THINKING}
       - WEB_GROUNDING=${WEB_GROUNDING}
       - DATABASE_URL=${DATABASE_URL}
@@ -158,7 +156,6 @@ services:
       - DALL_E_3_QUALITY=${DALL_E_3_QUALITY}
       - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY}
       - EXTENDED_REASONING=${EXTENDED_REASONING}
-      - TOOL_CALLS=${TOOL_CALLS}
       - DISABLE_THINKING=${DISABLE_THINKING}
       - WEB_GROUNDING=${WEB_GROUNDING}
       - DATABASE_URL=${DATABASE_URL}
@@ -223,7 +220,6 @@ services:
       - DALL_E_3_QUALITY=${DALL_E_3_QUALITY}
       - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY}
       - EXTENDED_REASONING=${EXTENDED_REASONING}
-      - TOOL_CALLS=${TOOL_CALLS}
       - DISABLE_THINKING=${DISABLE_THINKING}
       - WEB_GROUNDING=${WEB_GROUNDING}
       - DATABASE_URL=${DATABASE_URL}
diff --git a/servers/fastapi/api/v1/ppt/endpoints/outlines.py b/servers/fastapi/api/v1/ppt/endpoints/outlines.py
index 9accde4d..7fc31468 100644
--- a/servers/fastapi/api/v1/ppt/endpoints/outlines.py
+++ b/servers/fastapi/api/v1/ppt/endpoints/outlines.py
@@ -21,6 +21,7 @@ from services.documents_loader import DocumentsLoader
 from services.mem0_presentation_memory_service import (
     MEM0_PRESENTATION_MEMORY_SERVICE,
 )
+from utils.llm_utils import message_content_to_text
 from utils.outline_utils import (
     get_no_of_outlines_to_generate_for_n_slides,
     get_presentation_title_from_presentation_outline,
@@ -85,12 +86,12 @@ async def stream_outlines(
         await MEM0_PRESENTATION_MEMORY_SERVICE.store_generation_context(
             presentation_id=presentation.id,
             system_prompt=(
-                outline_messages[0].content
+                message_content_to_text(outline_messages[0].content)
                 if len(outline_messages) > 0
                 else None
             ),
             user_prompt=(
-                outline_messages[1].content
+                message_content_to_text(outline_messages[1].content)
                 if len(outline_messages) > 1
                 else None
             ),
diff --git a/servers/fastapi/api/v1/ppt/endpoints/presentation.py b/servers/fastapi/api/v1/ppt/endpoints/presentation.py
index 90933cbc..24c63fc9 100644
--- a/servers/fastapi/api/v1/ppt/endpoints/presentation.py
+++ b/servers/fastapi/api/v1/ppt/endpoints/presentation.py
@@ -75,6 +75,7 @@ from utils.process_slides import (
     process_slide_and_fetch_assets,
 )
 from utils.get_layout_by_name import get_layout_by_name
+from utils.llm_utils import message_content_to_text
 from models.presentation_layout import PresentationLayoutModel
 import uuid
 
@@ -666,12 +667,12 @@ async def generate_presentation_handler(
             await MEM0_PRESENTATION_MEMORY_SERVICE.store_generation_context(
                 presentation_id=presentation_id,
                 system_prompt=(
-                    outline_messages[0].content
+                    message_content_to_text(outline_messages[0].content)
                     if len(outline_messages) > 0
                     else None
                 ),
                 user_prompt=(
-                    outline_messages[1].content
+                    message_content_to_text(outline_messages[1].content)
                     if len(outline_messages) > 1
                     else None
                 ),
diff --git a/servers/fastapi/models/llm_message.py b/servers/fastapi/models/llm_message.py
deleted file mode 100644
index ba1be4cf..00000000
--- a/servers/fastapi/models/llm_message.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from typing import Any, List, Literal, Optional
-from pydantic import BaseModel
-from google.genai.types import Content as GoogleContent
-
-from models.llm_tool_call import AnthropicToolCall
-
-
-class LLMMessage(BaseModel):
-    pass
-
-
-class LLMUserMessage(LLMMessage):
-    role: Literal["user"] = "user"
-    content: str
-
-
-class LLMSystemMessage(LLMMessage):
-    role: Literal["system"] = "system"
-    content: str
-
-
-class OpenAIAssistantMessage(LLMMessage):
-    role: Literal["assistant"] = "assistant"
-    content: str | None = None
-    tool_calls: Optional[List[dict]] = None
-
-
-class GoogleAssistantMessage(LLMMessage):
-    role: Literal["assistant"] = "assistant"
-    content: GoogleContent
-
-
-class AnthropicAssistantMessage(LLMMessage):
-    role: Literal["assistant"] = "assistant"
-    content: List[AnthropicToolCall]
-
-
-class AnthropicToolCallMessage(LLMMessage):
-    type: Literal["tool_result"] = "tool_result"
-    tool_use_id: str
-    content: str
-
-
-class AnthropicUserMessage(LLMMessage):
-    role: Literal["user"] = "user"
-    content: List[AnthropicToolCallMessage]
-
-
-class OpenAIToolCallMessage(LLMMessage):
-    role: Literal["tool"] = "tool"
-    content: str
-    tool_call_id: str
-
-
-class GoogleToolCallMessage(LLMMessage):
-    role: Literal["tool"] = "tool"
-    id: Optional[str] = None
-    name: str
-    response: dict
diff --git a/servers/fastapi/models/llm_tool_call.py b/servers/fastapi/models/llm_tool_call.py
deleted file mode 100644
index d0fe7c0e..00000000
--- a/servers/fastapi/models/llm_tool_call.py
+++ /dev/null
@@ -1,30 +0,0 @@
-from typing import Literal, Optional
-from pydantic import BaseModel
-
-
-class LLMToolCall(BaseModel):
-    pass
-
-
-class OpenAIToolCallFunction(BaseModel):
-    name: str
-    arguments: str
-
-
-class OpenAIToolCall(LLMToolCall):
-    id: str
-    type: Literal["function"] = "function"
-    function: OpenAIToolCallFunction
-
-
-class GoogleToolCall(LLMToolCall):
-    id: Optional[str] = None
-    name: str
-    arguments: Optional[dict] = None
-
-
-class AnthropicToolCall(LLMToolCall):
-    type: Literal["tool_use"] = "tool_use"
-    id: str
-    name: str
-    input: object
diff --git a/servers/fastapi/models/llm_tools.py b/servers/fastapi/models/llm_tools.py
deleted file mode 100644
index ccf64e67..00000000
--- a/servers/fastapi/models/llm_tools.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from typing import Any, Callable, Coroutine, Optional
-from pydantic import BaseModel, Field
-
-
-class LLMTool(BaseModel):
-    pass
-
-
-class LLMDynamicTool(LLMTool):
-    name: str
-    description: str
-    parameters: dict = {}
-    handler: Callable[..., Coroutine[Any, Any, str]]
-
-
-class SearchWebTool(LLMTool):
-    """
-    Search the web for information.
-    """
-
-    query: str = Field(description="The query to search the web for")
-
-
-class GetCurrentDatetimeTool(LLMTool):
-    """
-    Get the current datetime.
-    """
-
-    pass
diff --git a/servers/fastapi/models/user_config.py b/servers/fastapi/models/user_config.py
index 111c585d..db41401b 100644
--- a/servers/fastapi/models/user_config.py
+++ b/servers/fastapi/models/user_config.py
@@ -46,7 +46,6 @@ class UserConfig(BaseModel):
     GPT_IMAGE_1_5_QUALITY: Optional[str] = None
 
     # Reasoning
-    TOOL_CALLS: Optional[bool] = None
     DISABLE_THINKING: Optional[bool] = None
     EXTENDED_REASONING: Optional[bool] = None
 
diff --git a/servers/fastapi/presenton_backend.egg-info/PKG-INFO b/servers/fastapi/presenton_backend.egg-info/PKG-INFO
index 434857bf..c24c29ca 100644
--- a/servers/fastapi/presenton_backend.egg-info/PKG-INFO
+++ b/servers/fastapi/presenton_backend.egg-info/PKG-INFO
@@ -7,19 +7,17 @@ Requires-Dist: alembic>=1.14.0
 Requires-Dist: aiohttp>=3.12.15
 Requires-Dist: aiomysql>=0.2.0
 Requires-Dist: aiosqlite>=0.21.0
-Requires-Dist: anthropic>=0.60.0
 Requires-Dist: asyncpg>=0.30.0
-Requires-Dist: chromadb>=1.0.15
 Requires-Dist: dirtyjson>=1.0.8
 Requires-Dist: fastapi[standard]>=0.116.1
 Requires-Dist: fastembed-vectorstore>=0.5.2
 Requires-Dist: fastmcp>=2.11.0
 Requires-Dist: google-genai>=1.28.0
+Requires-Dist: mem0ai[nlp]>=0.1.115
 Requires-Dist: nltk>=3.9.1
 Requires-Dist: openai>=1.98.0
 Requires-Dist: pathvalidate>=3.3.1
 Requires-Dist: pdfplumber>=0.11.7
-Requires-Dist: pytest>=8.4.1
 Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: redis>=6.2.0
 Requires-Dist: sqlmodel>=0.0.24
+Requires-Dist: llmai==0.1.8
diff --git a/servers/fastapi/presenton_backend.egg-info/SOURCES.txt b/servers/fastapi/presenton_backend.egg-info/SOURCES.txt
index fba03e0b..376ca3dd 100644
--- a/servers/fastapi/presenton_backend.egg-info/SOURCES.txt
+++ b/servers/fastapi/presenton_backend.egg-info/SOURCES.txt
@@ -3,6 +3,7 @@ api/__init__.py
 api/lifespan.py
 api/main.py
 api/middlewares.py
+api/v1/auth/router.py
 api/v1/mock/router.py
 api/v1/ppt/background_tasks.py
 api/v1/ppt/router.py
@@ -46,9 +47,6 @@ models/document_chunk.py
 models/generate_presentation_request.py
 models/image_prompt.py
 models/json_path_guide.py
-models/llm_message.py
-models/llm_tool_call.py
-models/llm_tools.py
 models/ollama_model_metadata.py
 models/ollama_model_status.py
 models/pptx_models.py
@@ -78,7 +76,6 @@ presenton_backend.egg-info/dependency_links.txt
 presenton_backend.egg-info/requires.txt
 presenton_backend.egg-info/top_level.txt
 services/__init__.py
-services/codex_llm.py
 services/concurrent_service.py
 services/database.py
 services/document_conversion_service.py
@@ -88,8 +85,7 @@ services/html_to_text_runs_service.py
 services/icon_finder_service.py
 services/image_generation_service.py
 services/liteparse_service.py
-services/llm_client.py
-services/llm_tool_calls_handler.py
+services/mem0_presentation_memory_service.py
 services/pptx_presentation_creator.py
 services/score_based_chunker.py
 services/temp_file_service.py
@@ -106,7 +102,9 @@ templates/providers.py
 templates/router.py
 tests/test_gemini_schema_support.py
 tests/test_image_generation.py
+tests/test_liteparse_service.py
 tests/test_mcp_server.py
+tests/test_mem0_presentation_memory_service.py
 tests/test_openai_schema_support.py
 tests/test_pptx_creator.py
 tests/test_pptx_slides_processing.py
@@ -130,7 +128,9 @@ utils/get_layout_by_name.py
 utils/image_provider.py
 utils/image_utils.py
 utils/llm_client_error_handler.py
+utils/llm_config.py
 utils/llm_provider.py
+utils/llm_utils.py
 utils/model_availability.py
 utils/ocr_language.py
 utils/ollama.py
@@ -141,6 +141,7 @@ utils/ppt_utils.py
 utils/process_slides.py
 utils/schema_utils.py
 utils/set_env.py
+utils/simple_auth.py
 utils/theme_utils.py
 utils/user_config.py
 utils/validators.py
diff --git a/servers/fastapi/presenton_backend.egg-info/requires.txt b/servers/fastapi/presenton_backend.egg-info/requires.txt
index e7bfb20e..87b670ce 100644
--- a/servers/fastapi/presenton_backend.egg-info/requires.txt
+++ b/servers/fastapi/presenton_backend.egg-info/requires.txt
@@ -2,19 +2,17 @@ alembic>=1.14.0
 aiohttp>=3.12.15
 aiomysql>=0.2.0
 aiosqlite>=0.21.0
-anthropic>=0.60.0
 asyncpg>=0.30.0
-chromadb>=1.0.15
 dirtyjson>=1.0.8
 fastapi[standard]>=0.116.1
 fastembed-vectorstore>=0.5.2
 fastmcp>=2.11.0
 google-genai>=1.28.0
+mem0ai[nlp]>=0.1.115
 nltk>=3.9.1
 openai>=1.98.0
 pathvalidate>=3.3.1
 pdfplumber>=0.11.7
-pytest>=8.4.1
 python-pptx>=1.0.2
-redis>=6.2.0
 sqlmodel>=0.0.24
+llmai==0.1.8
diff --git a/servers/fastapi/pyproject.toml b/servers/fastapi/pyproject.toml
index b82a1539..8fa45385 100644
--- a/servers/fastapi/pyproject.toml
+++ b/servers/fastapi/pyproject.toml
@@ -12,7 +12,6 @@ dependencies = [
     "aiohttp>=3.12.15",
     "aiomysql>=0.2.0",
     "aiosqlite>=0.21.0",
-    "anthropic>=0.60.0",
     "asyncpg>=0.30.0",
     "dirtyjson>=1.0.8",
     "fastapi[standard]>=0.116.1",
@@ -26,11 +25,15 @@ dependencies = [
     "pdfplumber>=0.11.7",
     "python-pptx>=1.0.2",
     "sqlmodel>=0.0.24",
+    "llmai==0.1.8",
 ]
 
 [tool.uv]
 index-strategy = "unsafe-best-match"
 
+[tool.uv.sources]
+llmai = { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" }
+
 [tool.setuptools.packages.find]
 where = ["."]
 include = ["api*", "enums*", "models*", "services*", "constants*", "utils*", "templates*"]
diff --git a/servers/fastapi/services/codex_llm.py b/servers/fastapi/services/codex_llm.py
deleted file mode 100644
index a94313f9..00000000
--- a/servers/fastapi/services/codex_llm.py
+++ /dev/null
@@ -1,431 +0,0 @@
-"""Codex (Responses API) adapter for structured and unstructured LLM calls.
-
-Stateless adapter: receives AsyncOpenAI client and tool_calls_handler at call time.
-Auth and client creation stay in LLMClient. Structure matches other providers:
-generate = call API, collect content + tool_calls, recurse on tool_calls; stream = same but yield deltas.
-
-Uses LLMToolCallsHandler directly: tools are parsed via parse_tools() in llm_client (handler supports
-Codex and returns OpenAI-style dicts); this module flattens them for the Responses API. Tool execution
-uses tool_calls_handler.handle_tool_calls_openai().
-"""
-
-import dirtyjson
-from typing import Any, AsyncGenerator, List, Optional, Union
-
-from fastapi import HTTPException
-from openai import APIStatusError, AsyncOpenAI, OpenAIError
-
-from models.llm_message import (
-    LLMMessage,
-    OpenAIAssistantMessage,
-    LLMSystemMessage,
-    LLMUserMessage,
-)
-from models.llm_tool_call import OpenAIToolCall, OpenAIToolCallFunction
-from utils.schema_utils import ensure_strict_json_schema
-
-# Responses API requires flat tool format: {"type":"function","name":...,"description":...,"parameters":...}
-RESPONSE_SCHEMA_NAME = "ResponseSchema"
-# Required tool choice for structured: force ResponseSchema (no plain-text fallback).
-STRUCTURED_TOOL_CHOICE = {"type": "function", "name": RESPONSE_SCHEMA_NAME}
-MAX_RECURSION_DEPTH = 5
-
-
-def _to_responses_tools(chat_tools: List[dict]) -> List[dict]:
-    """Convert Chat Completions tool format to flat Responses API format."""
-    result = []
-    for tool in chat_tools:
-        if tool.get("type") != "function":
-            result.append(tool)
-            continue
-        fn = tool.get("function") or tool
-        result.append({
-            "type": "function",
-            "name": fn.get("name", ""),
-            "description": fn.get("description", ""),
-            "parameters": fn.get("parameters", {}),
-        })
-    return result
-
-
-def _items_to_openai_calls(items_by_id: dict[str, dict]) -> List[OpenAIToolCall]:
-    """Build OpenAIToolCall list from Responses API output_item map."""
-    return [
-        OpenAIToolCall(
-            id=item.get("call_id", item.get("id", "")),
-            type="function",
-            function=OpenAIToolCallFunction(
-                name=item.get("name", ""),
-                arguments=item.get("arguments", "{}"),
-            ),
-        )
-        for item in items_by_id.values()
-    ]
-
-
-async def _messages_after_tool_turn(
-    messages: List[LLMMessage],
-    items_by_id: dict[str, dict],
-    tool_calls_handler: Any,
-) -> List[LLMMessage]:
-    """Handle tool calls and return messages extended with assistant turn + tool results."""
-    openai_calls = _items_to_openai_calls(items_by_id)
-    tool_call_messages = await tool_calls_handler.handle_tool_calls_openai(openai_calls)
-    return [
-        *messages,
-        OpenAIAssistantMessage(
-            role="assistant",
-            content=None,
-            tool_calls=[tc.model_dump() for tc in openai_calls],
-        ),
-        *tool_call_messages,
-    ]
-
-
-def _build_body(
-    model: str,
-    messages: List[LLMMessage],
-    tools: Optional[List[dict]] = None,
-    tool_choice: Optional[Union[str, dict]] = None,
-) -> dict:
-    """Build Responses API request body."""
-    instructions = None
-    input_messages = []
-
-    for msg in messages:
-        if isinstance(msg, LLMSystemMessage):
-            instructions = msg.content
-        elif isinstance(msg, LLMUserMessage):
-            input_messages.append({
-                "role": "user",
-                "content": [{"type": "input_text", "text": msg.content}],
-            })
-        elif isinstance(msg, OpenAIAssistantMessage):
-            text = msg.content or ""
-            if text:
-                input_messages.append({
-                    "role": "assistant",
-                    "content": [{"type": "output_text", "text": text}],
-                })
-        else:
-            text = getattr(msg, "content", "") or ""
-            if text:
-                input_messages.append({
-                    "role": "user",
-                    "content": [{"type": "input_text", "text": text}],
-                })
-
-    body: dict = {
-        "model": model,
-        "store": False,
-        "stream": True,
-        "text": {"verbosity": "medium"},
-        "include": ["reasoning.encrypted_content"],
-        "tool_choice": tool_choice if tool_choice is not None else "auto",
-        "parallel_tool_calls": True,
-    }
-    if instructions:
-        body["instructions"] = instructions
-    if input_messages:
-        body["input"] = input_messages
-    if tools:
-        body["tools"] = tools
-
-    return body
-
-
-def _event_to_dict(event: Any) -> dict:
-    """Convert SDK event to dict."""
-    if hasattr(event, "model_dump"):
-        return event.model_dump()
-    return {
-        "type": getattr(event, "type", None),
-        "delta": getattr(event, "delta", None),
-        "item": getattr(event, "item", None),
-        "message": getattr(event, "message", None),
-        "arguments": getattr(event, "arguments", None),
-        "name": getattr(event, "name", None),
-    }
-
-
-async def _stream_raw(
-    client: AsyncOpenAI,
-    model: str,
-    messages: List[LLMMessage],
-    tools: Optional[List[dict]] = None,
-    tool_choice: Optional[Union[str, dict]] = None,
-) -> AsyncGenerator[dict, None]:
-    """Yield raw SSE event dicts from Codex Responses API."""
-    body = _build_body(model, messages, tools, tool_choice=tool_choice)
-    create_kwargs = {k: v for k, v in body.items() if k != "stream"}
-
-    try:
-        stream = await client.responses.create(stream=True, **create_kwargs)
-    except (APIStatusError, OpenAIError) as e:
-        status = getattr(e, "status_code", 502)
-        detail = getattr(e, "message", str(e)) or str(e)
-        raise HTTPException(
-            status_code=status,
-            detail=f"Codex API error: {detail}"[:400],
-        ) from e
-
-    async for event in stream:
-        yield _event_to_dict(event)
-
-
-class CodexLLMAdapter:
-    """Stateless adapter for Codex Responses API. Matches other providers: generate/stream + tool recursion."""
-
-    @staticmethod
-    async def generate_codex(
-        client: AsyncOpenAI,
-        model: str,
-        messages: List[LLMMessage],
-        tool_calls_handler: Any,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> Optional[str]:
-        """Generate text; on tool_calls handle and recurse (like _generate_openai / _generate_anthropic)."""
-        print(
-            f"Codex generate: model={model} depth={depth} tools_count={len(tools) if tools else 0}"
-        )
-        responses_tools = _to_responses_tools(tools) if tools else None
-        text_parts: List[str] = []
-        tool_calls_by_id: dict[str, dict] = {}
-
-        async for event in _stream_raw(client, model, messages, responses_tools, tool_choice=None):
-            event_type = event.get("type", "")
-
-            if event_type == "response.output_text.delta":
-                delta = event.get("delta", "")
-                if delta:
-                    text_parts.append(delta)
-            elif event_type == "response.output_item.done":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call":
-                    tool_calls_by_id[item.get("call_id", item.get("id", ""))] = item
-            elif event_type in ("response.failed", "error"):
-                msg_text = event.get("message") or str(event)
-                raise HTTPException(status_code=502, detail=f"Codex error: {msg_text}")
-
-        if tool_calls_by_id and tools and depth < MAX_RECURSION_DEPTH:
-            print(
-                f"Codex generate: tool calls detected depth={depth} count={len(tool_calls_by_id)}"
-            )
-            new_messages = await _messages_after_tool_turn(
-                messages, tool_calls_by_id, tool_calls_handler
-            )
-            return await CodexLLMAdapter.generate_codex(
-                client, model, new_messages, tool_calls_handler,
-                max_tokens=max_tokens, tools=tools, depth=depth + 1,
-            )
-
-        return "".join(text_parts) or None
-
-    @staticmethod
-    async def stream_codex(
-        client: AsyncOpenAI,
-        model: str,
-        messages: List[LLMMessage],
-        tool_calls_handler: Any,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-        """Stream text deltas; on tool_calls handle and recurse (like _stream_openai)."""
-        print(
-            f"Codex stream: model={model} depth={depth} tools_count={len(tools) if tools else 0}"
-        )
-        responses_tools = _to_responses_tools(tools) if tools else None
-        tool_calls_by_id: dict[str, dict] = {}
-
-        async for event in _stream_raw(client, model, messages, responses_tools, tool_choice=None):
-            event_type = event.get("type", "")
-
-            if event_type == "response.output_text.delta":
-                delta = event.get("delta", "")
-                if delta:
-                    yield delta
-            elif event_type == "response.output_item.done":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call":
-                    tool_calls_by_id[item.get("call_id", item.get("id", ""))] = item
-            elif event_type in ("response.failed", "error"):
-                msg_text = event.get("message") or str(event)
-                raise HTTPException(status_code=502, detail=f"Codex stream error: {msg_text}")
-
-        if tool_calls_by_id and tools and depth < MAX_RECURSION_DEPTH:
-            print(
-                f"Codex stream: tool calls detected depth={depth} count={len(tool_calls_by_id)}"
-            )
-            new_messages = await _messages_after_tool_turn(
-                messages, tool_calls_by_id, tool_calls_handler
-            )
-            async for chunk in CodexLLMAdapter.stream_codex(
-                client, model, new_messages, tool_calls_handler,
-                max_tokens=max_tokens, tools=tools, depth=depth + 1,
-            ):
-                yield chunk
-
-    @staticmethod
-    async def stream_codex_structured(
-        client: AsyncOpenAI,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        tool_calls_handler: Any,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-        """Stream JSON chunks from ResponseSchema tool; recurse for other tool_calls.
-
-        Structured output is achieved by always adding an internal ResponseSchema "tool"
-        (with response_format as its parameters) and tool_choice=ResponseSchema. So
-        user_tools=0 only means no extra tools like web search; we still use the
-        ResponseSchema tool to receive the model's JSON.
-        """
-        user_tools_count = len(tools) if tools else 0
-        print(
-            f"Codex stream_structured: model={model} depth={depth} strict={strict} "
-            f"user_tools={user_tools_count} (always adding ResponseSchema tool for structured JSON)"
-        )
-        schema = ensure_strict_json_schema(response_format, path=(), root=response_format) if strict and depth == 0 else response_format
-        response_schema_tool = {
-            "type": "function",
-            "name": RESPONSE_SCHEMA_NAME,
-            "description": "Provide response to the user",
-            "parameters": schema,
-        }
-        all_tools: List[dict] = [response_schema_tool]
-        if tools:
-            all_tools.extend(_to_responses_tools(tools))
-
-        tool_calls_by_id: dict[str, dict] = {}
-        current_call_id: Optional[str] = None
-
-        async for event in _stream_raw(
-            client, model, messages, all_tools, tool_choice=STRUCTURED_TOOL_CHOICE
-        ):
-            event_type = event.get("type", "")
-
-            if event_type == "response.output_item.added":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call" and item.get("name") == RESPONSE_SCHEMA_NAME:
-                    current_call_id = item.get("call_id", item.get("id"))
-                    print(
-                        f"Codex stream_structured: ResponseSchema call started call_id={current_call_id}"
-                    )
-
-            elif event_type == "response.function_call_arguments.delta":
-                if current_call_id is not None:
-                    delta = event.get("delta", "")
-                    if delta:
-                        # Log only first few chunks to avoid log spam
-                        print(
-                            f"Codex stream_structured: ResponseSchema delta chunk len={len(delta)}"
-                        )
-                        yield delta
-
-            elif event_type == "response.function_call_arguments.done":
-                if event.get("name") == RESPONSE_SCHEMA_NAME:
-                    arguments = event.get("arguments", "")
-                    if arguments:
-                        print(
-                            f"Codex stream_structured: ResponseSchema arguments.done len={len(arguments)}"
-                        )
-                        yield arguments
-
-            elif event_type == "response.output_item.done":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call":
-                    tool_calls_by_id[item.get("call_id", item.get("id", ""))] = item
-                    if item.get("name") == RESPONSE_SCHEMA_NAME:
-                        arguments = item.get("arguments", "")
-                        if arguments:
-                            print(
-                                f"Codex stream_structured: ResponseSchema output_item.done len={len(arguments)}"
-                            )
-                            yield arguments
-
-            elif event_type in ("response.failed", "error"):
-                msg_text = event.get("message") or str(event)
-                raise HTTPException(status_code=502, detail=f"Codex structured error: {msg_text}")
-
-        other_tool_calls = {
-            k: v for k, v in tool_calls_by_id.items()
-            if v.get("name") != RESPONSE_SCHEMA_NAME
-        }
-        if other_tool_calls and tools and depth < MAX_RECURSION_DEPTH:
-            print(
-                f"Codex stream_structured: recursing for non-ResponseSchema tool calls "
-                f"depth={depth} count={len(other_tool_calls)}"
-            )
-            new_messages = await _messages_after_tool_turn(
-                messages, other_tool_calls, tool_calls_handler
-            )
-            async for chunk in CodexLLMAdapter.stream_codex_structured(
-                client, model, new_messages, response_format, tool_calls_handler,
-                strict=strict, max_tokens=max_tokens, tools=tools, depth=depth + 1,
-            ):
-                yield chunk
-
-    @staticmethod
-    async def generate_codex_structured(
-        client: AsyncOpenAI,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        tool_calls_handler: Any,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> Optional[dict]:
-        """Collect stream and parse JSON (like _generate_openai_structured)."""
-        user_tools_count = len(tools) if tools else 0
-        print(
-            f"Codex generate_structured: model={model} depth={depth} strict={strict} "
-            f"user_tools={user_tools_count} (using ResponseSchema tool for structured JSON)"
-        )
-        accumulated: List[str] = []
-        async for chunk in CodexLLMAdapter.stream_codex_structured(
-            client, model, messages, response_format, tool_calls_handler,
-            strict=strict, max_tokens=max_tokens, tools=tools, depth=depth,
-        ):
-            accumulated.append(chunk)
-
-        raw = "".join(accumulated)
-        if not raw:
-            return None
-
-        if depth == 0:
-            try:
-                parsed = dict(dirtyjson.loads(raw))
-                print(
-                    f"Codex generate_structured: parsed JSON keys={list(parsed.keys())[:8]}"
-                )
-                return parsed
-            except Exception:
-                start = raw.find("{")
-                if start >= 0:
-                    try:
-                        parsed = dict(dirtyjson.loads(raw[start:]))
-                        print(
-                            "Codex generate_structured: parsed JSON from offset "
-                            f"{start} keys={list(parsed.keys())[:8]}"
-                        )
-                        return parsed
-                    except Exception:
-                        pass
-                raise HTTPException(
-                    status_code=502,
-                    detail=(
-                        "Model did not return valid structured output (expected JSON from ResponseSchema). "
-                        "Please retry."
-                    ),
-                )
-
-        return None
diff --git a/servers/fastapi/services/llm_client.py b/servers/fastapi/services/llm_client.py
deleted file mode 100644
index a62e8098..00000000
--- a/servers/fastapi/services/llm_client.py
+++ /dev/null
@@ -1,2366 +0,0 @@
-import asyncio
-import dirtyjson
-import json
-from typing import AsyncGenerator, List, Optional, Dict, Any
-from fastapi import HTTPException
-from openai import APIStatusError, AsyncOpenAI, OpenAIError
-from openai.types.chat.chat_completion_chunk import (
-    ChatCompletionChunk as OpenAIChatCompletionChunk,
-)
-from google import genai
-from google.genai.types import Content as GoogleContent, Part as GoogleContentPart
-from google.genai.types import (
-    GenerateContentConfig,
-    GoogleSearch,
-    ToolConfig as GoogleToolConfig,
-    FunctionCallingConfig as GoogleFunctionCallingConfig,
-    FunctionCallingConfigMode as GoogleFunctionCallingConfigMode,
-)
-from google.genai.types import Tool as GoogleTool
-from anthropic import AsyncAnthropic
-from anthropic.types import Message as AnthropicMessage
-from anthropic import MessageStreamEvent as AnthropicMessageStreamEvent
-from enums.llm_provider import LLMProvider
-from models.llm_message import (
-    AnthropicAssistantMessage,
-    AnthropicUserMessage,
-    GoogleAssistantMessage,
-    GoogleToolCallMessage,
-    OpenAIAssistantMessage,
-    LLMMessage,
-    LLMSystemMessage,
-    LLMUserMessage,
-)
-from models.llm_tool_call import (
-    AnthropicToolCall,
-    GoogleToolCall,
-    LLMToolCall,
-    OpenAIToolCall,
-    OpenAIToolCallFunction,
-)
-from models.llm_tools import LLMDynamicTool, LLMTool
-from services.llm_tool_calls_handler import LLMToolCallsHandler
-from utils.async_iterator import iterator_to_async
-from utils.dummy_functions import do_nothing_async
-from utils.get_env import (
-    get_anthropic_api_key_env,
-    get_codex_access_token_env,
-    get_codex_account_id_env,
-    get_codex_refresh_token_env,
-    get_codex_token_expires_env,
-    get_custom_llm_api_key_env,
-    get_custom_llm_url_env,
-    get_disable_thinking_env,
-    get_google_api_key_env,
-    get_ollama_url_env,
-    get_openai_api_key_env,
-    get_tool_calls_env,
-    get_web_grounding_env,
-)
-from utils.set_env import (
-    set_codex_access_token_env,
-    set_codex_account_id_env,
-    set_codex_refresh_token_env,
-    set_codex_token_expires_env,
-)
-from utils.llm_provider import get_llm_provider, get_model
-from utils.parsers import parse_bool_or_none
-from utils.schema_utils import (
-    ensure_array_schemas_have_items,
-    ensure_strict_json_schema,
-    flatten_json_schema,
-    remove_titles_from_schema,
-)
-
-
-
-class LLMClient:
-    def __init__(self):
-        self.llm_provider = get_llm_provider()
-        self._client = self._get_client()
-        self.tool_calls_handler = LLMToolCallsHandler(self)
-
-    # ? Use tool calls
-    def use_tool_calls_for_structured_output(self) -> bool:
-        if self.llm_provider != LLMProvider.CUSTOM:
-            return False
-        return parse_bool_or_none(get_tool_calls_env()) or False
-
-    # ? Web Grounding
-    def enable_web_grounding(self) -> bool:
-        if (
-            self.llm_provider == LLMProvider.OLLAMA
-            or self.llm_provider == LLMProvider.CUSTOM
-            or self.llm_provider == LLMProvider.CODEX
-        ):
-            return False
-        return parse_bool_or_none(get_web_grounding_env()) or False
-
-    # ? Disable thinking
-    def disable_thinking(self) -> bool:
-        return parse_bool_or_none(get_disable_thinking_env()) or False
-
-    # ? Clients
-    def _get_client(self):
-        match self.llm_provider:
-            case LLMProvider.OPENAI:
-                return self._get_openai_client()
-            case LLMProvider.GOOGLE:
-                return self._get_google_client()
-            case LLMProvider.ANTHROPIC:
-                return self._get_anthropic_client()
-            case LLMProvider.OLLAMA:
-                return self._get_ollama_client()
-            case LLMProvider.CUSTOM:
-                return self._get_custom_client()
-            case LLMProvider.CODEX:
-                return self._get_codex_client()
-            case _:
-                raise HTTPException(
-                    status_code=400,
-                    detail="LLM Provider must be either openai, google, anthropic, ollama, custom, or codex",
-                )
-
-    def _get_openai_client(self):
-        if not get_openai_api_key_env():
-            raise HTTPException(
-                status_code=400,
-                detail="OpenAI API Key is not set",
-            )
-        return AsyncOpenAI()
-
-    def _get_google_client(self):
-        if not get_google_api_key_env():
-            raise HTTPException(
-                status_code=400,
-                detail="Google API Key is not set",
-            )
-        return genai.Client()
-
-    def _get_anthropic_client(self):
-        if not get_anthropic_api_key_env():
-            raise HTTPException(
-                status_code=400,
-                detail="Anthropic API Key is not set",
-            )
-        return AsyncAnthropic()
-
-    def _get_ollama_client(self):
-        return AsyncOpenAI(
-            base_url=(get_ollama_url_env() or "http://localhost:11434") + "/v1",
-            api_key="ollama",
-        )
-
-    def _get_custom_client(self):
-        if not get_custom_llm_url_env():
-            raise HTTPException(
-                status_code=400,
-                detail="Custom LLM URL is not set",
-            )
-        return AsyncOpenAI(
-            base_url=get_custom_llm_url_env(),
-            api_key=get_custom_llm_api_key_env() or "null",
-        )
-
-    def _get_codex_headers(self) -> dict:
-        """Return the HTTP headers required for Codex Responses API requests.
-
-        Handles token auto-refresh if the stored token is expired or within
-        60 s of expiry before building the header dict.
-        """
-        access_token = get_codex_access_token_env()
-        if not access_token:
-            raise HTTPException(
-                status_code=400,
-                detail="Codex OAuth access token is not set. Please authenticate via /api/v1/ppt/codex/auth/initiate",
-            )
-
-        # Auto-refresh if the token is expired or about to expire (within 60 s)
-        expires_str = get_codex_token_expires_env()
-        if expires_str:
-            try:
-                expires_ms = int(expires_str)
-                now_ms = int(__import__("time").time() * 1000)
-                if now_ms >= expires_ms - 60_000:
-                    refresh_token = get_codex_refresh_token_env()
-                    if refresh_token:
-                        from utils.oauth.openai_codex import (
-                            get_account_id,
-                            refresh_access_token,
-                            TokenSuccess,
-                        )
-                        result = refresh_access_token(refresh_token)
-                        if isinstance(result, TokenSuccess):
-                            set_codex_access_token_env(result.access)
-                            set_codex_refresh_token_env(result.refresh)
-                            set_codex_token_expires_env(str(result.expires))
-                            account_id = get_account_id(result.access)
-                            if account_id:
-                                set_codex_account_id_env(account_id)
-                            access_token = result.access
-            except (ValueError, TypeError):
-                pass
-
-        account_id = get_codex_account_id_env() or ""
-        return {
-            "Authorization": f"Bearer {access_token}",
-            "chatgpt-account-id": account_id,
-            "OpenAI-Beta": "responses=experimental",
-            "originator": "pi",
-            "content-type": "application/json",
-            "accept": "text/event-stream",
-        }
-
-    def _get_codex_client(self) -> AsyncOpenAI:
-        """Return an AsyncOpenAI client configured for the Codex Responses API.
-        Client is built per call so headers/token are fresh after refresh.
-        Only Codex-specific headers are passed; content-type and accept are left
-        to the SDK so the server does not reject the request.
-        """
-        headers = self._get_codex_headers()
-        access_token = (headers.get("Authorization") or "").replace("Bearer ", "").strip()
-        skip = {"authorization", "content-type", "accept"}
-        default_headers = {
-            k: v for k, v in headers.items() if k.lower() not in skip
-        }
-        return AsyncOpenAI(
-            base_url="https://chatgpt.com/backend-api/codex",
-            api_key=access_token or "codex",
-            default_headers=default_headers,
-            timeout=120.0,
-        )
-
-    # ? Prompts
-    def _get_system_prompt(self, messages: List[LLMMessage]) -> str:
-        for message in messages:
-            if isinstance(message, LLMSystemMessage):
-                return message.content
-        return ""
-
-    def _get_google_messages(self, messages: List[LLMMessage]) -> List[GoogleContent]:
-        contents = []
-        for message in messages:
-            if isinstance(message, LLMUserMessage):
-                contents.append(
-                    GoogleContent(
-                        role=message.role,
-                        parts=[GoogleContentPart(text=message.content)],
-                    )
-                )
-            elif isinstance(message, GoogleAssistantMessage):
-                contents.append(message.content)
-            elif isinstance(message, GoogleToolCallMessage):
-                contents.append(
-                    GoogleContent(
-                        role="user",
-                        parts=[
-                            GoogleContentPart.from_function_response(
-                                name=message.name,
-                                response=message.response,
-                            )
-                        ],
-                    )
-                )
-
-        return contents
-
-    def _get_anthropic_messages(self, messages: List[LLMMessage]) -> List[LLMMessage]:
-        return [
-            message for message in messages if not isinstance(message, LLMSystemMessage)
-        ]
-
-    # ? Generate Unstructured Content
-    async def _generate_openai(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        extra_body: Optional[dict] = None,
-        depth: int = 0,
-    ) -> str | None:
-        client: AsyncOpenAI = self._client
-        response = await client.chat.completions.create(
-            model=model,
-            messages=[message.model_dump() for message in messages],
-            max_completion_tokens=max_tokens,
-            tools=tools,
-            extra_body=extra_body,
-        )
-
-        if len(response.choices) == 0:
-            return None
-
-        tool_calls = response.choices[0].message.tool_calls
-        if tool_calls:
-            parsed_tool_calls = [
-                OpenAIToolCall(
-                    id=tool_call.id,
-                    type=tool_call.type,
-                    function=OpenAIToolCallFunction(
-                        name=tool_call.function.name,
-                        arguments=tool_call.function.arguments,
-                    ),
-                )
-                for tool_call in tool_calls
-            ]
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai(
-                parsed_tool_calls
-            )
-            assistant_message = OpenAIAssistantMessage(
-                role="assistant",
-                content=response.choices[0].message.content,
-                tool_calls=[tool_call.model_dump() for tool_call in parsed_tool_calls],
-            )
-            new_messages = [
-                *messages,
-                assistant_message,
-                *tool_call_messages,
-            ]
-            return await self._generate_openai(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                tools=tools,
-                extra_body=extra_body,
-                depth=depth + 1,
-            )
-
-        return response.choices[0].message.content
-
-    async def _generate_google(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        tools: Optional[List[dict]] = None,
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ) -> str | None:
-        client: genai.Client = self._client
-
-        google_tools = None
-        if tools:
-            google_tools = [GoogleTool(function_declarations=[tool]) for tool in tools]
-
-        response = await asyncio.to_thread(
-            client.models.generate_content,
-            model=model,
-            contents=self._get_google_messages(messages),
-            config=GenerateContentConfig(
-                tools=google_tools,
-                system_instruction=self._get_system_prompt(messages),
-                response_mime_type="text/plain",
-                max_output_tokens=max_tokens,
-            ),
-        )
-
-        content = response.candidates[0].content
-        response_parts = content.parts
-
-        if not response_parts:
-            return None
-
-        text_content = None
-        tool_calls = []
-        for each_part in response_parts:
-            if each_part.function_call:
-                tool_calls.append(
-                    GoogleToolCall(
-                        id=each_part.function_call.id,
-                        name=each_part.function_call.name,
-                        arguments=each_part.function_call.args,
-                    )
-                )
-            if each_part.text:
-                text_content = each_part.text
-
-        if tool_calls:
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_google(
-                tool_calls
-            )
-            new_messages = [
-                *messages,
-                GoogleAssistantMessage(
-                    role="assistant",
-                    content=content,
-                ),
-                *tool_call_messages,
-            ]
-            return await self._generate_google(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                tools=tools,
-                depth=depth + 1,
-            )
-
-        return text_content
-
-    async def _generate_anthropic(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> str | None:
-        client: AsyncAnthropic = self._client
-
-        response: AnthropicMessage = await client.messages.create(
-            model=model,
-            system=self._get_system_prompt(messages),
-            messages=[
-                message.model_dump()
-                for message in self._get_anthropic_messages(messages)
-            ],
-            tools=tools,
-            max_tokens=max_tokens or 4000,
-        )
-        text_content = None
-        tool_calls: List[AnthropicToolCall] = []
-        for content in response.content:
-            if content.type == "text" and isinstance(content.text, str):
-                text_content = content.text
-
-            if content.type == "tool_use":
-                tool_calls.append(
-                    AnthropicToolCall(
-                        id=content.id,
-                        type=content.type,
-                        name=content.name,
-                        input=content.input,
-                    )
-                )
-
-        if tool_calls:
-            tool_call_messages = (
-                await self.tool_calls_handler.handle_tool_calls_anthropic(tool_calls)
-            )
-            new_messages = [
-                *messages,
-                AnthropicAssistantMessage(
-                    role="assistant",
-                    content=[each.model_dump() for each in tool_calls],
-                ),
-                AnthropicUserMessage(
-                    role="user",
-                    content=[each.model_dump() for each in tool_call_messages],
-                ),
-            ]
-            return await self._generate_anthropic(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                tools=tools,
-                depth=depth + 1,
-            )
-
-        return text_content
-
-    async def _generate_ollama(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ):
-        return await self._generate_openai(
-            model=model, messages=messages, max_tokens=max_tokens, depth=depth
-        )
-
-    async def _generate_custom(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ):
-        extra_body = {"enable_thinking": False} if self.disable_thinking() else None
-        return await self._generate_openai(
-            model=model,
-            messages=messages,
-            max_tokens=max_tokens,
-            extra_body=extra_body,
-            depth=depth,
-        )
-
-    async def _generate_codex(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> Optional[str]:
-        """
-        Generate plain text using the Codex Responses API. On tool calls, run
-        handlers and recurse (same pattern as _generate_openai).
-        """
-        _MAX_RECURSION_DEPTH = 5
-        client: AsyncOpenAI = self._client
-
-        # Flatten tools to Responses API format
-        responses_tools: Optional[List[dict]] = None
-        if tools:
-            responses_tools = []
-            for tool in tools:
-                fn = (tool.get("function") or tool) if isinstance(tool, dict) else {}
-                if isinstance(fn, dict):
-                    responses_tools.append({
-                        "type": "function",
-                        "name": fn.get("name", ""),
-                        "description": fn.get("description", ""),
-                        "parameters": fn.get("parameters", {}),
-                    })
-                else:
-                    responses_tools.append(tool)
-
-        # Build instructions + input (same shape as _stream_codex_structured)
-        instructions = self._get_system_prompt(messages) or None
-        input_payload: List[Dict[str, Any]] = []
-        for m in messages:
-            if isinstance(m, LLMSystemMessage):
-                continue
-            if isinstance(m, LLMUserMessage):
-                input_payload.append({
-                    "role": "user",
-                    "content": [{"type": "input_text", "text": m.content}],
-                })
-            elif isinstance(m, OpenAIAssistantMessage):
-                text = m.content or ""
-                if text:
-                    input_payload.append({
-                        "role": "assistant",
-                        "content": [{"type": "output_text", "text": text}],
-                    })
-            else:
-                text = getattr(m, "content", "") or ""
-                if text:
-                    input_payload.append({
-                        "role": "user",
-                        "content": [{"type": "input_text", "text": text}],
-                    })
-
-        create_kwargs: Dict[str, Any] = {
-            "model": model,
-            "store": False,
-            "stream": True,
-            "text": {"verbosity": "medium"},
-            "include": ["reasoning.encrypted_content"],
-            "tool_choice": "auto",
-            "parallel_tool_calls": True,
-        }
-        if instructions:
-            create_kwargs["instructions"] = instructions
-        if input_payload:
-            create_kwargs["input"] = input_payload
-        if responses_tools:
-            create_kwargs["tools"] = responses_tools
-        if max_tokens is not None:
-            create_kwargs["max_output_tokens"] = max_tokens
-
-        stream = await client.responses.create(**create_kwargs)
-
-        def _event_dict(ev: Any) -> dict:
-            if hasattr(ev, "model_dump"):
-                return ev.model_dump()
-            return {
-                "type": getattr(ev, "type", None),
-                "delta": getattr(ev, "delta", None),
-                "item": getattr(ev, "item", None),
-                "message": getattr(ev, "message", None),
-            }
-
-        text_parts: List[str] = []
-        tool_calls_by_id: Dict[str, Dict[str, Any]] = {}
-
-        async for ev in stream:
-            event = _event_dict(ev) if not isinstance(ev, dict) else ev
-            event_type = event.get("type") or ""
-
-            if event_type == "response.output_text.delta":
-                delta = event.get("delta") or ""
-                if delta:
-                    text_parts.append(delta)
-            elif event_type == "response.output_item.done":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call":
-                    cid = item.get("call_id") or item.get("id", "")
-                    tool_calls_by_id[cid] = item
-            elif event_type in ("response.error", "response.failed", "error"):
-                err = event.get("message") or event.get("error") or str(event)
-                raise HTTPException(status_code=502, detail=f"Codex error: {err}"[:400])
-
-        if tool_calls_by_id and responses_tools and depth < _MAX_RECURSION_DEPTH:
-            parsed_tool_calls = [
-                OpenAIToolCall(
-                    id=cid,
-                    type="function",
-                    function=OpenAIToolCallFunction(
-                        name=data.get("name", ""),
-                        arguments=data.get("arguments", ""),
-                    ),
-                )
-                for cid, data in tool_calls_by_id.items()
-            ]
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai(
-                parsed_tool_calls
-            )
-            new_messages = [
-                *messages,
-                OpenAIAssistantMessage(
-                    role="assistant",
-                    content=None,
-                    tool_calls=[tc.model_dump() for tc in parsed_tool_calls],
-                ),
-                *tool_call_messages,
-            ]
-            return await self._generate_codex(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                tools=tools,
-                depth=depth + 1,
-            )
-
-        return "".join(text_parts) or None
-
-    async def generate(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None,
-    ):
-        parsed_tools = self.tool_calls_handler.parse_tools(tools)
-
-        content = None
-        match self.llm_provider:
-            case LLMProvider.OPENAI:
-                content = await self._generate_openai(
-                    model=model,
-                    messages=messages,
-                    max_tokens=max_tokens,
-                    tools=parsed_tools,
-                )
-            case LLMProvider.CODEX:
-                content = await self._generate_codex(
-                    model=model,
-                    messages=messages,
-                    max_tokens=max_tokens,
-                    tools=parsed_tools,
-                )
-            case LLMProvider.GOOGLE:
-                content = await self._generate_google(
-                    model=model,
-                    messages=messages,
-                    max_tokens=max_tokens,
-                    tools=parsed_tools,
-                )
-            case LLMProvider.ANTHROPIC:
-                content = await self._generate_anthropic(
-                    model=model,
-                    messages=messages,
-                    max_tokens=max_tokens,
-                    tools=parsed_tools,
-                )
-            case LLMProvider.OLLAMA:
-                content = await self._generate_ollama(
-                    model=model, messages=messages, max_tokens=max_tokens
-                )
-            case LLMProvider.CUSTOM:
-                content = await self._generate_custom(
-                    model=model, messages=messages, max_tokens=max_tokens
-                )
-        if content is None:
-            raise HTTPException(
-                status_code=400,
-                detail="LLM did not return any content",
-            )
-        return content
-
-    # ? Generate Structured Content
-    async def _generate_openai_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        extra_body: Optional[dict] = None,
-        depth: int = 0,
-    ) -> dict | None:
-        client: AsyncOpenAI = self._client
-        response_schema = response_format
-        all_tools = [*tools] if tools else None
-
-        use_tool_calls_for_structured_output = (
-            self.use_tool_calls_for_structured_output()
-        )
-        if strict and depth == 0:
-            response_schema = ensure_strict_json_schema(
-                response_schema,
-                path=(),
-                root=response_schema,
-            )
-        response_schema = ensure_array_schemas_have_items(response_schema)
-        if use_tool_calls_for_structured_output and depth == 0:
-            if all_tools is None:
-                all_tools = []
-            all_tools.append(
-                self.tool_calls_handler.parse_tool(
-                    LLMDynamicTool(
-                        name="ResponseSchema",
-                        description="Provide response to the user",
-                        parameters=response_schema,
-                        handler=do_nothing_async,
-                    ),
-                    strict=strict,
-                )
-            )
-
-        response = await client.chat.completions.create(
-            model=model,
-            messages=[message.model_dump() for message in messages],
-            response_format=(
-                {
-                    "type": "json_schema",
-                    "json_schema": (
-                        {
-                            "name": "ResponseSchema",
-                            "strict": strict,
-                            "schema": response_schema,
-                        }
-                    ),
-                }
-                if not use_tool_calls_for_structured_output
-                else None
-            ),
-            max_completion_tokens=max_tokens,
-            tools=all_tools,
-            extra_body=extra_body,
-        )
-
-        if len(response.choices) == 0:
-            return None
-
-        content = response.choices[0].message.content
-
-        tool_calls = response.choices[0].message.tool_calls
-        has_response_schema = False
-
-        if tool_calls:
-            for tool_call in tool_calls:
-                if tool_call.function.name == "ResponseSchema":
-                    content = tool_call.function.arguments
-                    has_response_schema = True
-
-            if not has_response_schema:
-                parsed_tool_calls = [
-                    OpenAIToolCall(
-                        id=tool_call.id,
-                        type=tool_call.type,
-                        function=OpenAIToolCallFunction(
-                            name=tool_call.function.name,
-                            arguments=tool_call.function.arguments,
-                        ),
-                    )
-                    for tool_call in tool_calls
-                ]
-                tool_call_messages = (
-                    await self.tool_calls_handler.handle_tool_calls_openai(
-                        parsed_tool_calls
-                    )
-                )
-                new_messages = [
-                    *messages,
-                    OpenAIAssistantMessage(
-                        role="assistant",
-                        content=response.choices[0].message.content,
-                        tool_calls=[each.model_dump() for each in parsed_tool_calls],
-                    ),
-                    *tool_call_messages,
-                ]
-                content = await self._generate_openai_structured(
-                    model=model,
-                    messages=new_messages,
-                    response_format=response_schema,
-                    strict=strict,
-                    max_tokens=max_tokens,
-                    tools=all_tools,
-                    extra_body=extra_body,
-                    depth=depth + 1,
-                )
-        if content:
-            if depth == 0:
-                return dict(dirtyjson.loads(content))
-            return content
-        return None
-
-    async def _generate_codex_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        extra_body: Optional[dict] = None,
-        depth: int = 0,
-    ) -> dict | None:
-        """
-        Generate structured Codex output using the Responses API.
-
-        This reuses the streaming Codex structured implementation and simply
-        accumulates the streamed JSON chunks into a single string, then parses
-        it at the root call.
-        """
-        # Reuse the Responses API streaming implementation for Codex.
-        accumulated: List[str] = []
-        async for chunk in self._stream_codex_structured(
-            model=model,
-            messages=messages,
-            response_format=response_format,
-            strict=strict,
-            max_tokens=max_tokens,
-            tools=tools,
-            extra_body=extra_body,
-            depth=depth,
-        ):
-            accumulated.append(chunk)
-
-        raw = "".join(accumulated)
-        if not raw:
-            return None
-
-        # At the root level we parse into a dict; recursive calls just
-        # propagate the raw JSON/text, mirroring other providers.
-        if depth == 0:
-            return dict(dirtyjson.loads(raw))
-        return {"raw": raw}
-
-    async def _generate_google_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> dict | None:
-        client: genai.Client = self._client
-
-        google_tools = None
-        if tools:
-            google_tools = [GoogleTool(function_declarations=[tool]) for tool in tools]
-            google_tools.append(
-                GoogleTool(
-                    function_declarations=[
-                        {
-                            "name": "ResponseSchema",
-                            "description": "Provide response to the user",
-                            "parameters": remove_titles_from_schema(
-                                flatten_json_schema(response_format)
-                            ),
-                        }
-                    ]
-                )
-            )
-
-        response = await asyncio.to_thread(
-            client.models.generate_content,
-            model=model,
-            contents=self._get_google_messages(messages),
-            config=GenerateContentConfig(
-                tools=google_tools,
-                tool_config=(
-                    GoogleToolConfig(
-                        function_calling_config=GoogleFunctionCallingConfig(
-                            mode=GoogleFunctionCallingConfigMode.ANY,
-                        ),
-                    )
-                    if tools
-                    else None
-                ),
-                system_instruction=self._get_system_prompt(messages),
-                response_mime_type="application/json" if not tools else None,
-                response_json_schema=response_format if not tools else None,
-                max_output_tokens=max_tokens,
-            ),
-        )
-
-        content = response.candidates[0].content
-        response_parts = content.parts
-        text_content = None
-
-        if not response_parts:
-            return None
-
-        tool_calls: List[GoogleToolCall] = []
-        for each_part in response_parts:
-            if each_part.function_call:
-                tool_calls.append(
-                    GoogleToolCall(
-                        id=each_part.function_call.id,
-                        name=each_part.function_call.name,
-                        arguments=each_part.function_call.args,
-                    )
-                )
-
-            if each_part.text:
-                text_content = each_part.text
-
-        for each in tool_calls:
-            if each.name == "ResponseSchema":
-                return each.arguments
-
-        if tool_calls:
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_google(
-                tool_calls
-            )
-            new_messages = [
-                *messages,
-                GoogleAssistantMessage(
-                    role="assistant",
-                    content=content,
-                ),
-                *tool_call_messages,
-            ]
-            return await self._generate_google_structured(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                response_format=response_format,
-                tools=tools,
-                depth=depth + 1,
-            )
-
-        if text_content:
-            return dict(dirtyjson.loads(text_content))
-        return None
-
-    async def _generate_anthropic_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        tools: Optional[List[dict]] = None,
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ):
-        client: AsyncAnthropic = self._client
-        response: AnthropicMessage = await client.messages.create(
-            model=model,
-            system=self._get_system_prompt(messages),
-            messages=[
-                message.model_dump()
-                for message in self._get_anthropic_messages(messages)
-            ],
-            max_tokens=max_tokens or 4000,
-            tools=[
-                {
-                    "name": "ResponseSchema",
-                    "description": "A response to the user's message",
-                    "input_schema": response_format,
-                },
-                *(tools or []),
-            ],
-        )
-        tool_calls: List[AnthropicToolCall] = []
-        text_parts: List[str] = []
-        for content in response.content:
-            if content.type == "text" and isinstance(content.text, str):
-                text_parts.append(content.text)
-            if content.type == "tool_use":
-                tool_calls.append(
-                    AnthropicToolCall(
-                        id=content.id,
-                        type=content.type,
-                        name=content.name,
-                        input=content.input,
-                    )
-                )
-
-        for each in tool_calls:
-            if each.name == "ResponseSchema":
-                return each.input
-
-        if tool_calls:
-            tool_call_messages = (
-                await self.tool_calls_handler.handle_tool_calls_anthropic(tool_calls)
-            )
-            new_messages = [
-                *messages,
-                AnthropicAssistantMessage(
-                    role="assistant",
-                    content=[each.model_dump() for each in tool_calls],
-                ),
-                AnthropicUserMessage(
-                    role="user",
-                    content=[each.model_dump() for each in tool_call_messages],
-                ),
-            ]
-            return await self._generate_anthropic_structured(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                response_format=response_format,
-                tools=tools,
-                depth=depth + 1,
-            )
-
-        text_content = "".join(text_parts).strip()
-        if text_content:
-            try:
-                return dict(dirtyjson.loads(text_content))
-            except Exception:
-                pass
-
-        if depth < 2:
-            await asyncio.sleep(0.4 * (depth + 1))
-            return await self._generate_anthropic_structured(
-                model=model,
-                messages=messages,
-                max_tokens=max_tokens,
-                response_format=response_format,
-                tools=tools,
-                depth=depth + 1,
-            )
-
-        return None
-
-    async def _generate_ollama_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ):
-        return await self._generate_openai_structured(
-            model=model,
-            messages=messages,
-            response_format=response_format,
-            strict=strict,
-            max_tokens=max_tokens,
-            depth=depth,
-        )
-
-    async def _generate_custom_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ):
-        extra_body = {"enable_thinking": False} if self.disable_thinking() else None
-        return await self._generate_openai_structured(
-            model=model,
-            messages=messages,
-            response_format=response_format,
-            strict=strict,
-            max_tokens=max_tokens,
-            extra_body=extra_body,
-            depth=depth,
-        )
-
-    async def generate_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None,
-        max_tokens: Optional[int] = None,
-    ) -> dict:
-        parsed_tools = self.tool_calls_handler.parse_tools(tools)
-
-        for attempt in range(3):
-            content = None
-            match self.llm_provider:
-                case LLMProvider.OPENAI:
-                    content = await self._generate_openai_structured(
-                        model=model,
-                        messages=messages,
-                        response_format=response_format,
-                        strict=strict,
-                        tools=parsed_tools,
-                        max_tokens=max_tokens,
-                    )
-                case LLMProvider.CODEX:
-                    content = await self._generate_codex_structured(
-                        model=model,
-                        messages=messages,
-                        response_format=response_format,
-                        strict=strict,
-                        tools=parsed_tools,
-                        max_tokens=max_tokens,
-                    )
-                case LLMProvider.GOOGLE:
-                    content = await self._generate_google_structured(
-                        model=model,
-                        messages=messages,
-                        response_format=response_format,
-                        tools=parsed_tools,
-                        max_tokens=max_tokens,
-                    )
-                case LLMProvider.ANTHROPIC:
-                    content = await self._generate_anthropic_structured(
-                        model=model,
-                        messages=messages,
-                        response_format=response_format,
-                        tools=parsed_tools,
-                        max_tokens=max_tokens,
-                    )
-                case LLMProvider.OLLAMA:
-                    content = await self._generate_ollama_structured(
-                        model=model,
-                        messages=messages,
-                        response_format=response_format,
-                        strict=strict,
-                        max_tokens=max_tokens,
-                    )
-                case LLMProvider.CUSTOM:
-                    content = await self._generate_custom_structured(
-                        model=model,
-                        messages=messages,
-                        response_format=response_format,
-                        strict=strict,
-                        max_tokens=max_tokens,
-                    )
-
-            if content is not None:
-                return content
-
-            if attempt < 2:
-                await asyncio.sleep(0.5 * (attempt + 1))
-
-        raise HTTPException(
-            status_code=400,
-            detail="LLM did not return any content",
-        )
-
-    # ? Stream Unstructured Content
-    async def _stream_openai(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        extra_body: Optional[dict] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-        client: AsyncOpenAI = self._client
-
-        tool_calls: List[LLMToolCall] = []
-        current_index = 0
-        current_id = None
-        current_name = None
-        current_arguments = None
-        async for event in await client.chat.completions.create(
-            model=model,
-            messages=[message.model_dump() for message in messages],
-            max_completion_tokens=max_tokens,
-            tools=tools,
-            extra_body=extra_body,
-            stream=True,
-        ):
-            event: OpenAIChatCompletionChunk = event
-            if not event.choices:
-                continue
-
-            content_chunk = event.choices[0].delta.content
-            if content_chunk:
-                yield content_chunk
-
-            tool_call_chunk = event.choices[0].delta.tool_calls
-            if tool_call_chunk:
-                tool_index = tool_call_chunk[0].index
-                tool_id = tool_call_chunk[0].id
-                tool_name = tool_call_chunk[0].function.name
-                tool_arguments = tool_call_chunk[0].function.arguments
-
-                if current_index != tool_index:
-                    tool_calls.append(
-                        OpenAIToolCall(
-                            id=current_id,
-                            type="function",
-                            function=OpenAIToolCallFunction(
-                                name=current_name,
-                                arguments=current_arguments,
-                            ),
-                        )
-                    )
-                    current_index = tool_index
-                    current_id = tool_id
-                    current_name = tool_name
-                    current_arguments = tool_arguments
-                else:
-                    current_name = tool_name or current_name
-                    current_id = tool_id or current_id
-                    if current_arguments is None:
-                        current_arguments = tool_arguments
-                    elif tool_arguments:
-                        current_arguments += tool_arguments
-
-        if current_id is not None:
-            tool_calls.append(
-                OpenAIToolCall(
-                    id=current_id,
-                    type="function",
-                    function=OpenAIToolCallFunction(
-                        name=current_name,
-                        arguments=current_arguments,
-                    ),
-                )
-            )
-
-        if tool_calls:
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai(
-                tool_calls
-            )
-            new_messages = [
-                *messages,
-                OpenAIAssistantMessage(
-                    role="assistant",
-                    content=None,
-                    tool_calls=[each.model_dump() for each in tool_calls],
-                ),
-                *tool_call_messages,
-            ]
-            async for event in self._stream_openai(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                tools=tools,
-                extra_body=extra_body,
-                depth=depth + 1,
-            ):
-                yield event
-
-    async def _stream_google(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        tools: Optional[List[dict]] = None,
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-        client: genai.Client = self._client
-
-        google_tools = None
-        if tools:
-            google_tools = [GoogleTool(function_declarations=[tool]) for tool in tools]
-
-        generated_contents = []
-        tool_calls: List[GoogleToolCall] = []
-        async for event in iterator_to_async(client.models.generate_content_stream)(
-            model=model,
-            contents=self._get_google_messages(messages),
-            config=GenerateContentConfig(
-                system_instruction=self._get_system_prompt(messages),
-                response_mime_type="text/plain",
-                tools=google_tools,
-                max_output_tokens=max_tokens,
-            ),
-        ):
-            if not (
-                event.candidates
-                and event.candidates[0].content
-                and event.candidates[0].content.parts
-            ):
-                continue
-
-            generated_contents.append(event.candidates[0].content)
-
-            for each_part in event.candidates[0].content.parts:
-                if each_part.text:
-                    yield each_part.text
-
-                if each_part.function_call:
-                    tool_calls.append(
-                        GoogleToolCall(
-                            id=each_part.function_call.id,
-                            name=each_part.function_call.name,
-                            arguments=each_part.function_call.args,
-                        )
-                    )
-
-        if tool_calls:
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_google(
-                tool_calls
-            )
-            new_messages = [
-                *messages,
-                *[
-                    GoogleAssistantMessage(
-                        role="assistant",
-                        content=each,
-                    )
-                    for each in generated_contents
-                ],
-                *tool_call_messages,
-            ]
-            async for event in self._stream_google(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                tools=tools,
-                depth=depth + 1,
-            ):
-                yield event
-
-    async def _stream_anthropic(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ):
-        client: AsyncAnthropic = self._client
-
-        tool_calls: List[AnthropicToolCall] = []
-        async with client.messages.stream(
-            model=model,
-            system=self._get_system_prompt(messages),
-            messages=[
-                message.model_dump()
-                for message in self._get_anthropic_messages(messages)
-            ],
-            max_tokens=max_tokens or 4000,
-            tools=tools,
-        ) as stream:
-            async for event in stream:
-                event: AnthropicMessageStreamEvent = event
-
-                if event.type == "text":
-                    yield event.text
-
-                if (
-                    event.type == "content_block_stop"
-                    and event.content_block.type == "tool_use"
-                ):
-                    tool_calls.append(
-                        AnthropicToolCall(
-                            id=event.content_block.id,
-                            type=event.content_block.type,
-                            name=event.content_block.name,
-                            input=event.content_block.input,
-                        )
-                    )
-
-        if tool_calls:
-            tool_call_messages = (
-                await self.tool_calls_handler.handle_tool_calls_anthropic(tool_calls)
-            )
-            new_messages = [
-                *messages,
-                AnthropicAssistantMessage(
-                    role="assistant",
-                    content=[each.model_dump() for each in tool_calls],
-                ),
-                AnthropicUserMessage(
-                    role="user",
-                    content=[each.model_dump() for each in tool_call_messages],
-                ),
-            ]
-            async for event in self._stream_anthropic(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                tools=tools,
-                depth=depth + 1,
-            ):
-                yield event
-
-    async def _stream_codex(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-        """
-        Stream plain text from Codex (Responses API). On tool calls, execute tools
-        and recurse, mirroring _stream_openai but using Responses events.
-        """
-        _MAX_RECURSION_DEPTH = 5
-        client: AsyncOpenAI = (
-            self._get_codex_client()
-            if self.llm_provider == LLMProvider.CODEX
-            else self._client
-        )
-
-        # Flatten tools to Responses API format
-        responses_tools: Optional[List[dict]] = None
-        if tools:
-            responses_tools = []
-            for tool in tools:
-                fn = (tool.get("function") or tool) if isinstance(tool, dict) else {}
-                if isinstance(fn, dict):
-                    responses_tools.append(
-                        {
-                            "type": "function",
-                            "name": fn.get("name", ""),
-                            "description": fn.get("description", ""),
-                            "parameters": fn.get("parameters", {}),
-                        }
-                    )
-                else:
-                    responses_tools.append(tool)
-
-        # Build instructions + input (same shape as _generate_codex/_stream_codex_structured)
-        instructions = self._get_system_prompt(messages) or None
-        input_payload: List[Dict[str, Any]] = []
-        for m in messages:
-            if isinstance(m, LLMSystemMessage):
-                continue
-            if isinstance(m, LLMUserMessage):
-                input_payload.append(
-                    {
-                        "role": "user",
-                        "content": [{"type": "input_text", "text": m.content}],
-                    }
-                )
-            elif isinstance(m, OpenAIAssistantMessage):
-                text = m.content or ""
-                if text:
-                    input_payload.append(
-                        {
-                            "role": "assistant",
-                            "content": [{"type": "output_text", "text": text}],
-                        }
-                    )
-            else:
-                text = getattr(m, "content", "") or ""
-                if text:
-                    input_payload.append(
-                        {
-                            "role": "user",
-                            "content": [{"type": "input_text", "text": text}],
-                        }
-                    )
-
-        create_kwargs: Dict[str, Any] = {
-            "model": model,
-            "store": False,
-            "stream": True,
-            "text": {"verbosity": "medium"},
-            "include": ["reasoning.encrypted_content"],
-            "tool_choice": "auto",
-            "parallel_tool_calls": True,
-        }
-        if instructions:
-            create_kwargs["instructions"] = instructions
-        if input_payload:
-            create_kwargs["input"] = input_payload
-        if responses_tools:
-            create_kwargs["tools"] = responses_tools
-        if max_tokens is not None:
-            create_kwargs["max_output_tokens"] = max_tokens
-
-        stream = await client.responses.create(**create_kwargs)
-
-        def _event_dict(ev: Any) -> dict:
-            if hasattr(ev, "model_dump"):
-                return ev.model_dump()
-            return {
-                "type": getattr(ev, "type", None),
-                "delta": getattr(ev, "delta", None),
-                "item": getattr(ev, "item", None),
-                "message": getattr(ev, "message", None),
-            }
-
-        tool_calls_by_id: Dict[str, Dict[str, Any]] = {}
-
-        async for ev in stream:
-            event = _event_dict(ev) if not isinstance(ev, dict) else ev
-            event_type = event.get("type") or ""
-
-            if event_type == "response.output_text.delta":
-                delta = event.get("delta") or ""
-                if delta:
-                    yield delta
-            elif event_type == "response.output_item.done":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call":
-                    cid = item.get("call_id") or item.get("id", "")
-                    tool_calls_by_id[cid] = item
-            elif event_type in ("response.error", "response.failed", "error"):
-                err = event.get("message") or event.get("error") or str(event)
-                raise HTTPException(status_code=502, detail=f"Codex stream error: {err}"[:400])
-
-        if tool_calls_by_id and responses_tools and depth < _MAX_RECURSION_DEPTH:
-            parsed_tool_calls = [
-                OpenAIToolCall(
-                    id=cid,
-                    type="function",
-                    function=OpenAIToolCallFunction(
-                        name=data.get("name", ""),
-                        arguments=data.get("arguments", ""),
-                    ),
-                )
-                for cid, data in tool_calls_by_id.items()
-            ]
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai(
-                parsed_tool_calls
-            )
-            new_messages = [
-                *messages,
-                OpenAIAssistantMessage(
-                    role="assistant",
-                    content=None,
-                    tool_calls=[tc.model_dump() for tc in parsed_tool_calls],
-                ),
-                *tool_call_messages,
-            ]
-            async for chunk in self._stream_codex(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                tools=tools,
-                depth=depth + 1,
-            ):
-                yield chunk
-
-    def _stream_ollama(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ):
-        return self._stream_openai(
-            model=model, messages=messages, max_tokens=max_tokens, depth=depth
-        )
-
-    def _stream_custom(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ):
-        extra_body = {"enable_thinking": False} if self.disable_thinking() else None
-        return self._stream_openai(
-            model=model,
-            messages=messages,
-            max_tokens=max_tokens,
-            extra_body=extra_body,
-            depth=depth,
-        )
-
-    def stream(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None,
-    ):
-        parsed_tools = self.tool_calls_handler.parse_tools(tools)
-
-        match self.llm_provider:
-            case LLMProvider.OPENAI:
-                return self._stream_openai(
-                    model=model,
-                    messages=messages,
-                    max_tokens=max_tokens,
-                    tools=parsed_tools,
-                )
-            case LLMProvider.CODEX:
-                return self._stream_codex(
-                    model=model,
-                    messages=messages,
-                    max_tokens=max_tokens,
-                    tools=parsed_tools,
-                )
-            case LLMProvider.GOOGLE:
-                return self._stream_google(
-                    model=model,
-                    messages=messages,
-                    max_tokens=max_tokens,
-                    tools=parsed_tools,
-                )
-            case LLMProvider.ANTHROPIC:
-                return self._stream_anthropic(
-                    model=model,
-                    messages=messages,
-                    max_tokens=max_tokens,
-                    tools=parsed_tools,
-                )
-            case LLMProvider.OLLAMA:
-                return self._stream_ollama(
-                    model=model, messages=messages, max_tokens=max_tokens
-                )
-            case LLMProvider.CUSTOM:
-                return self._stream_custom(
-                    model=model, messages=messages, max_tokens=max_tokens
-                )
-
-    # ? Stream Structured Content
-    async def _stream_openai_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        extra_body: Optional[dict] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-        client: AsyncOpenAI = self._client
-
-        response_schema = response_format
-        all_tools = [*tools] if tools else None
-
-        use_tool_calls_for_structured_output = (
-            self.use_tool_calls_for_structured_output()
-        )
-        if strict and depth == 0:
-            response_schema = ensure_strict_json_schema(
-                response_schema,
-                path=(),
-                root=response_schema,
-            )
-        response_schema = ensure_array_schemas_have_items(response_schema)
-
-        if use_tool_calls_for_structured_output and depth == 0:
-            if all_tools is None:
-                all_tools = []
-            all_tools.append(
-                self.tool_calls_handler.parse_tool(
-                    LLMDynamicTool(
-                        name="ResponseSchema",
-                        description="Provide response to the user",
-                        parameters=response_schema,
-                        handler=do_nothing_async,
-                    ),
-                    strict=strict,
-                )
-            )
-
-        tool_calls: List[LLMToolCall] = []
-        current_index = 0
-        current_id = None
-        current_name = None
-        current_arguments = None
-
-        has_response_schema_tool_call = False
-        async for event in await client.chat.completions.create(
-            model=model,
-            messages=[message.model_dump() for message in messages],
-            max_completion_tokens=max_tokens,
-            tools=all_tools,
-            response_format=(
-                {
-                    "type": "json_schema",
-                    "json_schema": (
-                        {
-                            "name": "ResponseSchema",
-                            "strict": strict,
-                            "schema": response_schema,
-                        }
-                    ),
-                }
-                if not use_tool_calls_for_structured_output
-                else None
-            ),
-            extra_body=extra_body,
-            stream=True,
-        ):
-            event: OpenAIChatCompletionChunk = event
-            if not event.choices:
-                continue
-
-            content_chunk = event.choices[0].delta.content
-            if content_chunk and not use_tool_calls_for_structured_output:
-                yield content_chunk
-
-            tool_call_chunk = event.choices[0].delta.tool_calls
-            if tool_call_chunk:
-                tool_index = tool_call_chunk[0].index
-                tool_id = tool_call_chunk[0].id
-                tool_name = tool_call_chunk[0].function.name
-                tool_arguments = tool_call_chunk[0].function.arguments
-
-                if current_index != tool_index:
-                    tool_calls.append(
-                        OpenAIToolCall(
-                            id=current_id,
-                            type="function",
-                            function=OpenAIToolCallFunction(
-                                name=current_name,
-                                arguments=current_arguments,
-                            ),
-                        )
-                    )
-                    current_index = tool_index
-                    current_id = tool_id
-                    current_name = tool_name
-                    current_arguments = tool_arguments
-                else:
-                    current_name = tool_name or current_name
-                    current_id = tool_id or current_id
-                    if current_arguments is None:
-                        current_arguments = tool_arguments
-                    elif tool_arguments:
-                        current_arguments += tool_arguments
-
-                if current_name == "ResponseSchema":
-                    if tool_arguments:
-                        yield tool_arguments
-                    has_response_schema_tool_call = True
-
-        if current_id is not None:
-            tool_calls.append(
-                OpenAIToolCall(
-                    id=current_id,
-                    type="function",
-                    function=OpenAIToolCallFunction(
-                        name=current_name,
-                        arguments=current_arguments,
-                    ),
-                )
-            )
-
-        if tool_calls and not has_response_schema_tool_call:
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai(
-                tool_calls
-            )
-            new_messages = [
-                *messages,
-                OpenAIAssistantMessage(
-                    role="assistant",
-                    content=None,
-                    tool_calls=[each.model_dump() for each in tool_calls],
-                ),
-                *tool_call_messages,
-            ]
-            async for event in self._stream_openai_structured(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                strict=strict,
-                tools=all_tools,
-                response_format=response_schema,
-                extra_body=extra_body,
-                depth=depth + 1,
-            ):
-                yield event
-
-
-
-    async def _stream_codex_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-        extra_body: Optional[dict] = None,
-    ) -> AsyncGenerator[str, None]:
-        """
-        Stream structured responses using OpenAI's Responses API (Codex-style models).
-
-        This implementation is intentionally separate from ChatCompletion-based streaming
-        because the Responses API uses a fundamentally different event model.
-
-        Why this function exists:
-
-        1. The Responses API does NOT return `choices[].delta` like ChatCompletions.
-        Instead, it streams typed events such as:
-            - response.output_text.delta
-            - response.output_tool_call.delta
-            - response.completed
-            - response.error
-
-        2. Structured output can be achieved in two ways:
-        a) Native JSON schema enforcement via `response_format`
-        b) Tool-call-based structured output using a synthetic `ResponseSchema` tool
-
-        This function supports both approaches. When tool-call structured mode is enabled,
-        a dynamic `ResponseSchema` tool is injected so the model returns structured data
-        as tool call arguments.
-
-        3. Tool calls must be accumulated incrementally.
-        The Responses API streams tool call arguments in chunks (`arguments_delta`),
-        so we reconstruct the full argument payload before executing the tool.
-
-        4. Recursive tool execution is supported.
-        If the model calls external tools (e.g., web search), we:
-            - Execute the tools asynchronously
-            - Append tool results as new messages
-            - Reinvoke the model recursively
-        This enables multi-step reasoning and grounding workflows.
-
-        5. Provider abstraction is preserved.
-        The Responses API event format is converted into our internal tool-call model
-        before being passed to the tool handler layer. This prevents SDK-specific
-        structures from leaking into business logic.
-
-        6. Strict schema enforcement (optional).
-        When `strict=True`, the provided JSON schema is hardened before being sent
-        to the model to reduce malformed outputs.
-
-        Important architectural note:
-        This function MUST NOT assume ChatCompletion-style streaming fields like
-        `choices`, `delta.content`, or `delta.tool_calls`. It strictly follows the
-        Responses API event model.
-
-        This separation ensures:
-            - Future compatibility with GPT-5 / Codex models
-            - Clean provider abstraction
-            - Streaming-safe structured JSON assembly
-            - Robust multi-tool recursive execution
-        """
-        client: AsyncOpenAI = self._client
-        response_schema = response_format
-        # Apply strict schema once at root (includes array "items" fix at lines 135–155).
-        if strict and depth == 0:
-            response_schema = ensure_strict_json_schema(
-                response_schema,
-                path=(),
-                root=response_schema,
-            )
-        # When we didn't run ensure_strict_json_schema, fix arrays for Codex API (strict=False or depth > 0).
-        else:
-            response_schema = ensure_array_schemas_have_items(response_schema)
-
-        # Responses API tool format: flat {type, name, description, parameters}
-        response_schema_tool = {
-            "type": "function",
-            "name": "ResponseSchema",
-            "description": "Provide structured response",
-            "parameters": response_schema,
-        }
-        all_tools: List[dict] = [response_schema_tool]
-        if tools:
-            for tool in tools:
-                fn = (tool.get("function") or tool) if isinstance(tool, dict) else {}
-                if isinstance(fn, dict):
-                    all_tools.append({
-                        "type": "function",
-                        "name": fn.get("name", ""),
-                        "description": fn.get("description", ""),
-                        "parameters": fn.get("parameters", {}),
-                    })
-                else:
-                    all_tools.append(tool)
-
-        # Build instructions + input like Codex adapter (instructions from system; input_text/output_text)
-        instructions = self._get_system_prompt(messages) or None
-        input_payload: List[Dict[str, Any]] = []
-        for m in messages:
-            if isinstance(m, LLMSystemMessage):
-                continue
-            if isinstance(m, LLMUserMessage):
-                input_payload.append({
-                    "role": "user",
-                    "content": [{"type": "input_text", "text": m.content}],
-                })
-            elif isinstance(m, OpenAIAssistantMessage):
-                text = m.content or ""
-                if text:
-                    input_payload.append({
-                        "role": "assistant",
-                        "content": [{"type": "output_text", "text": text}],
-                    })
-            else:
-                text = getattr(m, "content", "") or ""
-                if text:
-                    input_payload.append({
-                        "role": "user",
-                        "content": [{"type": "input_text", "text": text}],
-                    })
-
-        # Force model to use ResponseSchema for structured output
-        tool_choice = {"type": "function", "name": "ResponseSchema"}
-        create_kwargs: Dict[str, Any] = {
-            "model": model,
-            "store": False,
-            "stream": True,
-            "text": {"verbosity": "medium"},
-            "include": ["reasoning.encrypted_content"],
-            "tool_choice": tool_choice,
-            "parallel_tool_calls": True,
-            "tools": all_tools,
-        }
-        if instructions:
-            create_kwargs["instructions"] = instructions
-        if input_payload:
-            create_kwargs["input"] = input_payload
-        if max_tokens is not None:
-            create_kwargs["max_output_tokens"] = max_tokens
-        if extra_body:
-            create_kwargs.update(extra_body)
-
-        stream = await client.responses.create(**create_kwargs)
-
-
-        def _event_dict(ev: Any) -> dict:
-            if hasattr(ev, "model_dump"):
-                return ev.model_dump()
-            return {
-                "type": getattr(ev, "type", None),
-                "delta": getattr(ev, "delta", None),
-                "arguments": getattr(ev, "arguments", None),
-                "arguments_delta": getattr(ev, "arguments_delta", None),
-                "item": getattr(ev, "item", None),
-                "id": getattr(ev, "id", None),
-                "name": getattr(ev, "name", None),
-                "error": getattr(ev, "error", None),
-                "message": getattr(ev, "message", None),
-            }
-
-        tool_calls_by_id: Dict[str, Dict[str, Any]] = {}
-        current_call_id: Optional[str] = None
-        has_response_schema_tool_call = False
-
-        async for ev in stream:
-            event = _event_dict(ev) if not isinstance(ev, dict) else ev
-            event_type = event.get("type") or ""
-
-            if event_type == "response.output_item.added":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call" and item.get("name") == "ResponseSchema":
-                    current_call_id = item.get("call_id") or item.get("id")
-
-            elif event_type == "response.function_call_arguments.delta":
-                if current_call_id:
-                    delta = event.get("delta") or ""
-                    if delta:
-                        has_response_schema_tool_call = True
-                        yield delta
-
-            elif event_type == "response.function_call_arguments.done":
-                if event.get("name") == "ResponseSchema":
-                    args = event.get("arguments") or ""
-                    if args:
-                        has_response_schema_tool_call = True
-                        yield args
-
-            elif event_type == "response.output_item.done":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call":
-                    cid = item.get("call_id") or item.get("id", "")
-                    tool_calls_by_id[cid] = item
-                    if item.get("name") == "ResponseSchema":
-                        args = item.get("arguments") or ""
-                        if args:
-                            has_response_schema_tool_call = True
-                            yield args
-
-            elif event_type == "response.output_tool_call.delta":
-                call_id = event.get("id")
-                name = event.get("name")
-                arguments_delta = event.get("arguments_delta") or ""
-                if call_id and name:
-                    if call_id not in tool_calls_by_id:
-                        tool_calls_by_id[call_id] = {"name": name, "arguments": ""}
-                    tool_calls_by_id[call_id]["arguments"] += arguments_delta
-                    if name == "ResponseSchema" and arguments_delta:
-                        has_response_schema_tool_call = True
-                        yield arguments_delta
-
-            elif event_type == "response.completed":
-                break
-
-            elif event_type in ("response.error", "response.failed", "error"):
-                err = event.get("error") or event.get("message") or str(event)
-                raise RuntimeError(err)
-
-        # ============================================
-        # EXECUTE NON-STRUCTURED TOOL CALLS (RECURSIVE)
-        # ============================================
-
-        other_tool_calls = {
-            cid: data
-            for cid, data in tool_calls_by_id.items()
-            if data.get("name") != "ResponseSchema"
-        }
-        if other_tool_calls and not has_response_schema_tool_call:
-            parsed_tool_calls = []
-            for call_id, data in other_tool_calls.items():
-                args = data.get("arguments", "") if isinstance(data, dict) else ""
-                parsed_tool_calls.append(
-                    OpenAIToolCall(
-                        id=call_id,
-                        type="function",
-                        function=OpenAIToolCallFunction(
-                            name=data.get("name", ""),
-                            arguments=args,
-                        ),
-                    )
-                )
-
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai(
-                parsed_tool_calls
-            )
-
-            new_messages = [
-                *messages,
-                OpenAIAssistantMessage(
-                    role="assistant",
-                    content=None,
-                    tool_calls=[tc.model_dump() for tc in parsed_tool_calls],
-                ),
-                *tool_call_messages,
-            ]
-
-            async for chunk in self._stream_codex_structured(
-                model=model,
-                messages=new_messages,
-                response_format=response_schema,
-                strict=strict,
-                max_tokens=max_tokens,
-                tools=tools,
-                extra_body=extra_body,
-                depth=depth + 1,
-            ):
-                yield chunk
-
-    async def _stream_google_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-
-        client: genai.Client = self._client
-
-        google_tools = None
-        if tools:
-            google_tools = [GoogleTool(function_declarations=[tool]) for tool in tools]
-            google_tools.append(
-                GoogleTool(
-                    function_declarations=[
-                        {
-                            "name": "ResponseSchema",
-                            "description": "Provide response to the user",
-                            "parameters": remove_titles_from_schema(
-                                flatten_json_schema(response_format)
-                            ),
-                        }
-                    ]
-                )
-            )
-
-        parsed_messages = self._get_google_messages(messages)
-
-        generated_contents = []
-        tool_calls: List[GoogleToolCall] = []
-        has_response_schema_tool_call = False
-        async for event in iterator_to_async(client.models.generate_content_stream)(
-            model=model,
-            contents=parsed_messages,
-            config=GenerateContentConfig(
-                tools=google_tools,
-                tool_config=(
-                    GoogleToolConfig(
-                        function_calling_config=GoogleFunctionCallingConfig(
-                            mode=GoogleFunctionCallingConfigMode.ANY,
-                        ),
-                    )
-                    if tools
-                    else None
-                ),
-                system_instruction=self._get_system_prompt(messages),
-                response_mime_type="application/json" if not tools else None,
-                response_json_schema=response_format if not tools else None,
-                max_output_tokens=max_tokens,
-            ),
-        ):
-            if not (
-                event.candidates
-                and event.candidates[0].content
-                and event.candidates[0].content.parts
-            ):
-                continue
-
-            generated_contents.append(event.candidates[0].content)
-
-            for each_part in event.candidates[0].content.parts:
-                if each_part.text and not google_tools:
-                    yield each_part.text
-
-                if each_part.function_call:
-                    if each_part.function_call.name == "ResponseSchema":
-                        has_response_schema_tool_call = True
-                        if each_part.function_call.args:
-                            yield json.dumps(each_part.function_call.args)
-
-                    tool_calls.append(
-                        GoogleToolCall(
-                            id=each_part.function_call.id,
-                            name=each_part.function_call.name,
-                            arguments=each_part.function_call.args,
-                        )
-                    )
-
-        if tool_calls and not has_response_schema_tool_call:
-            tool_call_messages = await self.tool_calls_handler.handle_tool_calls_google(
-                tool_calls
-            )
-            new_messages = [
-                *messages,
-                *[
-                    GoogleAssistantMessage(
-                        role="assistant",
-                        content=each,
-                    )
-                    for each in generated_contents
-                ],
-                *tool_call_messages,
-            ]
-            async for event in self._stream_google_structured(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                response_format=response_format,
-                tools=tools,
-                depth=depth + 1,
-            ):
-                yield event
-
-    async def _stream_anthropic_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        tools: Optional[List[dict]] = None,
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-        client: AsyncAnthropic = self._client
-
-        tool_calls: List[AnthropicToolCall] = []
-        has_response_schema_tool_call = False
-        async with client.messages.stream(
-            model=model,
-            system=self._get_system_prompt(messages),
-            messages=[
-                message.model_dump()
-                for message in self._get_anthropic_messages(messages)
-            ],
-            max_tokens=max_tokens or 4000,
-            tools=[
-                {
-                    "name": "ResponseSchema",
-                    "description": "A response to the user's message",
-                    "input_schema": response_format,
-                },
-                *(tools or []),
-            ],
-        ) as stream:
-            is_response_schema_tool_call_started = False
-            async for event in stream:
-                event: AnthropicMessageStreamEvent = event
-
-                if (
-                    event.type == "content_block_start"
-                    and event.content_block.type == "tool_use"
-                ):
-                    if event.content_block.name == "ResponseSchema":
-                        has_response_schema_tool_call = True
-                        is_response_schema_tool_call_started = True
-
-                if (
-                    event.type == "content_block_delta"
-                    and event.delta.type == "input_json_delta"
-                    and is_response_schema_tool_call_started
-                ):
-                    yield event.delta.partial_json
-
-                if (
-                    event.type == "content_block_stop"
-                    and event.content_block.type == "tool_use"
-                ):
-                    tool_calls.append(
-                        AnthropicToolCall(
-                            id=event.content_block.id,
-                            type=event.content_block.type,
-                            name=event.content_block.name,
-                            input=event.content_block.input,
-                        )
-                    )
-
-        if tool_calls and not has_response_schema_tool_call:
-            tool_call_messages = (
-                await self.tool_calls_handler.handle_tool_calls_anthropic(tool_calls)
-            )
-            new_messages = [
-                *messages,
-                AnthropicAssistantMessage(
-                    role="assistant",
-                    content=[each.model_dump() for each in tool_calls],
-                ),
-                AnthropicUserMessage(
-                    role="user",
-                    content=[each.model_dump() for each in tool_call_messages],
-                ),
-            ]
-            async for event in self._stream_anthropic_structured(
-                model=model,
-                messages=new_messages,
-                max_tokens=max_tokens,
-                response_format=response_format,
-                tools=tools,
-                depth=depth + 1,
-            ):
-                yield event
-
-    def _stream_ollama_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ):
-        return self._stream_openai_structured(
-            model=model,
-            messages=messages,
-            response_format=response_format,
-            strict=strict,
-            max_tokens=max_tokens,
-            depth=depth,
-        )
-
-    def _stream_custom_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        depth: int = 0,
-    ):
-        extra_body = {"enable_thinking": False} if self.disable_thinking() else None
-        return self._stream_openai_structured(
-            model=model,
-            messages=messages,
-            response_format=response_format,
-            strict=strict,
-            max_tokens=max_tokens,
-            extra_body=extra_body,
-            depth=depth,
-        )
-
-    def stream_structured(
-        self,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        strict: bool = False,
-        tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None,
-        max_tokens: Optional[int] = None,
-    ):
-        parsed_tools = self.tool_calls_handler.parse_tools(tools)
-
-        match self.llm_provider:
-            case LLMProvider.OPENAI:
-                return self._stream_openai_structured(
-                    model=model,
-                    messages=messages,
-                    response_format=response_format,
-                    strict=strict,
-                    tools=parsed_tools,
-                    max_tokens=max_tokens,
-                )
-            case LLMProvider.CODEX:
-                return self._stream_codex_structured(
-                    model=model,
-                    messages=messages,
-                    response_format=response_format,
-                    strict=strict,
-                    tools=parsed_tools,
-                    max_tokens=max_tokens,
-                )
-            case LLMProvider.GOOGLE:
-                return self._stream_google_structured(
-                    model=model,
-                    messages=messages,
-                    response_format=response_format,
-                    tools=parsed_tools,
-                    max_tokens=max_tokens,
-                )
-            case LLMProvider.ANTHROPIC:
-                return self._stream_anthropic_structured(
-                    model=model,
-                    messages=messages,
-                    response_format=response_format,
-                    tools=parsed_tools,
-                    max_tokens=max_tokens,
-                )
-            case LLMProvider.OLLAMA:
-                return self._stream_ollama_structured(
-                    model=model,
-                    messages=messages,
-                    response_format=response_format,
-                    strict=strict,
-                    max_tokens=max_tokens,
-                )
-            case LLMProvider.CUSTOM:
-                return self._stream_custom_structured(
-                    model=model,
-                    messages=messages,
-                    response_format=response_format,
-                    strict=strict,
-                    max_tokens=max_tokens,
-                )
-
-    # ? Web search
-    async def _search_openai(self, query: str) -> str:
-        client: AsyncOpenAI = self._client
-        response = await client.responses.create(
-            model=get_model(),
-            tools=[
-                {
-                    "type": "web_search_preview",
-                }
-            ],
-            input=query,
-        )
-        return response.output_text
-
-    async def _search_google(self, query: str) -> str:
-        client: genai.Client = self._client
-        grounding_tool = GoogleTool(google_search=GoogleSearch())
-        config = GenerateContentConfig(tools=[grounding_tool])
-
-        response = await asyncio.to_thread(
-            client.models.generate_content,
-            model=get_model(),
-            contents=query,
-            config=config,
-        )
-        return response.text
-
-    async def _search_anthropic(self, query: str) -> str:
-        client: AsyncAnthropic = self._client
-
-        response = await client.messages.create(
-            model=get_model(),
-            max_tokens=4000,
-            messages=[{"role": "user", "content": query}],
-            tools=[
-                {"type": "web_search_20250305", "name": "web_search", "max_uses": 1}
-            ],
-        )
-        result = "\n".join(
-            [each.text for each in response.content if each.type == "text"]
-        )
-        return result
diff --git a/servers/fastapi/services/llm_tool_calls_handler.py b/servers/fastapi/services/llm_tool_calls_handler.py
deleted file mode 100644
index 63476028..00000000
--- a/servers/fastapi/services/llm_tool_calls_handler.py
+++ /dev/null
@@ -1,211 +0,0 @@
-import asyncio
-from datetime import datetime
-import json
-from typing import Any, Callable, Coroutine, List, Optional
-from fastapi import HTTPException
-from enums.llm_provider import LLMProvider
-from models.llm_message import (
-    AnthropicToolCallMessage,
-    GoogleToolCallMessage,
-    OpenAIToolCallMessage,
-)
-from models.llm_tool_call import AnthropicToolCall, GoogleToolCall, OpenAIToolCall
-from models.llm_tools import LLMDynamicTool, LLMTool, SearchWebTool
-from utils.schema_utils import (
-    ensure_strict_json_schema,
-    flatten_json_schema,
-    remove_titles_from_schema,
-)
-
-
-class LLMToolCallsHandler:
-    def __init__(self, client):
-        from services.llm_client import LLMClient
-
-        self.client: LLMClient = client
-
-        self.tools_map: dict[str, Callable[..., Coroutine[Any, Any, str]]] = {
-            "SearchWebTool": self.search_web_tool_call_handler,
-            "GetCurrentDatetimeTool": self.get_current_datetime_tool_call_handler,
-        }
-        self.dynamic_tools: List[LLMDynamicTool] = []
-
-    def get_tool_handler(
-        self, tool_name: str
-    ) -> Callable[..., Coroutine[Any, Any, str]]:
-        handler = self.tools_map.get(tool_name)
-        if handler:
-            return handler
-        else:
-            dynamic_tools = list(
-                filter(lambda tool: tool.name == tool_name, self.dynamic_tools)
-            )
-            if dynamic_tools:
-                return dynamic_tools[0].handler
-        raise HTTPException(status_code=500, detail=f"Tool {tool_name} not found")
-
-    def parse_tools(self, tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None):
-        if tools is None:
-            return None
-        parsed_tools = map(self.parse_tool, tools)
-        return list(parsed_tools)
-
-    def parse_tool(self, tool: type[LLMTool] | LLMDynamicTool, strict: bool = False):
-        if isinstance(tool, LLMDynamicTool):
-            self.dynamic_tools.append(tool)
-
-        match self.client.llm_provider:
-            case LLMProvider.OPENAI | LLMProvider.OLLAMA | LLMProvider.CUSTOM:
-                return self.parse_tool_openai(tool, strict)
-            case LLMProvider.ANTHROPIC:
-                return self.parse_tool_anthropic(tool)
-            case LLMProvider.GOOGLE:
-                return self.parse_tool_google(tool)
-            case _:
-                raise ValueError(
-                    f"LLM provider must be either openai, anthropic, or google"
-                )
-
-    def parse_tool_openai(
-        self, tool: type[LLMTool] | LLMDynamicTool, strict: bool = False
-    ):
-        if isinstance(tool, LLMDynamicTool):
-            name = tool.name
-            description = tool.description
-            parameters = tool.parameters
-        else:
-            name = tool.__name__
-            description = tool.__doc__ or ""
-            parameters = tool.model_json_schema()
-
-        if strict:
-            parameters = ensure_strict_json_schema(parameters, path=(), root=parameters)
-
-        return {
-            "type": "function",
-            "function": {
-                "name": name,
-                "description": description,
-                "strict": strict,
-                "parameters": parameters,
-            },
-        }
-
-    def parse_tool_google(self, tool: type[LLMTool] | LLMDynamicTool):
-        parsed = self.parse_tool_openai(tool)
-        parsed["function"]["parameters"] = (
-            remove_titles_from_schema(
-                flatten_json_schema(parsed["function"]["parameters"])
-            )
-            if parsed["function"]["parameters"]
-            else {}
-        )
-        return {
-            "name": parsed["function"]["name"],
-            "description": parsed["function"]["description"],
-            "parameters": parsed["function"]["parameters"],
-        }
-
-    def parse_tool_anthropic(self, tool: type[LLMTool] | LLMDynamicTool):
-        parsed = self.parse_tool_openai(tool)
-        input_schema = parsed["function"]["parameters"]
-        return {
-            "name": parsed["function"]["name"],
-            "description": parsed["function"]["description"],
-            "input_schema": {"type": "object"} if input_schema == {} else input_schema,
-        }
-
-    async def handle_tool_calls_openai(
-        self,
-        tool_calls: List[OpenAIToolCall],
-    ) -> List[OpenAIToolCallMessage]:
-        async_tool_calls_tasks = []
-        for tool_call in tool_calls:
-            tool_name = tool_call.function.name
-            tool_handler = self.get_tool_handler(tool_name)
-            async_tool_calls_tasks.append(tool_handler(tool_call.function.arguments))
-
-        tool_call_results: List[str] = await asyncio.gather(*async_tool_calls_tasks)
-        tool_call_messages = [
-            OpenAIToolCallMessage(
-                content=result,
-                tool_call_id=tool_call.id,
-            )
-            for tool_call, result in zip(tool_calls, tool_call_results)
-        ]
-        return tool_call_messages
-
-    async def handle_tool_calls_google(
-        self,
-        tool_calls: List[GoogleToolCall],
-    ) -> List[GoogleToolCallMessage]:
-        async_tool_calls_tasks = []
-        for tool_call in tool_calls:
-            tool_name = tool_call.name
-            tool_handler = self.get_tool_handler(tool_name)
-            async_tool_calls_tasks.append(tool_handler(json.dumps(tool_call.arguments)))
-
-        tool_call_results: List[str] = await asyncio.gather(*async_tool_calls_tasks)
-
-        tool_call_messages = [
-            GoogleToolCallMessage(
-                id=tool_call.id,
-                name=tool_call.name,
-                response={"result": result},
-            )
-            for tool_call, result in zip(tool_calls, tool_call_results)
-        ]
-        return tool_call_messages
-
-    async def handle_tool_calls_anthropic(
-        self,
-        tool_calls: List[AnthropicToolCall],
-    ) -> List[AnthropicToolCallMessage]:
-        async_tool_calls_tasks = []
-        for tool_call in tool_calls:
-            tool_name = tool_call.name
-            tool_handler = self.get_tool_handler(tool_name)
-            async_tool_calls_tasks.append(tool_handler(json.dumps(tool_call.input)))
-
-        tool_call_results: List[str] = await asyncio.gather(*async_tool_calls_tasks)
-        tool_call_messages = [
-            AnthropicToolCallMessage(
-                content=result,
-                tool_use_id=tool_call.id,
-            )
-            for tool_call, result in zip(tool_calls, tool_call_results)
-        ]
-        return tool_call_messages
-
-    # ? Tool call handlers
-    # Search web tool call handler
-    async def search_web_tool_call_handler(self, arguments: str) -> str:
-        match self.client.llm_provider:
-            case LLMProvider.OPENAI:
-                return await self.search_web_tool_call_handler_openai(arguments)
-            case LLMProvider.ANTHROPIC:
-                return await self.search_web_tool_call_handler_anthropic(arguments)
-            case LLMProvider.GOOGLE:
-                return await self.search_web_tool_call_handler_google(arguments)
-            case _:
-                return (
-                    "Web search tool call handler not implemented for this LLM provider: "
-                    + self.client.llm_provider.value
-                )
-
-    async def search_web_tool_call_handler_openai(self, arguments: str) -> str:
-        args = SearchWebTool.model_validate_json(arguments)
-        return await self.client._search_openai(args.query)
-
-    async def search_web_tool_call_handler_google(self, arguments: str) -> str:
-        args = SearchWebTool.model_validate_json(arguments)
-        return await self.client._search_google(args.query)
-
-    async def search_web_tool_call_handler_anthropic(self, arguments: str) -> str:
-        args = SearchWebTool.model_validate_json(arguments)
-        return await self.client._search_anthropic(args.query)
-
-    # Get current datetime tool call handler
-    async def get_current_datetime_tool_call_handler(self, _) -> str:
-        current_time = datetime.now()
-        return f"{current_time.strftime('%A, %B %d, %Y')} at {current_time.strftime('%I:%M:%S %p')}"
diff --git a/servers/fastapi/templates/providers.py b/servers/fastapi/templates/providers.py
index 9e3a0ba8..1c7f4734 100644
--- a/servers/fastapi/templates/providers.py
+++ b/servers/fastapi/templates/providers.py
@@ -4,10 +4,17 @@ from dataclasses import dataclass
 import time
 from typing import Any, Awaitable, Callable, Optional
 
-from anthropic import AsyncAnthropic
 from fastapi import HTTPException
 from google import genai
 from google.genai import types as google_types
+from llmai import AnthropicClient
+from llmai.shared import (
+    AnthropicClientConfig,
+    ImageContentPart,
+    SystemMessage,
+    TextResponse,
+    UserMessage,
+)
 from openai import AsyncOpenAI
 
 from enums.llm_provider import LLMProvider
@@ -160,11 +167,28 @@ def _get_google_client() -> genai.Client:
     return genai.Client(api_key=api_key)
 
 
-def _get_anthropic_client() -> AsyncAnthropic:
+def _get_anthropic_client() -> AnthropicClient:
     api_key = get_anthropic_api_key_env()
     if not api_key:
         raise HTTPException(status_code=400, detail="ANTHROPIC_API_KEY is not set")
-    return AsyncAnthropic(api_key=api_key)
+    return AnthropicClient(config=AnthropicClientConfig(api_key=api_key))
+
+
+def _read_llmai_response_text(response: Any) -> str:
+    content = getattr(response, "content", None)
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for part in content:
+            if isinstance(part, str):
+                parts.append(part)
+                continue
+            text = getattr(part, "text", None)
+            if isinstance(text, str):
+                parts.append(text)
+        return "".join(parts)
+    return getattr(content, "text", None) or ""
 
 
 async def _call_openai_like(
@@ -308,28 +332,24 @@ async def _call_anthropic(
     media_type: str = "image/png",
 ) -> str:
     client = _get_anthropic_client()
-    content = [{"type": "text", "text": user_text}]
+    content: str | list[object] = user_text
     if image_bytes:
-        content.append(
-            {
-                "type": "image",
-                "source": {
-                    "type": "base64",
-                    "media_type": media_type,
-                    "data": base64.b64encode(image_bytes).decode("utf-8"),
-                },
-            }
-        )
+        content = [
+            user_text,
+            ImageContentPart(data=image_bytes, mime_type=media_type),
+        ]
 
-    response = await client.messages.create(
+    response = await asyncio.to_thread(
+        client.generate,
         model=model,
+        messages=[
+            SystemMessage(content=system_prompt),
+            UserMessage(content=content),
+        ],
+        response_format=TextResponse(),
         max_tokens=8192,
-        system=system_prompt,
-        messages=[{"role": "user", "content": content}],
-    )
-    output_text = "".join(
-        block.text for block in response.content if getattr(block, "type", None) == "text"
     )
+    output_text = _read_llmai_response_text(response)
     if not output_text:
         raise HTTPException(status_code=500, detail="No output from template provider")
     return output_text
diff --git a/servers/fastapi/utils/available_models.py b/servers/fastapi/utils/available_models.py
index 539533ad..ff4ae3a7 100644
--- a/servers/fastapi/utils/available_models.py
+++ b/servers/fastapi/utils/available_models.py
@@ -1,4 +1,4 @@
-from anthropic import AsyncAnthropic
+import aiohttp
 from openai import AsyncOpenAI
 from google import genai
 
@@ -12,8 +12,21 @@ async def list_available_openai_compatible_models(url: str, api_key: str) -> lis
 
 
 async def list_available_anthropic_models(api_key: str) -> list[str]:
-    client = AsyncAnthropic(api_key=api_key)
-    return list(map(lambda x: x.id, (await client.models.list(limit=50)).data))
+    async with aiohttp.ClientSession(
+        headers={
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+        }
+    ) as session:
+        async with session.get(
+            "https://api.anthropic.com/v1/models",
+            params={"limit": 50},
+        ) as response:
+            response.raise_for_status()
+            data = await response.json()
+
+    models = data.get("data", [])
+    return [model.get("id") for model in models if model.get("id")]
 
 
 async def list_available_google_models(api_key: str) -> list[str]:
diff --git a/servers/fastapi/utils/get_env.py b/servers/fastapi/utils/get_env.py
index 5a940f78..ea111630 100644
--- a/servers/fastapi/utils/get_env.py
+++ b/servers/fastapi/utils/get_env.py
@@ -85,10 +85,6 @@ def get_pixabay_api_key_env():
     return os.getenv("PIXABAY_API_KEY")
 
 
-def get_tool_calls_env():
-    return os.getenv("TOOL_CALLS")
-
-
 def get_disable_thinking_env():
     return os.getenv("DISABLE_THINKING")
 
diff --git a/servers/fastapi/utils/llm_calls/edit_slide.py b/servers/fastapi/utils/llm_calls/edit_slide.py
index 99d46b6d..40f692e5 100644
--- a/servers/fastapi/utils/llm_calls/edit_slide.py
+++ b/servers/fastapi/utils/llm_calls/edit_slide.py
@@ -1,10 +1,14 @@
+import asyncio
 from datetime import datetime
 from typing import Optional
-from models.llm_message import LLMSystemMessage, LLMUserMessage
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage
 from models.presentation_layout import SlideLayoutModel
 from models.sql.slide import SlideModel
-from services.llm_client import LLMClient
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_structured_content, get_generate_kwargs
 from utils.llm_provider import get_model
 from utils.schema_utils import add_field_in_schema, remove_fields_from_schema
 
@@ -89,12 +93,12 @@ def get_messages(
     verbosity: Optional[str] = None,
     instructions: Optional[str] = None,
     memory_context: Optional[str] = None,
-):
+) -> list[Message]:
     return [
-        LLMSystemMessage(
+        SystemMessage(
             content=get_system_prompt(tone, verbosity, instructions, memory_context),
         ),
-        LLMUserMessage(
+        UserMessage(
             content=get_user_prompt(prompt, slide_data, language),
         ),
     ]
@@ -128,23 +132,40 @@ async def get_edited_slide_content(
         True,
     )
 
-    client = LLMClient()
+    client = get_client(config=get_llm_config())
     try:
-        response = await client.generate_structured(
-            model=model,
-            messages=get_messages(
-                prompt,
-                slide.content,
-                language,
-                tone,
-                verbosity,
-                instructions,
-                memory_context,
-            ),
-            response_format=response_schema,
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=response_schema,
             strict=False,
         )
-        return response
+        messages = get_messages(
+            prompt,
+            slide.content,
+            language,
+            tone,
+            verbosity,
+            instructions,
+            memory_context,
+        )
+
+        for attempt in range(3):
+            response = await asyncio.to_thread(
+                client.generate,
+                **get_generate_kwargs(
+                    model=model,
+                    messages=messages,
+                    response_format=response_format,
+                ),
+            )
+            content = extract_structured_content(response.content)
+            if content is not None:
+                return content
+
+            if attempt < 2:
+                await asyncio.sleep(0.5 * (attempt + 1))
+
+        raise HTTPException(status_code=400, detail="LLM did not return any content")
 
     except Exception as e:
         raise handle_llm_client_exceptions(e)
diff --git a/servers/fastapi/utils/llm_calls/edit_slide_html.py b/servers/fastapi/utils/llm_calls/edit_slide_html.py
index e74a01de..c0915386 100644
--- a/servers/fastapi/utils/llm_calls/edit_slide_html.py
+++ b/servers/fastapi/utils/llm_calls/edit_slide_html.py
@@ -1,7 +1,11 @@
+import asyncio
 from typing import Optional
-from models.llm_message import LLMSystemMessage, LLMUserMessage
-from services.llm_client import LLMClient
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import SystemMessage, UserMessage
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_text, get_generate_kwargs
 from utils.llm_provider import get_model
 
 system_prompt = """
@@ -59,18 +63,24 @@ async def get_edited_slide_html(
 ):
     model = get_model()
 
-    client = LLMClient()
+    client = get_client(config=get_llm_config())
     try:
-        response = await client.generate(
-            model=model,
-            messages=[
-                LLMSystemMessage(content=system_prompt),
-                LLMUserMessage(
-                    content=get_user_prompt(prompt, html, memory_context)
-                ),
-            ],
+        response = await asyncio.to_thread(
+            client.generate,
+            **get_generate_kwargs(
+                model=model,
+                messages=[
+                    SystemMessage(content=system_prompt),
+                    UserMessage(
+                        content=get_user_prompt(prompt, html, memory_context)
+                    ),
+                ],
+            ),
         )
-        return extract_html_from_response(response) or html
+        response_text = extract_text(response.content)
+        if response_text is None:
+            raise HTTPException(status_code=400, detail="LLM did not return any content")
+        return extract_html_from_response(response_text) or html
     except Exception as e:
         raise handle_llm_client_exceptions(e)
 
diff --git a/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py b/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py
index 8ae47ae7..6f2f28ef 100644
--- a/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py
+++ b/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py
@@ -1,14 +1,26 @@
 from datetime import datetime
 from typing import Optional
 
-from enums.llm_provider import LLMProvider
-from models.llm_message import LLMSystemMessage, LLMUserMessage
+from llmai import get_client
+from llmai.shared import (
+    JSONSchemaResponse,
+    Message,
+    ResponseStreamCompletionChunk,
+    SystemMessage,
+    UserMessage,
+    WebSearchTool,
+)
+
 from models.presentation_outline_model import PresentationOutlineModel
-from models.llm_tools import SearchWebTool
-from services.llm_client import LLMClient
 from utils.get_dynamic_models import get_presentation_outline_model_with_n_slides
+from utils.llm_config import enable_web_grounding, get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
 from utils.llm_provider import get_model
+from utils.llm_utils import (
+    get_generate_kwargs,
+    serialize_structured_content,
+    stream_generate_events,
+)
 
 
 def get_system_prompt(
@@ -125,9 +137,9 @@ def get_messages(
     instructions: Optional[str] = None,
     include_title_slide: bool = True,
     include_table_of_contents: bool = False,
-):
+) -> list[Message]:
     return [
-        LLMSystemMessage(
+        SystemMessage(
             content=get_system_prompt(
                 tone,
                 verbosity,
@@ -136,7 +148,7 @@ def get_messages(
                 include_table_of_contents,
             ),
         ),
-        LLMUserMessage(
+        UserMessage(
             content=get_user_prompt(
                 content,
                 n_slides,
@@ -170,36 +182,47 @@ async def generate_ppt_outline(
         else PresentationOutlineModel
     )
 
-    client = LLMClient()
-    providers_with_search_tool = {
-        LLMProvider.OPENAI,
-        LLMProvider.ANTHROPIC,
-        LLMProvider.GOOGLE,
-    }
-    use_search_tool = (
-        web_search
-        and client.enable_web_grounding()
-        and client.llm_provider in providers_with_search_tool
-    )
+    client = get_client(config=get_llm_config())
+    use_search_tool = web_search and enable_web_grounding()
 
     try:
-        async for chunk in client.stream_structured(
-            model,
-            get_messages(
-                content,
-                n_slides,
-                language,
-                additional_context,
-                tone,
-                verbosity,
-                instructions,
-                include_title_slide,
-                include_table_of_contents,
-            ),
-            response_model.model_json_schema(),
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=response_model.model_json_schema(),
             strict=True,
-            tools=([SearchWebTool] if use_search_tool else None),
+        )
+        emitted_content = False
+        async for event in stream_generate_events(
+            client,
+            **get_generate_kwargs(
+                model=model,
+                messages=get_messages(
+                    content,
+                    n_slides,
+                    language,
+                    additional_context,
+                    tone,
+                    verbosity,
+                    instructions,
+                    include_title_slide,
+                    include_table_of_contents,
+                ),
+                response_format=response_format,
+                tools=([WebSearchTool()] if use_search_tool else None),
+                stream=True,
+            ),
         ):
-            yield chunk
+            if getattr(event, "type", None) == "content":
+                chunk = getattr(event, "chunk", None)
+                if chunk:
+                    emitted_content = True
+                    yield chunk
+            elif (
+                isinstance(event, ResponseStreamCompletionChunk)
+                and not emitted_content
+            ):
+                final_content = serialize_structured_content(event.content)
+                if final_content:
+                    yield final_content
     except Exception as e:
         yield handle_llm_client_exceptions(e)
diff --git a/servers/fastapi/utils/llm_calls/generate_presentation_structure.py b/servers/fastapi/utils/llm_calls/generate_presentation_structure.py
index bbe26172..df890164 100644
--- a/servers/fastapi/utils/llm_calls/generate_presentation_structure.py
+++ b/servers/fastapi/utils/llm_calls/generate_presentation_structure.py
@@ -1,10 +1,14 @@
-from typing import Optional, Dict
+import asyncio
+from typing import Optional
 
-from models.llm_message import LLMSystemMessage, LLMUserMessage
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage
 from models.presentation_layout import PresentationLayoutModel
 from models.presentation_outline_model import PresentationOutlineModel
-from services.llm_client import LLMClient
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_structured_content, get_generate_kwargs
 from utils.llm_provider import get_model
 from utils.get_dynamic_models import get_presentation_structure_model_with_n_slides
 from models.presentation_structure_model import PresentationStructureModel
@@ -97,19 +101,21 @@ def get_messages(
     n_slides: int,
     data: str,
     instructions: Optional[str] = None,
-):
+) -> list[Message]:
     system_prompt = GET_MESSAGES_SYSTEM_PROMPT.format(
         user_instruction_header="# User Instruction:" if instructions else "",
         n_slides=n_slides,
     )
 
     return [
-        LLMSystemMessage(content=system_prompt),
-        LLMUserMessage(content=(
-            f"{presentation_layout.to_string()}\n\n"
-            "--------------------------------------\n\n"
-            f"{data}"
-        )),
+        SystemMessage(content=system_prompt),
+        UserMessage(
+            content=(
+                f"{presentation_layout.to_string()}\n\n"
+                "--------------------------------------\n\n"
+                f"{data}"
+            )
+        ),
     ]
 
 
@@ -118,20 +124,13 @@ def get_messages_for_slides_markdown(
     n_slides: int,
     data: str,
     instructions: Optional[str] = None,
-):
+) -> list[Message]:
     system_prompt = STRUCTURE_FROM_SLIDES_MARKDOWN_SYSTEM_PROMPT.format(
         user_instructions=instructions or "",
         presentation_layout=presentation_layout.to_string(with_schema=True),
     )
 
-    return [
-        LLMSystemMessage(
-            content=system_prompt
-        ),
-        LLMUserMessage(
-            content=data
-        )
-    ]
+    return [SystemMessage(content=system_prompt), UserMessage(content=data)]
 
 
 async def generate_presentation_structure(
@@ -140,34 +139,50 @@ async def generate_presentation_structure(
     instructions: Optional[str] = None,
     using_slides_markdown: bool = False,
 ) -> PresentationStructureModel:
-
-    client = LLMClient()
+    client = get_client(config=get_llm_config())
     model = get_model()
     response_model = get_presentation_structure_model_with_n_slides(
         len(presentation_outline.slides)
     )
 
     try:
-        response = await client.generate_structured(
-            model=model,
-            messages=(
-                get_messages_for_slides_markdown(
-                    presentation_layout,
-                    len(presentation_outline.slides),
-                    presentation_outline.to_string(),
-                    instructions,
-                )
-                if using_slides_markdown
-                else get_messages(
-                    presentation_layout,
-                    len(presentation_outline.slides),
-                    presentation_outline.to_string(),
-                    instructions,
-                )
-            ),
-            response_format=response_model.model_json_schema(),
+        messages = (
+            get_messages_for_slides_markdown(
+                presentation_layout,
+                len(presentation_outline.slides),
+                presentation_outline.to_string(),
+                instructions,
+            )
+            if using_slides_markdown
+            else get_messages(
+                presentation_layout,
+                len(presentation_outline.slides),
+                presentation_outline.to_string(),
+                instructions,
+            )
+        )
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=response_model.model_json_schema(),
             strict=True,
         )
-        return PresentationStructureModel(**response)
+
+        for attempt in range(3):
+            response = await asyncio.to_thread(
+                client.generate,
+                **get_generate_kwargs(
+                    model=model,
+                    messages=messages,
+                    response_format=response_format,
+                ),
+            )
+            content = extract_structured_content(response.content)
+            if content is not None:
+                return PresentationStructureModel(**content)
+
+            if attempt < 2:
+                await asyncio.sleep(0.5 * (attempt + 1))
+
+        raise HTTPException(status_code=400, detail="LLM did not return any content")
     except Exception as e:
         raise handle_llm_client_exceptions(e)
diff --git a/servers/fastapi/utils/llm_calls/generate_slide_content.py b/servers/fastapi/utils/llm_calls/generate_slide_content.py
index a5010cf2..532fd52d 100644
--- a/servers/fastapi/utils/llm_calls/generate_slide_content.py
+++ b/servers/fastapi/utils/llm_calls/generate_slide_content.py
@@ -1,11 +1,15 @@
+import asyncio
 from datetime import datetime
 import json
 from typing import Optional
-from models.llm_message import LLMSystemMessage, LLMUserMessage
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage
 from models.presentation_layout import SlideLayoutModel
 from models.presentation_outline_model import SlideOutlineModel
-from services.llm_client import LLMClient
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_structured_content, get_generate_kwargs
 from utils.llm_provider import get_model
 from utils.schema_utils import add_field_in_schema, remove_fields_from_schema
 
@@ -130,10 +134,10 @@ def get_messages(
     verbosity: Optional[str] = None,
     instructions: Optional[str] = None,
     response_schema: Optional[dict] = None,
-):
+) -> list[Message]:
 
     return [
-        LLMSystemMessage(
+        SystemMessage(
             content=get_system_prompt(
                 tone,
                 verbosity,
@@ -141,7 +145,7 @@ def get_messages(
                 response_schema,
             ),
         ),
-        LLMUserMessage(
+        UserMessage(
             content=get_user_prompt(outline, language),
         ),
     ]
@@ -155,7 +159,7 @@ async def get_slide_content_from_type_and_outline(
     verbosity: Optional[str] = None,
     instructions: Optional[str] = None,
 ):
-    client = LLMClient()
+    client = get_client(config=get_llm_config())
     model = get_model()
 
     response_schema = remove_fields_from_schema(
@@ -175,20 +179,37 @@ async def get_slide_content_from_type_and_outline(
     )
 
     try:
-        response = await client.generate_structured(
-            model=model,
-            messages=get_messages(
-                outline.content,
-                language,
-                tone,
-                verbosity,
-                instructions,
-                response_schema,
-            ),
-            response_format=response_schema,
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=response_schema,
             strict=False,
         )
-        return response
+        messages = get_messages(
+            outline.content,
+            language,
+            tone,
+            verbosity,
+            instructions,
+            response_schema,
+        )
+
+        for attempt in range(3):
+            response = await asyncio.to_thread(
+                client.generate,
+                **get_generate_kwargs(
+                    model=model,
+                    messages=messages,
+                    response_format=response_format,
+                ),
+            )
+            content = extract_structured_content(response.content)
+            if content is not None:
+                return content
+
+            if attempt < 2:
+                await asyncio.sleep(0.5 * (attempt + 1))
+
+        raise HTTPException(status_code=400, detail="LLM did not return any content")
 
     except Exception as e:
         raise handle_llm_client_exceptions(e)
diff --git a/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py b/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py
index f12e7d07..8719b561 100644
--- a/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py
+++ b/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py
@@ -1,9 +1,13 @@
-from models.llm_message import LLMSystemMessage, LLMUserMessage
+import asyncio
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage
 from models.presentation_layout import PresentationLayoutModel, SlideLayoutModel
 from models.slide_layout_index import SlideLayoutIndex
 from models.sql.slide import SlideModel
-from services.llm_client import LLMClient
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_structured_content, get_generate_kwargs
 from utils.llm_provider import get_model
 
 
@@ -13,7 +17,7 @@ def get_messages(
     layout: PresentationLayoutModel,
     current_slide_layout: int,
     memory_context: str = "",
-):
+) -> list[Message]:
     memory_block = (
         f"\n                # Retrieved Presentation Memory Context\n                {memory_context}\n"
         if memory_context
@@ -21,7 +25,7 @@ def get_messages(
     )
 
     return [
-        LLMSystemMessage(
+        SystemMessage(
             content=f"""
                 Select a Slide Layout index based on provided user prompt and current slide data.
                 {layout.to_string()}
@@ -34,7 +38,7 @@ def get_messages(
                 **Go through all notes and steps and make sure they are followed, including mentioned constraints**
             """,
         ),
-        LLMUserMessage(
+        UserMessage(
             content=f"""
                 - User Prompt: {prompt}
                 - Current Slide Data: {slide_data}
@@ -50,27 +54,43 @@ async def get_slide_layout_from_prompt(
     slide: SlideModel,
     memory_context: str = "",
 ) -> SlideLayoutModel:
-
-    client = LLMClient()
+    client = get_client(config=get_llm_config())
     model = get_model()
 
     slide_layout_index = layout.get_slide_layout_index(slide.layout)
 
     try:
-        response = await client.generate_structured(
-            model=model,
-            messages=get_messages(
-                prompt,
-                slide.content,
-                layout,
-                slide_layout_index,
-                memory_context,
-            ),
-            response_format=SlideLayoutIndex.model_json_schema(),
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=SlideLayoutIndex.model_json_schema(),
             strict=True,
         )
-        index = SlideLayoutIndex(**response).index
-        return layout.slides[index]
+        messages = get_messages(
+            prompt,
+            slide.content,
+            layout,
+            slide_layout_index,
+            memory_context,
+        )
+
+        for attempt in range(3):
+            response = await asyncio.to_thread(
+                client.generate,
+                **get_generate_kwargs(
+                    model=model,
+                    messages=messages,
+                    response_format=response_format,
+                ),
+            )
+            content = extract_structured_content(response.content)
+            if content is not None:
+                index = SlideLayoutIndex(**content).index
+                return layout.slides[index]
+
+            if attempt < 2:
+                await asyncio.sleep(0.5 * (attempt + 1))
+
+        raise HTTPException(status_code=400, detail="LLM did not return any content")
 
     except Exception as e:
         raise handle_llm_client_exceptions(e)
diff --git a/servers/fastapi/utils/llm_client_error_handler.py b/servers/fastapi/utils/llm_client_error_handler.py
index 7e4c915b..ab27c300 100644
--- a/servers/fastapi/utils/llm_client_error_handler.py
+++ b/servers/fastapi/utils/llm_client_error_handler.py
@@ -1,18 +1,19 @@
 from fastapi import HTTPException
-from anthropic import APIError as AnthropicAPIError
 from openai import APIError as OpenAIAPIError
 from google.genai.errors import APIError as GoogleAPIError
 import traceback
 
+from llmai.shared.errors import BaseError as LLMAIBaseError
+
 
 def handle_llm_client_exceptions(e: Exception) -> HTTPException:
     traceback.print_exc()
+    if isinstance(e, HTTPException):
+        return e
+    if isinstance(e, LLMAIBaseError):
+        return HTTPException(status_code=e.status_code, detail=e.message)
     if isinstance(e, OpenAIAPIError):
         return HTTPException(status_code=500, detail=f"OpenAI API error: {e.message}")
     if isinstance(e, GoogleAPIError):
         return HTTPException(status_code=500, detail=f"Google API error: {e.message}")
-    if isinstance(e, AnthropicAPIError):
-        return HTTPException(
-            status_code=500, detail=f"Anthropic API error: {e.message}"
-        )
     return HTTPException(status_code=500, detail=f"LLM API error: {e}")
diff --git a/servers/fastapi/utils/llm_config.py b/servers/fastapi/utils/llm_config.py
new file mode 100644
index 00000000..bef4dcda
--- /dev/null
+++ b/servers/fastapi/utils/llm_config.py
@@ -0,0 +1,146 @@
+import time
+from typing import Optional
+
+from fastapi import HTTPException
+from llmai.shared import (
+    AnthropicClientConfig,
+    ChatGPTClientConfig,
+    ClientConfig,
+    GoogleClientConfig,
+    OpenAIApiType,
+    OpenAIClientConfig,
+)
+
+from enums.llm_provider import LLMProvider
+from utils.get_env import (
+    get_anthropic_api_key_env,
+    get_codex_access_token_env,
+    get_codex_account_id_env,
+    get_codex_refresh_token_env,
+    get_codex_token_expires_env,
+    get_custom_llm_api_key_env,
+    get_custom_llm_url_env,
+    get_disable_thinking_env,
+    get_google_api_key_env,
+    get_ollama_url_env,
+    get_openai_api_key_env,
+    get_web_grounding_env,
+)
+from utils.llm_provider import get_llm_provider
+from utils.parsers import parse_bool_or_none
+from utils.set_env import (
+    set_codex_access_token_env,
+    set_codex_account_id_env,
+    set_codex_refresh_token_env,
+    set_codex_token_expires_env,
+)
+
+
+def enable_web_grounding() -> bool:
+    return parse_bool_or_none(get_web_grounding_env()) or False
+
+
+def disable_thinking() -> bool:
+    return parse_bool_or_none(get_disable_thinking_env()) or False
+
+
+def _get_codex_access_token() -> str:
+    access_token = get_codex_access_token_env()
+    if not access_token:
+        raise HTTPException(
+            status_code=400,
+            detail=(
+                "Codex OAuth access token is not set. Please authenticate via "
+                "/api/v1/ppt/codex/auth/initiate"
+            ),
+        )
+
+    expires_str = get_codex_token_expires_env()
+    if expires_str:
+        try:
+            expires_ms = int(expires_str)
+            now_ms = int(time.time() * 1000)
+            if now_ms >= expires_ms - 60_000:
+                refresh_token = get_codex_refresh_token_env()
+                if refresh_token:
+                    from utils.oauth.openai_codex import (
+                        TokenSuccess,
+                        get_account_id,
+                        refresh_access_token,
+                    )
+
+                    result = refresh_access_token(refresh_token)
+                    if isinstance(result, TokenSuccess):
+                        set_codex_access_token_env(result.access)
+                        set_codex_refresh_token_env(result.refresh)
+                        set_codex_token_expires_env(str(result.expires))
+                        account_id = get_account_id(result.access)
+                        if account_id:
+                            set_codex_account_id_env(account_id)
+                        access_token = result.access
+        except (TypeError, ValueError):
+            pass
+
+    return access_token
+
+
+def get_llm_config() -> ClientConfig:
+    llm_provider = get_llm_provider()
+
+    match llm_provider:
+        case LLMProvider.OPENAI:
+            api_key = get_openai_api_key_env()
+            if not api_key:
+                raise HTTPException(status_code=400, detail="OpenAI API Key is not set")
+            return OpenAIClientConfig(
+                api_key=api_key,
+                api_type=OpenAIApiType.RESPONSES,
+            )
+        case LLMProvider.GOOGLE:
+            api_key = get_google_api_key_env()
+            if not api_key:
+                raise HTTPException(status_code=400, detail="Google API Key is not set")
+            return GoogleClientConfig(api_key=api_key)
+        case LLMProvider.ANTHROPIC:
+            api_key = get_anthropic_api_key_env()
+            if not api_key:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Anthropic API Key is not set",
+                )
+            return AnthropicClientConfig(api_key=api_key)
+        case LLMProvider.OLLAMA:
+            return OpenAIClientConfig(
+                base_url=(get_ollama_url_env() or "http://localhost:11434") + "/v1",
+                api_key="ollama",
+            )
+        case LLMProvider.CUSTOM:
+            base_url = get_custom_llm_url_env()
+            if not base_url:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Custom LLM URL is not set",
+                )
+            return OpenAIClientConfig(
+                base_url=base_url,
+                api_key=get_custom_llm_api_key_env() or "null",
+            )
+        case LLMProvider.CODEX:
+            return ChatGPTClientConfig(
+                access_token=_get_codex_access_token(),
+                account_id=get_codex_account_id_env() or None,
+            )
+        case _:
+            raise HTTPException(
+                status_code=400,
+                detail=(
+                    "LLM Provider must be either openai, google, anthropic, "
+                    "ollama, custom, or codex"
+                ),
+            )
+
+
+def get_extra_body() -> Optional[dict]:
+    if get_llm_provider() == LLMProvider.CUSTOM and disable_thinking():
+        return {"enable_thinking": False}
+    return None
diff --git a/servers/fastapi/utils/llm_utils.py b/servers/fastapi/utils/llm_utils.py
new file mode 100644
index 00000000..c10a7341
--- /dev/null
+++ b/servers/fastapi/utils/llm_utils.py
@@ -0,0 +1,134 @@
+import asyncio
+import json
+from collections.abc import AsyncGenerator, Sequence
+from typing import Any, Optional
+
+import dirtyjson
+from llmai.shared import (
+    LLMTool,
+    Message,
+    ResponseFormat,
+    normalize_content_parts,
+)
+
+from utils.llm_config import get_extra_body
+
+
+def get_generate_kwargs(
+    model: str,
+    messages: Sequence[Message],
+    max_tokens: Optional[int] = None,
+    tools: Optional[list[LLMTool]] = None,
+    response_format: Optional[ResponseFormat] = None,
+    stream: bool = False,
+) -> dict[str, Any]:
+    kwargs: dict[str, Any] = {
+        "model": model,
+        "messages": list(messages),
+        "stream": stream,
+    }
+    if max_tokens is not None:
+        kwargs["max_tokens"] = max_tokens
+    if tools:
+        kwargs["tools"] = tools
+    if response_format is not None:
+        kwargs["response_format"] = response_format
+
+    extra_body = get_extra_body()
+    if extra_body:
+        kwargs["extra_body"] = extra_body
+
+    return kwargs
+
+
+def extract_text(content: Any) -> Optional[str]:
+    if content is None:
+        return None
+    if isinstance(content, str):
+        return content
+    if isinstance(content, Sequence) and not isinstance(content, (bytes, bytearray)):
+        parts: list[str] = []
+        for part in content:
+            if isinstance(part, str):
+                parts.append(part)
+                continue
+            text = getattr(part, "text", None)
+            if isinstance(text, str):
+                parts.append(text)
+        joined = "".join(parts)
+        return joined or None
+    text = getattr(content, "text", None)
+    if isinstance(text, str):
+        return text
+    return None
+
+
+def extract_structured_content(content: Any) -> Optional[dict]:
+    if content is None:
+        return None
+    if isinstance(content, dict):
+        return content
+    if hasattr(content, "model_dump"):
+        dumped = content.model_dump(mode="json")
+        if isinstance(dumped, dict):
+            return dumped
+
+    raw_text = extract_text(content)
+    if not raw_text:
+        return None
+
+    try:
+        parsed = dirtyjson.loads(raw_text)
+    except Exception:
+        return None
+
+    if isinstance(parsed, dict):
+        return dict(parsed)
+    return None
+
+
+def serialize_structured_content(content: Any) -> Optional[str]:
+    parsed = extract_structured_content(content)
+    if parsed is not None:
+        return json.dumps(parsed, ensure_ascii=False)
+
+    raw_text = extract_text(content)
+    if raw_text:
+        return raw_text
+    return None
+
+
+def message_content_to_text(content: Sequence[Any] | str | None) -> Optional[str]:
+    joined = "".join(
+        part.text
+        for part in normalize_content_parts(content)
+        if isinstance(getattr(part, "text", None), str)
+    )
+    return joined or None
+
+
+async def stream_generate_events(client: Any, **kwargs) -> AsyncGenerator[Any, None]:
+    loop = asyncio.get_running_loop()
+    queue: asyncio.Queue[Any] = asyncio.Queue()
+    sentinel = object()
+
+    def worker():
+        try:
+            for event in client.generate(**kwargs):
+                loop.call_soon_threadsafe(queue.put_nowait, event)
+        except Exception as exc:
+            loop.call_soon_threadsafe(queue.put_nowait, exc)
+        finally:
+            loop.call_soon_threadsafe(queue.put_nowait, sentinel)
+
+    worker_task = asyncio.create_task(asyncio.to_thread(worker))
+    try:
+        while True:
+            item = await queue.get()
+            if item is sentinel:
+                break
+            if isinstance(item, Exception):
+                raise item
+            yield item
+    finally:
+        await worker_task
diff --git a/servers/fastapi/utils/set_env.py b/servers/fastapi/utils/set_env.py
index 1a367735..18456d8e 100644
--- a/servers/fastapi/utils/set_env.py
+++ b/servers/fastapi/utils/set_env.py
@@ -73,10 +73,6 @@ def set_disable_image_generation_env(value):
     os.environ["DISABLE_IMAGE_GENERATION"] = value
 
 
-def set_tool_calls_env(value):
-    os.environ["TOOL_CALLS"] = value
-
-
 def set_disable_thinking_env(value):
     os.environ["DISABLE_THINKING"] = value
 
diff --git a/servers/fastapi/utils/user_config.py b/servers/fastapi/utils/user_config.py
index b7bfaab1..bc499075 100644
--- a/servers/fastapi/utils/user_config.py
+++ b/servers/fastapi/utils/user_config.py
@@ -22,7 +22,6 @@ from utils.get_env import (
     get_openai_api_key_env,
     get_openai_model_env,
     get_pexels_api_key_env,
-    get_tool_calls_env,
     get_user_config_path_env,
     get_image_provider_env,
     get_pixabay_api_key_env,
@@ -63,7 +62,6 @@ from utils.set_env import (
     set_pexels_api_key_env,
     set_image_provider_env,
     set_pixabay_api_key_env,
-    set_tool_calls_env,
     set_web_grounding_env,
     set_codex_access_token_env,
     set_codex_refresh_token_env,
@@ -118,11 +116,6 @@ def get_user_config():
         DALL_E_3_QUALITY=existing_config.DALL_E_3_QUALITY or get_dall_e_3_quality_env(),
         GPT_IMAGE_1_5_QUALITY=existing_config.GPT_IMAGE_1_5_QUALITY
         or get_gpt_image_1_5_quality_env(),
-        TOOL_CALLS=(
-            existing_config.TOOL_CALLS
-            if existing_config.TOOL_CALLS is not None
-            else (parse_bool_or_none(get_tool_calls_env()) or False)
-        ),
         DISABLE_THINKING=(
             existing_config.DISABLE_THINKING
             if existing_config.DISABLE_THINKING is not None
@@ -197,8 +190,6 @@ def update_env_with_user_config():
         set_dall_e_3_quality_env(user_config.DALL_E_3_QUALITY)
     if user_config.GPT_IMAGE_1_5_QUALITY:
         set_gpt_image_1_5_quality_env(user_config.GPT_IMAGE_1_5_QUALITY)
-    if user_config.TOOL_CALLS is not None:
-        set_tool_calls_env(str(user_config.TOOL_CALLS))
     if user_config.DISABLE_THINKING is not None:
         set_disable_thinking_env(str(user_config.DISABLE_THINKING))
     if user_config.EXTENDED_REASONING is not None:
diff --git a/servers/fastapi/uv.lock b/servers/fastapi/uv.lock
index 2e7c3f0e..e0ded891 100644
--- a/servers/fastapi/uv.lock
+++ b/servers/fastapi/uv.lock
@@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = "==3.11.*"
 
 [[package]]
@@ -238,6 +238,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ea/44/b749f8777b020b420bceaaf60f66432fc30cc904ca5b69640ec9cbef11ed/blis-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:27f82b8633030f8d095d2b412dffa7eb6dbc8ee43813139909a20012e54422ea", size = 6171233, upload-time = "2025-11-17T12:27:41.921Z" },
 ]
 
+[[package]]
+name = "boto3"
+version = "1.42.94"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+    { name = "jmespath" },
+    { name = "s3transfer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6a/6a/95302333208830de932ad1d0b69599ee13e936349a44981fb72632507861/boto3-1.42.94.tar.gz", hash = "sha256:5b6056a661c19e974aaea3cb97690ddbe30d10c31e4f887df3bff06574f34510", size = 113211, upload-time = "2026-04-22T20:36:19.167Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/6f/4e175604f3168befcb413c95bf45eada67d12042f92f76a9305d6a817ea9/boto3-1.42.94-py3-none-any.whl", hash = "sha256:56d53bce75629cc7c78a32da8b62de74cee3e2a3d54a2b60ba1a65f9f1b129da", size = 140555, upload-time = "2026-04-22T20:36:16.182Z" },
+]
+
+[[package]]
+name = "botocore"
+version = "1.42.94"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jmespath" },
+    { name = "python-dateutil" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/90/1a4d0e81b325d38e37f81d907ceacac3b8f509ad38b495bb95086ecb609d/botocore-1.42.94.tar.gz", hash = "sha256:41c6b3b11b073221a41f52b222ba387be34459fb77cdc506e8b74cdaf24bdcce", size = 15260901, upload-time = "2026-04-22T20:36:00.853Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/61/73/313af9ee02ac0155247bcf3f04fcf54fcae2e33250bb437528c18aeefd81/botocore-1.42.94-py3-none-any.whl", hash = "sha256:a2143742132ed0f6cdb90204d667b89d0301068b1045e8bc099efa267bf1b348", size = 14942938, upload-time = "2026-04-22T20:35:55.663Z" },
+]
+
 [[package]]
 name = "cachetools"
 version = "7.0.6"
@@ -783,7 +811,9 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fb/c6/dba32cab7e3a625b011aa5647486e2d28423a48845a2998c126dd69c85e1/greenlet-3.4.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:805bebb4945094acbab757d34d6e1098be6de8966009ab9ca54f06ff492def58", size = 285504, upload-time = "2026-04-08T15:52:14.071Z" },
     { url = "https://files.pythonhosted.org/packages/54/f4/7cb5c2b1feb9a1f50e038be79980dfa969aa91979e5e3a18fdbcfad2c517/greenlet-3.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:439fc2f12b9b512d9dfa681c5afe5f6b3232c708d13e6f02c845e0d9f4c2d8c6", size = 605476, upload-time = "2026-04-08T16:24:37.064Z" },
     { url = "https://files.pythonhosted.org/packages/d6/af/b66ab0b2f9a4c5a867c136bf66d9599f34f21a1bcca26a2884a29c450bd9/greenlet-3.4.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a70ed1cb0295bee1df57b63bf7f46b4e56a5c93709eea769c1fec1bb23a95875", size = 618336, upload-time = "2026-04-08T16:30:56.59Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/31/56c43d2b5de476f77d36ceeec436328533bff960a4cba9a07616e93063ab/greenlet-3.4.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c5696c42e6bb5cfb7c6ff4453789081c66b9b91f061e5e9367fa15792644e76", size = 625045, upload-time = "2026-04-08T16:40:37.111Z" },
     { url = "https://files.pythonhosted.org/packages/e5/5c/8c5633ece6ba611d64bf2770219a98dd439921d6424e4e8cf16b0ac74ea5/greenlet-3.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c660bce1940a1acae5f51f0a064f1bc785d07ea16efcb4bc708090afc4d69e83", size = 613515, upload-time = "2026-04-08T15:56:32.478Z" },
+    { url = "https://files.pythonhosted.org/packages/80/ca/704d4e2c90acb8bdf7ae593f5cbc95f58e82de95cc540fb75631c1054533/greenlet-3.4.0-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:89995ce5ddcd2896d89615116dd39b9703bfa0c07b583b85b89bf1b5d6eddf81", size = 419745, upload-time = "2026-04-08T16:43:04.022Z" },
     { url = "https://files.pythonhosted.org/packages/a9/df/950d15bca0d90a0e7395eb777903060504cdb509b7b705631e8fb69ff415/greenlet-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee407d4d1ca9dc632265aee1c8732c4a2d60adff848057cdebfe5fe94eb2c8a2", size = 1574623, upload-time = "2026-04-08T16:26:18.596Z" },
     { url = "https://files.pythonhosted.org/packages/1a/e7/0839afab829fcb7333c9ff6d80c040949510055d2d4d63251f0d1c7c804e/greenlet-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:956215d5e355fffa7c021d168728321fd4d31fd730ac609b1653b450f6a4bc71", size = 1639579, upload-time = "2026-04-08T15:57:29.231Z" },
     { url = "https://files.pythonhosted.org/packages/d9/2b/b4482401e9bcaf9f5c97f67ead38db89c19520ff6d0d6699979c6efcc200/greenlet-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:5cb614ace7c27571270354e9c9f696554d073f8aa9319079dcba466bbdead711", size = 238233, upload-time = "2026-04-08T17:02:54.286Z" },
@@ -1057,6 +1087,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/99/8f/15e7741ff19e9bcd4d753f7ff22f988fd54592f134ca13701c13ea8c20e0/jiter-0.14.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e52c076f187405fc21523c746c04399c9af8ece566077ed147b2126f2bcba577", size = 351445, upload-time = "2026-04-10T14:28:33.093Z" },
 ]
 
+[[package]]
+name = "jmespath"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
+]
+
 [[package]]
 name = "joblib"
 version = "1.5.3"
@@ -1146,6 +1185,28 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/81/db/e655086b7f3a705df045bf0933bdd9c2f79bb3c97bfef1384598bb79a217/keyring-25.7.0-py3-none-any.whl", hash = "sha256:be4a0b195f149690c166e850609a477c532ddbfbaed96a404d4e43f8d5e2689f", size = 39160, upload-time = "2025-11-16T16:26:08.402Z" },
 ]
 
+[[package]]
+name = "llmai"
+version = "0.1.8"
+source = { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" }
+dependencies = [
+    { name = "anthropic" },
+    { name = "boto3" },
+    { name = "google-genai" },
+    { name = "openai" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl", hash = "sha256:c4bae504dae928e88e8437bd3e2e5eb573f459d6df9ed8fc182671ee99b3cf1b" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "anthropic", specifier = ">=0.79.0" },
+    { name = "boto3", specifier = ">=1.42.89" },
+    { name = "google-genai", specifier = ">=1.62.0" },
+    { name = "openai", specifier = ">=2.18.0" },
+]
+
 [[package]]
 name = "loguru"
 version = "0.7.3"
@@ -1604,13 +1665,13 @@ dependencies = [
     { name = "aiomysql" },
     { name = "aiosqlite" },
     { name = "alembic" },
-    { name = "anthropic" },
     { name = "asyncpg" },
     { name = "dirtyjson" },
     { name = "fastapi", extra = ["standard"] },
     { name = "fastembed-vectorstore" },
     { name = "fastmcp" },
     { name = "google-genai" },
+    { name = "llmai" },
     { name = "mem0ai", extra = ["nlp"] },
     { name = "nltk" },
     { name = "openai" },
@@ -1626,13 +1687,13 @@ requires-dist = [
     { name = "aiomysql", specifier = ">=0.2.0" },
     { name = "aiosqlite", specifier = ">=0.21.0" },
     { name = "alembic", specifier = ">=1.14.0" },
-    { name = "anthropic", specifier = ">=0.60.0" },
     { name = "asyncpg", specifier = ">=0.30.0" },
     { name = "dirtyjson", specifier = ">=1.0.8" },
     { name = "fastapi", extras = ["standard"], specifier = ">=0.116.1" },
     { name = "fastembed-vectorstore", specifier = ">=0.5.2" },
     { name = "fastmcp", specifier = ">=2.11.0" },
     { name = "google-genai", specifier = ">=1.28.0" },
+    { name = "llmai", url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" },
     { name = "mem0ai", extras = ["nlp"], specifier = ">=0.1.115" },
     { name = "nltk", specifier = ">=3.9.1" },
     { name = "openai", specifier = ">=1.98.0" },
@@ -2200,6 +2261,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" },
 ]
 
+[[package]]
+name = "s3transfer"
+version = "0.16.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/46/29/af14f4ef3c11a50435308660e2cc68761c9a7742475e0585cd4396b91777/s3transfer-0.16.1.tar.gz", hash = "sha256:8e424355754b9ccb32467bdc568edf55be82692ef2002d934b1311dbb3b9e524", size = 154801, upload-time = "2026-04-22T20:36:06.475Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/03/19/90d7d4ed51932c022d53f1d02d564b62d10e272692a1f9b76425c1ad2a02/s3transfer-0.16.1-py3-none-any.whl", hash = "sha256:61bcd00ccb83b21a0fe7e91a553fff9729d46c83b4e0106e7c314a733891f7c2", size = 86825, upload-time = "2026-04-22T20:36:04.992Z" },
+]
+
 [[package]]
 name = "secretstorage"
 version = "3.5.0"
diff --git a/servers/nextjs/components/CustomConfig.tsx b/servers/nextjs/components/CustomConfig.tsx
index f79557fa..9ac1cebc 100644
--- a/servers/nextjs/components/CustomConfig.tsx
+++ b/servers/nextjs/components/CustomConfig.tsx
@@ -20,7 +20,6 @@ interface CustomConfigProps {
   customLlmUrl: string;
   customLlmApiKey: string;
   customModel: string;
-  toolCalls: boolean;
   disableThinking: boolean;
   onInputChange: (value: string | boolean, field: string) => void;
 }
@@ -29,7 +28,6 @@ export default function CustomConfig({
   customLlmUrl,
   customLlmApiKey,
   customModel,
-  toolCalls,
   disableThinking,
   onInputChange,
 }: CustomConfigProps) {
@@ -165,9 +163,8 @@ export default function CustomConfig({
         <div className="mb-4">
           <div className="mb-3 p-3 bg-amber-50 border border-amber-200 rounded-lg">
             <p className="text-sm text-amber-800">
-              <strong>Important:</strong> Only models with function
-              calling capabilities (tool calls) or JSON schema support
-              will work.
+              <strong>Important:</strong> Only models with structured
+              JSON schema output support will work reliably.
             </p>
           </div>
           <label className="block text-sm font-medium text-gray-700 mb-2">
@@ -231,23 +228,6 @@ export default function CustomConfig({
           </div>
         </div>
       )}
-
-      {/* Tool Calls Toggle */}
-      <div>
-        <div className="flex items-center justify-between mb-4 bg-green-50 p-2 rounded-sm">
-          <label className="text-sm font-medium text-gray-700">
-            Use Tool Calls
-          </label>
-          <Switch
-            checked={toolCalls}
-            onCheckedChange={(checked) => onInputChange(checked, "tool_calls")}
-          />
-        </div>
-        <p className="mt-2 text-sm text-gray-500 flex items-center gap-2">
-          <span className="block w-1 h-1 rounded-full bg-gray-400"></span>
-          If enabled, Tool Calls will be used instead of JSON Schema for Structured Output.
-        </p>
-      </div>
       {/* Disable Thinking Toggle */}
       <div>
         <div className="flex items-center justify-between mb-4 bg-green-50 p-2 rounded-sm">
@@ -266,4 +246,4 @@ export default function CustomConfig({
       </div>
     </div >
   );
-} 
\ No newline at end of file
+} 
diff --git a/servers/nextjs/components/LLMSelection.tsx b/servers/nextjs/components/LLMSelection.tsx
index 32ba272f..86682ea3 100644
--- a/servers/nextjs/components/LLMSelection.tsx
+++ b/servers/nextjs/components/LLMSelection.tsx
@@ -292,7 +292,6 @@ export default function LLMProviderSelection({
               customLlmUrl={llmConfig.CUSTOM_LLM_URL || ""}
               customLlmApiKey={llmConfig.CUSTOM_LLM_API_KEY || ""}
               customModel={llmConfig.CUSTOM_MODEL || ""}
-              toolCalls={llmConfig.TOOL_CALLS || false}
               disableThinking={llmConfig.DISABLE_THINKING || false}
               onInputChange={input_field_changed}
             />
diff --git a/servers/nextjs/types/llm_config.ts b/servers/nextjs/types/llm_config.ts
index 3559b065..976b77b1 100644
--- a/servers/nextjs/types/llm_config.ts
+++ b/servers/nextjs/types/llm_config.ts
@@ -42,7 +42,6 @@ export interface LLMConfig {
   GPT_IMAGE_1_5_QUALITY?: string;
 
   // Other Configs
-  TOOL_CALLS?: boolean;
   DISABLE_THINKING?: boolean;
   EXTENDED_REASONING?: boolean;
   WEB_GROUNDING?: boolean;
diff --git a/servers/nextjs/utils/providerUtils.ts b/servers/nextjs/utils/providerUtils.ts
index da23f138..92ec57ee 100644
--- a/servers/nextjs/utils/providerUtils.ts
+++ b/servers/nextjs/utils/providerUtils.ts
@@ -46,7 +46,6 @@ export const updateLLMConfig = (
     image_provider: "IMAGE_PROVIDER",
     disable_image_generation: "DISABLE_IMAGE_GENERATION",
     use_custom_url: "USE_CUSTOM_URL",
-    tool_calls: "TOOL_CALLS",
     disable_thinking: "DISABLE_THINKING",
     extended_reasoning: "EXTENDED_REASONING",
     web_grounding: "WEB_GROUNDING",
@@ -244,4 +243,4 @@ export const pullOllamaModel = async (
       void pollOnce();
     }, 1000);
   });
-};
\ No newline at end of file
+};
diff --git a/start.js b/start.js
index 4eddda4e..6e2425dc 100644
--- a/start.js
+++ b/start.js
@@ -175,7 +175,6 @@ const setupUserConfigFromEnv = () => {
     PIXABAY_API_KEY:
       process.env.PIXABAY_API_KEY || existingConfig.PIXABAY_API_KEY,
     IMAGE_PROVIDER: process.env.IMAGE_PROVIDER || existingConfig.IMAGE_PROVIDER,
-    TOOL_CALLS: process.env.TOOL_CALLS || existingConfig.TOOL_CALLS,
     DISABLE_THINKING:
       process.env.DISABLE_THINKING || existingConfig.DISABLE_THINKING,
     EXTENDED_REASONING:

From 98d74057770834b4b7f146db6932e38cf0e471b2 Mon Sep 17 00:00:00 2001
From: sauravniraula <developmentsaurav@gmail.com>
Date: Thu, 23 Apr 2026 13:20:18 +0545
Subject: [PATCH 2/3] chore: version bump of llmai to 0.1.9

---
 servers/fastapi/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/servers/fastapi/pyproject.toml b/servers/fastapi/pyproject.toml
index 8fa45385..5eb3e431 100644
--- a/servers/fastapi/pyproject.toml
+++ b/servers/fastapi/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "pdfplumber>=0.11.7",
     "python-pptx>=1.0.2",
     "sqlmodel>=0.0.24",
-    "llmai==0.1.8",
+    "llmai==0.1.9",
 ]
 
 [tool.uv]

From 1030f07ec7aaefeb10aecba1e165fc66592720f6 Mon Sep 17 00:00:00 2001
From: sauravniraula <developmentsaurav@gmail.com>
Date: Thu, 23 Apr 2026 13:33:35 +0545
Subject: [PATCH 3/3] chore: fixes chatgpt none content issue

---
 servers/fastapi/presenton_backend.egg-info/PKG-INFO |  2 +-
 .../fastapi/presenton_backend.egg-info/requires.txt |  2 +-
 servers/fastapi/pyproject.toml                      | 13 +++++++++----
 servers/fastapi/uv.lock                             |  8 ++++----
 4 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/servers/fastapi/presenton_backend.egg-info/PKG-INFO b/servers/fastapi/presenton_backend.egg-info/PKG-INFO
index c24c29ca..a2ae42d4 100644
--- a/servers/fastapi/presenton_backend.egg-info/PKG-INFO
+++ b/servers/fastapi/presenton_backend.egg-info/PKG-INFO
@@ -20,4 +20,4 @@ Requires-Dist: pathvalidate>=3.3.1
 Requires-Dist: pdfplumber>=0.11.7
 Requires-Dist: python-pptx>=1.0.2
 Requires-Dist: sqlmodel>=0.0.24
-Requires-Dist: llmai==0.1.8
+Requires-Dist: llmai==0.1.9
diff --git a/servers/fastapi/presenton_backend.egg-info/requires.txt b/servers/fastapi/presenton_backend.egg-info/requires.txt
index 87b670ce..b7f83600 100644
--- a/servers/fastapi/presenton_backend.egg-info/requires.txt
+++ b/servers/fastapi/presenton_backend.egg-info/requires.txt
@@ -15,4 +15,4 @@ pathvalidate>=3.3.1
 pdfplumber>=0.11.7
 python-pptx>=1.0.2
 sqlmodel>=0.0.24
-llmai==0.1.8
+llmai==0.1.9
diff --git a/servers/fastapi/pyproject.toml b/servers/fastapi/pyproject.toml
index 5eb3e431..b36d123b 100644
--- a/servers/fastapi/pyproject.toml
+++ b/servers/fastapi/pyproject.toml
@@ -31,9 +31,14 @@ dependencies = [
 [tool.uv]
 index-strategy = "unsafe-best-match"
 
-[tool.uv.sources]
-llmai = { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" }
-
 [tool.setuptools.packages.find]
 where = ["."]
-include = ["api*", "enums*", "models*", "services*", "constants*", "utils*", "templates*"]
+include = [
+    "api*",
+    "enums*",
+    "models*",
+    "services*",
+    "constants*",
+    "utils*",
+    "templates*",
+]
diff --git a/servers/fastapi/uv.lock b/servers/fastapi/uv.lock
index e0ded891..0a12daf7 100644
--- a/servers/fastapi/uv.lock
+++ b/servers/fastapi/uv.lock
@@ -1187,8 +1187,8 @@ wheels = [
 
 [[package]]
 name = "llmai"
-version = "0.1.8"
-source = { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" }
+version = "0.1.9"
+source = { url = "https://files.pythonhosted.org/packages/c6/86/5dcfd77b634947cd570680b13217b40bc72cd7d9e7f04cc1a52ff5f549a0/llmai-0.1.9-py3-none-any.whl" }
 dependencies = [
     { name = "anthropic" },
     { name = "boto3" },
@@ -1196,7 +1196,7 @@ dependencies = [
     { name = "openai" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl", hash = "sha256:c4bae504dae928e88e8437bd3e2e5eb573f459d6df9ed8fc182671ee99b3cf1b" },
+    { url = "https://files.pythonhosted.org/packages/c6/86/5dcfd77b634947cd570680b13217b40bc72cd7d9e7f04cc1a52ff5f549a0/llmai-0.1.9-py3-none-any.whl", hash = "sha256:dcd94502516586bbd6394fe2c9c610941ff4c19eae0f1316825435f35134cfb4" },
 ]
 
 [package.metadata]
@@ -1693,7 +1693,7 @@ requires-dist = [
     { name = "fastembed-vectorstore", specifier = ">=0.5.2" },
     { name = "fastmcp", specifier = ">=2.11.0" },
     { name = "google-genai", specifier = ">=1.28.0" },
-    { name = "llmai", url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" },
+    { name = "llmai", url = "https://files.pythonhosted.org/packages/c6/86/5dcfd77b634947cd570680b13217b40bc72cd7d9e7f04cc1a52ff5f549a0/llmai-0.1.9-py3-none-any.whl" },
     { name = "mem0ai", extras = ["nlp"], specifier = ">=0.1.115" },
     { name = "nltk", specifier = ">=3.9.1" },
     { name = "openai", specifier = ">=1.98.0" },