Merge pull request #532 from presenton/refactor/use-llmai

refactor: use llmai as llm client, removes old llm client and tool call handler
2026-04-23 13:46:11 +05:45 · 2026-04-23 13:46:11 +05:45 · 6765897913
commit 6765897913
parent ad3b31a359 1030f07ec7
35 changed files with 701 additions and 3370 deletions
--- a/README.md
+++ b/README.md
@ -214,7 +214,6 @@ Other optional variables exist in code (for example advanced Mem0 paths, LitePar
 - **CUSTOM_LLM_URL**: OpenAI-compatible base URL if **LLM** is **custom**.
 - **CUSTOM_LLM_API_KEY**: API key if **LLM** is **custom**.
 - **CUSTOM_MODEL**: Model id if **LLM** is **custom**.
- **TOOL_CALLS**=[true/false]: If **true**, the custom LLM uses tool calls instead of JSON schema for structured output.
 - **DISABLE_THINKING**=[true/false]: If **true**, disables “thinking” on the custom LLM.
 - **WEB_GROUNDING**=[true/false]: If **true**, enables web search for OpenAI, Google, and Anthropic models.
 - **EXTENDED_REASONING**=[true/false]: Enables extended reasoning where supported by the configured stack.
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -35,7 +35,6 @@ services:
      - DALL_E_3_QUALITY=${DALL_E_3_QUALITY}
      - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY}
      - EXTENDED_REASONING=${EXTENDED_REASONING}
-      - TOOL_CALLS=${TOOL_CALLS}
      - DISABLE_THINKING=${DISABLE_THINKING}
      - WEB_GROUNDING=${WEB_GROUNDING}
      - DATABASE_URL=${DATABASE_URL}
@ -99,7 +98,6 @@ services:
      - DALL_E_3_QUALITY=${DALL_E_3_QUALITY}
      - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY}
      - EXTENDED_REASONING=${EXTENDED_REASONING}
-      - TOOL_CALLS=${TOOL_CALLS}
      - DISABLE_THINKING=${DISABLE_THINKING}
      - WEB_GROUNDING=${WEB_GROUNDING}
      - DATABASE_URL=${DATABASE_URL}
@ -158,7 +156,6 @@ services:
      - DALL_E_3_QUALITY=${DALL_E_3_QUALITY}
      - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY}
      - EXTENDED_REASONING=${EXTENDED_REASONING}
-      - TOOL_CALLS=${TOOL_CALLS}
      - DISABLE_THINKING=${DISABLE_THINKING}
      - WEB_GROUNDING=${WEB_GROUNDING}
      - DATABASE_URL=${DATABASE_URL}
@ -223,7 +220,6 @@ services:
      - DALL_E_3_QUALITY=${DALL_E_3_QUALITY}
      - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY}
      - EXTENDED_REASONING=${EXTENDED_REASONING}
-      - TOOL_CALLS=${TOOL_CALLS}
      - DISABLE_THINKING=${DISABLE_THINKING}
      - WEB_GROUNDING=${WEB_GROUNDING}
      - DATABASE_URL=${DATABASE_URL}
--- a/servers/fastapi/api/v1/ppt/endpoints/outlines.py
+++ b/servers/fastapi/api/v1/ppt/endpoints/outlines.py
@ -21,6 +21,7 @@ from services.documents_loader import DocumentsLoader
 from services.mem0_presentation_memory_service import (
    MEM0_PRESENTATION_MEMORY_SERVICE,
 )
+from utils.llm_utils import message_content_to_text
 from utils.outline_utils import (
    get_no_of_outlines_to_generate_for_n_slides,
    get_presentation_title_from_presentation_outline,
@ -85,12 +86,12 @@ async def stream_outlines(
        await MEM0_PRESENTATION_MEMORY_SERVICE.store_generation_context(
            presentation_id=presentation.id,
            system_prompt=(
-                outline_messages[0].content
+                message_content_to_text(outline_messages[0].content)
                if len(outline_messages) > 0
                else None
            ),
            user_prompt=(
-                outline_messages[1].content
+                message_content_to_text(outline_messages[1].content)
                if len(outline_messages) > 1
                else None
            ),
--- a/servers/fastapi/api/v1/ppt/endpoints/presentation.py
+++ b/servers/fastapi/api/v1/ppt/endpoints/presentation.py
@ -75,6 +75,7 @@ from utils.process_slides import (
    process_slide_and_fetch_assets,
 )
 from utils.get_layout_by_name import get_layout_by_name
+from utils.llm_utils import message_content_to_text
 from models.presentation_layout import PresentationLayoutModel
 import uuid

@ -666,12 +667,12 @@ async def generate_presentation_handler(
            await MEM0_PRESENTATION_MEMORY_SERVICE.store_generation_context(
                presentation_id=presentation_id,
                system_prompt=(
-                    outline_messages[0].content
+                    message_content_to_text(outline_messages[0].content)
                    if len(outline_messages) > 0
                    else None
                ),
                user_prompt=(
-                    outline_messages[1].content
+                    message_content_to_text(outline_messages[1].content)
                    if len(outline_messages) > 1
                    else None
                ),
--- a/servers/fastapi/models/llm_message.py
+++ b/servers/fastapi/models/llm_message.py
@ -1,59 +0,0 @@
-from typing import Any, List, Literal, Optional
-from pydantic import BaseModel
-from google.genai.types import Content as GoogleContent
-
-from models.llm_tool_call import AnthropicToolCall
-
-
-class LLMMessage(BaseModel):
-    pass
-
-
-class LLMUserMessage(LLMMessage):
-    role: Literal["user"] = "user"
-    content: str
-
-
-class LLMSystemMessage(LLMMessage):
-    role: Literal["system"] = "system"
-    content: str
-
-
-class OpenAIAssistantMessage(LLMMessage):
-    role: Literal["assistant"] = "assistant"
-    content: str | None = None
-    tool_calls: Optional[List[dict]] = None
-
-
-class GoogleAssistantMessage(LLMMessage):
-    role: Literal["assistant"] = "assistant"
-    content: GoogleContent
-
-
-class AnthropicAssistantMessage(LLMMessage):
-    role: Literal["assistant"] = "assistant"
-    content: List[AnthropicToolCall]
-
-
-class AnthropicToolCallMessage(LLMMessage):
-    type: Literal["tool_result"] = "tool_result"
-    tool_use_id: str
-    content: str
-
-
-class AnthropicUserMessage(LLMMessage):
-    role: Literal["user"] = "user"
-    content: List[AnthropicToolCallMessage]
-
-
-class OpenAIToolCallMessage(LLMMessage):
-    role: Literal["tool"] = "tool"
-    content: str
-    tool_call_id: str
-
-
-class GoogleToolCallMessage(LLMMessage):
-    role: Literal["tool"] = "tool"
-    id: Optional[str] = None
-    name: str
-    response: dict
--- a/servers/fastapi/models/llm_tool_call.py
+++ b/servers/fastapi/models/llm_tool_call.py
@ -1,30 +0,0 @@
-from typing import Literal, Optional
-from pydantic import BaseModel
-
-
-class LLMToolCall(BaseModel):
-    pass
-
-
-class OpenAIToolCallFunction(BaseModel):
-    name: str
-    arguments: str
-
-
-class OpenAIToolCall(LLMToolCall):
-    id: str
-    type: Literal["function"] = "function"
-    function: OpenAIToolCallFunction
-
-
-class GoogleToolCall(LLMToolCall):
-    id: Optional[str] = None
-    name: str
-    arguments: Optional[dict] = None
-
-
-class AnthropicToolCall(LLMToolCall):
-    type: Literal["tool_use"] = "tool_use"
-    id: str
-    name: str
-    input: object
--- a/servers/fastapi/models/llm_tools.py
+++ b/servers/fastapi/models/llm_tools.py
@ -1,29 +0,0 @@
-from typing import Any, Callable, Coroutine, Optional
-from pydantic import BaseModel, Field
-
-
-class LLMTool(BaseModel):
-    pass
-
-
-class LLMDynamicTool(LLMTool):
-    name: str
-    description: str
-    parameters: dict = {}
-    handler: Callable[..., Coroutine[Any, Any, str]]
-
-
-class SearchWebTool(LLMTool):
-    """
-    Search the web for information.
-    """
-
-    query: str = Field(description="The query to search the web for")
-
-
-class GetCurrentDatetimeTool(LLMTool):
-    """
-    Get the current datetime.
-    """
-
-    pass
--- a/servers/fastapi/models/user_config.py
+++ b/servers/fastapi/models/user_config.py
@ -46,7 +46,6 @@ class UserConfig(BaseModel):
    GPT_IMAGE_1_5_QUALITY: Optional[str] = None

    # Reasoning
-    TOOL_CALLS: Optional[bool] = None
    DISABLE_THINKING: Optional[bool] = None
    EXTENDED_REASONING: Optional[bool] = None

--- a/servers/fastapi/presenton_backend.egg-info/PKG-INFO
+++ b/servers/fastapi/presenton_backend.egg-info/PKG-INFO
@ -7,19 +7,17 @@ Requires-Dist: alembic>=1.14.0
 Requires-Dist: aiohttp>=3.12.15
 Requires-Dist: aiomysql>=0.2.0
 Requires-Dist: aiosqlite>=0.21.0
-Requires-Dist: anthropic>=0.60.0
 Requires-Dist: asyncpg>=0.30.0
-Requires-Dist: chromadb>=1.0.15
 Requires-Dist: dirtyjson>=1.0.8
 Requires-Dist: fastapi[standard]>=0.116.1
 Requires-Dist: fastembed-vectorstore>=0.5.2
 Requires-Dist: fastmcp>=2.11.0
 Requires-Dist: google-genai>=1.28.0
+Requires-Dist: mem0ai[nlp]>=0.1.115
 Requires-Dist: nltk>=3.9.1
 Requires-Dist: openai>=1.98.0
 Requires-Dist: pathvalidate>=3.3.1
 Requires-Dist: pdfplumber>=0.11.7
-Requires-Dist: pytest>=8.4.1
 Requires-Dist: python-pptx>=1.0.2
-Requires-Dist: redis>=6.2.0
 Requires-Dist: sqlmodel>=0.0.24
+Requires-Dist: llmai==0.1.9
--- a/servers/fastapi/presenton_backend.egg-info/SOURCES.txt
+++ b/servers/fastapi/presenton_backend.egg-info/SOURCES.txt
@ -3,6 +3,7 @@ api/__init__.py
 api/lifespan.py
 api/main.py
 api/middlewares.py
+api/v1/auth/router.py
 api/v1/mock/router.py
 api/v1/ppt/background_tasks.py
 api/v1/ppt/router.py
@ -46,9 +47,6 @@ models/document_chunk.py
 models/generate_presentation_request.py
 models/image_prompt.py
 models/json_path_guide.py
-models/llm_message.py
-models/llm_tool_call.py
-models/llm_tools.py
 models/ollama_model_metadata.py
 models/ollama_model_status.py
 models/pptx_models.py
@ -78,7 +76,6 @@ presenton_backend.egg-info/dependency_links.txt
 presenton_backend.egg-info/requires.txt
 presenton_backend.egg-info/top_level.txt
 services/__init__.py
-services/codex_llm.py
 services/concurrent_service.py
 services/database.py
 services/document_conversion_service.py
@ -88,8 +85,7 @@ services/html_to_text_runs_service.py
 services/icon_finder_service.py
 services/image_generation_service.py
 services/liteparse_service.py
-services/llm_client.py
-services/llm_tool_calls_handler.py
+services/mem0_presentation_memory_service.py
 services/pptx_presentation_creator.py
 services/score_based_chunker.py
 services/temp_file_service.py
@ -106,7 +102,9 @@ templates/providers.py
 templates/router.py
 tests/test_gemini_schema_support.py
 tests/test_image_generation.py
+tests/test_liteparse_service.py
 tests/test_mcp_server.py
+tests/test_mem0_presentation_memory_service.py
 tests/test_openai_schema_support.py
 tests/test_pptx_creator.py
 tests/test_pptx_slides_processing.py
@ -130,7 +128,9 @@ utils/get_layout_by_name.py
 utils/image_provider.py
 utils/image_utils.py
 utils/llm_client_error_handler.py
+utils/llm_config.py
 utils/llm_provider.py
+utils/llm_utils.py
 utils/model_availability.py
 utils/ocr_language.py
 utils/ollama.py
@ -141,6 +141,7 @@ utils/ppt_utils.py
 utils/process_slides.py
 utils/schema_utils.py
 utils/set_env.py
+utils/simple_auth.py
 utils/theme_utils.py
 utils/user_config.py
 utils/validators.py
--- a/servers/fastapi/presenton_backend.egg-info/requires.txt
+++ b/servers/fastapi/presenton_backend.egg-info/requires.txt
@ -2,19 +2,17 @@ alembic>=1.14.0
 aiohttp>=3.12.15
 aiomysql>=0.2.0
 aiosqlite>=0.21.0
-anthropic>=0.60.0
 asyncpg>=0.30.0
-chromadb>=1.0.15
 dirtyjson>=1.0.8
 fastapi[standard]>=0.116.1
 fastembed-vectorstore>=0.5.2
 fastmcp>=2.11.0
 google-genai>=1.28.0
+mem0ai[nlp]>=0.1.115
 nltk>=3.9.1
 openai>=1.98.0
 pathvalidate>=3.3.1
 pdfplumber>=0.11.7
-pytest>=8.4.1
 python-pptx>=1.0.2
-redis>=6.2.0
 sqlmodel>=0.0.24
+llmai==0.1.9
--- a/servers/fastapi/pyproject.toml
+++ b/servers/fastapi/pyproject.toml
@ -12,7 +12,6 @@ dependencies = [
    "aiohttp>=3.12.15",
    "aiomysql>=0.2.0",
    "aiosqlite>=0.21.0",
-    "anthropic>=0.60.0",
    "asyncpg>=0.30.0",
    "dirtyjson>=1.0.8",
    "fastapi[standard]>=0.116.1",
@ -26,6 +25,7 @@ dependencies = [
    "pdfplumber>=0.11.7",
    "python-pptx>=1.0.2",
    "sqlmodel>=0.0.24",
+    "llmai==0.1.9",
 ]

 [tool.uv]
@ -33,4 +33,12 @@ index-strategy = "unsafe-best-match"

 [tool.setuptools.packages.find]
 where = ["."]
-include = ["api*", "enums*", "models*", "services*", "constants*", "utils*", "templates*"]
+include = [
+    "api*",
+    "enums*",
+    "models*",
+    "services*",
+    "constants*",
+    "utils*",
+    "templates*",
+]
--- a/servers/fastapi/services/codex_llm.py
+++ b/servers/fastapi/services/codex_llm.py
@ -1,431 +0,0 @@
-"""Codex (Responses API) adapter for structured and unstructured LLM calls.
-
-Stateless adapter: receives AsyncOpenAI client and tool_calls_handler at call time.
-Auth and client creation stay in LLMClient. Structure matches other providers:
-generate = call API, collect content + tool_calls, recurse on tool_calls; stream = same but yield deltas.
-
-Uses LLMToolCallsHandler directly: tools are parsed via parse_tools() in llm_client (handler supports
-Codex and returns OpenAI-style dicts); this module flattens them for the Responses API. Tool execution
-uses tool_calls_handler.handle_tool_calls_openai().
-"""
-
-import dirtyjson
-from typing import Any, AsyncGenerator, List, Optional, Union
-
-from fastapi import HTTPException
-from openai import APIStatusError, AsyncOpenAI, OpenAIError
-
-from models.llm_message import (
-    LLMMessage,
-    OpenAIAssistantMessage,
-    LLMSystemMessage,
-    LLMUserMessage,
-)
-from models.llm_tool_call import OpenAIToolCall, OpenAIToolCallFunction
-from utils.schema_utils import ensure_strict_json_schema
-
-# Responses API requires flat tool format: {"type":"function","name":...,"description":...,"parameters":...}
-RESPONSE_SCHEMA_NAME = "ResponseSchema"
-# Required tool choice for structured: force ResponseSchema (no plain-text fallback).
-STRUCTURED_TOOL_CHOICE = {"type": "function", "name": RESPONSE_SCHEMA_NAME}
-MAX_RECURSION_DEPTH = 5
-
-
-def _to_responses_tools(chat_tools: List[dict]) -> List[dict]:
-    """Convert Chat Completions tool format to flat Responses API format."""
-    result = []
-    for tool in chat_tools:
-        if tool.get("type") != "function":
-            result.append(tool)
-            continue
-        fn = tool.get("function") or tool
-        result.append({
-            "type": "function",
-            "name": fn.get("name", ""),
-            "description": fn.get("description", ""),
-            "parameters": fn.get("parameters", {}),
-        })
-    return result
-
-
-def _items_to_openai_calls(items_by_id: dict[str, dict]) -> List[OpenAIToolCall]:
-    """Build OpenAIToolCall list from Responses API output_item map."""
-    return [
-        OpenAIToolCall(
-            id=item.get("call_id", item.get("id", "")),
-            type="function",
-            function=OpenAIToolCallFunction(
-                name=item.get("name", ""),
-                arguments=item.get("arguments", "{}"),
-            ),
-        )
-        for item in items_by_id.values()
-    ]
-
-
-async def _messages_after_tool_turn(
-    messages: List[LLMMessage],
-    items_by_id: dict[str, dict],
-    tool_calls_handler: Any,
-) -> List[LLMMessage]:
-    """Handle tool calls and return messages extended with assistant turn + tool results."""
-    openai_calls = _items_to_openai_calls(items_by_id)
-    tool_call_messages = await tool_calls_handler.handle_tool_calls_openai(openai_calls)
-    return [
-        *messages,
-        OpenAIAssistantMessage(
-            role="assistant",
-            content=None,
-            tool_calls=[tc.model_dump() for tc in openai_calls],
-        ),
-        *tool_call_messages,
-    ]
-
-
-def _build_body(
-    model: str,
-    messages: List[LLMMessage],
-    tools: Optional[List[dict]] = None,
-    tool_choice: Optional[Union[str, dict]] = None,
-) -> dict:
-    """Build Responses API request body."""
-    instructions = None
-    input_messages = []
-
-    for msg in messages:
-        if isinstance(msg, LLMSystemMessage):
-            instructions = msg.content
-        elif isinstance(msg, LLMUserMessage):
-            input_messages.append({
-                "role": "user",
-                "content": [{"type": "input_text", "text": msg.content}],
-            })
-        elif isinstance(msg, OpenAIAssistantMessage):
-            text = msg.content or ""
-            if text:
-                input_messages.append({
-                    "role": "assistant",
-                    "content": [{"type": "output_text", "text": text}],
-                })
-        else:
-            text = getattr(msg, "content", "") or ""
-            if text:
-                input_messages.append({
-                    "role": "user",
-                    "content": [{"type": "input_text", "text": text}],
-                })
-
-    body: dict = {
-        "model": model,
-        "store": False,
-        "stream": True,
-        "text": {"verbosity": "medium"},
-        "include": ["reasoning.encrypted_content"],
-        "tool_choice": tool_choice if tool_choice is not None else "auto",
-        "parallel_tool_calls": True,
-    }
-    if instructions:
-        body["instructions"] = instructions
-    if input_messages:
-        body["input"] = input_messages
-    if tools:
-        body["tools"] = tools
-
-    return body
-
-
-def _event_to_dict(event: Any) -> dict:
-    """Convert SDK event to dict."""
-    if hasattr(event, "model_dump"):
-        return event.model_dump()
-    return {
-        "type": getattr(event, "type", None),
-        "delta": getattr(event, "delta", None),
-        "item": getattr(event, "item", None),
-        "message": getattr(event, "message", None),
-        "arguments": getattr(event, "arguments", None),
-        "name": getattr(event, "name", None),
-    }
-
-
-async def _stream_raw(
-    client: AsyncOpenAI,
-    model: str,
-    messages: List[LLMMessage],
-    tools: Optional[List[dict]] = None,
-    tool_choice: Optional[Union[str, dict]] = None,
-) -> AsyncGenerator[dict, None]:
-    """Yield raw SSE event dicts from Codex Responses API."""
-    body = _build_body(model, messages, tools, tool_choice=tool_choice)
-    create_kwargs = {k: v for k, v in body.items() if k != "stream"}
-
-    try:
-        stream = await client.responses.create(stream=True, **create_kwargs)
-    except (APIStatusError, OpenAIError) as e:
-        status = getattr(e, "status_code", 502)
-        detail = getattr(e, "message", str(e)) or str(e)
-        raise HTTPException(
-            status_code=status,
-            detail=f"Codex API error: {detail}"[:400],
-        ) from e
-
-    async for event in stream:
-        yield _event_to_dict(event)
-
-
-class CodexLLMAdapter:
-    """Stateless adapter for Codex Responses API. Matches other providers: generate/stream + tool recursion."""
-
-    @staticmethod
-    async def generate_codex(
-        client: AsyncOpenAI,
-        model: str,
-        messages: List[LLMMessage],
-        tool_calls_handler: Any,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> Optional[str]:
-        """Generate text; on tool_calls handle and recurse (like _generate_openai / _generate_anthropic)."""
-        print(
-            f"Codex generate: model={model} depth={depth} tools_count={len(tools) if tools else 0}"
-        )
-        responses_tools = _to_responses_tools(tools) if tools else None
-        text_parts: List[str] = []
-        tool_calls_by_id: dict[str, dict] = {}
-
-        async for event in _stream_raw(client, model, messages, responses_tools, tool_choice=None):
-            event_type = event.get("type", "")
-
-            if event_type == "response.output_text.delta":
-                delta = event.get("delta", "")
-                if delta:
-                    text_parts.append(delta)
-            elif event_type == "response.output_item.done":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call":
-                    tool_calls_by_id[item.get("call_id", item.get("id", ""))] = item
-            elif event_type in ("response.failed", "error"):
-                msg_text = event.get("message") or str(event)
-                raise HTTPException(status_code=502, detail=f"Codex error: {msg_text}")
-
-        if tool_calls_by_id and tools and depth < MAX_RECURSION_DEPTH:
-            print(
-                f"Codex generate: tool calls detected depth={depth} count={len(tool_calls_by_id)}"
-            )
-            new_messages = await _messages_after_tool_turn(
-                messages, tool_calls_by_id, tool_calls_handler
-            )
-            return await CodexLLMAdapter.generate_codex(
-                client, model, new_messages, tool_calls_handler,
-                max_tokens=max_tokens, tools=tools, depth=depth + 1,
-            )
-
-        return "".join(text_parts) or None
-
-    @staticmethod
-    async def stream_codex(
-        client: AsyncOpenAI,
-        model: str,
-        messages: List[LLMMessage],
-        tool_calls_handler: Any,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-        """Stream text deltas; on tool_calls handle and recurse (like _stream_openai)."""
-        print(
-            f"Codex stream: model={model} depth={depth} tools_count={len(tools) if tools else 0}"
-        )
-        responses_tools = _to_responses_tools(tools) if tools else None
-        tool_calls_by_id: dict[str, dict] = {}
-
-        async for event in _stream_raw(client, model, messages, responses_tools, tool_choice=None):
-            event_type = event.get("type", "")
-
-            if event_type == "response.output_text.delta":
-                delta = event.get("delta", "")
-                if delta:
-                    yield delta
-            elif event_type == "response.output_item.done":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call":
-                    tool_calls_by_id[item.get("call_id", item.get("id", ""))] = item
-            elif event_type in ("response.failed", "error"):
-                msg_text = event.get("message") or str(event)
-                raise HTTPException(status_code=502, detail=f"Codex stream error: {msg_text}")
-
-        if tool_calls_by_id and tools and depth < MAX_RECURSION_DEPTH:
-            print(
-                f"Codex stream: tool calls detected depth={depth} count={len(tool_calls_by_id)}"
-            )
-            new_messages = await _messages_after_tool_turn(
-                messages, tool_calls_by_id, tool_calls_handler
-            )
-            async for chunk in CodexLLMAdapter.stream_codex(
-                client, model, new_messages, tool_calls_handler,
-                max_tokens=max_tokens, tools=tools, depth=depth + 1,
-            ):
-                yield chunk
-
-    @staticmethod
-    async def stream_codex_structured(
-        client: AsyncOpenAI,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        tool_calls_handler: Any,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> AsyncGenerator[str, None]:
-        """Stream JSON chunks from ResponseSchema tool; recurse for other tool_calls.
-
-        Structured output is achieved by always adding an internal ResponseSchema "tool"
-        (with response_format as its parameters) and tool_choice=ResponseSchema. So
-        user_tools=0 only means no extra tools like web search; we still use the
-        ResponseSchema tool to receive the model's JSON.
-        """
-        user_tools_count = len(tools) if tools else 0
-        print(
-            f"Codex stream_structured: model={model} depth={depth} strict={strict} "
-            f"user_tools={user_tools_count} (always adding ResponseSchema tool for structured JSON)"
-        )
-        schema = ensure_strict_json_schema(response_format, path=(), root=response_format) if strict and depth == 0 else response_format
-        response_schema_tool = {
-            "type": "function",
-            "name": RESPONSE_SCHEMA_NAME,
-            "description": "Provide response to the user",
-            "parameters": schema,
-        }
-        all_tools: List[dict] = [response_schema_tool]
-        if tools:
-            all_tools.extend(_to_responses_tools(tools))
-
-        tool_calls_by_id: dict[str, dict] = {}
-        current_call_id: Optional[str] = None
-
-        async for event in _stream_raw(
-            client, model, messages, all_tools, tool_choice=STRUCTURED_TOOL_CHOICE
-        ):
-            event_type = event.get("type", "")
-
-            if event_type == "response.output_item.added":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call" and item.get("name") == RESPONSE_SCHEMA_NAME:
-                    current_call_id = item.get("call_id", item.get("id"))
-                    print(
-                        f"Codex stream_structured: ResponseSchema call started call_id={current_call_id}"
-                    )
-
-            elif event_type == "response.function_call_arguments.delta":
-                if current_call_id is not None:
-                    delta = event.get("delta", "")
-                    if delta:
-                        # Log only first few chunks to avoid log spam
-                        print(
-                            f"Codex stream_structured: ResponseSchema delta chunk len={len(delta)}"
-                        )
-                        yield delta
-
-            elif event_type == "response.function_call_arguments.done":
-                if event.get("name") == RESPONSE_SCHEMA_NAME:
-                    arguments = event.get("arguments", "")
-                    if arguments:
-                        print(
-                            f"Codex stream_structured: ResponseSchema arguments.done len={len(arguments)}"
-                        )
-                        yield arguments
-
-            elif event_type == "response.output_item.done":
-                item = event.get("item") or {}
-                if item.get("type") == "function_call":
-                    tool_calls_by_id[item.get("call_id", item.get("id", ""))] = item
-                    if item.get("name") == RESPONSE_SCHEMA_NAME:
-                        arguments = item.get("arguments", "")
-                        if arguments:
-                            print(
-                                f"Codex stream_structured: ResponseSchema output_item.done len={len(arguments)}"
-                            )
-                            yield arguments
-
-            elif event_type in ("response.failed", "error"):
-                msg_text = event.get("message") or str(event)
-                raise HTTPException(status_code=502, detail=f"Codex structured error: {msg_text}")
-
-        other_tool_calls = {
-            k: v for k, v in tool_calls_by_id.items()
-            if v.get("name") != RESPONSE_SCHEMA_NAME
-        }
-        if other_tool_calls and tools and depth < MAX_RECURSION_DEPTH:
-            print(
-                f"Codex stream_structured: recursing for non-ResponseSchema tool calls "
-                f"depth={depth} count={len(other_tool_calls)}"
-            )
-            new_messages = await _messages_after_tool_turn(
-                messages, other_tool_calls, tool_calls_handler
-            )
-            async for chunk in CodexLLMAdapter.stream_codex_structured(
-                client, model, new_messages, response_format, tool_calls_handler,
-                strict=strict, max_tokens=max_tokens, tools=tools, depth=depth + 1,
-            ):
-                yield chunk
-
-    @staticmethod
-    async def generate_codex_structured(
-        client: AsyncOpenAI,
-        model: str,
-        messages: List[LLMMessage],
-        response_format: dict,
-        tool_calls_handler: Any,
-        strict: bool = False,
-        max_tokens: Optional[int] = None,
-        tools: Optional[List[dict]] = None,
-        depth: int = 0,
-    ) -> Optional[dict]:
-        """Collect stream and parse JSON (like _generate_openai_structured)."""
-        user_tools_count = len(tools) if tools else 0
-        print(
-            f"Codex generate_structured: model={model} depth={depth} strict={strict} "
-            f"user_tools={user_tools_count} (using ResponseSchema tool for structured JSON)"
-        )
-        accumulated: List[str] = []
-        async for chunk in CodexLLMAdapter.stream_codex_structured(
-            client, model, messages, response_format, tool_calls_handler,
-            strict=strict, max_tokens=max_tokens, tools=tools, depth=depth,
-        ):
-            accumulated.append(chunk)
-
-        raw = "".join(accumulated)
-        if not raw:
-            return None
-
-        if depth == 0:
-            try:
-                parsed = dict(dirtyjson.loads(raw))
-                print(
-                    f"Codex generate_structured: parsed JSON keys={list(parsed.keys())[:8]}"
-                )
-                return parsed
-            except Exception:
-                start = raw.find("{")
-                if start >= 0:
-                    try:
-                        parsed = dict(dirtyjson.loads(raw[start:]))
-                        print(
-                            "Codex generate_structured: parsed JSON from offset "
-                            f"{start} keys={list(parsed.keys())[:8]}"
-                        )
-                        return parsed
-                    except Exception:
-                        pass
-                raise HTTPException(
-                    status_code=502,
-                    detail=(
-                        "Model did not return valid structured output (expected JSON from ResponseSchema). "
-                        "Please retry."
-                    ),
-                )
-
-        return None
--- a/servers/fastapi/services/llm_client.py
+++ b/servers/fastapi/services/llm_client.py
--- a/servers/fastapi/services/llm_tool_calls_handler.py
+++ b/servers/fastapi/services/llm_tool_calls_handler.py
@ -1,211 +0,0 @@
-import asyncio
-from datetime import datetime
-import json
-from typing import Any, Callable, Coroutine, List, Optional
-from fastapi import HTTPException
-from enums.llm_provider import LLMProvider
-from models.llm_message import (
-    AnthropicToolCallMessage,
-    GoogleToolCallMessage,
-    OpenAIToolCallMessage,
-)
-from models.llm_tool_call import AnthropicToolCall, GoogleToolCall, OpenAIToolCall
-from models.llm_tools import LLMDynamicTool, LLMTool, SearchWebTool
-from utils.schema_utils import (
-    ensure_strict_json_schema,
-    flatten_json_schema,
-    remove_titles_from_schema,
-)
-
-
-class LLMToolCallsHandler:
-    def __init__(self, client):
-        from services.llm_client import LLMClient
-
-        self.client: LLMClient = client
-
-        self.tools_map: dict[str, Callable[..., Coroutine[Any, Any, str]]] = {
-            "SearchWebTool": self.search_web_tool_call_handler,
-            "GetCurrentDatetimeTool": self.get_current_datetime_tool_call_handler,
-        }
-        self.dynamic_tools: List[LLMDynamicTool] = []
-
-    def get_tool_handler(
-        self, tool_name: str
-    ) -> Callable[..., Coroutine[Any, Any, str]]:
-        handler = self.tools_map.get(tool_name)
-        if handler:
-            return handler
-        else:
-            dynamic_tools = list(
-                filter(lambda tool: tool.name == tool_name, self.dynamic_tools)
-            )
-            if dynamic_tools:
-                return dynamic_tools[0].handler
-        raise HTTPException(status_code=500, detail=f"Tool {tool_name} not found")
-
-    def parse_tools(self, tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None):
-        if tools is None:
-            return None
-        parsed_tools = map(self.parse_tool, tools)
-        return list(parsed_tools)
-
-    def parse_tool(self, tool: type[LLMTool] | LLMDynamicTool, strict: bool = False):
-        if isinstance(tool, LLMDynamicTool):
-            self.dynamic_tools.append(tool)
-
-        match self.client.llm_provider:
-            case LLMProvider.OPENAI | LLMProvider.OLLAMA | LLMProvider.CUSTOM:
-                return self.parse_tool_openai(tool, strict)
-            case LLMProvider.ANTHROPIC:
-                return self.parse_tool_anthropic(tool)
-            case LLMProvider.GOOGLE:
-                return self.parse_tool_google(tool)
-            case _:
-                raise ValueError(
-                    f"LLM provider must be either openai, anthropic, or google"
-                )
-
-    def parse_tool_openai(
-        self, tool: type[LLMTool] | LLMDynamicTool, strict: bool = False
-    ):
-        if isinstance(tool, LLMDynamicTool):
-            name = tool.name
-            description = tool.description
-            parameters = tool.parameters
-        else:
-            name = tool.__name__
-            description = tool.__doc__ or ""
-            parameters = tool.model_json_schema()
-
-        if strict:
-            parameters = ensure_strict_json_schema(parameters, path=(), root=parameters)
-
-        return {
-            "type": "function",
-            "function": {
-                "name": name,
-                "description": description,
-                "strict": strict,
-                "parameters": parameters,
-            },
-        }
-
-    def parse_tool_google(self, tool: type[LLMTool] | LLMDynamicTool):
-        parsed = self.parse_tool_openai(tool)
-        parsed["function"]["parameters"] = (
-            remove_titles_from_schema(
-                flatten_json_schema(parsed["function"]["parameters"])
-            )
-            if parsed["function"]["parameters"]
-            else {}
-        )
-        return {
-            "name": parsed["function"]["name"],
-            "description": parsed["function"]["description"],
-            "parameters": parsed["function"]["parameters"],
-        }
-
-    def parse_tool_anthropic(self, tool: type[LLMTool] | LLMDynamicTool):
-        parsed = self.parse_tool_openai(tool)
-        input_schema = parsed["function"]["parameters"]
-        return {
-            "name": parsed["function"]["name"],
-            "description": parsed["function"]["description"],
-            "input_schema": {"type": "object"} if input_schema == {} else input_schema,
-        }
-
-    async def handle_tool_calls_openai(
-        self,
-        tool_calls: List[OpenAIToolCall],
-    ) -> List[OpenAIToolCallMessage]:
-        async_tool_calls_tasks = []
-        for tool_call in tool_calls:
-            tool_name = tool_call.function.name
-            tool_handler = self.get_tool_handler(tool_name)
-            async_tool_calls_tasks.append(tool_handler(tool_call.function.arguments))
-
-        tool_call_results: List[str] = await asyncio.gather(*async_tool_calls_tasks)
-        tool_call_messages = [
-            OpenAIToolCallMessage(
-                content=result,
-                tool_call_id=tool_call.id,
-            )
-            for tool_call, result in zip(tool_calls, tool_call_results)
-        ]
-        return tool_call_messages
-
-    async def handle_tool_calls_google(
-        self,
-        tool_calls: List[GoogleToolCall],
-    ) -> List[GoogleToolCallMessage]:
-        async_tool_calls_tasks = []
-        for tool_call in tool_calls:
-            tool_name = tool_call.name
-            tool_handler = self.get_tool_handler(tool_name)
-            async_tool_calls_tasks.append(tool_handler(json.dumps(tool_call.arguments)))
-
-        tool_call_results: List[str] = await asyncio.gather(*async_tool_calls_tasks)
-
-        tool_call_messages = [
-            GoogleToolCallMessage(
-                id=tool_call.id,
-                name=tool_call.name,
-                response={"result": result},
-            )
-            for tool_call, result in zip(tool_calls, tool_call_results)
-        ]
-        return tool_call_messages
-
-    async def handle_tool_calls_anthropic(
-        self,
-        tool_calls: List[AnthropicToolCall],
-    ) -> List[AnthropicToolCallMessage]:
-        async_tool_calls_tasks = []
-        for tool_call in tool_calls:
-            tool_name = tool_call.name
-            tool_handler = self.get_tool_handler(tool_name)
-            async_tool_calls_tasks.append(tool_handler(json.dumps(tool_call.input)))
-
-        tool_call_results: List[str] = await asyncio.gather(*async_tool_calls_tasks)
-        tool_call_messages = [
-            AnthropicToolCallMessage(
-                content=result,
-                tool_use_id=tool_call.id,
-            )
-            for tool_call, result in zip(tool_calls, tool_call_results)
-        ]
-        return tool_call_messages
-
-    # ? Tool call handlers
-    # Search web tool call handler
-    async def search_web_tool_call_handler(self, arguments: str) -> str:
-        match self.client.llm_provider:
-            case LLMProvider.OPENAI:
-                return await self.search_web_tool_call_handler_openai(arguments)
-            case LLMProvider.ANTHROPIC:
-                return await self.search_web_tool_call_handler_anthropic(arguments)
-            case LLMProvider.GOOGLE:
-                return await self.search_web_tool_call_handler_google(arguments)
-            case _:
-                return (
-                    "Web search tool call handler not implemented for this LLM provider: "
-                    + self.client.llm_provider.value
-                )
-
-    async def search_web_tool_call_handler_openai(self, arguments: str) -> str:
-        args = SearchWebTool.model_validate_json(arguments)
-        return await self.client._search_openai(args.query)
-
-    async def search_web_tool_call_handler_google(self, arguments: str) -> str:
-        args = SearchWebTool.model_validate_json(arguments)
-        return await self.client._search_google(args.query)
-
-    async def search_web_tool_call_handler_anthropic(self, arguments: str) -> str:
-        args = SearchWebTool.model_validate_json(arguments)
-        return await self.client._search_anthropic(args.query)
-
-    # Get current datetime tool call handler
-    async def get_current_datetime_tool_call_handler(self, _) -> str:
-        current_time = datetime.now()
-        return f"{current_time.strftime('%A, %B %d, %Y')} at {current_time.strftime('%I:%M:%S %p')}"
--- a/servers/fastapi/templates/providers.py
+++ b/servers/fastapi/templates/providers.py
@ -4,10 +4,17 @@ from dataclasses import dataclass
 import time
 from typing import Any, Awaitable, Callable, Optional

-from anthropic import AsyncAnthropic
 from fastapi import HTTPException
 from google import genai
 from google.genai import types as google_types
+from llmai import AnthropicClient
+from llmai.shared import (
+    AnthropicClientConfig,
+    ImageContentPart,
+    SystemMessage,
+    TextResponse,
+    UserMessage,
+)
 from openai import AsyncOpenAI

 from enums.llm_provider import LLMProvider
@ -160,11 +167,28 @@ def _get_google_client() -> genai.Client:
    return genai.Client(api_key=api_key)


-def _get_anthropic_client() -> AsyncAnthropic:
+def _get_anthropic_client() -> AnthropicClient:
    api_key = get_anthropic_api_key_env()
    if not api_key:
        raise HTTPException(status_code=400, detail="ANTHROPIC_API_KEY is not set")
-    return AsyncAnthropic(api_key=api_key)
+    return AnthropicClient(config=AnthropicClientConfig(api_key=api_key))
+
+
+def _read_llmai_response_text(response: Any) -> str:
+    content = getattr(response, "content", None)
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts: list[str] = []
+        for part in content:
+            if isinstance(part, str):
+                parts.append(part)
+                continue
+            text = getattr(part, "text", None)
+            if isinstance(text, str):
+                parts.append(text)
+        return "".join(parts)
+    return getattr(content, "text", None) or ""


 async def _call_openai_like(
@ -308,28 +332,24 @@ async def _call_anthropic(
    media_type: str = "image/png",
 ) -> str:
    client = _get_anthropic_client()
-    content = [{"type": "text", "text": user_text}]
+    content: str | list[object] = user_text
    if image_bytes:
-        content.append(
-            {
-                "type": "image",
-                "source": {
-                    "type": "base64",
-                    "media_type": media_type,
-                    "data": base64.b64encode(image_bytes).decode("utf-8"),
-                },
-            }
-        )
+        content = [
+            user_text,
+            ImageContentPart(data=image_bytes, mime_type=media_type),
+        ]

-    response = await client.messages.create(
+    response = await asyncio.to_thread(
+        client.generate,
        model=model,
+        messages=[
+            SystemMessage(content=system_prompt),
+            UserMessage(content=content),
+        ],
+        response_format=TextResponse(),
        max_tokens=8192,
-        system=system_prompt,
-        messages=[{"role": "user", "content": content}],
-    )
-    output_text = "".join(
-        block.text for block in response.content if getattr(block, "type", None) == "text"
    )
+    output_text = _read_llmai_response_text(response)
    if not output_text:
        raise HTTPException(status_code=500, detail="No output from template provider")
    return output_text
--- a/servers/fastapi/utils/available_models.py
+++ b/servers/fastapi/utils/available_models.py
@ -1,4 +1,4 @@
-from anthropic import AsyncAnthropic
+import aiohttp
 from openai import AsyncOpenAI
 from google import genai

@ -12,8 +12,21 @@ async def list_available_openai_compatible_models(url: str, api_key: str) -> lis


 async def list_available_anthropic_models(api_key: str) -> list[str]:
-    client = AsyncAnthropic(api_key=api_key)
-    return list(map(lambda x: x.id, (await client.models.list(limit=50)).data))
+    async with aiohttp.ClientSession(
+        headers={
+            "x-api-key": api_key,
+            "anthropic-version": "2023-06-01",
+        }
+    ) as session:
+        async with session.get(
+            "https://api.anthropic.com/v1/models",
+            params={"limit": 50},
+        ) as response:
+            response.raise_for_status()
+            data = await response.json()
+
+    models = data.get("data", [])
+    return [model.get("id") for model in models if model.get("id")]


 async def list_available_google_models(api_key: str) -> list[str]:
--- a/servers/fastapi/utils/get_env.py
+++ b/servers/fastapi/utils/get_env.py
@ -85,10 +85,6 @@ def get_pixabay_api_key_env():
    return os.getenv("PIXABAY_API_KEY")


-def get_tool_calls_env():
-    return os.getenv("TOOL_CALLS")
-
-
 def get_disable_thinking_env():
    return os.getenv("DISABLE_THINKING")

--- a/servers/fastapi/utils/llm_calls/edit_slide.py
+++ b/servers/fastapi/utils/llm_calls/edit_slide.py
@ -1,10 +1,14 @@
+import asyncio
 from datetime import datetime
 from typing import Optional
-from models.llm_message import LLMSystemMessage, LLMUserMessage
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage
 from models.presentation_layout import SlideLayoutModel
 from models.sql.slide import SlideModel
-from services.llm_client import LLMClient
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_structured_content, get_generate_kwargs
 from utils.llm_provider import get_model
 from utils.schema_utils import add_field_in_schema, remove_fields_from_schema

@ -89,12 +93,12 @@ def get_messages(
    verbosity: Optional[str] = None,
    instructions: Optional[str] = None,
    memory_context: Optional[str] = None,
-):
+) -> list[Message]:
    return [
-        LLMSystemMessage(
+        SystemMessage(
            content=get_system_prompt(tone, verbosity, instructions, memory_context),
        ),
-        LLMUserMessage(
+        UserMessage(
            content=get_user_prompt(prompt, slide_data, language),
        ),
    ]
@ -128,23 +132,40 @@ async def get_edited_slide_content(
        True,
    )

-    client = LLMClient()
+    client = get_client(config=get_llm_config())
    try:
-        response = await client.generate_structured(
-            model=model,
-            messages=get_messages(
-                prompt,
-                slide.content,
-                language,
-                tone,
-                verbosity,
-                instructions,
-                memory_context,
-            ),
-            response_format=response_schema,
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=response_schema,
            strict=False,
        )
-        return response
+        messages = get_messages(
+            prompt,
+            slide.content,
+            language,
+            tone,
+            verbosity,
+            instructions,
+            memory_context,
+        )
+
+        for attempt in range(3):
+            response = await asyncio.to_thread(
+                client.generate,
+                **get_generate_kwargs(
+                    model=model,
+                    messages=messages,
+                    response_format=response_format,
+                ),
+            )
+            content = extract_structured_content(response.content)
+            if content is not None:
+                return content
+
+            if attempt < 2:
+                await asyncio.sleep(0.5 * (attempt + 1))
+
+        raise HTTPException(status_code=400, detail="LLM did not return any content")

    except Exception as e:
        raise handle_llm_client_exceptions(e)
--- a/servers/fastapi/utils/llm_calls/edit_slide_html.py
+++ b/servers/fastapi/utils/llm_calls/edit_slide_html.py
@ -1,7 +1,11 @@
+import asyncio
 from typing import Optional
-from models.llm_message import LLMSystemMessage, LLMUserMessage
-from services.llm_client import LLMClient
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import SystemMessage, UserMessage
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_text, get_generate_kwargs
 from utils.llm_provider import get_model

 system_prompt = """
@ -59,18 +63,24 @@ async def get_edited_slide_html(
 ):
    model = get_model()

-    client = LLMClient()
+    client = get_client(config=get_llm_config())
    try:
-        response = await client.generate(
-            model=model,
-            messages=[
-                LLMSystemMessage(content=system_prompt),
-                LLMUserMessage(
-                    content=get_user_prompt(prompt, html, memory_context)
-                ),
-            ],
+        response = await asyncio.to_thread(
+            client.generate,
+            **get_generate_kwargs(
+                model=model,
+                messages=[
+                    SystemMessage(content=system_prompt),
+                    UserMessage(
+                        content=get_user_prompt(prompt, html, memory_context)
+                    ),
+                ],
+            ),
        )
-        return extract_html_from_response(response) or html
+        response_text = extract_text(response.content)
+        if response_text is None:
+            raise HTTPException(status_code=400, detail="LLM did not return any content")
+        return extract_html_from_response(response_text) or html
    except Exception as e:
        raise handle_llm_client_exceptions(e)

--- a/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py
+++ b/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py
@ -1,14 +1,26 @@
 from datetime import datetime
 from typing import Optional

-from enums.llm_provider import LLMProvider
-from models.llm_message import LLMSystemMessage, LLMUserMessage
+from llmai import get_client
+from llmai.shared import (
+    JSONSchemaResponse,
+    Message,
+    ResponseStreamCompletionChunk,
+    SystemMessage,
+    UserMessage,
+    WebSearchTool,
+)
+
 from models.presentation_outline_model import PresentationOutlineModel
-from models.llm_tools import SearchWebTool
-from services.llm_client import LLMClient
 from utils.get_dynamic_models import get_presentation_outline_model_with_n_slides
+from utils.llm_config import enable_web_grounding, get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
 from utils.llm_provider import get_model
+from utils.llm_utils import (
+    get_generate_kwargs,
+    serialize_structured_content,
+    stream_generate_events,
+)


 def get_system_prompt(
@ -125,9 +137,9 @@ def get_messages(
    instructions: Optional[str] = None,
    include_title_slide: bool = True,
    include_table_of_contents: bool = False,
-):
+) -> list[Message]:
    return [
-        LLMSystemMessage(
+        SystemMessage(
            content=get_system_prompt(
                tone,
                verbosity,
@ -136,7 +148,7 @@ def get_messages(
                include_table_of_contents,
            ),
        ),
-        LLMUserMessage(
+        UserMessage(
            content=get_user_prompt(
                content,
                n_slides,
@ -170,36 +182,47 @@ async def generate_ppt_outline(
        else PresentationOutlineModel
    )

-    client = LLMClient()
-    providers_with_search_tool = {
-        LLMProvider.OPENAI,
-        LLMProvider.ANTHROPIC,
-        LLMProvider.GOOGLE,
-    }
-    use_search_tool = (
-        web_search
-        and client.enable_web_grounding()
-        and client.llm_provider in providers_with_search_tool
-    )
+    client = get_client(config=get_llm_config())
+    use_search_tool = web_search and enable_web_grounding()

    try:
-        async for chunk in client.stream_structured(
-            model,
-            get_messages(
-                content,
-                n_slides,
-                language,
-                additional_context,
-                tone,
-                verbosity,
-                instructions,
-                include_title_slide,
-                include_table_of_contents,
-            ),
-            response_model.model_json_schema(),
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=response_model.model_json_schema(),
            strict=True,
-            tools=([SearchWebTool] if use_search_tool else None),
+        )
+        emitted_content = False
+        async for event in stream_generate_events(
+            client,
+            **get_generate_kwargs(
+                model=model,
+                messages=get_messages(
+                    content,
+                    n_slides,
+                    language,
+                    additional_context,
+                    tone,
+                    verbosity,
+                    instructions,
+                    include_title_slide,
+                    include_table_of_contents,
+                ),
+                response_format=response_format,
+                tools=([WebSearchTool()] if use_search_tool else None),
+                stream=True,
+            ),
        ):
-            yield chunk
+            if getattr(event, "type", None) == "content":
+                chunk = getattr(event, "chunk", None)
+                if chunk:
+                    emitted_content = True
+                    yield chunk
+            elif (
+                isinstance(event, ResponseStreamCompletionChunk)
+                and not emitted_content
+            ):
+                final_content = serialize_structured_content(event.content)
+                if final_content:
+                    yield final_content
    except Exception as e:
        yield handle_llm_client_exceptions(e)
--- a/servers/fastapi/utils/llm_calls/generate_presentation_structure.py
+++ b/servers/fastapi/utils/llm_calls/generate_presentation_structure.py
@ -1,10 +1,14 @@
-from typing import Optional, Dict
+import asyncio
+from typing import Optional

-from models.llm_message import LLMSystemMessage, LLMUserMessage
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage
 from models.presentation_layout import PresentationLayoutModel
 from models.presentation_outline_model import PresentationOutlineModel
-from services.llm_client import LLMClient
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_structured_content, get_generate_kwargs
 from utils.llm_provider import get_model
 from utils.get_dynamic_models import get_presentation_structure_model_with_n_slides
 from models.presentation_structure_model import PresentationStructureModel
@ -97,19 +101,21 @@ def get_messages(
    n_slides: int,
    data: str,
    instructions: Optional[str] = None,
-):
+) -> list[Message]:
    system_prompt = GET_MESSAGES_SYSTEM_PROMPT.format(
        user_instruction_header="# User Instruction:" if instructions else "",
        n_slides=n_slides,
    )

    return [
-        LLMSystemMessage(content=system_prompt),
-        LLMUserMessage(content=(
-            f"{presentation_layout.to_string()}\n\n"
-            "--------------------------------------\n\n"
-            f"{data}"
-        )),
+        SystemMessage(content=system_prompt),
+        UserMessage(
+            content=(
+                f"{presentation_layout.to_string()}\n\n"
+                "--------------------------------------\n\n"
+                f"{data}"
+            )
+        ),
    ]


@ -118,20 +124,13 @@ def get_messages_for_slides_markdown(
    n_slides: int,
    data: str,
    instructions: Optional[str] = None,
-):
+) -> list[Message]:
    system_prompt = STRUCTURE_FROM_SLIDES_MARKDOWN_SYSTEM_PROMPT.format(
        user_instructions=instructions or "",
        presentation_layout=presentation_layout.to_string(with_schema=True),
    )

-    return [
-        LLMSystemMessage(
-            content=system_prompt
-        ),
-        LLMUserMessage(
-            content=data
-        )
-    ]
+    return [SystemMessage(content=system_prompt), UserMessage(content=data)]


 async def generate_presentation_structure(
@ -140,34 +139,50 @@ async def generate_presentation_structure(
    instructions: Optional[str] = None,
    using_slides_markdown: bool = False,
 ) -> PresentationStructureModel:
-
-    client = LLMClient()
+    client = get_client(config=get_llm_config())
    model = get_model()
    response_model = get_presentation_structure_model_with_n_slides(
        len(presentation_outline.slides)
    )

    try:
-        response = await client.generate_structured(
-            model=model,
-            messages=(
-                get_messages_for_slides_markdown(
-                    presentation_layout,
-                    len(presentation_outline.slides),
-                    presentation_outline.to_string(),
-                    instructions,
-                )
-                if using_slides_markdown
-                else get_messages(
-                    presentation_layout,
-                    len(presentation_outline.slides),
-                    presentation_outline.to_string(),
-                    instructions,
-                )
-            ),
-            response_format=response_model.model_json_schema(),
+        messages = (
+            get_messages_for_slides_markdown(
+                presentation_layout,
+                len(presentation_outline.slides),
+                presentation_outline.to_string(),
+                instructions,
+            )
+            if using_slides_markdown
+            else get_messages(
+                presentation_layout,
+                len(presentation_outline.slides),
+                presentation_outline.to_string(),
+                instructions,
+            )
+        )
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=response_model.model_json_schema(),
            strict=True,
        )
-        return PresentationStructureModel(**response)
+
+        for attempt in range(3):
+            response = await asyncio.to_thread(
+                client.generate,
+                **get_generate_kwargs(
+                    model=model,
+                    messages=messages,
+                    response_format=response_format,
+                ),
+            )
+            content = extract_structured_content(response.content)
+            if content is not None:
+                return PresentationStructureModel(**content)
+
+            if attempt < 2:
+                await asyncio.sleep(0.5 * (attempt + 1))
+
+        raise HTTPException(status_code=400, detail="LLM did not return any content")
    except Exception as e:
        raise handle_llm_client_exceptions(e)
--- a/servers/fastapi/utils/llm_calls/generate_slide_content.py
+++ b/servers/fastapi/utils/llm_calls/generate_slide_content.py
@ -1,11 +1,15 @@
+import asyncio
 from datetime import datetime
 import json
 from typing import Optional
-from models.llm_message import LLMSystemMessage, LLMUserMessage
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage
 from models.presentation_layout import SlideLayoutModel
 from models.presentation_outline_model import SlideOutlineModel
-from services.llm_client import LLMClient
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_structured_content, get_generate_kwargs
 from utils.llm_provider import get_model
 from utils.schema_utils import add_field_in_schema, remove_fields_from_schema

@ -130,10 +134,10 @@ def get_messages(
    verbosity: Optional[str] = None,
    instructions: Optional[str] = None,
    response_schema: Optional[dict] = None,
-):
+) -> list[Message]:

    return [
-        LLMSystemMessage(
+        SystemMessage(
            content=get_system_prompt(
                tone,
                verbosity,
@ -141,7 +145,7 @@ def get_messages(
                response_schema,
            ),
        ),
-        LLMUserMessage(
+        UserMessage(
            content=get_user_prompt(outline, language),
        ),
    ]
@ -155,7 +159,7 @@ async def get_slide_content_from_type_and_outline(
    verbosity: Optional[str] = None,
    instructions: Optional[str] = None,
 ):
-    client = LLMClient()
+    client = get_client(config=get_llm_config())
    model = get_model()

    response_schema = remove_fields_from_schema(
@ -175,20 +179,37 @@ async def get_slide_content_from_type_and_outline(
    )

    try:
-        response = await client.generate_structured(
-            model=model,
-            messages=get_messages(
-                outline.content,
-                language,
-                tone,
-                verbosity,
-                instructions,
-                response_schema,
-            ),
-            response_format=response_schema,
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=response_schema,
            strict=False,
        )
-        return response
+        messages = get_messages(
+            outline.content,
+            language,
+            tone,
+            verbosity,
+            instructions,
+            response_schema,
+        )
+
+        for attempt in range(3):
+            response = await asyncio.to_thread(
+                client.generate,
+                **get_generate_kwargs(
+                    model=model,
+                    messages=messages,
+                    response_format=response_format,
+                ),
+            )
+            content = extract_structured_content(response.content)
+            if content is not None:
+                return content
+
+            if attempt < 2:
+                await asyncio.sleep(0.5 * (attempt + 1))
+
+        raise HTTPException(status_code=400, detail="LLM did not return any content")

    except Exception as e:
        raise handle_llm_client_exceptions(e)
--- a/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py
+++ b/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py
@ -1,9 +1,13 @@
-from models.llm_message import LLMSystemMessage, LLMUserMessage
+import asyncio
+from fastapi import HTTPException
+from llmai import get_client
+from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage
 from models.presentation_layout import PresentationLayoutModel, SlideLayoutModel
 from models.slide_layout_index import SlideLayoutIndex
 from models.sql.slide import SlideModel
-from services.llm_client import LLMClient
+from utils.llm_config import get_llm_config
 from utils.llm_client_error_handler import handle_llm_client_exceptions
+from utils.llm_utils import extract_structured_content, get_generate_kwargs
 from utils.llm_provider import get_model


@ -13,7 +17,7 @@ def get_messages(
    layout: PresentationLayoutModel,
    current_slide_layout: int,
    memory_context: str = "",
-):
+) -> list[Message]:
    memory_block = (
        f"\n                # Retrieved Presentation Memory Context\n                {memory_context}\n"
        if memory_context
@ -21,7 +25,7 @@ def get_messages(
    )

    return [
-        LLMSystemMessage(
+        SystemMessage(
            content=f"""
                Select a Slide Layout index based on provided user prompt and current slide data.
                {layout.to_string()}
@ -34,7 +38,7 @@ def get_messages(
                **Go through all notes and steps and make sure they are followed, including mentioned constraints**
            """,
        ),
-        LLMUserMessage(
+        UserMessage(
            content=f"""
                - User Prompt: {prompt}
                - Current Slide Data: {slide_data}
@ -50,27 +54,43 @@ async def get_slide_layout_from_prompt(
    slide: SlideModel,
    memory_context: str = "",
 ) -> SlideLayoutModel:
-
-    client = LLMClient()
+    client = get_client(config=get_llm_config())
    model = get_model()

    slide_layout_index = layout.get_slide_layout_index(slide.layout)

    try:
-        response = await client.generate_structured(
-            model=model,
-            messages=get_messages(
-                prompt,
-                slide.content,
-                layout,
-                slide_layout_index,
-                memory_context,
-            ),
-            response_format=SlideLayoutIndex.model_json_schema(),
+        response_format = JSONSchemaResponse(
+            name="response",
+            json_schema=SlideLayoutIndex.model_json_schema(),
            strict=True,
        )
-        index = SlideLayoutIndex(**response).index
-        return layout.slides[index]
+        messages = get_messages(
+            prompt,
+            slide.content,
+            layout,
+            slide_layout_index,
+            memory_context,
+        )
+
+        for attempt in range(3):
+            response = await asyncio.to_thread(
+                client.generate,
+                **get_generate_kwargs(
+                    model=model,
+                    messages=messages,
+                    response_format=response_format,
+                ),
+            )
+            content = extract_structured_content(response.content)
+            if content is not None:
+                index = SlideLayoutIndex(**content).index
+                return layout.slides[index]
+
+            if attempt < 2:
+                await asyncio.sleep(0.5 * (attempt + 1))
+
+        raise HTTPException(status_code=400, detail="LLM did not return any content")

    except Exception as e:
        raise handle_llm_client_exceptions(e)
--- a/servers/fastapi/utils/llm_client_error_handler.py
+++ b/servers/fastapi/utils/llm_client_error_handler.py
@ -1,18 +1,19 @@
 from fastapi import HTTPException
-from anthropic import APIError as AnthropicAPIError
 from openai import APIError as OpenAIAPIError
 from google.genai.errors import APIError as GoogleAPIError
 import traceback

+from llmai.shared.errors import BaseError as LLMAIBaseError
+

 def handle_llm_client_exceptions(e: Exception) -> HTTPException:
    traceback.print_exc()
+    if isinstance(e, HTTPException):
+        return e
+    if isinstance(e, LLMAIBaseError):
+        return HTTPException(status_code=e.status_code, detail=e.message)
    if isinstance(e, OpenAIAPIError):
        return HTTPException(status_code=500, detail=f"OpenAI API error: {e.message}")
    if isinstance(e, GoogleAPIError):
        return HTTPException(status_code=500, detail=f"Google API error: {e.message}")
-    if isinstance(e, AnthropicAPIError):
-        return HTTPException(
-            status_code=500, detail=f"Anthropic API error: {e.message}"
-        )
    return HTTPException(status_code=500, detail=f"LLM API error: {e}")
--- a/servers/fastapi/utils/llm_config.py
+++ b/servers/fastapi/utils/llm_config.py
@ -0,0 +1,146 @@
+import time
+from typing import Optional
+
+from fastapi import HTTPException
+from llmai.shared import (
+    AnthropicClientConfig,
+    ChatGPTClientConfig,
+    ClientConfig,
+    GoogleClientConfig,
+    OpenAIApiType,
+    OpenAIClientConfig,
+)
+
+from enums.llm_provider import LLMProvider
+from utils.get_env import (
+    get_anthropic_api_key_env,
+    get_codex_access_token_env,
+    get_codex_account_id_env,
+    get_codex_refresh_token_env,
+    get_codex_token_expires_env,
+    get_custom_llm_api_key_env,
+    get_custom_llm_url_env,
+    get_disable_thinking_env,
+    get_google_api_key_env,
+    get_ollama_url_env,
+    get_openai_api_key_env,
+    get_web_grounding_env,
+)
+from utils.llm_provider import get_llm_provider
+from utils.parsers import parse_bool_or_none
+from utils.set_env import (
+    set_codex_access_token_env,
+    set_codex_account_id_env,
+    set_codex_refresh_token_env,
+    set_codex_token_expires_env,
+)
+
+
+def enable_web_grounding() -> bool:
+    return parse_bool_or_none(get_web_grounding_env()) or False
+
+
+def disable_thinking() -> bool:
+    return parse_bool_or_none(get_disable_thinking_env()) or False
+
+
+def _get_codex_access_token() -> str:
+    access_token = get_codex_access_token_env()
+    if not access_token:
+        raise HTTPException(
+            status_code=400,
+            detail=(
+                "Codex OAuth access token is not set. Please authenticate via "
+                "/api/v1/ppt/codex/auth/initiate"
+            ),
+        )
+
+    expires_str = get_codex_token_expires_env()
+    if expires_str:
+        try:
+            expires_ms = int(expires_str)
+            now_ms = int(time.time() * 1000)
+            if now_ms >= expires_ms - 60_000:
+                refresh_token = get_codex_refresh_token_env()
+                if refresh_token:
+                    from utils.oauth.openai_codex import (
+                        TokenSuccess,
+                        get_account_id,
+                        refresh_access_token,
+                    )
+
+                    result = refresh_access_token(refresh_token)
+                    if isinstance(result, TokenSuccess):
+                        set_codex_access_token_env(result.access)
+                        set_codex_refresh_token_env(result.refresh)
+                        set_codex_token_expires_env(str(result.expires))
+                        account_id = get_account_id(result.access)
+                        if account_id:
+                            set_codex_account_id_env(account_id)
+                        access_token = result.access
+        except (TypeError, ValueError):
+            pass
+
+    return access_token
+
+
+def get_llm_config() -> ClientConfig:
+    llm_provider = get_llm_provider()
+
+    match llm_provider:
+        case LLMProvider.OPENAI:
+            api_key = get_openai_api_key_env()
+            if not api_key:
+                raise HTTPException(status_code=400, detail="OpenAI API Key is not set")
+            return OpenAIClientConfig(
+                api_key=api_key,
+                api_type=OpenAIApiType.RESPONSES,
+            )
+        case LLMProvider.GOOGLE:
+            api_key = get_google_api_key_env()
+            if not api_key:
+                raise HTTPException(status_code=400, detail="Google API Key is not set")
+            return GoogleClientConfig(api_key=api_key)
+        case LLMProvider.ANTHROPIC:
+            api_key = get_anthropic_api_key_env()
+            if not api_key:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Anthropic API Key is not set",
+                )
+            return AnthropicClientConfig(api_key=api_key)
+        case LLMProvider.OLLAMA:
+            return OpenAIClientConfig(
+                base_url=(get_ollama_url_env() or "http://localhost:11434") + "/v1",
+                api_key="ollama",
+            )
+        case LLMProvider.CUSTOM:
+            base_url = get_custom_llm_url_env()
+            if not base_url:
+                raise HTTPException(
+                    status_code=400,
+                    detail="Custom LLM URL is not set",
+                )
+            return OpenAIClientConfig(
+                base_url=base_url,
+                api_key=get_custom_llm_api_key_env() or "null",
+            )
+        case LLMProvider.CODEX:
+            return ChatGPTClientConfig(
+                access_token=_get_codex_access_token(),
+                account_id=get_codex_account_id_env() or None,
+            )
+        case _:
+            raise HTTPException(
+                status_code=400,
+                detail=(
+                    "LLM Provider must be either openai, google, anthropic, "
+                    "ollama, custom, or codex"
+                ),
+            )
+
+
+def get_extra_body() -> Optional[dict]:
+    if get_llm_provider() == LLMProvider.CUSTOM and disable_thinking():
+        return {"enable_thinking": False}
+    return None
--- a/servers/fastapi/utils/llm_utils.py
+++ b/servers/fastapi/utils/llm_utils.py
@ -0,0 +1,134 @@
+import asyncio
+import json
+from collections.abc import AsyncGenerator, Sequence
+from typing import Any, Optional
+
+import dirtyjson
+from llmai.shared import (
+    LLMTool,
+    Message,
+    ResponseFormat,
+    normalize_content_parts,
+)
+
+from utils.llm_config import get_extra_body
+
+
+def get_generate_kwargs(
+    model: str,
+    messages: Sequence[Message],
+    max_tokens: Optional[int] = None,
+    tools: Optional[list[LLMTool]] = None,
+    response_format: Optional[ResponseFormat] = None,
+    stream: bool = False,
+) -> dict[str, Any]:
+    kwargs: dict[str, Any] = {
+        "model": model,
+        "messages": list(messages),
+        "stream": stream,
+    }
+    if max_tokens is not None:
+        kwargs["max_tokens"] = max_tokens
+    if tools:
+        kwargs["tools"] = tools
+    if response_format is not None:
+        kwargs["response_format"] = response_format
+
+    extra_body = get_extra_body()
+    if extra_body:
+        kwargs["extra_body"] = extra_body
+
+    return kwargs
+
+
+def extract_text(content: Any) -> Optional[str]:
+    if content is None:
+        return None
+    if isinstance(content, str):
+        return content
+    if isinstance(content, Sequence) and not isinstance(content, (bytes, bytearray)):
+        parts: list[str] = []
+        for part in content:
+            if isinstance(part, str):
+                parts.append(part)
+                continue
+            text = getattr(part, "text", None)
+            if isinstance(text, str):
+                parts.append(text)
+        joined = "".join(parts)
+        return joined or None
+    text = getattr(content, "text", None)
+    if isinstance(text, str):
+        return text
+    return None
+
+
+def extract_structured_content(content: Any) -> Optional[dict]:
+    if content is None:
+        return None
+    if isinstance(content, dict):
+        return content
+    if hasattr(content, "model_dump"):
+        dumped = content.model_dump(mode="json")
+        if isinstance(dumped, dict):
+            return dumped
+
+    raw_text = extract_text(content)
+    if not raw_text:
+        return None
+
+    try:
+        parsed = dirtyjson.loads(raw_text)
+    except Exception:
+        return None
+
+    if isinstance(parsed, dict):
+        return dict(parsed)
+    return None
+
+
+def serialize_structured_content(content: Any) -> Optional[str]:
+    parsed = extract_structured_content(content)
+    if parsed is not None:
+        return json.dumps(parsed, ensure_ascii=False)
+
+    raw_text = extract_text(content)
+    if raw_text:
+        return raw_text
+    return None
+
+
+def message_content_to_text(content: Sequence[Any] | str | None) -> Optional[str]:
+    joined = "".join(
+        part.text
+        for part in normalize_content_parts(content)
+        if isinstance(getattr(part, "text", None), str)
+    )
+    return joined or None
+
+
+async def stream_generate_events(client: Any, **kwargs) -> AsyncGenerator[Any, None]:
+    loop = asyncio.get_running_loop()
+    queue: asyncio.Queue[Any] = asyncio.Queue()
+    sentinel = object()
+
+    def worker():
+        try:
+            for event in client.generate(**kwargs):
+                loop.call_soon_threadsafe(queue.put_nowait, event)
+        except Exception as exc:
+            loop.call_soon_threadsafe(queue.put_nowait, exc)
+        finally:
+            loop.call_soon_threadsafe(queue.put_nowait, sentinel)
+
+    worker_task = asyncio.create_task(asyncio.to_thread(worker))
+    try:
+        while True:
+            item = await queue.get()
+            if item is sentinel:
+                break
+            if isinstance(item, Exception):
+                raise item
+            yield item
+    finally:
+        await worker_task
--- a/servers/fastapi/utils/set_env.py
+++ b/servers/fastapi/utils/set_env.py
@ -73,10 +73,6 @@ def set_disable_image_generation_env(value):
    os.environ["DISABLE_IMAGE_GENERATION"] = value


-def set_tool_calls_env(value):
-    os.environ["TOOL_CALLS"] = value
-
-
 def set_disable_thinking_env(value):
    os.environ["DISABLE_THINKING"] = value

--- a/servers/fastapi/utils/user_config.py
+++ b/servers/fastapi/utils/user_config.py
@ -22,7 +22,6 @@ from utils.get_env import (
    get_openai_api_key_env,
    get_openai_model_env,
    get_pexels_api_key_env,
-    get_tool_calls_env,
    get_user_config_path_env,
    get_image_provider_env,
    get_pixabay_api_key_env,
@ -63,7 +62,6 @@ from utils.set_env import (
    set_pexels_api_key_env,
    set_image_provider_env,
    set_pixabay_api_key_env,
-    set_tool_calls_env,
    set_web_grounding_env,
    set_codex_access_token_env,
    set_codex_refresh_token_env,
@ -118,11 +116,6 @@ def get_user_config():
        DALL_E_3_QUALITY=existing_config.DALL_E_3_QUALITY or get_dall_e_3_quality_env(),
        GPT_IMAGE_1_5_QUALITY=existing_config.GPT_IMAGE_1_5_QUALITY
        or get_gpt_image_1_5_quality_env(),
-        TOOL_CALLS=(
-            existing_config.TOOL_CALLS
-            if existing_config.TOOL_CALLS is not None
-            else (parse_bool_or_none(get_tool_calls_env()) or False)
-        ),
        DISABLE_THINKING=(
            existing_config.DISABLE_THINKING
            if existing_config.DISABLE_THINKING is not None
@ -197,8 +190,6 @@ def update_env_with_user_config():
        set_dall_e_3_quality_env(user_config.DALL_E_3_QUALITY)
    if user_config.GPT_IMAGE_1_5_QUALITY:
        set_gpt_image_1_5_quality_env(user_config.GPT_IMAGE_1_5_QUALITY)
-    if user_config.TOOL_CALLS is not None:
-        set_tool_calls_env(str(user_config.TOOL_CALLS))
    if user_config.DISABLE_THINKING is not None:
        set_disable_thinking_env(str(user_config.DISABLE_THINKING))
    if user_config.EXTENDED_REASONING is not None:
--- a/servers/fastapi/uv.lock
+++ b/servers/fastapi/uv.lock
@ -1,5 +1,5 @@
 version = 1
-revision = 3
+revision = 2
 requires-python = "==3.11.*"

 [[package]]
@ -238,6 +238,34 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/ea/44/b749f8777b020b420bceaaf60f66432fc30cc904ca5b69640ec9cbef11ed/blis-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:27f82b8633030f8d095d2b412dffa7eb6dbc8ee43813139909a20012e54422ea", size = 6171233, upload-time = "2025-11-17T12:27:41.921Z" },
 ]

+[[package]]
+name = "boto3"
+version = "1.42.94"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+    { name = "jmespath" },
+    { name = "s3transfer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6a/6a/95302333208830de932ad1d0b69599ee13e936349a44981fb72632507861/boto3-1.42.94.tar.gz", hash = "sha256:5b6056a661c19e974aaea3cb97690ddbe30d10c31e4f887df3bff06574f34510", size = 113211, upload-time = "2026-04-22T20:36:19.167Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/6f/4e175604f3168befcb413c95bf45eada67d12042f92f76a9305d6a817ea9/boto3-1.42.94-py3-none-any.whl", hash = "sha256:56d53bce75629cc7c78a32da8b62de74cee3e2a3d54a2b60ba1a65f9f1b129da", size = 140555, upload-time = "2026-04-22T20:36:16.182Z" },
+]
+
+[[package]]
+name = "botocore"
+version = "1.42.94"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jmespath" },
+    { name = "python-dateutil" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/90/1a4d0e81b325d38e37f81d907ceacac3b8f509ad38b495bb95086ecb609d/botocore-1.42.94.tar.gz", hash = "sha256:41c6b3b11b073221a41f52b222ba387be34459fb77cdc506e8b74cdaf24bdcce", size = 15260901, upload-time = "2026-04-22T20:36:00.853Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/61/73/313af9ee02ac0155247bcf3f04fcf54fcae2e33250bb437528c18aeefd81/botocore-1.42.94-py3-none-any.whl", hash = "sha256:a2143742132ed0f6cdb90204d667b89d0301068b1045e8bc099efa267bf1b348", size = 14942938, upload-time = "2026-04-22T20:35:55.663Z" },
+]
+
 [[package]]
 name = "cachetools"
 version = "7.0.6"
@ -783,7 +811,9 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/fb/c6/dba32cab7e3a625b011aa5647486e2d28423a48845a2998c126dd69c85e1/greenlet-3.4.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:805bebb4945094acbab757d34d6e1098be6de8966009ab9ca54f06ff492def58", size = 285504, upload-time = "2026-04-08T15:52:14.071Z" },
    { url = "https://files.pythonhosted.org/packages/54/f4/7cb5c2b1feb9a1f50e038be79980dfa969aa91979e5e3a18fdbcfad2c517/greenlet-3.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:439fc2f12b9b512d9dfa681c5afe5f6b3232c708d13e6f02c845e0d9f4c2d8c6", size = 605476, upload-time = "2026-04-08T16:24:37.064Z" },
    { url = "https://files.pythonhosted.org/packages/d6/af/b66ab0b2f9a4c5a867c136bf66d9599f34f21a1bcca26a2884a29c450bd9/greenlet-3.4.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a70ed1cb0295bee1df57b63bf7f46b4e56a5c93709eea769c1fec1bb23a95875", size = 618336, upload-time = "2026-04-08T16:30:56.59Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/31/56c43d2b5de476f77d36ceeec436328533bff960a4cba9a07616e93063ab/greenlet-3.4.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c5696c42e6bb5cfb7c6ff4453789081c66b9b91f061e5e9367fa15792644e76", size = 625045, upload-time = "2026-04-08T16:40:37.111Z" },
    { url = "https://files.pythonhosted.org/packages/e5/5c/8c5633ece6ba611d64bf2770219a98dd439921d6424e4e8cf16b0ac74ea5/greenlet-3.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c660bce1940a1acae5f51f0a064f1bc785d07ea16efcb4bc708090afc4d69e83", size = 613515, upload-time = "2026-04-08T15:56:32.478Z" },
+    { url = "https://files.pythonhosted.org/packages/80/ca/704d4e2c90acb8bdf7ae593f5cbc95f58e82de95cc540fb75631c1054533/greenlet-3.4.0-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:89995ce5ddcd2896d89615116dd39b9703bfa0c07b583b85b89bf1b5d6eddf81", size = 419745, upload-time = "2026-04-08T16:43:04.022Z" },
    { url = "https://files.pythonhosted.org/packages/a9/df/950d15bca0d90a0e7395eb777903060504cdb509b7b705631e8fb69ff415/greenlet-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee407d4d1ca9dc632265aee1c8732c4a2d60adff848057cdebfe5fe94eb2c8a2", size = 1574623, upload-time = "2026-04-08T16:26:18.596Z" },
    { url = "https://files.pythonhosted.org/packages/1a/e7/0839afab829fcb7333c9ff6d80c040949510055d2d4d63251f0d1c7c804e/greenlet-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:956215d5e355fffa7c021d168728321fd4d31fd730ac609b1653b450f6a4bc71", size = 1639579, upload-time = "2026-04-08T15:57:29.231Z" },
    { url = "https://files.pythonhosted.org/packages/d9/2b/b4482401e9bcaf9f5c97f67ead38db89c19520ff6d0d6699979c6efcc200/greenlet-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:5cb614ace7c27571270354e9c9f696554d073f8aa9319079dcba466bbdead711", size = 238233, upload-time = "2026-04-08T17:02:54.286Z" },
@ -1057,6 +1087,15 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/99/8f/15e7741ff19e9bcd4d753f7ff22f988fd54592f134ca13701c13ea8c20e0/jiter-0.14.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e52c076f187405fc21523c746c04399c9af8ece566077ed147b2126f2bcba577", size = 351445, upload-time = "2026-04-10T14:28:33.093Z" },
 ]

+[[package]]
+name = "jmespath"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
+]
+
 [[package]]
 name = "joblib"
 version = "1.5.3"
@ -1146,6 +1185,28 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/81/db/e655086b7f3a705df045bf0933bdd9c2f79bb3c97bfef1384598bb79a217/keyring-25.7.0-py3-none-any.whl", hash = "sha256:be4a0b195f149690c166e850609a477c532ddbfbaed96a404d4e43f8d5e2689f", size = 39160, upload-time = "2025-11-16T16:26:08.402Z" },
 ]

+[[package]]
+name = "llmai"
+version = "0.1.9"
+source = { url = "https://files.pythonhosted.org/packages/c6/86/5dcfd77b634947cd570680b13217b40bc72cd7d9e7f04cc1a52ff5f549a0/llmai-0.1.9-py3-none-any.whl" }
+dependencies = [
+    { name = "anthropic" },
+    { name = "boto3" },
+    { name = "google-genai" },
+    { name = "openai" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/86/5dcfd77b634947cd570680b13217b40bc72cd7d9e7f04cc1a52ff5f549a0/llmai-0.1.9-py3-none-any.whl", hash = "sha256:dcd94502516586bbd6394fe2c9c610941ff4c19eae0f1316825435f35134cfb4" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "anthropic", specifier = ">=0.79.0" },
+    { name = "boto3", specifier = ">=1.42.89" },
+    { name = "google-genai", specifier = ">=1.62.0" },
+    { name = "openai", specifier = ">=2.18.0" },
+]
+
 [[package]]
 name = "loguru"
 version = "0.7.3"
@ -1604,13 +1665,13 @@ dependencies = [
    { name = "aiomysql" },
    { name = "aiosqlite" },
    { name = "alembic" },
-    { name = "anthropic" },
    { name = "asyncpg" },
    { name = "dirtyjson" },
    { name = "fastapi", extra = ["standard"] },
    { name = "fastembed-vectorstore" },
    { name = "fastmcp" },
    { name = "google-genai" },
+    { name = "llmai" },
    { name = "mem0ai", extra = ["nlp"] },
    { name = "nltk" },
    { name = "openai" },
@ -1626,13 +1687,13 @@ requires-dist = [
    { name = "aiomysql", specifier = ">=0.2.0" },
    { name = "aiosqlite", specifier = ">=0.21.0" },
    { name = "alembic", specifier = ">=1.14.0" },
-    { name = "anthropic", specifier = ">=0.60.0" },
    { name = "asyncpg", specifier = ">=0.30.0" },
    { name = "dirtyjson", specifier = ">=1.0.8" },
    { name = "fastapi", extras = ["standard"], specifier = ">=0.116.1" },
    { name = "fastembed-vectorstore", specifier = ">=0.5.2" },
    { name = "fastmcp", specifier = ">=2.11.0" },
    { name = "google-genai", specifier = ">=1.28.0" },
+    { name = "llmai", url = "https://files.pythonhosted.org/packages/c6/86/5dcfd77b634947cd570680b13217b40bc72cd7d9e7f04cc1a52ff5f549a0/llmai-0.1.9-py3-none-any.whl" },
    { name = "mem0ai", extras = ["nlp"], specifier = ">=0.1.115" },
    { name = "nltk", specifier = ">=3.9.1" },
    { name = "openai", specifier = ">=1.98.0" },
@ -2200,6 +2261,18 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" },
 ]

+[[package]]
+name = "s3transfer"
+version = "0.16.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/46/29/af14f4ef3c11a50435308660e2cc68761c9a7742475e0585cd4396b91777/s3transfer-0.16.1.tar.gz", hash = "sha256:8e424355754b9ccb32467bdc568edf55be82692ef2002d934b1311dbb3b9e524", size = 154801, upload-time = "2026-04-22T20:36:06.475Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/03/19/90d7d4ed51932c022d53f1d02d564b62d10e272692a1f9b76425c1ad2a02/s3transfer-0.16.1-py3-none-any.whl", hash = "sha256:61bcd00ccb83b21a0fe7e91a553fff9729d46c83b4e0106e7c314a733891f7c2", size = 86825, upload-time = "2026-04-22T20:36:04.992Z" },
+]
+
 [[package]]
 name = "secretstorage"
 version = "3.5.0"
--- a/servers/nextjs/components/CustomConfig.tsx
+++ b/servers/nextjs/components/CustomConfig.tsx
@ -20,7 +20,6 @@ interface CustomConfigProps {
  customLlmUrl: string;
  customLlmApiKey: string;
  customModel: string;
-  toolCalls: boolean;
  disableThinking: boolean;
  onInputChange: (value: string | boolean, field: string) => void;
 }
@ -29,7 +28,6 @@ export default function CustomConfig({
  customLlmUrl,
  customLlmApiKey,
  customModel,
-  toolCalls,
  disableThinking,
  onInputChange,
 }: CustomConfigProps) {
@ -165,9 +163,8 @@ export default function CustomConfig({
        <div className="mb-4">
          <div className="mb-3 p-3 bg-amber-50 border border-amber-200 rounded-lg">
            <p className="text-sm text-amber-800">
-              <strong>Important:</strong> Only models with function
-              calling capabilities (tool calls) or JSON schema support
-              will work.
+              <strong>Important:</strong> Only models with structured
+              JSON schema output support will work reliably.
            </p>
          </div>
          <label className="block text-sm font-medium text-gray-700 mb-2">
@ -231,23 +228,6 @@ export default function CustomConfig({
          </div>
        </div>
      )}
-
-      {/* Tool Calls Toggle */}
-      <div>
-        <div className="flex items-center justify-between mb-4 bg-green-50 p-2 rounded-sm">
-          <label className="text-sm font-medium text-gray-700">
-            Use Tool Calls
-          </label>
-          <Switch
-            checked={toolCalls}
-            onCheckedChange={(checked) => onInputChange(checked, "tool_calls")}
-          />
-        </div>
-        <p className="mt-2 text-sm text-gray-500 flex items-center gap-2">
-          <span className="block w-1 h-1 rounded-full bg-gray-400"></span>
-          If enabled, Tool Calls will be used instead of JSON Schema for Structured Output.
-        </p>
-      </div>
      {/* Disable Thinking Toggle */}
      <div>
        <div className="flex items-center justify-between mb-4 bg-green-50 p-2 rounded-sm">
@ -266,4 +246,4 @@ export default function CustomConfig({
      </div>
    </div >
  );
-} 
+} 
--- a/servers/nextjs/components/LLMSelection.tsx
+++ b/servers/nextjs/components/LLMSelection.tsx
@ -292,7 +292,6 @@ export default function LLMProviderSelection({
              customLlmUrl={llmConfig.CUSTOM_LLM_URL || ""}
              customLlmApiKey={llmConfig.CUSTOM_LLM_API_KEY || ""}
              customModel={llmConfig.CUSTOM_MODEL || ""}
-              toolCalls={llmConfig.TOOL_CALLS || false}
              disableThinking={llmConfig.DISABLE_THINKING || false}
              onInputChange={input_field_changed}
            />
--- a/servers/nextjs/types/llm_config.ts
+++ b/servers/nextjs/types/llm_config.ts
@ -42,7 +42,6 @@ export interface LLMConfig {
  GPT_IMAGE_1_5_QUALITY?: string;

  // Other Configs
-  TOOL_CALLS?: boolean;
  DISABLE_THINKING?: boolean;
  EXTENDED_REASONING?: boolean;
  WEB_GROUNDING?: boolean;
--- a/servers/nextjs/utils/providerUtils.ts
+++ b/servers/nextjs/utils/providerUtils.ts
@ -46,7 +46,6 @@ export const updateLLMConfig = (
    image_provider: "IMAGE_PROVIDER",
    disable_image_generation: "DISABLE_IMAGE_GENERATION",
    use_custom_url: "USE_CUSTOM_URL",
-    tool_calls: "TOOL_CALLS",
    disable_thinking: "DISABLE_THINKING",
    extended_reasoning: "EXTENDED_REASONING",
    web_grounding: "WEB_GROUNDING",
@ -244,4 +243,4 @@ export const pullOllamaModel = async (
      void pollOnce();
    }, 1000);
  });
-};
+};
--- a/start.js
+++ b/start.js
@ -175,7 +175,6 @@ const setupUserConfigFromEnv = () => {
    PIXABAY_API_KEY:
      process.env.PIXABAY_API_KEY || existingConfig.PIXABAY_API_KEY,
    IMAGE_PROVIDER: process.env.IMAGE_PROVIDER || existingConfig.IMAGE_PROVIDER,
-    TOOL_CALLS: process.env.TOOL_CALLS || existingConfig.TOOL_CALLS,
    DISABLE_THINKING:
      process.env.DISABLE_THINKING || existingConfig.DISABLE_THINKING,
    EXTENDED_REASONING: