From f76d17314a733240173791abdffd3741429a3ed8 Mon Sep 17 00:00:00 2001 From: sauravniraula Date: Thu, 23 Apr 2026 11:42:15 +0545 Subject: [PATCH 1/3] feat: integrates llmai instead of using old llm client and tool call handlers --- README.md | 1 - docker-compose.yml | 4 - .../fastapi/api/v1/ppt/endpoints/outlines.py | 5 +- .../api/v1/ppt/endpoints/presentation.py | 5 +- servers/fastapi/models/llm_message.py | 59 - servers/fastapi/models/llm_tool_call.py | 30 - servers/fastapi/models/llm_tools.py | 29 - servers/fastapi/models/user_config.py | 1 - .../presenton_backend.egg-info/PKG-INFO | 6 +- .../presenton_backend.egg-info/SOURCES.txt | 13 +- .../presenton_backend.egg-info/requires.txt | 6 +- servers/fastapi/pyproject.toml | 5 +- servers/fastapi/services/codex_llm.py | 431 --- servers/fastapi/services/llm_client.py | 2366 ----------------- .../services/llm_tool_calls_handler.py | 211 -- servers/fastapi/templates/providers.py | 60 +- servers/fastapi/utils/available_models.py | 19 +- servers/fastapi/utils/get_env.py | 4 - servers/fastapi/utils/llm_calls/edit_slide.py | 59 +- .../utils/llm_calls/edit_slide_html.py | 34 +- .../generate_presentation_outlines.py | 91 +- .../generate_presentation_structure.py | 95 +- .../utils/llm_calls/generate_slide_content.py | 57 +- .../llm_calls/select_slide_type_on_edit.py | 58 +- .../fastapi/utils/llm_client_error_handler.py | 11 +- servers/fastapi/utils/llm_config.py | 146 + servers/fastapi/utils/llm_utils.py | 134 + servers/fastapi/utils/set_env.py | 4 - servers/fastapi/utils/user_config.py | 9 - servers/fastapi/uv.lock | 79 +- servers/nextjs/components/CustomConfig.tsx | 26 +- servers/nextjs/components/LLMSelection.tsx | 1 - servers/nextjs/types/llm_config.ts | 1 - servers/nextjs/utils/providerUtils.ts | 3 +- start.js | 1 - 35 files changed, 695 insertions(+), 3369 deletions(-) delete mode 100644 servers/fastapi/models/llm_message.py delete mode 100644 servers/fastapi/models/llm_tool_call.py delete mode 100644 servers/fastapi/models/llm_tools.py delete mode 100644 servers/fastapi/services/codex_llm.py delete mode 100644 servers/fastapi/services/llm_client.py delete mode 100644 servers/fastapi/services/llm_tool_calls_handler.py create mode 100644 servers/fastapi/utils/llm_config.py create mode 100644 servers/fastapi/utils/llm_utils.py diff --git a/README.md b/README.md index 849708ba..3aa749d5 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,6 @@ Other optional variables exist in code (for example advanced Mem0 paths, LitePar - **CUSTOM_LLM_URL**: OpenAI-compatible base URL if **LLM** is **custom**. - **CUSTOM_LLM_API_KEY**: API key if **LLM** is **custom**. - **CUSTOM_MODEL**: Model id if **LLM** is **custom**. -- **TOOL_CALLS**=[true/false]: If **true**, the custom LLM uses tool calls instead of JSON schema for structured output. - **DISABLE_THINKING**=[true/false]: If **true**, disables “thinking” on the custom LLM. - **WEB_GROUNDING**=[true/false]: If **true**, enables web search for OpenAI, Google, and Anthropic models. - **EXTENDED_REASONING**=[true/false]: Enables extended reasoning where supported by the configured stack. diff --git a/docker-compose.yml b/docker-compose.yml index 9e883cfc..386cdde7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -35,7 +35,6 @@ services: - DALL_E_3_QUALITY=${DALL_E_3_QUALITY} - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY} - EXTENDED_REASONING=${EXTENDED_REASONING} - - TOOL_CALLS=${TOOL_CALLS} - DISABLE_THINKING=${DISABLE_THINKING} - WEB_GROUNDING=${WEB_GROUNDING} - DATABASE_URL=${DATABASE_URL} @@ -99,7 +98,6 @@ services: - DALL_E_3_QUALITY=${DALL_E_3_QUALITY} - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY} - EXTENDED_REASONING=${EXTENDED_REASONING} - - TOOL_CALLS=${TOOL_CALLS} - DISABLE_THINKING=${DISABLE_THINKING} - WEB_GROUNDING=${WEB_GROUNDING} - DATABASE_URL=${DATABASE_URL} @@ -158,7 +156,6 @@ services: - DALL_E_3_QUALITY=${DALL_E_3_QUALITY} - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY} - EXTENDED_REASONING=${EXTENDED_REASONING} - - TOOL_CALLS=${TOOL_CALLS} - DISABLE_THINKING=${DISABLE_THINKING} - WEB_GROUNDING=${WEB_GROUNDING} - DATABASE_URL=${DATABASE_URL} @@ -223,7 +220,6 @@ services: - DALL_E_3_QUALITY=${DALL_E_3_QUALITY} - GPT_IMAGE_1_5_QUALITY=${GPT_IMAGE_1_5_QUALITY} - EXTENDED_REASONING=${EXTENDED_REASONING} - - TOOL_CALLS=${TOOL_CALLS} - DISABLE_THINKING=${DISABLE_THINKING} - WEB_GROUNDING=${WEB_GROUNDING} - DATABASE_URL=${DATABASE_URL} diff --git a/servers/fastapi/api/v1/ppt/endpoints/outlines.py b/servers/fastapi/api/v1/ppt/endpoints/outlines.py index 9accde4d..7fc31468 100644 --- a/servers/fastapi/api/v1/ppt/endpoints/outlines.py +++ b/servers/fastapi/api/v1/ppt/endpoints/outlines.py @@ -21,6 +21,7 @@ from services.documents_loader import DocumentsLoader from services.mem0_presentation_memory_service import ( MEM0_PRESENTATION_MEMORY_SERVICE, ) +from utils.llm_utils import message_content_to_text from utils.outline_utils import ( get_no_of_outlines_to_generate_for_n_slides, get_presentation_title_from_presentation_outline, @@ -85,12 +86,12 @@ async def stream_outlines( await MEM0_PRESENTATION_MEMORY_SERVICE.store_generation_context( presentation_id=presentation.id, system_prompt=( - outline_messages[0].content + message_content_to_text(outline_messages[0].content) if len(outline_messages) > 0 else None ), user_prompt=( - outline_messages[1].content + message_content_to_text(outline_messages[1].content) if len(outline_messages) > 1 else None ), diff --git a/servers/fastapi/api/v1/ppt/endpoints/presentation.py b/servers/fastapi/api/v1/ppt/endpoints/presentation.py index 90933cbc..24c63fc9 100644 --- a/servers/fastapi/api/v1/ppt/endpoints/presentation.py +++ b/servers/fastapi/api/v1/ppt/endpoints/presentation.py @@ -75,6 +75,7 @@ from utils.process_slides import ( process_slide_and_fetch_assets, ) from utils.get_layout_by_name import get_layout_by_name +from utils.llm_utils import message_content_to_text from models.presentation_layout import PresentationLayoutModel import uuid @@ -666,12 +667,12 @@ async def generate_presentation_handler( await MEM0_PRESENTATION_MEMORY_SERVICE.store_generation_context( presentation_id=presentation_id, system_prompt=( - outline_messages[0].content + message_content_to_text(outline_messages[0].content) if len(outline_messages) > 0 else None ), user_prompt=( - outline_messages[1].content + message_content_to_text(outline_messages[1].content) if len(outline_messages) > 1 else None ), diff --git a/servers/fastapi/models/llm_message.py b/servers/fastapi/models/llm_message.py deleted file mode 100644 index ba1be4cf..00000000 --- a/servers/fastapi/models/llm_message.py +++ /dev/null @@ -1,59 +0,0 @@ -from typing import Any, List, Literal, Optional -from pydantic import BaseModel -from google.genai.types import Content as GoogleContent - -from models.llm_tool_call import AnthropicToolCall - - -class LLMMessage(BaseModel): - pass - - -class LLMUserMessage(LLMMessage): - role: Literal["user"] = "user" - content: str - - -class LLMSystemMessage(LLMMessage): - role: Literal["system"] = "system" - content: str - - -class OpenAIAssistantMessage(LLMMessage): - role: Literal["assistant"] = "assistant" - content: str | None = None - tool_calls: Optional[List[dict]] = None - - -class GoogleAssistantMessage(LLMMessage): - role: Literal["assistant"] = "assistant" - content: GoogleContent - - -class AnthropicAssistantMessage(LLMMessage): - role: Literal["assistant"] = "assistant" - content: List[AnthropicToolCall] - - -class AnthropicToolCallMessage(LLMMessage): - type: Literal["tool_result"] = "tool_result" - tool_use_id: str - content: str - - -class AnthropicUserMessage(LLMMessage): - role: Literal["user"] = "user" - content: List[AnthropicToolCallMessage] - - -class OpenAIToolCallMessage(LLMMessage): - role: Literal["tool"] = "tool" - content: str - tool_call_id: str - - -class GoogleToolCallMessage(LLMMessage): - role: Literal["tool"] = "tool" - id: Optional[str] = None - name: str - response: dict diff --git a/servers/fastapi/models/llm_tool_call.py b/servers/fastapi/models/llm_tool_call.py deleted file mode 100644 index d0fe7c0e..00000000 --- a/servers/fastapi/models/llm_tool_call.py +++ /dev/null @@ -1,30 +0,0 @@ -from typing import Literal, Optional -from pydantic import BaseModel - - -class LLMToolCall(BaseModel): - pass - - -class OpenAIToolCallFunction(BaseModel): - name: str - arguments: str - - -class OpenAIToolCall(LLMToolCall): - id: str - type: Literal["function"] = "function" - function: OpenAIToolCallFunction - - -class GoogleToolCall(LLMToolCall): - id: Optional[str] = None - name: str - arguments: Optional[dict] = None - - -class AnthropicToolCall(LLMToolCall): - type: Literal["tool_use"] = "tool_use" - id: str - name: str - input: object diff --git a/servers/fastapi/models/llm_tools.py b/servers/fastapi/models/llm_tools.py deleted file mode 100644 index ccf64e67..00000000 --- a/servers/fastapi/models/llm_tools.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import Any, Callable, Coroutine, Optional -from pydantic import BaseModel, Field - - -class LLMTool(BaseModel): - pass - - -class LLMDynamicTool(LLMTool): - name: str - description: str - parameters: dict = {} - handler: Callable[..., Coroutine[Any, Any, str]] - - -class SearchWebTool(LLMTool): - """ - Search the web for information. - """ - - query: str = Field(description="The query to search the web for") - - -class GetCurrentDatetimeTool(LLMTool): - """ - Get the current datetime. - """ - - pass diff --git a/servers/fastapi/models/user_config.py b/servers/fastapi/models/user_config.py index 111c585d..db41401b 100644 --- a/servers/fastapi/models/user_config.py +++ b/servers/fastapi/models/user_config.py @@ -46,7 +46,6 @@ class UserConfig(BaseModel): GPT_IMAGE_1_5_QUALITY: Optional[str] = None # Reasoning - TOOL_CALLS: Optional[bool] = None DISABLE_THINKING: Optional[bool] = None EXTENDED_REASONING: Optional[bool] = None diff --git a/servers/fastapi/presenton_backend.egg-info/PKG-INFO b/servers/fastapi/presenton_backend.egg-info/PKG-INFO index 434857bf..c24c29ca 100644 --- a/servers/fastapi/presenton_backend.egg-info/PKG-INFO +++ b/servers/fastapi/presenton_backend.egg-info/PKG-INFO @@ -7,19 +7,17 @@ Requires-Dist: alembic>=1.14.0 Requires-Dist: aiohttp>=3.12.15 Requires-Dist: aiomysql>=0.2.0 Requires-Dist: aiosqlite>=0.21.0 -Requires-Dist: anthropic>=0.60.0 Requires-Dist: asyncpg>=0.30.0 -Requires-Dist: chromadb>=1.0.15 Requires-Dist: dirtyjson>=1.0.8 Requires-Dist: fastapi[standard]>=0.116.1 Requires-Dist: fastembed-vectorstore>=0.5.2 Requires-Dist: fastmcp>=2.11.0 Requires-Dist: google-genai>=1.28.0 +Requires-Dist: mem0ai[nlp]>=0.1.115 Requires-Dist: nltk>=3.9.1 Requires-Dist: openai>=1.98.0 Requires-Dist: pathvalidate>=3.3.1 Requires-Dist: pdfplumber>=0.11.7 -Requires-Dist: pytest>=8.4.1 Requires-Dist: python-pptx>=1.0.2 -Requires-Dist: redis>=6.2.0 Requires-Dist: sqlmodel>=0.0.24 +Requires-Dist: llmai==0.1.8 diff --git a/servers/fastapi/presenton_backend.egg-info/SOURCES.txt b/servers/fastapi/presenton_backend.egg-info/SOURCES.txt index fba03e0b..376ca3dd 100644 --- a/servers/fastapi/presenton_backend.egg-info/SOURCES.txt +++ b/servers/fastapi/presenton_backend.egg-info/SOURCES.txt @@ -3,6 +3,7 @@ api/__init__.py api/lifespan.py api/main.py api/middlewares.py +api/v1/auth/router.py api/v1/mock/router.py api/v1/ppt/background_tasks.py api/v1/ppt/router.py @@ -46,9 +47,6 @@ models/document_chunk.py models/generate_presentation_request.py models/image_prompt.py models/json_path_guide.py -models/llm_message.py -models/llm_tool_call.py -models/llm_tools.py models/ollama_model_metadata.py models/ollama_model_status.py models/pptx_models.py @@ -78,7 +76,6 @@ presenton_backend.egg-info/dependency_links.txt presenton_backend.egg-info/requires.txt presenton_backend.egg-info/top_level.txt services/__init__.py -services/codex_llm.py services/concurrent_service.py services/database.py services/document_conversion_service.py @@ -88,8 +85,7 @@ services/html_to_text_runs_service.py services/icon_finder_service.py services/image_generation_service.py services/liteparse_service.py -services/llm_client.py -services/llm_tool_calls_handler.py +services/mem0_presentation_memory_service.py services/pptx_presentation_creator.py services/score_based_chunker.py services/temp_file_service.py @@ -106,7 +102,9 @@ templates/providers.py templates/router.py tests/test_gemini_schema_support.py tests/test_image_generation.py +tests/test_liteparse_service.py tests/test_mcp_server.py +tests/test_mem0_presentation_memory_service.py tests/test_openai_schema_support.py tests/test_pptx_creator.py tests/test_pptx_slides_processing.py @@ -130,7 +128,9 @@ utils/get_layout_by_name.py utils/image_provider.py utils/image_utils.py utils/llm_client_error_handler.py +utils/llm_config.py utils/llm_provider.py +utils/llm_utils.py utils/model_availability.py utils/ocr_language.py utils/ollama.py @@ -141,6 +141,7 @@ utils/ppt_utils.py utils/process_slides.py utils/schema_utils.py utils/set_env.py +utils/simple_auth.py utils/theme_utils.py utils/user_config.py utils/validators.py diff --git a/servers/fastapi/presenton_backend.egg-info/requires.txt b/servers/fastapi/presenton_backend.egg-info/requires.txt index e7bfb20e..87b670ce 100644 --- a/servers/fastapi/presenton_backend.egg-info/requires.txt +++ b/servers/fastapi/presenton_backend.egg-info/requires.txt @@ -2,19 +2,17 @@ alembic>=1.14.0 aiohttp>=3.12.15 aiomysql>=0.2.0 aiosqlite>=0.21.0 -anthropic>=0.60.0 asyncpg>=0.30.0 -chromadb>=1.0.15 dirtyjson>=1.0.8 fastapi[standard]>=0.116.1 fastembed-vectorstore>=0.5.2 fastmcp>=2.11.0 google-genai>=1.28.0 +mem0ai[nlp]>=0.1.115 nltk>=3.9.1 openai>=1.98.0 pathvalidate>=3.3.1 pdfplumber>=0.11.7 -pytest>=8.4.1 python-pptx>=1.0.2 -redis>=6.2.0 sqlmodel>=0.0.24 +llmai==0.1.8 diff --git a/servers/fastapi/pyproject.toml b/servers/fastapi/pyproject.toml index b82a1539..8fa45385 100644 --- a/servers/fastapi/pyproject.toml +++ b/servers/fastapi/pyproject.toml @@ -12,7 +12,6 @@ dependencies = [ "aiohttp>=3.12.15", "aiomysql>=0.2.0", "aiosqlite>=0.21.0", - "anthropic>=0.60.0", "asyncpg>=0.30.0", "dirtyjson>=1.0.8", "fastapi[standard]>=0.116.1", @@ -26,11 +25,15 @@ dependencies = [ "pdfplumber>=0.11.7", "python-pptx>=1.0.2", "sqlmodel>=0.0.24", + "llmai==0.1.8", ] [tool.uv] index-strategy = "unsafe-best-match" +[tool.uv.sources] +llmai = { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" } + [tool.setuptools.packages.find] where = ["."] include = ["api*", "enums*", "models*", "services*", "constants*", "utils*", "templates*"] diff --git a/servers/fastapi/services/codex_llm.py b/servers/fastapi/services/codex_llm.py deleted file mode 100644 index a94313f9..00000000 --- a/servers/fastapi/services/codex_llm.py +++ /dev/null @@ -1,431 +0,0 @@ -"""Codex (Responses API) adapter for structured and unstructured LLM calls. - -Stateless adapter: receives AsyncOpenAI client and tool_calls_handler at call time. -Auth and client creation stay in LLMClient. Structure matches other providers: -generate = call API, collect content + tool_calls, recurse on tool_calls; stream = same but yield deltas. - -Uses LLMToolCallsHandler directly: tools are parsed via parse_tools() in llm_client (handler supports -Codex and returns OpenAI-style dicts); this module flattens them for the Responses API. Tool execution -uses tool_calls_handler.handle_tool_calls_openai(). -""" - -import dirtyjson -from typing import Any, AsyncGenerator, List, Optional, Union - -from fastapi import HTTPException -from openai import APIStatusError, AsyncOpenAI, OpenAIError - -from models.llm_message import ( - LLMMessage, - OpenAIAssistantMessage, - LLMSystemMessage, - LLMUserMessage, -) -from models.llm_tool_call import OpenAIToolCall, OpenAIToolCallFunction -from utils.schema_utils import ensure_strict_json_schema - -# Responses API requires flat tool format: {"type":"function","name":...,"description":...,"parameters":...} -RESPONSE_SCHEMA_NAME = "ResponseSchema" -# Required tool choice for structured: force ResponseSchema (no plain-text fallback). -STRUCTURED_TOOL_CHOICE = {"type": "function", "name": RESPONSE_SCHEMA_NAME} -MAX_RECURSION_DEPTH = 5 - - -def _to_responses_tools(chat_tools: List[dict]) -> List[dict]: - """Convert Chat Completions tool format to flat Responses API format.""" - result = [] - for tool in chat_tools: - if tool.get("type") != "function": - result.append(tool) - continue - fn = tool.get("function") or tool - result.append({ - "type": "function", - "name": fn.get("name", ""), - "description": fn.get("description", ""), - "parameters": fn.get("parameters", {}), - }) - return result - - -def _items_to_openai_calls(items_by_id: dict[str, dict]) -> List[OpenAIToolCall]: - """Build OpenAIToolCall list from Responses API output_item map.""" - return [ - OpenAIToolCall( - id=item.get("call_id", item.get("id", "")), - type="function", - function=OpenAIToolCallFunction( - name=item.get("name", ""), - arguments=item.get("arguments", "{}"), - ), - ) - for item in items_by_id.values() - ] - - -async def _messages_after_tool_turn( - messages: List[LLMMessage], - items_by_id: dict[str, dict], - tool_calls_handler: Any, -) -> List[LLMMessage]: - """Handle tool calls and return messages extended with assistant turn + tool results.""" - openai_calls = _items_to_openai_calls(items_by_id) - tool_call_messages = await tool_calls_handler.handle_tool_calls_openai(openai_calls) - return [ - *messages, - OpenAIAssistantMessage( - role="assistant", - content=None, - tool_calls=[tc.model_dump() for tc in openai_calls], - ), - *tool_call_messages, - ] - - -def _build_body( - model: str, - messages: List[LLMMessage], - tools: Optional[List[dict]] = None, - tool_choice: Optional[Union[str, dict]] = None, -) -> dict: - """Build Responses API request body.""" - instructions = None - input_messages = [] - - for msg in messages: - if isinstance(msg, LLMSystemMessage): - instructions = msg.content - elif isinstance(msg, LLMUserMessage): - input_messages.append({ - "role": "user", - "content": [{"type": "input_text", "text": msg.content}], - }) - elif isinstance(msg, OpenAIAssistantMessage): - text = msg.content or "" - if text: - input_messages.append({ - "role": "assistant", - "content": [{"type": "output_text", "text": text}], - }) - else: - text = getattr(msg, "content", "") or "" - if text: - input_messages.append({ - "role": "user", - "content": [{"type": "input_text", "text": text}], - }) - - body: dict = { - "model": model, - "store": False, - "stream": True, - "text": {"verbosity": "medium"}, - "include": ["reasoning.encrypted_content"], - "tool_choice": tool_choice if tool_choice is not None else "auto", - "parallel_tool_calls": True, - } - if instructions: - body["instructions"] = instructions - if input_messages: - body["input"] = input_messages - if tools: - body["tools"] = tools - - return body - - -def _event_to_dict(event: Any) -> dict: - """Convert SDK event to dict.""" - if hasattr(event, "model_dump"): - return event.model_dump() - return { - "type": getattr(event, "type", None), - "delta": getattr(event, "delta", None), - "item": getattr(event, "item", None), - "message": getattr(event, "message", None), - "arguments": getattr(event, "arguments", None), - "name": getattr(event, "name", None), - } - - -async def _stream_raw( - client: AsyncOpenAI, - model: str, - messages: List[LLMMessage], - tools: Optional[List[dict]] = None, - tool_choice: Optional[Union[str, dict]] = None, -) -> AsyncGenerator[dict, None]: - """Yield raw SSE event dicts from Codex Responses API.""" - body = _build_body(model, messages, tools, tool_choice=tool_choice) - create_kwargs = {k: v for k, v in body.items() if k != "stream"} - - try: - stream = await client.responses.create(stream=True, **create_kwargs) - except (APIStatusError, OpenAIError) as e: - status = getattr(e, "status_code", 502) - detail = getattr(e, "message", str(e)) or str(e) - raise HTTPException( - status_code=status, - detail=f"Codex API error: {detail}"[:400], - ) from e - - async for event in stream: - yield _event_to_dict(event) - - -class CodexLLMAdapter: - """Stateless adapter for Codex Responses API. Matches other providers: generate/stream + tool recursion.""" - - @staticmethod - async def generate_codex( - client: AsyncOpenAI, - model: str, - messages: List[LLMMessage], - tool_calls_handler: Any, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ) -> Optional[str]: - """Generate text; on tool_calls handle and recurse (like _generate_openai / _generate_anthropic).""" - print( - f"Codex generate: model={model} depth={depth} tools_count={len(tools) if tools else 0}" - ) - responses_tools = _to_responses_tools(tools) if tools else None - text_parts: List[str] = [] - tool_calls_by_id: dict[str, dict] = {} - - async for event in _stream_raw(client, model, messages, responses_tools, tool_choice=None): - event_type = event.get("type", "") - - if event_type == "response.output_text.delta": - delta = event.get("delta", "") - if delta: - text_parts.append(delta) - elif event_type == "response.output_item.done": - item = event.get("item") or {} - if item.get("type") == "function_call": - tool_calls_by_id[item.get("call_id", item.get("id", ""))] = item - elif event_type in ("response.failed", "error"): - msg_text = event.get("message") or str(event) - raise HTTPException(status_code=502, detail=f"Codex error: {msg_text}") - - if tool_calls_by_id and tools and depth < MAX_RECURSION_DEPTH: - print( - f"Codex generate: tool calls detected depth={depth} count={len(tool_calls_by_id)}" - ) - new_messages = await _messages_after_tool_turn( - messages, tool_calls_by_id, tool_calls_handler - ) - return await CodexLLMAdapter.generate_codex( - client, model, new_messages, tool_calls_handler, - max_tokens=max_tokens, tools=tools, depth=depth + 1, - ) - - return "".join(text_parts) or None - - @staticmethod - async def stream_codex( - client: AsyncOpenAI, - model: str, - messages: List[LLMMessage], - tool_calls_handler: Any, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ) -> AsyncGenerator[str, None]: - """Stream text deltas; on tool_calls handle and recurse (like _stream_openai).""" - print( - f"Codex stream: model={model} depth={depth} tools_count={len(tools) if tools else 0}" - ) - responses_tools = _to_responses_tools(tools) if tools else None - tool_calls_by_id: dict[str, dict] = {} - - async for event in _stream_raw(client, model, messages, responses_tools, tool_choice=None): - event_type = event.get("type", "") - - if event_type == "response.output_text.delta": - delta = event.get("delta", "") - if delta: - yield delta - elif event_type == "response.output_item.done": - item = event.get("item") or {} - if item.get("type") == "function_call": - tool_calls_by_id[item.get("call_id", item.get("id", ""))] = item - elif event_type in ("response.failed", "error"): - msg_text = event.get("message") or str(event) - raise HTTPException(status_code=502, detail=f"Codex stream error: {msg_text}") - - if tool_calls_by_id and tools and depth < MAX_RECURSION_DEPTH: - print( - f"Codex stream: tool calls detected depth={depth} count={len(tool_calls_by_id)}" - ) - new_messages = await _messages_after_tool_turn( - messages, tool_calls_by_id, tool_calls_handler - ) - async for chunk in CodexLLMAdapter.stream_codex( - client, model, new_messages, tool_calls_handler, - max_tokens=max_tokens, tools=tools, depth=depth + 1, - ): - yield chunk - - @staticmethod - async def stream_codex_structured( - client: AsyncOpenAI, - model: str, - messages: List[LLMMessage], - response_format: dict, - tool_calls_handler: Any, - strict: bool = False, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ) -> AsyncGenerator[str, None]: - """Stream JSON chunks from ResponseSchema tool; recurse for other tool_calls. - - Structured output is achieved by always adding an internal ResponseSchema "tool" - (with response_format as its parameters) and tool_choice=ResponseSchema. So - user_tools=0 only means no extra tools like web search; we still use the - ResponseSchema tool to receive the model's JSON. - """ - user_tools_count = len(tools) if tools else 0 - print( - f"Codex stream_structured: model={model} depth={depth} strict={strict} " - f"user_tools={user_tools_count} (always adding ResponseSchema tool for structured JSON)" - ) - schema = ensure_strict_json_schema(response_format, path=(), root=response_format) if strict and depth == 0 else response_format - response_schema_tool = { - "type": "function", - "name": RESPONSE_SCHEMA_NAME, - "description": "Provide response to the user", - "parameters": schema, - } - all_tools: List[dict] = [response_schema_tool] - if tools: - all_tools.extend(_to_responses_tools(tools)) - - tool_calls_by_id: dict[str, dict] = {} - current_call_id: Optional[str] = None - - async for event in _stream_raw( - client, model, messages, all_tools, tool_choice=STRUCTURED_TOOL_CHOICE - ): - event_type = event.get("type", "") - - if event_type == "response.output_item.added": - item = event.get("item") or {} - if item.get("type") == "function_call" and item.get("name") == RESPONSE_SCHEMA_NAME: - current_call_id = item.get("call_id", item.get("id")) - print( - f"Codex stream_structured: ResponseSchema call started call_id={current_call_id}" - ) - - elif event_type == "response.function_call_arguments.delta": - if current_call_id is not None: - delta = event.get("delta", "") - if delta: - # Log only first few chunks to avoid log spam - print( - f"Codex stream_structured: ResponseSchema delta chunk len={len(delta)}" - ) - yield delta - - elif event_type == "response.function_call_arguments.done": - if event.get("name") == RESPONSE_SCHEMA_NAME: - arguments = event.get("arguments", "") - if arguments: - print( - f"Codex stream_structured: ResponseSchema arguments.done len={len(arguments)}" - ) - yield arguments - - elif event_type == "response.output_item.done": - item = event.get("item") or {} - if item.get("type") == "function_call": - tool_calls_by_id[item.get("call_id", item.get("id", ""))] = item - if item.get("name") == RESPONSE_SCHEMA_NAME: - arguments = item.get("arguments", "") - if arguments: - print( - f"Codex stream_structured: ResponseSchema output_item.done len={len(arguments)}" - ) - yield arguments - - elif event_type in ("response.failed", "error"): - msg_text = event.get("message") or str(event) - raise HTTPException(status_code=502, detail=f"Codex structured error: {msg_text}") - - other_tool_calls = { - k: v for k, v in tool_calls_by_id.items() - if v.get("name") != RESPONSE_SCHEMA_NAME - } - if other_tool_calls and tools and depth < MAX_RECURSION_DEPTH: - print( - f"Codex stream_structured: recursing for non-ResponseSchema tool calls " - f"depth={depth} count={len(other_tool_calls)}" - ) - new_messages = await _messages_after_tool_turn( - messages, other_tool_calls, tool_calls_handler - ) - async for chunk in CodexLLMAdapter.stream_codex_structured( - client, model, new_messages, response_format, tool_calls_handler, - strict=strict, max_tokens=max_tokens, tools=tools, depth=depth + 1, - ): - yield chunk - - @staticmethod - async def generate_codex_structured( - client: AsyncOpenAI, - model: str, - messages: List[LLMMessage], - response_format: dict, - tool_calls_handler: Any, - strict: bool = False, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ) -> Optional[dict]: - """Collect stream and parse JSON (like _generate_openai_structured).""" - user_tools_count = len(tools) if tools else 0 - print( - f"Codex generate_structured: model={model} depth={depth} strict={strict} " - f"user_tools={user_tools_count} (using ResponseSchema tool for structured JSON)" - ) - accumulated: List[str] = [] - async for chunk in CodexLLMAdapter.stream_codex_structured( - client, model, messages, response_format, tool_calls_handler, - strict=strict, max_tokens=max_tokens, tools=tools, depth=depth, - ): - accumulated.append(chunk) - - raw = "".join(accumulated) - if not raw: - return None - - if depth == 0: - try: - parsed = dict(dirtyjson.loads(raw)) - print( - f"Codex generate_structured: parsed JSON keys={list(parsed.keys())[:8]}" - ) - return parsed - except Exception: - start = raw.find("{") - if start >= 0: - try: - parsed = dict(dirtyjson.loads(raw[start:])) - print( - "Codex generate_structured: parsed JSON from offset " - f"{start} keys={list(parsed.keys())[:8]}" - ) - return parsed - except Exception: - pass - raise HTTPException( - status_code=502, - detail=( - "Model did not return valid structured output (expected JSON from ResponseSchema). " - "Please retry." - ), - ) - - return None diff --git a/servers/fastapi/services/llm_client.py b/servers/fastapi/services/llm_client.py deleted file mode 100644 index a62e8098..00000000 --- a/servers/fastapi/services/llm_client.py +++ /dev/null @@ -1,2366 +0,0 @@ -import asyncio -import dirtyjson -import json -from typing import AsyncGenerator, List, Optional, Dict, Any -from fastapi import HTTPException -from openai import APIStatusError, AsyncOpenAI, OpenAIError -from openai.types.chat.chat_completion_chunk import ( - ChatCompletionChunk as OpenAIChatCompletionChunk, -) -from google import genai -from google.genai.types import Content as GoogleContent, Part as GoogleContentPart -from google.genai.types import ( - GenerateContentConfig, - GoogleSearch, - ToolConfig as GoogleToolConfig, - FunctionCallingConfig as GoogleFunctionCallingConfig, - FunctionCallingConfigMode as GoogleFunctionCallingConfigMode, -) -from google.genai.types import Tool as GoogleTool -from anthropic import AsyncAnthropic -from anthropic.types import Message as AnthropicMessage -from anthropic import MessageStreamEvent as AnthropicMessageStreamEvent -from enums.llm_provider import LLMProvider -from models.llm_message import ( - AnthropicAssistantMessage, - AnthropicUserMessage, - GoogleAssistantMessage, - GoogleToolCallMessage, - OpenAIAssistantMessage, - LLMMessage, - LLMSystemMessage, - LLMUserMessage, -) -from models.llm_tool_call import ( - AnthropicToolCall, - GoogleToolCall, - LLMToolCall, - OpenAIToolCall, - OpenAIToolCallFunction, -) -from models.llm_tools import LLMDynamicTool, LLMTool -from services.llm_tool_calls_handler import LLMToolCallsHandler -from utils.async_iterator import iterator_to_async -from utils.dummy_functions import do_nothing_async -from utils.get_env import ( - get_anthropic_api_key_env, - get_codex_access_token_env, - get_codex_account_id_env, - get_codex_refresh_token_env, - get_codex_token_expires_env, - get_custom_llm_api_key_env, - get_custom_llm_url_env, - get_disable_thinking_env, - get_google_api_key_env, - get_ollama_url_env, - get_openai_api_key_env, - get_tool_calls_env, - get_web_grounding_env, -) -from utils.set_env import ( - set_codex_access_token_env, - set_codex_account_id_env, - set_codex_refresh_token_env, - set_codex_token_expires_env, -) -from utils.llm_provider import get_llm_provider, get_model -from utils.parsers import parse_bool_or_none -from utils.schema_utils import ( - ensure_array_schemas_have_items, - ensure_strict_json_schema, - flatten_json_schema, - remove_titles_from_schema, -) - - - -class LLMClient: - def __init__(self): - self.llm_provider = get_llm_provider() - self._client = self._get_client() - self.tool_calls_handler = LLMToolCallsHandler(self) - - # ? Use tool calls - def use_tool_calls_for_structured_output(self) -> bool: - if self.llm_provider != LLMProvider.CUSTOM: - return False - return parse_bool_or_none(get_tool_calls_env()) or False - - # ? Web Grounding - def enable_web_grounding(self) -> bool: - if ( - self.llm_provider == LLMProvider.OLLAMA - or self.llm_provider == LLMProvider.CUSTOM - or self.llm_provider == LLMProvider.CODEX - ): - return False - return parse_bool_or_none(get_web_grounding_env()) or False - - # ? Disable thinking - def disable_thinking(self) -> bool: - return parse_bool_or_none(get_disable_thinking_env()) or False - - # ? Clients - def _get_client(self): - match self.llm_provider: - case LLMProvider.OPENAI: - return self._get_openai_client() - case LLMProvider.GOOGLE: - return self._get_google_client() - case LLMProvider.ANTHROPIC: - return self._get_anthropic_client() - case LLMProvider.OLLAMA: - return self._get_ollama_client() - case LLMProvider.CUSTOM: - return self._get_custom_client() - case LLMProvider.CODEX: - return self._get_codex_client() - case _: - raise HTTPException( - status_code=400, - detail="LLM Provider must be either openai, google, anthropic, ollama, custom, or codex", - ) - - def _get_openai_client(self): - if not get_openai_api_key_env(): - raise HTTPException( - status_code=400, - detail="OpenAI API Key is not set", - ) - return AsyncOpenAI() - - def _get_google_client(self): - if not get_google_api_key_env(): - raise HTTPException( - status_code=400, - detail="Google API Key is not set", - ) - return genai.Client() - - def _get_anthropic_client(self): - if not get_anthropic_api_key_env(): - raise HTTPException( - status_code=400, - detail="Anthropic API Key is not set", - ) - return AsyncAnthropic() - - def _get_ollama_client(self): - return AsyncOpenAI( - base_url=(get_ollama_url_env() or "http://localhost:11434") + "/v1", - api_key="ollama", - ) - - def _get_custom_client(self): - if not get_custom_llm_url_env(): - raise HTTPException( - status_code=400, - detail="Custom LLM URL is not set", - ) - return AsyncOpenAI( - base_url=get_custom_llm_url_env(), - api_key=get_custom_llm_api_key_env() or "null", - ) - - def _get_codex_headers(self) -> dict: - """Return the HTTP headers required for Codex Responses API requests. - - Handles token auto-refresh if the stored token is expired or within - 60 s of expiry before building the header dict. - """ - access_token = get_codex_access_token_env() - if not access_token: - raise HTTPException( - status_code=400, - detail="Codex OAuth access token is not set. Please authenticate via /api/v1/ppt/codex/auth/initiate", - ) - - # Auto-refresh if the token is expired or about to expire (within 60 s) - expires_str = get_codex_token_expires_env() - if expires_str: - try: - expires_ms = int(expires_str) - now_ms = int(__import__("time").time() * 1000) - if now_ms >= expires_ms - 60_000: - refresh_token = get_codex_refresh_token_env() - if refresh_token: - from utils.oauth.openai_codex import ( - get_account_id, - refresh_access_token, - TokenSuccess, - ) - result = refresh_access_token(refresh_token) - if isinstance(result, TokenSuccess): - set_codex_access_token_env(result.access) - set_codex_refresh_token_env(result.refresh) - set_codex_token_expires_env(str(result.expires)) - account_id = get_account_id(result.access) - if account_id: - set_codex_account_id_env(account_id) - access_token = result.access - except (ValueError, TypeError): - pass - - account_id = get_codex_account_id_env() or "" - return { - "Authorization": f"Bearer {access_token}", - "chatgpt-account-id": account_id, - "OpenAI-Beta": "responses=experimental", - "originator": "pi", - "content-type": "application/json", - "accept": "text/event-stream", - } - - def _get_codex_client(self) -> AsyncOpenAI: - """Return an AsyncOpenAI client configured for the Codex Responses API. - Client is built per call so headers/token are fresh after refresh. - Only Codex-specific headers are passed; content-type and accept are left - to the SDK so the server does not reject the request. - """ - headers = self._get_codex_headers() - access_token = (headers.get("Authorization") or "").replace("Bearer ", "").strip() - skip = {"authorization", "content-type", "accept"} - default_headers = { - k: v for k, v in headers.items() if k.lower() not in skip - } - return AsyncOpenAI( - base_url="https://chatgpt.com/backend-api/codex", - api_key=access_token or "codex", - default_headers=default_headers, - timeout=120.0, - ) - - # ? Prompts - def _get_system_prompt(self, messages: List[LLMMessage]) -> str: - for message in messages: - if isinstance(message, LLMSystemMessage): - return message.content - return "" - - def _get_google_messages(self, messages: List[LLMMessage]) -> List[GoogleContent]: - contents = [] - for message in messages: - if isinstance(message, LLMUserMessage): - contents.append( - GoogleContent( - role=message.role, - parts=[GoogleContentPart(text=message.content)], - ) - ) - elif isinstance(message, GoogleAssistantMessage): - contents.append(message.content) - elif isinstance(message, GoogleToolCallMessage): - contents.append( - GoogleContent( - role="user", - parts=[ - GoogleContentPart.from_function_response( - name=message.name, - response=message.response, - ) - ], - ) - ) - - return contents - - def _get_anthropic_messages(self, messages: List[LLMMessage]) -> List[LLMMessage]: - return [ - message for message in messages if not isinstance(message, LLMSystemMessage) - ] - - # ? Generate Unstructured Content - async def _generate_openai( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - extra_body: Optional[dict] = None, - depth: int = 0, - ) -> str | None: - client: AsyncOpenAI = self._client - response = await client.chat.completions.create( - model=model, - messages=[message.model_dump() for message in messages], - max_completion_tokens=max_tokens, - tools=tools, - extra_body=extra_body, - ) - - if len(response.choices) == 0: - return None - - tool_calls = response.choices[0].message.tool_calls - if tool_calls: - parsed_tool_calls = [ - OpenAIToolCall( - id=tool_call.id, - type=tool_call.type, - function=OpenAIToolCallFunction( - name=tool_call.function.name, - arguments=tool_call.function.arguments, - ), - ) - for tool_call in tool_calls - ] - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai( - parsed_tool_calls - ) - assistant_message = OpenAIAssistantMessage( - role="assistant", - content=response.choices[0].message.content, - tool_calls=[tool_call.model_dump() for tool_call in parsed_tool_calls], - ) - new_messages = [ - *messages, - assistant_message, - *tool_call_messages, - ] - return await self._generate_openai( - model=model, - messages=new_messages, - max_tokens=max_tokens, - tools=tools, - extra_body=extra_body, - depth=depth + 1, - ) - - return response.choices[0].message.content - - async def _generate_google( - self, - model: str, - messages: List[LLMMessage], - tools: Optional[List[dict]] = None, - max_tokens: Optional[int] = None, - depth: int = 0, - ) -> str | None: - client: genai.Client = self._client - - google_tools = None - if tools: - google_tools = [GoogleTool(function_declarations=[tool]) for tool in tools] - - response = await asyncio.to_thread( - client.models.generate_content, - model=model, - contents=self._get_google_messages(messages), - config=GenerateContentConfig( - tools=google_tools, - system_instruction=self._get_system_prompt(messages), - response_mime_type="text/plain", - max_output_tokens=max_tokens, - ), - ) - - content = response.candidates[0].content - response_parts = content.parts - - if not response_parts: - return None - - text_content = None - tool_calls = [] - for each_part in response_parts: - if each_part.function_call: - tool_calls.append( - GoogleToolCall( - id=each_part.function_call.id, - name=each_part.function_call.name, - arguments=each_part.function_call.args, - ) - ) - if each_part.text: - text_content = each_part.text - - if tool_calls: - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_google( - tool_calls - ) - new_messages = [ - *messages, - GoogleAssistantMessage( - role="assistant", - content=content, - ), - *tool_call_messages, - ] - return await self._generate_google( - model=model, - messages=new_messages, - max_tokens=max_tokens, - tools=tools, - depth=depth + 1, - ) - - return text_content - - async def _generate_anthropic( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ) -> str | None: - client: AsyncAnthropic = self._client - - response: AnthropicMessage = await client.messages.create( - model=model, - system=self._get_system_prompt(messages), - messages=[ - message.model_dump() - for message in self._get_anthropic_messages(messages) - ], - tools=tools, - max_tokens=max_tokens or 4000, - ) - text_content = None - tool_calls: List[AnthropicToolCall] = [] - for content in response.content: - if content.type == "text" and isinstance(content.text, str): - text_content = content.text - - if content.type == "tool_use": - tool_calls.append( - AnthropicToolCall( - id=content.id, - type=content.type, - name=content.name, - input=content.input, - ) - ) - - if tool_calls: - tool_call_messages = ( - await self.tool_calls_handler.handle_tool_calls_anthropic(tool_calls) - ) - new_messages = [ - *messages, - AnthropicAssistantMessage( - role="assistant", - content=[each.model_dump() for each in tool_calls], - ), - AnthropicUserMessage( - role="user", - content=[each.model_dump() for each in tool_call_messages], - ), - ] - return await self._generate_anthropic( - model=model, - messages=new_messages, - max_tokens=max_tokens, - tools=tools, - depth=depth + 1, - ) - - return text_content - - async def _generate_ollama( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - depth: int = 0, - ): - return await self._generate_openai( - model=model, messages=messages, max_tokens=max_tokens, depth=depth - ) - - async def _generate_custom( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - depth: int = 0, - ): - extra_body = {"enable_thinking": False} if self.disable_thinking() else None - return await self._generate_openai( - model=model, - messages=messages, - max_tokens=max_tokens, - extra_body=extra_body, - depth=depth, - ) - - async def _generate_codex( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ) -> Optional[str]: - """ - Generate plain text using the Codex Responses API. On tool calls, run - handlers and recurse (same pattern as _generate_openai). - """ - _MAX_RECURSION_DEPTH = 5 - client: AsyncOpenAI = self._client - - # Flatten tools to Responses API format - responses_tools: Optional[List[dict]] = None - if tools: - responses_tools = [] - for tool in tools: - fn = (tool.get("function") or tool) if isinstance(tool, dict) else {} - if isinstance(fn, dict): - responses_tools.append({ - "type": "function", - "name": fn.get("name", ""), - "description": fn.get("description", ""), - "parameters": fn.get("parameters", {}), - }) - else: - responses_tools.append(tool) - - # Build instructions + input (same shape as _stream_codex_structured) - instructions = self._get_system_prompt(messages) or None - input_payload: List[Dict[str, Any]] = [] - for m in messages: - if isinstance(m, LLMSystemMessage): - continue - if isinstance(m, LLMUserMessage): - input_payload.append({ - "role": "user", - "content": [{"type": "input_text", "text": m.content}], - }) - elif isinstance(m, OpenAIAssistantMessage): - text = m.content or "" - if text: - input_payload.append({ - "role": "assistant", - "content": [{"type": "output_text", "text": text}], - }) - else: - text = getattr(m, "content", "") or "" - if text: - input_payload.append({ - "role": "user", - "content": [{"type": "input_text", "text": text}], - }) - - create_kwargs: Dict[str, Any] = { - "model": model, - "store": False, - "stream": True, - "text": {"verbosity": "medium"}, - "include": ["reasoning.encrypted_content"], - "tool_choice": "auto", - "parallel_tool_calls": True, - } - if instructions: - create_kwargs["instructions"] = instructions - if input_payload: - create_kwargs["input"] = input_payload - if responses_tools: - create_kwargs["tools"] = responses_tools - if max_tokens is not None: - create_kwargs["max_output_tokens"] = max_tokens - - stream = await client.responses.create(**create_kwargs) - - def _event_dict(ev: Any) -> dict: - if hasattr(ev, "model_dump"): - return ev.model_dump() - return { - "type": getattr(ev, "type", None), - "delta": getattr(ev, "delta", None), - "item": getattr(ev, "item", None), - "message": getattr(ev, "message", None), - } - - text_parts: List[str] = [] - tool_calls_by_id: Dict[str, Dict[str, Any]] = {} - - async for ev in stream: - event = _event_dict(ev) if not isinstance(ev, dict) else ev - event_type = event.get("type") or "" - - if event_type == "response.output_text.delta": - delta = event.get("delta") or "" - if delta: - text_parts.append(delta) - elif event_type == "response.output_item.done": - item = event.get("item") or {} - if item.get("type") == "function_call": - cid = item.get("call_id") or item.get("id", "") - tool_calls_by_id[cid] = item - elif event_type in ("response.error", "response.failed", "error"): - err = event.get("message") or event.get("error") or str(event) - raise HTTPException(status_code=502, detail=f"Codex error: {err}"[:400]) - - if tool_calls_by_id and responses_tools and depth < _MAX_RECURSION_DEPTH: - parsed_tool_calls = [ - OpenAIToolCall( - id=cid, - type="function", - function=OpenAIToolCallFunction( - name=data.get("name", ""), - arguments=data.get("arguments", ""), - ), - ) - for cid, data in tool_calls_by_id.items() - ] - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai( - parsed_tool_calls - ) - new_messages = [ - *messages, - OpenAIAssistantMessage( - role="assistant", - content=None, - tool_calls=[tc.model_dump() for tc in parsed_tool_calls], - ), - *tool_call_messages, - ] - return await self._generate_codex( - model=model, - messages=new_messages, - max_tokens=max_tokens, - tools=tools, - depth=depth + 1, - ) - - return "".join(text_parts) or None - - async def generate( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None, - ): - parsed_tools = self.tool_calls_handler.parse_tools(tools) - - content = None - match self.llm_provider: - case LLMProvider.OPENAI: - content = await self._generate_openai( - model=model, - messages=messages, - max_tokens=max_tokens, - tools=parsed_tools, - ) - case LLMProvider.CODEX: - content = await self._generate_codex( - model=model, - messages=messages, - max_tokens=max_tokens, - tools=parsed_tools, - ) - case LLMProvider.GOOGLE: - content = await self._generate_google( - model=model, - messages=messages, - max_tokens=max_tokens, - tools=parsed_tools, - ) - case LLMProvider.ANTHROPIC: - content = await self._generate_anthropic( - model=model, - messages=messages, - max_tokens=max_tokens, - tools=parsed_tools, - ) - case LLMProvider.OLLAMA: - content = await self._generate_ollama( - model=model, messages=messages, max_tokens=max_tokens - ) - case LLMProvider.CUSTOM: - content = await self._generate_custom( - model=model, messages=messages, max_tokens=max_tokens - ) - if content is None: - raise HTTPException( - status_code=400, - detail="LLM did not return any content", - ) - return content - - # ? Generate Structured Content - async def _generate_openai_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - extra_body: Optional[dict] = None, - depth: int = 0, - ) -> dict | None: - client: AsyncOpenAI = self._client - response_schema = response_format - all_tools = [*tools] if tools else None - - use_tool_calls_for_structured_output = ( - self.use_tool_calls_for_structured_output() - ) - if strict and depth == 0: - response_schema = ensure_strict_json_schema( - response_schema, - path=(), - root=response_schema, - ) - response_schema = ensure_array_schemas_have_items(response_schema) - if use_tool_calls_for_structured_output and depth == 0: - if all_tools is None: - all_tools = [] - all_tools.append( - self.tool_calls_handler.parse_tool( - LLMDynamicTool( - name="ResponseSchema", - description="Provide response to the user", - parameters=response_schema, - handler=do_nothing_async, - ), - strict=strict, - ) - ) - - response = await client.chat.completions.create( - model=model, - messages=[message.model_dump() for message in messages], - response_format=( - { - "type": "json_schema", - "json_schema": ( - { - "name": "ResponseSchema", - "strict": strict, - "schema": response_schema, - } - ), - } - if not use_tool_calls_for_structured_output - else None - ), - max_completion_tokens=max_tokens, - tools=all_tools, - extra_body=extra_body, - ) - - if len(response.choices) == 0: - return None - - content = response.choices[0].message.content - - tool_calls = response.choices[0].message.tool_calls - has_response_schema = False - - if tool_calls: - for tool_call in tool_calls: - if tool_call.function.name == "ResponseSchema": - content = tool_call.function.arguments - has_response_schema = True - - if not has_response_schema: - parsed_tool_calls = [ - OpenAIToolCall( - id=tool_call.id, - type=tool_call.type, - function=OpenAIToolCallFunction( - name=tool_call.function.name, - arguments=tool_call.function.arguments, - ), - ) - for tool_call in tool_calls - ] - tool_call_messages = ( - await self.tool_calls_handler.handle_tool_calls_openai( - parsed_tool_calls - ) - ) - new_messages = [ - *messages, - OpenAIAssistantMessage( - role="assistant", - content=response.choices[0].message.content, - tool_calls=[each.model_dump() for each in parsed_tool_calls], - ), - *tool_call_messages, - ] - content = await self._generate_openai_structured( - model=model, - messages=new_messages, - response_format=response_schema, - strict=strict, - max_tokens=max_tokens, - tools=all_tools, - extra_body=extra_body, - depth=depth + 1, - ) - if content: - if depth == 0: - return dict(dirtyjson.loads(content)) - return content - return None - - async def _generate_codex_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - extra_body: Optional[dict] = None, - depth: int = 0, - ) -> dict | None: - """ - Generate structured Codex output using the Responses API. - - This reuses the streaming Codex structured implementation and simply - accumulates the streamed JSON chunks into a single string, then parses - it at the root call. - """ - # Reuse the Responses API streaming implementation for Codex. - accumulated: List[str] = [] - async for chunk in self._stream_codex_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - max_tokens=max_tokens, - tools=tools, - extra_body=extra_body, - depth=depth, - ): - accumulated.append(chunk) - - raw = "".join(accumulated) - if not raw: - return None - - # At the root level we parse into a dict; recursive calls just - # propagate the raw JSON/text, mirroring other providers. - if depth == 0: - return dict(dirtyjson.loads(raw)) - return {"raw": raw} - - async def _generate_google_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ) -> dict | None: - client: genai.Client = self._client - - google_tools = None - if tools: - google_tools = [GoogleTool(function_declarations=[tool]) for tool in tools] - google_tools.append( - GoogleTool( - function_declarations=[ - { - "name": "ResponseSchema", - "description": "Provide response to the user", - "parameters": remove_titles_from_schema( - flatten_json_schema(response_format) - ), - } - ] - ) - ) - - response = await asyncio.to_thread( - client.models.generate_content, - model=model, - contents=self._get_google_messages(messages), - config=GenerateContentConfig( - tools=google_tools, - tool_config=( - GoogleToolConfig( - function_calling_config=GoogleFunctionCallingConfig( - mode=GoogleFunctionCallingConfigMode.ANY, - ), - ) - if tools - else None - ), - system_instruction=self._get_system_prompt(messages), - response_mime_type="application/json" if not tools else None, - response_json_schema=response_format if not tools else None, - max_output_tokens=max_tokens, - ), - ) - - content = response.candidates[0].content - response_parts = content.parts - text_content = None - - if not response_parts: - return None - - tool_calls: List[GoogleToolCall] = [] - for each_part in response_parts: - if each_part.function_call: - tool_calls.append( - GoogleToolCall( - id=each_part.function_call.id, - name=each_part.function_call.name, - arguments=each_part.function_call.args, - ) - ) - - if each_part.text: - text_content = each_part.text - - for each in tool_calls: - if each.name == "ResponseSchema": - return each.arguments - - if tool_calls: - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_google( - tool_calls - ) - new_messages = [ - *messages, - GoogleAssistantMessage( - role="assistant", - content=content, - ), - *tool_call_messages, - ] - return await self._generate_google_structured( - model=model, - messages=new_messages, - max_tokens=max_tokens, - response_format=response_format, - tools=tools, - depth=depth + 1, - ) - - if text_content: - return dict(dirtyjson.loads(text_content)) - return None - - async def _generate_anthropic_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - tools: Optional[List[dict]] = None, - max_tokens: Optional[int] = None, - depth: int = 0, - ): - client: AsyncAnthropic = self._client - response: AnthropicMessage = await client.messages.create( - model=model, - system=self._get_system_prompt(messages), - messages=[ - message.model_dump() - for message in self._get_anthropic_messages(messages) - ], - max_tokens=max_tokens or 4000, - tools=[ - { - "name": "ResponseSchema", - "description": "A response to the user's message", - "input_schema": response_format, - }, - *(tools or []), - ], - ) - tool_calls: List[AnthropicToolCall] = [] - text_parts: List[str] = [] - for content in response.content: - if content.type == "text" and isinstance(content.text, str): - text_parts.append(content.text) - if content.type == "tool_use": - tool_calls.append( - AnthropicToolCall( - id=content.id, - type=content.type, - name=content.name, - input=content.input, - ) - ) - - for each in tool_calls: - if each.name == "ResponseSchema": - return each.input - - if tool_calls: - tool_call_messages = ( - await self.tool_calls_handler.handle_tool_calls_anthropic(tool_calls) - ) - new_messages = [ - *messages, - AnthropicAssistantMessage( - role="assistant", - content=[each.model_dump() for each in tool_calls], - ), - AnthropicUserMessage( - role="user", - content=[each.model_dump() for each in tool_call_messages], - ), - ] - return await self._generate_anthropic_structured( - model=model, - messages=new_messages, - max_tokens=max_tokens, - response_format=response_format, - tools=tools, - depth=depth + 1, - ) - - text_content = "".join(text_parts).strip() - if text_content: - try: - return dict(dirtyjson.loads(text_content)) - except Exception: - pass - - if depth < 2: - await asyncio.sleep(0.4 * (depth + 1)) - return await self._generate_anthropic_structured( - model=model, - messages=messages, - max_tokens=max_tokens, - response_format=response_format, - tools=tools, - depth=depth + 1, - ) - - return None - - async def _generate_ollama_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - max_tokens: Optional[int] = None, - depth: int = 0, - ): - return await self._generate_openai_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - max_tokens=max_tokens, - depth=depth, - ) - - async def _generate_custom_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - max_tokens: Optional[int] = None, - depth: int = 0, - ): - extra_body = {"enable_thinking": False} if self.disable_thinking() else None - return await self._generate_openai_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - max_tokens=max_tokens, - extra_body=extra_body, - depth=depth, - ) - - async def generate_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None, - max_tokens: Optional[int] = None, - ) -> dict: - parsed_tools = self.tool_calls_handler.parse_tools(tools) - - for attempt in range(3): - content = None - match self.llm_provider: - case LLMProvider.OPENAI: - content = await self._generate_openai_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - tools=parsed_tools, - max_tokens=max_tokens, - ) - case LLMProvider.CODEX: - content = await self._generate_codex_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - tools=parsed_tools, - max_tokens=max_tokens, - ) - case LLMProvider.GOOGLE: - content = await self._generate_google_structured( - model=model, - messages=messages, - response_format=response_format, - tools=parsed_tools, - max_tokens=max_tokens, - ) - case LLMProvider.ANTHROPIC: - content = await self._generate_anthropic_structured( - model=model, - messages=messages, - response_format=response_format, - tools=parsed_tools, - max_tokens=max_tokens, - ) - case LLMProvider.OLLAMA: - content = await self._generate_ollama_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - max_tokens=max_tokens, - ) - case LLMProvider.CUSTOM: - content = await self._generate_custom_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - max_tokens=max_tokens, - ) - - if content is not None: - return content - - if attempt < 2: - await asyncio.sleep(0.5 * (attempt + 1)) - - raise HTTPException( - status_code=400, - detail="LLM did not return any content", - ) - - # ? Stream Unstructured Content - async def _stream_openai( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - extra_body: Optional[dict] = None, - depth: int = 0, - ) -> AsyncGenerator[str, None]: - client: AsyncOpenAI = self._client - - tool_calls: List[LLMToolCall] = [] - current_index = 0 - current_id = None - current_name = None - current_arguments = None - async for event in await client.chat.completions.create( - model=model, - messages=[message.model_dump() for message in messages], - max_completion_tokens=max_tokens, - tools=tools, - extra_body=extra_body, - stream=True, - ): - event: OpenAIChatCompletionChunk = event - if not event.choices: - continue - - content_chunk = event.choices[0].delta.content - if content_chunk: - yield content_chunk - - tool_call_chunk = event.choices[0].delta.tool_calls - if tool_call_chunk: - tool_index = tool_call_chunk[0].index - tool_id = tool_call_chunk[0].id - tool_name = tool_call_chunk[0].function.name - tool_arguments = tool_call_chunk[0].function.arguments - - if current_index != tool_index: - tool_calls.append( - OpenAIToolCall( - id=current_id, - type="function", - function=OpenAIToolCallFunction( - name=current_name, - arguments=current_arguments, - ), - ) - ) - current_index = tool_index - current_id = tool_id - current_name = tool_name - current_arguments = tool_arguments - else: - current_name = tool_name or current_name - current_id = tool_id or current_id - if current_arguments is None: - current_arguments = tool_arguments - elif tool_arguments: - current_arguments += tool_arguments - - if current_id is not None: - tool_calls.append( - OpenAIToolCall( - id=current_id, - type="function", - function=OpenAIToolCallFunction( - name=current_name, - arguments=current_arguments, - ), - ) - ) - - if tool_calls: - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai( - tool_calls - ) - new_messages = [ - *messages, - OpenAIAssistantMessage( - role="assistant", - content=None, - tool_calls=[each.model_dump() for each in tool_calls], - ), - *tool_call_messages, - ] - async for event in self._stream_openai( - model=model, - messages=new_messages, - max_tokens=max_tokens, - tools=tools, - extra_body=extra_body, - depth=depth + 1, - ): - yield event - - async def _stream_google( - self, - model: str, - messages: List[LLMMessage], - tools: Optional[List[dict]] = None, - max_tokens: Optional[int] = None, - depth: int = 0, - ) -> AsyncGenerator[str, None]: - client: genai.Client = self._client - - google_tools = None - if tools: - google_tools = [GoogleTool(function_declarations=[tool]) for tool in tools] - - generated_contents = [] - tool_calls: List[GoogleToolCall] = [] - async for event in iterator_to_async(client.models.generate_content_stream)( - model=model, - contents=self._get_google_messages(messages), - config=GenerateContentConfig( - system_instruction=self._get_system_prompt(messages), - response_mime_type="text/plain", - tools=google_tools, - max_output_tokens=max_tokens, - ), - ): - if not ( - event.candidates - and event.candidates[0].content - and event.candidates[0].content.parts - ): - continue - - generated_contents.append(event.candidates[0].content) - - for each_part in event.candidates[0].content.parts: - if each_part.text: - yield each_part.text - - if each_part.function_call: - tool_calls.append( - GoogleToolCall( - id=each_part.function_call.id, - name=each_part.function_call.name, - arguments=each_part.function_call.args, - ) - ) - - if tool_calls: - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_google( - tool_calls - ) - new_messages = [ - *messages, - *[ - GoogleAssistantMessage( - role="assistant", - content=each, - ) - for each in generated_contents - ], - *tool_call_messages, - ] - async for event in self._stream_google( - model=model, - messages=new_messages, - max_tokens=max_tokens, - tools=tools, - depth=depth + 1, - ): - yield event - - async def _stream_anthropic( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ): - client: AsyncAnthropic = self._client - - tool_calls: List[AnthropicToolCall] = [] - async with client.messages.stream( - model=model, - system=self._get_system_prompt(messages), - messages=[ - message.model_dump() - for message in self._get_anthropic_messages(messages) - ], - max_tokens=max_tokens or 4000, - tools=tools, - ) as stream: - async for event in stream: - event: AnthropicMessageStreamEvent = event - - if event.type == "text": - yield event.text - - if ( - event.type == "content_block_stop" - and event.content_block.type == "tool_use" - ): - tool_calls.append( - AnthropicToolCall( - id=event.content_block.id, - type=event.content_block.type, - name=event.content_block.name, - input=event.content_block.input, - ) - ) - - if tool_calls: - tool_call_messages = ( - await self.tool_calls_handler.handle_tool_calls_anthropic(tool_calls) - ) - new_messages = [ - *messages, - AnthropicAssistantMessage( - role="assistant", - content=[each.model_dump() for each in tool_calls], - ), - AnthropicUserMessage( - role="user", - content=[each.model_dump() for each in tool_call_messages], - ), - ] - async for event in self._stream_anthropic( - model=model, - messages=new_messages, - max_tokens=max_tokens, - tools=tools, - depth=depth + 1, - ): - yield event - - async def _stream_codex( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ) -> AsyncGenerator[str, None]: - """ - Stream plain text from Codex (Responses API). On tool calls, execute tools - and recurse, mirroring _stream_openai but using Responses events. - """ - _MAX_RECURSION_DEPTH = 5 - client: AsyncOpenAI = ( - self._get_codex_client() - if self.llm_provider == LLMProvider.CODEX - else self._client - ) - - # Flatten tools to Responses API format - responses_tools: Optional[List[dict]] = None - if tools: - responses_tools = [] - for tool in tools: - fn = (tool.get("function") or tool) if isinstance(tool, dict) else {} - if isinstance(fn, dict): - responses_tools.append( - { - "type": "function", - "name": fn.get("name", ""), - "description": fn.get("description", ""), - "parameters": fn.get("parameters", {}), - } - ) - else: - responses_tools.append(tool) - - # Build instructions + input (same shape as _generate_codex/_stream_codex_structured) - instructions = self._get_system_prompt(messages) or None - input_payload: List[Dict[str, Any]] = [] - for m in messages: - if isinstance(m, LLMSystemMessage): - continue - if isinstance(m, LLMUserMessage): - input_payload.append( - { - "role": "user", - "content": [{"type": "input_text", "text": m.content}], - } - ) - elif isinstance(m, OpenAIAssistantMessage): - text = m.content or "" - if text: - input_payload.append( - { - "role": "assistant", - "content": [{"type": "output_text", "text": text}], - } - ) - else: - text = getattr(m, "content", "") or "" - if text: - input_payload.append( - { - "role": "user", - "content": [{"type": "input_text", "text": text}], - } - ) - - create_kwargs: Dict[str, Any] = { - "model": model, - "store": False, - "stream": True, - "text": {"verbosity": "medium"}, - "include": ["reasoning.encrypted_content"], - "tool_choice": "auto", - "parallel_tool_calls": True, - } - if instructions: - create_kwargs["instructions"] = instructions - if input_payload: - create_kwargs["input"] = input_payload - if responses_tools: - create_kwargs["tools"] = responses_tools - if max_tokens is not None: - create_kwargs["max_output_tokens"] = max_tokens - - stream = await client.responses.create(**create_kwargs) - - def _event_dict(ev: Any) -> dict: - if hasattr(ev, "model_dump"): - return ev.model_dump() - return { - "type": getattr(ev, "type", None), - "delta": getattr(ev, "delta", None), - "item": getattr(ev, "item", None), - "message": getattr(ev, "message", None), - } - - tool_calls_by_id: Dict[str, Dict[str, Any]] = {} - - async for ev in stream: - event = _event_dict(ev) if not isinstance(ev, dict) else ev - event_type = event.get("type") or "" - - if event_type == "response.output_text.delta": - delta = event.get("delta") or "" - if delta: - yield delta - elif event_type == "response.output_item.done": - item = event.get("item") or {} - if item.get("type") == "function_call": - cid = item.get("call_id") or item.get("id", "") - tool_calls_by_id[cid] = item - elif event_type in ("response.error", "response.failed", "error"): - err = event.get("message") or event.get("error") or str(event) - raise HTTPException(status_code=502, detail=f"Codex stream error: {err}"[:400]) - - if tool_calls_by_id and responses_tools and depth < _MAX_RECURSION_DEPTH: - parsed_tool_calls = [ - OpenAIToolCall( - id=cid, - type="function", - function=OpenAIToolCallFunction( - name=data.get("name", ""), - arguments=data.get("arguments", ""), - ), - ) - for cid, data in tool_calls_by_id.items() - ] - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai( - parsed_tool_calls - ) - new_messages = [ - *messages, - OpenAIAssistantMessage( - role="assistant", - content=None, - tool_calls=[tc.model_dump() for tc in parsed_tool_calls], - ), - *tool_call_messages, - ] - async for chunk in self._stream_codex( - model=model, - messages=new_messages, - max_tokens=max_tokens, - tools=tools, - depth=depth + 1, - ): - yield chunk - - def _stream_ollama( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - depth: int = 0, - ): - return self._stream_openai( - model=model, messages=messages, max_tokens=max_tokens, depth=depth - ) - - def _stream_custom( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - depth: int = 0, - ): - extra_body = {"enable_thinking": False} if self.disable_thinking() else None - return self._stream_openai( - model=model, - messages=messages, - max_tokens=max_tokens, - extra_body=extra_body, - depth=depth, - ) - - def stream( - self, - model: str, - messages: List[LLMMessage], - max_tokens: Optional[int] = None, - tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None, - ): - parsed_tools = self.tool_calls_handler.parse_tools(tools) - - match self.llm_provider: - case LLMProvider.OPENAI: - return self._stream_openai( - model=model, - messages=messages, - max_tokens=max_tokens, - tools=parsed_tools, - ) - case LLMProvider.CODEX: - return self._stream_codex( - model=model, - messages=messages, - max_tokens=max_tokens, - tools=parsed_tools, - ) - case LLMProvider.GOOGLE: - return self._stream_google( - model=model, - messages=messages, - max_tokens=max_tokens, - tools=parsed_tools, - ) - case LLMProvider.ANTHROPIC: - return self._stream_anthropic( - model=model, - messages=messages, - max_tokens=max_tokens, - tools=parsed_tools, - ) - case LLMProvider.OLLAMA: - return self._stream_ollama( - model=model, messages=messages, max_tokens=max_tokens - ) - case LLMProvider.CUSTOM: - return self._stream_custom( - model=model, messages=messages, max_tokens=max_tokens - ) - - # ? Stream Structured Content - async def _stream_openai_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - extra_body: Optional[dict] = None, - depth: int = 0, - ) -> AsyncGenerator[str, None]: - client: AsyncOpenAI = self._client - - response_schema = response_format - all_tools = [*tools] if tools else None - - use_tool_calls_for_structured_output = ( - self.use_tool_calls_for_structured_output() - ) - if strict and depth == 0: - response_schema = ensure_strict_json_schema( - response_schema, - path=(), - root=response_schema, - ) - response_schema = ensure_array_schemas_have_items(response_schema) - - if use_tool_calls_for_structured_output and depth == 0: - if all_tools is None: - all_tools = [] - all_tools.append( - self.tool_calls_handler.parse_tool( - LLMDynamicTool( - name="ResponseSchema", - description="Provide response to the user", - parameters=response_schema, - handler=do_nothing_async, - ), - strict=strict, - ) - ) - - tool_calls: List[LLMToolCall] = [] - current_index = 0 - current_id = None - current_name = None - current_arguments = None - - has_response_schema_tool_call = False - async for event in await client.chat.completions.create( - model=model, - messages=[message.model_dump() for message in messages], - max_completion_tokens=max_tokens, - tools=all_tools, - response_format=( - { - "type": "json_schema", - "json_schema": ( - { - "name": "ResponseSchema", - "strict": strict, - "schema": response_schema, - } - ), - } - if not use_tool_calls_for_structured_output - else None - ), - extra_body=extra_body, - stream=True, - ): - event: OpenAIChatCompletionChunk = event - if not event.choices: - continue - - content_chunk = event.choices[0].delta.content - if content_chunk and not use_tool_calls_for_structured_output: - yield content_chunk - - tool_call_chunk = event.choices[0].delta.tool_calls - if tool_call_chunk: - tool_index = tool_call_chunk[0].index - tool_id = tool_call_chunk[0].id - tool_name = tool_call_chunk[0].function.name - tool_arguments = tool_call_chunk[0].function.arguments - - if current_index != tool_index: - tool_calls.append( - OpenAIToolCall( - id=current_id, - type="function", - function=OpenAIToolCallFunction( - name=current_name, - arguments=current_arguments, - ), - ) - ) - current_index = tool_index - current_id = tool_id - current_name = tool_name - current_arguments = tool_arguments - else: - current_name = tool_name or current_name - current_id = tool_id or current_id - if current_arguments is None: - current_arguments = tool_arguments - elif tool_arguments: - current_arguments += tool_arguments - - if current_name == "ResponseSchema": - if tool_arguments: - yield tool_arguments - has_response_schema_tool_call = True - - if current_id is not None: - tool_calls.append( - OpenAIToolCall( - id=current_id, - type="function", - function=OpenAIToolCallFunction( - name=current_name, - arguments=current_arguments, - ), - ) - ) - - if tool_calls and not has_response_schema_tool_call: - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai( - tool_calls - ) - new_messages = [ - *messages, - OpenAIAssistantMessage( - role="assistant", - content=None, - tool_calls=[each.model_dump() for each in tool_calls], - ), - *tool_call_messages, - ] - async for event in self._stream_openai_structured( - model=model, - messages=new_messages, - max_tokens=max_tokens, - strict=strict, - tools=all_tools, - response_format=response_schema, - extra_body=extra_body, - depth=depth + 1, - ): - yield event - - - - async def _stream_codex_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - extra_body: Optional[dict] = None, - ) -> AsyncGenerator[str, None]: - """ - Stream structured responses using OpenAI's Responses API (Codex-style models). - - This implementation is intentionally separate from ChatCompletion-based streaming - because the Responses API uses a fundamentally different event model. - - Why this function exists: - - 1. The Responses API does NOT return `choices[].delta` like ChatCompletions. - Instead, it streams typed events such as: - - response.output_text.delta - - response.output_tool_call.delta - - response.completed - - response.error - - 2. Structured output can be achieved in two ways: - a) Native JSON schema enforcement via `response_format` - b) Tool-call-based structured output using a synthetic `ResponseSchema` tool - - This function supports both approaches. When tool-call structured mode is enabled, - a dynamic `ResponseSchema` tool is injected so the model returns structured data - as tool call arguments. - - 3. Tool calls must be accumulated incrementally. - The Responses API streams tool call arguments in chunks (`arguments_delta`), - so we reconstruct the full argument payload before executing the tool. - - 4. Recursive tool execution is supported. - If the model calls external tools (e.g., web search), we: - - Execute the tools asynchronously - - Append tool results as new messages - - Reinvoke the model recursively - This enables multi-step reasoning and grounding workflows. - - 5. Provider abstraction is preserved. - The Responses API event format is converted into our internal tool-call model - before being passed to the tool handler layer. This prevents SDK-specific - structures from leaking into business logic. - - 6. Strict schema enforcement (optional). - When `strict=True`, the provided JSON schema is hardened before being sent - to the model to reduce malformed outputs. - - Important architectural note: - This function MUST NOT assume ChatCompletion-style streaming fields like - `choices`, `delta.content`, or `delta.tool_calls`. It strictly follows the - Responses API event model. - - This separation ensures: - - Future compatibility with GPT-5 / Codex models - - Clean provider abstraction - - Streaming-safe structured JSON assembly - - Robust multi-tool recursive execution - """ - client: AsyncOpenAI = self._client - response_schema = response_format - # Apply strict schema once at root (includes array "items" fix at lines 135–155). - if strict and depth == 0: - response_schema = ensure_strict_json_schema( - response_schema, - path=(), - root=response_schema, - ) - # When we didn't run ensure_strict_json_schema, fix arrays for Codex API (strict=False or depth > 0). - else: - response_schema = ensure_array_schemas_have_items(response_schema) - - # Responses API tool format: flat {type, name, description, parameters} - response_schema_tool = { - "type": "function", - "name": "ResponseSchema", - "description": "Provide structured response", - "parameters": response_schema, - } - all_tools: List[dict] = [response_schema_tool] - if tools: - for tool in tools: - fn = (tool.get("function") or tool) if isinstance(tool, dict) else {} - if isinstance(fn, dict): - all_tools.append({ - "type": "function", - "name": fn.get("name", ""), - "description": fn.get("description", ""), - "parameters": fn.get("parameters", {}), - }) - else: - all_tools.append(tool) - - # Build instructions + input like Codex adapter (instructions from system; input_text/output_text) - instructions = self._get_system_prompt(messages) or None - input_payload: List[Dict[str, Any]] = [] - for m in messages: - if isinstance(m, LLMSystemMessage): - continue - if isinstance(m, LLMUserMessage): - input_payload.append({ - "role": "user", - "content": [{"type": "input_text", "text": m.content}], - }) - elif isinstance(m, OpenAIAssistantMessage): - text = m.content or "" - if text: - input_payload.append({ - "role": "assistant", - "content": [{"type": "output_text", "text": text}], - }) - else: - text = getattr(m, "content", "") or "" - if text: - input_payload.append({ - "role": "user", - "content": [{"type": "input_text", "text": text}], - }) - - # Force model to use ResponseSchema for structured output - tool_choice = {"type": "function", "name": "ResponseSchema"} - create_kwargs: Dict[str, Any] = { - "model": model, - "store": False, - "stream": True, - "text": {"verbosity": "medium"}, - "include": ["reasoning.encrypted_content"], - "tool_choice": tool_choice, - "parallel_tool_calls": True, - "tools": all_tools, - } - if instructions: - create_kwargs["instructions"] = instructions - if input_payload: - create_kwargs["input"] = input_payload - if max_tokens is not None: - create_kwargs["max_output_tokens"] = max_tokens - if extra_body: - create_kwargs.update(extra_body) - - stream = await client.responses.create(**create_kwargs) - - - def _event_dict(ev: Any) -> dict: - if hasattr(ev, "model_dump"): - return ev.model_dump() - return { - "type": getattr(ev, "type", None), - "delta": getattr(ev, "delta", None), - "arguments": getattr(ev, "arguments", None), - "arguments_delta": getattr(ev, "arguments_delta", None), - "item": getattr(ev, "item", None), - "id": getattr(ev, "id", None), - "name": getattr(ev, "name", None), - "error": getattr(ev, "error", None), - "message": getattr(ev, "message", None), - } - - tool_calls_by_id: Dict[str, Dict[str, Any]] = {} - current_call_id: Optional[str] = None - has_response_schema_tool_call = False - - async for ev in stream: - event = _event_dict(ev) if not isinstance(ev, dict) else ev - event_type = event.get("type") or "" - - if event_type == "response.output_item.added": - item = event.get("item") or {} - if item.get("type") == "function_call" and item.get("name") == "ResponseSchema": - current_call_id = item.get("call_id") or item.get("id") - - elif event_type == "response.function_call_arguments.delta": - if current_call_id: - delta = event.get("delta") or "" - if delta: - has_response_schema_tool_call = True - yield delta - - elif event_type == "response.function_call_arguments.done": - if event.get("name") == "ResponseSchema": - args = event.get("arguments") or "" - if args: - has_response_schema_tool_call = True - yield args - - elif event_type == "response.output_item.done": - item = event.get("item") or {} - if item.get("type") == "function_call": - cid = item.get("call_id") or item.get("id", "") - tool_calls_by_id[cid] = item - if item.get("name") == "ResponseSchema": - args = item.get("arguments") or "" - if args: - has_response_schema_tool_call = True - yield args - - elif event_type == "response.output_tool_call.delta": - call_id = event.get("id") - name = event.get("name") - arguments_delta = event.get("arguments_delta") or "" - if call_id and name: - if call_id not in tool_calls_by_id: - tool_calls_by_id[call_id] = {"name": name, "arguments": ""} - tool_calls_by_id[call_id]["arguments"] += arguments_delta - if name == "ResponseSchema" and arguments_delta: - has_response_schema_tool_call = True - yield arguments_delta - - elif event_type == "response.completed": - break - - elif event_type in ("response.error", "response.failed", "error"): - err = event.get("error") or event.get("message") or str(event) - raise RuntimeError(err) - - # ============================================ - # EXECUTE NON-STRUCTURED TOOL CALLS (RECURSIVE) - # ============================================ - - other_tool_calls = { - cid: data - for cid, data in tool_calls_by_id.items() - if data.get("name") != "ResponseSchema" - } - if other_tool_calls and not has_response_schema_tool_call: - parsed_tool_calls = [] - for call_id, data in other_tool_calls.items(): - args = data.get("arguments", "") if isinstance(data, dict) else "" - parsed_tool_calls.append( - OpenAIToolCall( - id=call_id, - type="function", - function=OpenAIToolCallFunction( - name=data.get("name", ""), - arguments=args, - ), - ) - ) - - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_openai( - parsed_tool_calls - ) - - new_messages = [ - *messages, - OpenAIAssistantMessage( - role="assistant", - content=None, - tool_calls=[tc.model_dump() for tc in parsed_tool_calls], - ), - *tool_call_messages, - ] - - async for chunk in self._stream_codex_structured( - model=model, - messages=new_messages, - response_format=response_schema, - strict=strict, - max_tokens=max_tokens, - tools=tools, - extra_body=extra_body, - depth=depth + 1, - ): - yield chunk - - async def _stream_google_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - max_tokens: Optional[int] = None, - tools: Optional[List[dict]] = None, - depth: int = 0, - ) -> AsyncGenerator[str, None]: - - client: genai.Client = self._client - - google_tools = None - if tools: - google_tools = [GoogleTool(function_declarations=[tool]) for tool in tools] - google_tools.append( - GoogleTool( - function_declarations=[ - { - "name": "ResponseSchema", - "description": "Provide response to the user", - "parameters": remove_titles_from_schema( - flatten_json_schema(response_format) - ), - } - ] - ) - ) - - parsed_messages = self._get_google_messages(messages) - - generated_contents = [] - tool_calls: List[GoogleToolCall] = [] - has_response_schema_tool_call = False - async for event in iterator_to_async(client.models.generate_content_stream)( - model=model, - contents=parsed_messages, - config=GenerateContentConfig( - tools=google_tools, - tool_config=( - GoogleToolConfig( - function_calling_config=GoogleFunctionCallingConfig( - mode=GoogleFunctionCallingConfigMode.ANY, - ), - ) - if tools - else None - ), - system_instruction=self._get_system_prompt(messages), - response_mime_type="application/json" if not tools else None, - response_json_schema=response_format if not tools else None, - max_output_tokens=max_tokens, - ), - ): - if not ( - event.candidates - and event.candidates[0].content - and event.candidates[0].content.parts - ): - continue - - generated_contents.append(event.candidates[0].content) - - for each_part in event.candidates[0].content.parts: - if each_part.text and not google_tools: - yield each_part.text - - if each_part.function_call: - if each_part.function_call.name == "ResponseSchema": - has_response_schema_tool_call = True - if each_part.function_call.args: - yield json.dumps(each_part.function_call.args) - - tool_calls.append( - GoogleToolCall( - id=each_part.function_call.id, - name=each_part.function_call.name, - arguments=each_part.function_call.args, - ) - ) - - if tool_calls and not has_response_schema_tool_call: - tool_call_messages = await self.tool_calls_handler.handle_tool_calls_google( - tool_calls - ) - new_messages = [ - *messages, - *[ - GoogleAssistantMessage( - role="assistant", - content=each, - ) - for each in generated_contents - ], - *tool_call_messages, - ] - async for event in self._stream_google_structured( - model=model, - messages=new_messages, - max_tokens=max_tokens, - response_format=response_format, - tools=tools, - depth=depth + 1, - ): - yield event - - async def _stream_anthropic_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - tools: Optional[List[dict]] = None, - max_tokens: Optional[int] = None, - depth: int = 0, - ) -> AsyncGenerator[str, None]: - client: AsyncAnthropic = self._client - - tool_calls: List[AnthropicToolCall] = [] - has_response_schema_tool_call = False - async with client.messages.stream( - model=model, - system=self._get_system_prompt(messages), - messages=[ - message.model_dump() - for message in self._get_anthropic_messages(messages) - ], - max_tokens=max_tokens or 4000, - tools=[ - { - "name": "ResponseSchema", - "description": "A response to the user's message", - "input_schema": response_format, - }, - *(tools or []), - ], - ) as stream: - is_response_schema_tool_call_started = False - async for event in stream: - event: AnthropicMessageStreamEvent = event - - if ( - event.type == "content_block_start" - and event.content_block.type == "tool_use" - ): - if event.content_block.name == "ResponseSchema": - has_response_schema_tool_call = True - is_response_schema_tool_call_started = True - - if ( - event.type == "content_block_delta" - and event.delta.type == "input_json_delta" - and is_response_schema_tool_call_started - ): - yield event.delta.partial_json - - if ( - event.type == "content_block_stop" - and event.content_block.type == "tool_use" - ): - tool_calls.append( - AnthropicToolCall( - id=event.content_block.id, - type=event.content_block.type, - name=event.content_block.name, - input=event.content_block.input, - ) - ) - - if tool_calls and not has_response_schema_tool_call: - tool_call_messages = ( - await self.tool_calls_handler.handle_tool_calls_anthropic(tool_calls) - ) - new_messages = [ - *messages, - AnthropicAssistantMessage( - role="assistant", - content=[each.model_dump() for each in tool_calls], - ), - AnthropicUserMessage( - role="user", - content=[each.model_dump() for each in tool_call_messages], - ), - ] - async for event in self._stream_anthropic_structured( - model=model, - messages=new_messages, - max_tokens=max_tokens, - response_format=response_format, - tools=tools, - depth=depth + 1, - ): - yield event - - def _stream_ollama_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - max_tokens: Optional[int] = None, - depth: int = 0, - ): - return self._stream_openai_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - max_tokens=max_tokens, - depth=depth, - ) - - def _stream_custom_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - max_tokens: Optional[int] = None, - depth: int = 0, - ): - extra_body = {"enable_thinking": False} if self.disable_thinking() else None - return self._stream_openai_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - max_tokens=max_tokens, - extra_body=extra_body, - depth=depth, - ) - - def stream_structured( - self, - model: str, - messages: List[LLMMessage], - response_format: dict, - strict: bool = False, - tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None, - max_tokens: Optional[int] = None, - ): - parsed_tools = self.tool_calls_handler.parse_tools(tools) - - match self.llm_provider: - case LLMProvider.OPENAI: - return self._stream_openai_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - tools=parsed_tools, - max_tokens=max_tokens, - ) - case LLMProvider.CODEX: - return self._stream_codex_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - tools=parsed_tools, - max_tokens=max_tokens, - ) - case LLMProvider.GOOGLE: - return self._stream_google_structured( - model=model, - messages=messages, - response_format=response_format, - tools=parsed_tools, - max_tokens=max_tokens, - ) - case LLMProvider.ANTHROPIC: - return self._stream_anthropic_structured( - model=model, - messages=messages, - response_format=response_format, - tools=parsed_tools, - max_tokens=max_tokens, - ) - case LLMProvider.OLLAMA: - return self._stream_ollama_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - max_tokens=max_tokens, - ) - case LLMProvider.CUSTOM: - return self._stream_custom_structured( - model=model, - messages=messages, - response_format=response_format, - strict=strict, - max_tokens=max_tokens, - ) - - # ? Web search - async def _search_openai(self, query: str) -> str: - client: AsyncOpenAI = self._client - response = await client.responses.create( - model=get_model(), - tools=[ - { - "type": "web_search_preview", - } - ], - input=query, - ) - return response.output_text - - async def _search_google(self, query: str) -> str: - client: genai.Client = self._client - grounding_tool = GoogleTool(google_search=GoogleSearch()) - config = GenerateContentConfig(tools=[grounding_tool]) - - response = await asyncio.to_thread( - client.models.generate_content, - model=get_model(), - contents=query, - config=config, - ) - return response.text - - async def _search_anthropic(self, query: str) -> str: - client: AsyncAnthropic = self._client - - response = await client.messages.create( - model=get_model(), - max_tokens=4000, - messages=[{"role": "user", "content": query}], - tools=[ - {"type": "web_search_20250305", "name": "web_search", "max_uses": 1} - ], - ) - result = "\n".join( - [each.text for each in response.content if each.type == "text"] - ) - return result diff --git a/servers/fastapi/services/llm_tool_calls_handler.py b/servers/fastapi/services/llm_tool_calls_handler.py deleted file mode 100644 index 63476028..00000000 --- a/servers/fastapi/services/llm_tool_calls_handler.py +++ /dev/null @@ -1,211 +0,0 @@ -import asyncio -from datetime import datetime -import json -from typing import Any, Callable, Coroutine, List, Optional -from fastapi import HTTPException -from enums.llm_provider import LLMProvider -from models.llm_message import ( - AnthropicToolCallMessage, - GoogleToolCallMessage, - OpenAIToolCallMessage, -) -from models.llm_tool_call import AnthropicToolCall, GoogleToolCall, OpenAIToolCall -from models.llm_tools import LLMDynamicTool, LLMTool, SearchWebTool -from utils.schema_utils import ( - ensure_strict_json_schema, - flatten_json_schema, - remove_titles_from_schema, -) - - -class LLMToolCallsHandler: - def __init__(self, client): - from services.llm_client import LLMClient - - self.client: LLMClient = client - - self.tools_map: dict[str, Callable[..., Coroutine[Any, Any, str]]] = { - "SearchWebTool": self.search_web_tool_call_handler, - "GetCurrentDatetimeTool": self.get_current_datetime_tool_call_handler, - } - self.dynamic_tools: List[LLMDynamicTool] = [] - - def get_tool_handler( - self, tool_name: str - ) -> Callable[..., Coroutine[Any, Any, str]]: - handler = self.tools_map.get(tool_name) - if handler: - return handler - else: - dynamic_tools = list( - filter(lambda tool: tool.name == tool_name, self.dynamic_tools) - ) - if dynamic_tools: - return dynamic_tools[0].handler - raise HTTPException(status_code=500, detail=f"Tool {tool_name} not found") - - def parse_tools(self, tools: Optional[List[type[LLMTool] | LLMDynamicTool]] = None): - if tools is None: - return None - parsed_tools = map(self.parse_tool, tools) - return list(parsed_tools) - - def parse_tool(self, tool: type[LLMTool] | LLMDynamicTool, strict: bool = False): - if isinstance(tool, LLMDynamicTool): - self.dynamic_tools.append(tool) - - match self.client.llm_provider: - case LLMProvider.OPENAI | LLMProvider.OLLAMA | LLMProvider.CUSTOM: - return self.parse_tool_openai(tool, strict) - case LLMProvider.ANTHROPIC: - return self.parse_tool_anthropic(tool) - case LLMProvider.GOOGLE: - return self.parse_tool_google(tool) - case _: - raise ValueError( - f"LLM provider must be either openai, anthropic, or google" - ) - - def parse_tool_openai( - self, tool: type[LLMTool] | LLMDynamicTool, strict: bool = False - ): - if isinstance(tool, LLMDynamicTool): - name = tool.name - description = tool.description - parameters = tool.parameters - else: - name = tool.__name__ - description = tool.__doc__ or "" - parameters = tool.model_json_schema() - - if strict: - parameters = ensure_strict_json_schema(parameters, path=(), root=parameters) - - return { - "type": "function", - "function": { - "name": name, - "description": description, - "strict": strict, - "parameters": parameters, - }, - } - - def parse_tool_google(self, tool: type[LLMTool] | LLMDynamicTool): - parsed = self.parse_tool_openai(tool) - parsed["function"]["parameters"] = ( - remove_titles_from_schema( - flatten_json_schema(parsed["function"]["parameters"]) - ) - if parsed["function"]["parameters"] - else {} - ) - return { - "name": parsed["function"]["name"], - "description": parsed["function"]["description"], - "parameters": parsed["function"]["parameters"], - } - - def parse_tool_anthropic(self, tool: type[LLMTool] | LLMDynamicTool): - parsed = self.parse_tool_openai(tool) - input_schema = parsed["function"]["parameters"] - return { - "name": parsed["function"]["name"], - "description": parsed["function"]["description"], - "input_schema": {"type": "object"} if input_schema == {} else input_schema, - } - - async def handle_tool_calls_openai( - self, - tool_calls: List[OpenAIToolCall], - ) -> List[OpenAIToolCallMessage]: - async_tool_calls_tasks = [] - for tool_call in tool_calls: - tool_name = tool_call.function.name - tool_handler = self.get_tool_handler(tool_name) - async_tool_calls_tasks.append(tool_handler(tool_call.function.arguments)) - - tool_call_results: List[str] = await asyncio.gather(*async_tool_calls_tasks) - tool_call_messages = [ - OpenAIToolCallMessage( - content=result, - tool_call_id=tool_call.id, - ) - for tool_call, result in zip(tool_calls, tool_call_results) - ] - return tool_call_messages - - async def handle_tool_calls_google( - self, - tool_calls: List[GoogleToolCall], - ) -> List[GoogleToolCallMessage]: - async_tool_calls_tasks = [] - for tool_call in tool_calls: - tool_name = tool_call.name - tool_handler = self.get_tool_handler(tool_name) - async_tool_calls_tasks.append(tool_handler(json.dumps(tool_call.arguments))) - - tool_call_results: List[str] = await asyncio.gather(*async_tool_calls_tasks) - - tool_call_messages = [ - GoogleToolCallMessage( - id=tool_call.id, - name=tool_call.name, - response={"result": result}, - ) - for tool_call, result in zip(tool_calls, tool_call_results) - ] - return tool_call_messages - - async def handle_tool_calls_anthropic( - self, - tool_calls: List[AnthropicToolCall], - ) -> List[AnthropicToolCallMessage]: - async_tool_calls_tasks = [] - for tool_call in tool_calls: - tool_name = tool_call.name - tool_handler = self.get_tool_handler(tool_name) - async_tool_calls_tasks.append(tool_handler(json.dumps(tool_call.input))) - - tool_call_results: List[str] = await asyncio.gather(*async_tool_calls_tasks) - tool_call_messages = [ - AnthropicToolCallMessage( - content=result, - tool_use_id=tool_call.id, - ) - for tool_call, result in zip(tool_calls, tool_call_results) - ] - return tool_call_messages - - # ? Tool call handlers - # Search web tool call handler - async def search_web_tool_call_handler(self, arguments: str) -> str: - match self.client.llm_provider: - case LLMProvider.OPENAI: - return await self.search_web_tool_call_handler_openai(arguments) - case LLMProvider.ANTHROPIC: - return await self.search_web_tool_call_handler_anthropic(arguments) - case LLMProvider.GOOGLE: - return await self.search_web_tool_call_handler_google(arguments) - case _: - return ( - "Web search tool call handler not implemented for this LLM provider: " - + self.client.llm_provider.value - ) - - async def search_web_tool_call_handler_openai(self, arguments: str) -> str: - args = SearchWebTool.model_validate_json(arguments) - return await self.client._search_openai(args.query) - - async def search_web_tool_call_handler_google(self, arguments: str) -> str: - args = SearchWebTool.model_validate_json(arguments) - return await self.client._search_google(args.query) - - async def search_web_tool_call_handler_anthropic(self, arguments: str) -> str: - args = SearchWebTool.model_validate_json(arguments) - return await self.client._search_anthropic(args.query) - - # Get current datetime tool call handler - async def get_current_datetime_tool_call_handler(self, _) -> str: - current_time = datetime.now() - return f"{current_time.strftime('%A, %B %d, %Y')} at {current_time.strftime('%I:%M:%S %p')}" diff --git a/servers/fastapi/templates/providers.py b/servers/fastapi/templates/providers.py index 9e3a0ba8..1c7f4734 100644 --- a/servers/fastapi/templates/providers.py +++ b/servers/fastapi/templates/providers.py @@ -4,10 +4,17 @@ from dataclasses import dataclass import time from typing import Any, Awaitable, Callable, Optional -from anthropic import AsyncAnthropic from fastapi import HTTPException from google import genai from google.genai import types as google_types +from llmai import AnthropicClient +from llmai.shared import ( + AnthropicClientConfig, + ImageContentPart, + SystemMessage, + TextResponse, + UserMessage, +) from openai import AsyncOpenAI from enums.llm_provider import LLMProvider @@ -160,11 +167,28 @@ def _get_google_client() -> genai.Client: return genai.Client(api_key=api_key) -def _get_anthropic_client() -> AsyncAnthropic: +def _get_anthropic_client() -> AnthropicClient: api_key = get_anthropic_api_key_env() if not api_key: raise HTTPException(status_code=400, detail="ANTHROPIC_API_KEY is not set") - return AsyncAnthropic(api_key=api_key) + return AnthropicClient(config=AnthropicClientConfig(api_key=api_key)) + + +def _read_llmai_response_text(response: Any) -> str: + content = getattr(response, "content", None) + if isinstance(content, str): + return content + if isinstance(content, list): + parts: list[str] = [] + for part in content: + if isinstance(part, str): + parts.append(part) + continue + text = getattr(part, "text", None) + if isinstance(text, str): + parts.append(text) + return "".join(parts) + return getattr(content, "text", None) or "" async def _call_openai_like( @@ -308,28 +332,24 @@ async def _call_anthropic( media_type: str = "image/png", ) -> str: client = _get_anthropic_client() - content = [{"type": "text", "text": user_text}] + content: str | list[object] = user_text if image_bytes: - content.append( - { - "type": "image", - "source": { - "type": "base64", - "media_type": media_type, - "data": base64.b64encode(image_bytes).decode("utf-8"), - }, - } - ) + content = [ + user_text, + ImageContentPart(data=image_bytes, mime_type=media_type), + ] - response = await client.messages.create( + response = await asyncio.to_thread( + client.generate, model=model, + messages=[ + SystemMessage(content=system_prompt), + UserMessage(content=content), + ], + response_format=TextResponse(), max_tokens=8192, - system=system_prompt, - messages=[{"role": "user", "content": content}], - ) - output_text = "".join( - block.text for block in response.content if getattr(block, "type", None) == "text" ) + output_text = _read_llmai_response_text(response) if not output_text: raise HTTPException(status_code=500, detail="No output from template provider") return output_text diff --git a/servers/fastapi/utils/available_models.py b/servers/fastapi/utils/available_models.py index 539533ad..ff4ae3a7 100644 --- a/servers/fastapi/utils/available_models.py +++ b/servers/fastapi/utils/available_models.py @@ -1,4 +1,4 @@ -from anthropic import AsyncAnthropic +import aiohttp from openai import AsyncOpenAI from google import genai @@ -12,8 +12,21 @@ async def list_available_openai_compatible_models(url: str, api_key: str) -> lis async def list_available_anthropic_models(api_key: str) -> list[str]: - client = AsyncAnthropic(api_key=api_key) - return list(map(lambda x: x.id, (await client.models.list(limit=50)).data)) + async with aiohttp.ClientSession( + headers={ + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + } + ) as session: + async with session.get( + "https://api.anthropic.com/v1/models", + params={"limit": 50}, + ) as response: + response.raise_for_status() + data = await response.json() + + models = data.get("data", []) + return [model.get("id") for model in models if model.get("id")] async def list_available_google_models(api_key: str) -> list[str]: diff --git a/servers/fastapi/utils/get_env.py b/servers/fastapi/utils/get_env.py index 5a940f78..ea111630 100644 --- a/servers/fastapi/utils/get_env.py +++ b/servers/fastapi/utils/get_env.py @@ -85,10 +85,6 @@ def get_pixabay_api_key_env(): return os.getenv("PIXABAY_API_KEY") -def get_tool_calls_env(): - return os.getenv("TOOL_CALLS") - - def get_disable_thinking_env(): return os.getenv("DISABLE_THINKING") diff --git a/servers/fastapi/utils/llm_calls/edit_slide.py b/servers/fastapi/utils/llm_calls/edit_slide.py index 99d46b6d..40f692e5 100644 --- a/servers/fastapi/utils/llm_calls/edit_slide.py +++ b/servers/fastapi/utils/llm_calls/edit_slide.py @@ -1,10 +1,14 @@ +import asyncio from datetime import datetime from typing import Optional -from models.llm_message import LLMSystemMessage, LLMUserMessage +from fastapi import HTTPException +from llmai import get_client +from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage from models.presentation_layout import SlideLayoutModel from models.sql.slide import SlideModel -from services.llm_client import LLMClient +from utils.llm_config import get_llm_config from utils.llm_client_error_handler import handle_llm_client_exceptions +from utils.llm_utils import extract_structured_content, get_generate_kwargs from utils.llm_provider import get_model from utils.schema_utils import add_field_in_schema, remove_fields_from_schema @@ -89,12 +93,12 @@ def get_messages( verbosity: Optional[str] = None, instructions: Optional[str] = None, memory_context: Optional[str] = None, -): +) -> list[Message]: return [ - LLMSystemMessage( + SystemMessage( content=get_system_prompt(tone, verbosity, instructions, memory_context), ), - LLMUserMessage( + UserMessage( content=get_user_prompt(prompt, slide_data, language), ), ] @@ -128,23 +132,40 @@ async def get_edited_slide_content( True, ) - client = LLMClient() + client = get_client(config=get_llm_config()) try: - response = await client.generate_structured( - model=model, - messages=get_messages( - prompt, - slide.content, - language, - tone, - verbosity, - instructions, - memory_context, - ), - response_format=response_schema, + response_format = JSONSchemaResponse( + name="response", + json_schema=response_schema, strict=False, ) - return response + messages = get_messages( + prompt, + slide.content, + language, + tone, + verbosity, + instructions, + memory_context, + ) + + for attempt in range(3): + response = await asyncio.to_thread( + client.generate, + **get_generate_kwargs( + model=model, + messages=messages, + response_format=response_format, + ), + ) + content = extract_structured_content(response.content) + if content is not None: + return content + + if attempt < 2: + await asyncio.sleep(0.5 * (attempt + 1)) + + raise HTTPException(status_code=400, detail="LLM did not return any content") except Exception as e: raise handle_llm_client_exceptions(e) diff --git a/servers/fastapi/utils/llm_calls/edit_slide_html.py b/servers/fastapi/utils/llm_calls/edit_slide_html.py index e74a01de..c0915386 100644 --- a/servers/fastapi/utils/llm_calls/edit_slide_html.py +++ b/servers/fastapi/utils/llm_calls/edit_slide_html.py @@ -1,7 +1,11 @@ +import asyncio from typing import Optional -from models.llm_message import LLMSystemMessage, LLMUserMessage -from services.llm_client import LLMClient +from fastapi import HTTPException +from llmai import get_client +from llmai.shared import SystemMessage, UserMessage +from utils.llm_config import get_llm_config from utils.llm_client_error_handler import handle_llm_client_exceptions +from utils.llm_utils import extract_text, get_generate_kwargs from utils.llm_provider import get_model system_prompt = """ @@ -59,18 +63,24 @@ async def get_edited_slide_html( ): model = get_model() - client = LLMClient() + client = get_client(config=get_llm_config()) try: - response = await client.generate( - model=model, - messages=[ - LLMSystemMessage(content=system_prompt), - LLMUserMessage( - content=get_user_prompt(prompt, html, memory_context) - ), - ], + response = await asyncio.to_thread( + client.generate, + **get_generate_kwargs( + model=model, + messages=[ + SystemMessage(content=system_prompt), + UserMessage( + content=get_user_prompt(prompt, html, memory_context) + ), + ], + ), ) - return extract_html_from_response(response) or html + response_text = extract_text(response.content) + if response_text is None: + raise HTTPException(status_code=400, detail="LLM did not return any content") + return extract_html_from_response(response_text) or html except Exception as e: raise handle_llm_client_exceptions(e) diff --git a/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py b/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py index 8ae47ae7..6f2f28ef 100644 --- a/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py +++ b/servers/fastapi/utils/llm_calls/generate_presentation_outlines.py @@ -1,14 +1,26 @@ from datetime import datetime from typing import Optional -from enums.llm_provider import LLMProvider -from models.llm_message import LLMSystemMessage, LLMUserMessage +from llmai import get_client +from llmai.shared import ( + JSONSchemaResponse, + Message, + ResponseStreamCompletionChunk, + SystemMessage, + UserMessage, + WebSearchTool, +) + from models.presentation_outline_model import PresentationOutlineModel -from models.llm_tools import SearchWebTool -from services.llm_client import LLMClient from utils.get_dynamic_models import get_presentation_outline_model_with_n_slides +from utils.llm_config import enable_web_grounding, get_llm_config from utils.llm_client_error_handler import handle_llm_client_exceptions from utils.llm_provider import get_model +from utils.llm_utils import ( + get_generate_kwargs, + serialize_structured_content, + stream_generate_events, +) def get_system_prompt( @@ -125,9 +137,9 @@ def get_messages( instructions: Optional[str] = None, include_title_slide: bool = True, include_table_of_contents: bool = False, -): +) -> list[Message]: return [ - LLMSystemMessage( + SystemMessage( content=get_system_prompt( tone, verbosity, @@ -136,7 +148,7 @@ def get_messages( include_table_of_contents, ), ), - LLMUserMessage( + UserMessage( content=get_user_prompt( content, n_slides, @@ -170,36 +182,47 @@ async def generate_ppt_outline( else PresentationOutlineModel ) - client = LLMClient() - providers_with_search_tool = { - LLMProvider.OPENAI, - LLMProvider.ANTHROPIC, - LLMProvider.GOOGLE, - } - use_search_tool = ( - web_search - and client.enable_web_grounding() - and client.llm_provider in providers_with_search_tool - ) + client = get_client(config=get_llm_config()) + use_search_tool = web_search and enable_web_grounding() try: - async for chunk in client.stream_structured( - model, - get_messages( - content, - n_slides, - language, - additional_context, - tone, - verbosity, - instructions, - include_title_slide, - include_table_of_contents, - ), - response_model.model_json_schema(), + response_format = JSONSchemaResponse( + name="response", + json_schema=response_model.model_json_schema(), strict=True, - tools=([SearchWebTool] if use_search_tool else None), + ) + emitted_content = False + async for event in stream_generate_events( + client, + **get_generate_kwargs( + model=model, + messages=get_messages( + content, + n_slides, + language, + additional_context, + tone, + verbosity, + instructions, + include_title_slide, + include_table_of_contents, + ), + response_format=response_format, + tools=([WebSearchTool()] if use_search_tool else None), + stream=True, + ), ): - yield chunk + if getattr(event, "type", None) == "content": + chunk = getattr(event, "chunk", None) + if chunk: + emitted_content = True + yield chunk + elif ( + isinstance(event, ResponseStreamCompletionChunk) + and not emitted_content + ): + final_content = serialize_structured_content(event.content) + if final_content: + yield final_content except Exception as e: yield handle_llm_client_exceptions(e) diff --git a/servers/fastapi/utils/llm_calls/generate_presentation_structure.py b/servers/fastapi/utils/llm_calls/generate_presentation_structure.py index bbe26172..df890164 100644 --- a/servers/fastapi/utils/llm_calls/generate_presentation_structure.py +++ b/servers/fastapi/utils/llm_calls/generate_presentation_structure.py @@ -1,10 +1,14 @@ -from typing import Optional, Dict +import asyncio +from typing import Optional -from models.llm_message import LLMSystemMessage, LLMUserMessage +from fastapi import HTTPException +from llmai import get_client +from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage from models.presentation_layout import PresentationLayoutModel from models.presentation_outline_model import PresentationOutlineModel -from services.llm_client import LLMClient +from utils.llm_config import get_llm_config from utils.llm_client_error_handler import handle_llm_client_exceptions +from utils.llm_utils import extract_structured_content, get_generate_kwargs from utils.llm_provider import get_model from utils.get_dynamic_models import get_presentation_structure_model_with_n_slides from models.presentation_structure_model import PresentationStructureModel @@ -97,19 +101,21 @@ def get_messages( n_slides: int, data: str, instructions: Optional[str] = None, -): +) -> list[Message]: system_prompt = GET_MESSAGES_SYSTEM_PROMPT.format( user_instruction_header="# User Instruction:" if instructions else "", n_slides=n_slides, ) return [ - LLMSystemMessage(content=system_prompt), - LLMUserMessage(content=( - f"{presentation_layout.to_string()}\n\n" - "--------------------------------------\n\n" - f"{data}" - )), + SystemMessage(content=system_prompt), + UserMessage( + content=( + f"{presentation_layout.to_string()}\n\n" + "--------------------------------------\n\n" + f"{data}" + ) + ), ] @@ -118,20 +124,13 @@ def get_messages_for_slides_markdown( n_slides: int, data: str, instructions: Optional[str] = None, -): +) -> list[Message]: system_prompt = STRUCTURE_FROM_SLIDES_MARKDOWN_SYSTEM_PROMPT.format( user_instructions=instructions or "", presentation_layout=presentation_layout.to_string(with_schema=True), ) - return [ - LLMSystemMessage( - content=system_prompt - ), - LLMUserMessage( - content=data - ) - ] + return [SystemMessage(content=system_prompt), UserMessage(content=data)] async def generate_presentation_structure( @@ -140,34 +139,50 @@ async def generate_presentation_structure( instructions: Optional[str] = None, using_slides_markdown: bool = False, ) -> PresentationStructureModel: - - client = LLMClient() + client = get_client(config=get_llm_config()) model = get_model() response_model = get_presentation_structure_model_with_n_slides( len(presentation_outline.slides) ) try: - response = await client.generate_structured( - model=model, - messages=( - get_messages_for_slides_markdown( - presentation_layout, - len(presentation_outline.slides), - presentation_outline.to_string(), - instructions, - ) - if using_slides_markdown - else get_messages( - presentation_layout, - len(presentation_outline.slides), - presentation_outline.to_string(), - instructions, - ) - ), - response_format=response_model.model_json_schema(), + messages = ( + get_messages_for_slides_markdown( + presentation_layout, + len(presentation_outline.slides), + presentation_outline.to_string(), + instructions, + ) + if using_slides_markdown + else get_messages( + presentation_layout, + len(presentation_outline.slides), + presentation_outline.to_string(), + instructions, + ) + ) + response_format = JSONSchemaResponse( + name="response", + json_schema=response_model.model_json_schema(), strict=True, ) - return PresentationStructureModel(**response) + + for attempt in range(3): + response = await asyncio.to_thread( + client.generate, + **get_generate_kwargs( + model=model, + messages=messages, + response_format=response_format, + ), + ) + content = extract_structured_content(response.content) + if content is not None: + return PresentationStructureModel(**content) + + if attempt < 2: + await asyncio.sleep(0.5 * (attempt + 1)) + + raise HTTPException(status_code=400, detail="LLM did not return any content") except Exception as e: raise handle_llm_client_exceptions(e) diff --git a/servers/fastapi/utils/llm_calls/generate_slide_content.py b/servers/fastapi/utils/llm_calls/generate_slide_content.py index a5010cf2..532fd52d 100644 --- a/servers/fastapi/utils/llm_calls/generate_slide_content.py +++ b/servers/fastapi/utils/llm_calls/generate_slide_content.py @@ -1,11 +1,15 @@ +import asyncio from datetime import datetime import json from typing import Optional -from models.llm_message import LLMSystemMessage, LLMUserMessage +from fastapi import HTTPException +from llmai import get_client +from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage from models.presentation_layout import SlideLayoutModel from models.presentation_outline_model import SlideOutlineModel -from services.llm_client import LLMClient +from utils.llm_config import get_llm_config from utils.llm_client_error_handler import handle_llm_client_exceptions +from utils.llm_utils import extract_structured_content, get_generate_kwargs from utils.llm_provider import get_model from utils.schema_utils import add_field_in_schema, remove_fields_from_schema @@ -130,10 +134,10 @@ def get_messages( verbosity: Optional[str] = None, instructions: Optional[str] = None, response_schema: Optional[dict] = None, -): +) -> list[Message]: return [ - LLMSystemMessage( + SystemMessage( content=get_system_prompt( tone, verbosity, @@ -141,7 +145,7 @@ def get_messages( response_schema, ), ), - LLMUserMessage( + UserMessage( content=get_user_prompt(outline, language), ), ] @@ -155,7 +159,7 @@ async def get_slide_content_from_type_and_outline( verbosity: Optional[str] = None, instructions: Optional[str] = None, ): - client = LLMClient() + client = get_client(config=get_llm_config()) model = get_model() response_schema = remove_fields_from_schema( @@ -175,20 +179,37 @@ async def get_slide_content_from_type_and_outline( ) try: - response = await client.generate_structured( - model=model, - messages=get_messages( - outline.content, - language, - tone, - verbosity, - instructions, - response_schema, - ), - response_format=response_schema, + response_format = JSONSchemaResponse( + name="response", + json_schema=response_schema, strict=False, ) - return response + messages = get_messages( + outline.content, + language, + tone, + verbosity, + instructions, + response_schema, + ) + + for attempt in range(3): + response = await asyncio.to_thread( + client.generate, + **get_generate_kwargs( + model=model, + messages=messages, + response_format=response_format, + ), + ) + content = extract_structured_content(response.content) + if content is not None: + return content + + if attempt < 2: + await asyncio.sleep(0.5 * (attempt + 1)) + + raise HTTPException(status_code=400, detail="LLM did not return any content") except Exception as e: raise handle_llm_client_exceptions(e) diff --git a/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py b/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py index f12e7d07..8719b561 100644 --- a/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py +++ b/servers/fastapi/utils/llm_calls/select_slide_type_on_edit.py @@ -1,9 +1,13 @@ -from models.llm_message import LLMSystemMessage, LLMUserMessage +import asyncio +from fastapi import HTTPException +from llmai import get_client +from llmai.shared import JSONSchemaResponse, Message, SystemMessage, UserMessage from models.presentation_layout import PresentationLayoutModel, SlideLayoutModel from models.slide_layout_index import SlideLayoutIndex from models.sql.slide import SlideModel -from services.llm_client import LLMClient +from utils.llm_config import get_llm_config from utils.llm_client_error_handler import handle_llm_client_exceptions +from utils.llm_utils import extract_structured_content, get_generate_kwargs from utils.llm_provider import get_model @@ -13,7 +17,7 @@ def get_messages( layout: PresentationLayoutModel, current_slide_layout: int, memory_context: str = "", -): +) -> list[Message]: memory_block = ( f"\n # Retrieved Presentation Memory Context\n {memory_context}\n" if memory_context @@ -21,7 +25,7 @@ def get_messages( ) return [ - LLMSystemMessage( + SystemMessage( content=f""" Select a Slide Layout index based on provided user prompt and current slide data. {layout.to_string()} @@ -34,7 +38,7 @@ def get_messages( **Go through all notes and steps and make sure they are followed, including mentioned constraints** """, ), - LLMUserMessage( + UserMessage( content=f""" - User Prompt: {prompt} - Current Slide Data: {slide_data} @@ -50,27 +54,43 @@ async def get_slide_layout_from_prompt( slide: SlideModel, memory_context: str = "", ) -> SlideLayoutModel: - - client = LLMClient() + client = get_client(config=get_llm_config()) model = get_model() slide_layout_index = layout.get_slide_layout_index(slide.layout) try: - response = await client.generate_structured( - model=model, - messages=get_messages( - prompt, - slide.content, - layout, - slide_layout_index, - memory_context, - ), - response_format=SlideLayoutIndex.model_json_schema(), + response_format = JSONSchemaResponse( + name="response", + json_schema=SlideLayoutIndex.model_json_schema(), strict=True, ) - index = SlideLayoutIndex(**response).index - return layout.slides[index] + messages = get_messages( + prompt, + slide.content, + layout, + slide_layout_index, + memory_context, + ) + + for attempt in range(3): + response = await asyncio.to_thread( + client.generate, + **get_generate_kwargs( + model=model, + messages=messages, + response_format=response_format, + ), + ) + content = extract_structured_content(response.content) + if content is not None: + index = SlideLayoutIndex(**content).index + return layout.slides[index] + + if attempt < 2: + await asyncio.sleep(0.5 * (attempt + 1)) + + raise HTTPException(status_code=400, detail="LLM did not return any content") except Exception as e: raise handle_llm_client_exceptions(e) diff --git a/servers/fastapi/utils/llm_client_error_handler.py b/servers/fastapi/utils/llm_client_error_handler.py index 7e4c915b..ab27c300 100644 --- a/servers/fastapi/utils/llm_client_error_handler.py +++ b/servers/fastapi/utils/llm_client_error_handler.py @@ -1,18 +1,19 @@ from fastapi import HTTPException -from anthropic import APIError as AnthropicAPIError from openai import APIError as OpenAIAPIError from google.genai.errors import APIError as GoogleAPIError import traceback +from llmai.shared.errors import BaseError as LLMAIBaseError + def handle_llm_client_exceptions(e: Exception) -> HTTPException: traceback.print_exc() + if isinstance(e, HTTPException): + return e + if isinstance(e, LLMAIBaseError): + return HTTPException(status_code=e.status_code, detail=e.message) if isinstance(e, OpenAIAPIError): return HTTPException(status_code=500, detail=f"OpenAI API error: {e.message}") if isinstance(e, GoogleAPIError): return HTTPException(status_code=500, detail=f"Google API error: {e.message}") - if isinstance(e, AnthropicAPIError): - return HTTPException( - status_code=500, detail=f"Anthropic API error: {e.message}" - ) return HTTPException(status_code=500, detail=f"LLM API error: {e}") diff --git a/servers/fastapi/utils/llm_config.py b/servers/fastapi/utils/llm_config.py new file mode 100644 index 00000000..bef4dcda --- /dev/null +++ b/servers/fastapi/utils/llm_config.py @@ -0,0 +1,146 @@ +import time +from typing import Optional + +from fastapi import HTTPException +from llmai.shared import ( + AnthropicClientConfig, + ChatGPTClientConfig, + ClientConfig, + GoogleClientConfig, + OpenAIApiType, + OpenAIClientConfig, +) + +from enums.llm_provider import LLMProvider +from utils.get_env import ( + get_anthropic_api_key_env, + get_codex_access_token_env, + get_codex_account_id_env, + get_codex_refresh_token_env, + get_codex_token_expires_env, + get_custom_llm_api_key_env, + get_custom_llm_url_env, + get_disable_thinking_env, + get_google_api_key_env, + get_ollama_url_env, + get_openai_api_key_env, + get_web_grounding_env, +) +from utils.llm_provider import get_llm_provider +from utils.parsers import parse_bool_or_none +from utils.set_env import ( + set_codex_access_token_env, + set_codex_account_id_env, + set_codex_refresh_token_env, + set_codex_token_expires_env, +) + + +def enable_web_grounding() -> bool: + return parse_bool_or_none(get_web_grounding_env()) or False + + +def disable_thinking() -> bool: + return parse_bool_or_none(get_disable_thinking_env()) or False + + +def _get_codex_access_token() -> str: + access_token = get_codex_access_token_env() + if not access_token: + raise HTTPException( + status_code=400, + detail=( + "Codex OAuth access token is not set. Please authenticate via " + "/api/v1/ppt/codex/auth/initiate" + ), + ) + + expires_str = get_codex_token_expires_env() + if expires_str: + try: + expires_ms = int(expires_str) + now_ms = int(time.time() * 1000) + if now_ms >= expires_ms - 60_000: + refresh_token = get_codex_refresh_token_env() + if refresh_token: + from utils.oauth.openai_codex import ( + TokenSuccess, + get_account_id, + refresh_access_token, + ) + + result = refresh_access_token(refresh_token) + if isinstance(result, TokenSuccess): + set_codex_access_token_env(result.access) + set_codex_refresh_token_env(result.refresh) + set_codex_token_expires_env(str(result.expires)) + account_id = get_account_id(result.access) + if account_id: + set_codex_account_id_env(account_id) + access_token = result.access + except (TypeError, ValueError): + pass + + return access_token + + +def get_llm_config() -> ClientConfig: + llm_provider = get_llm_provider() + + match llm_provider: + case LLMProvider.OPENAI: + api_key = get_openai_api_key_env() + if not api_key: + raise HTTPException(status_code=400, detail="OpenAI API Key is not set") + return OpenAIClientConfig( + api_key=api_key, + api_type=OpenAIApiType.RESPONSES, + ) + case LLMProvider.GOOGLE: + api_key = get_google_api_key_env() + if not api_key: + raise HTTPException(status_code=400, detail="Google API Key is not set") + return GoogleClientConfig(api_key=api_key) + case LLMProvider.ANTHROPIC: + api_key = get_anthropic_api_key_env() + if not api_key: + raise HTTPException( + status_code=400, + detail="Anthropic API Key is not set", + ) + return AnthropicClientConfig(api_key=api_key) + case LLMProvider.OLLAMA: + return OpenAIClientConfig( + base_url=(get_ollama_url_env() or "http://localhost:11434") + "/v1", + api_key="ollama", + ) + case LLMProvider.CUSTOM: + base_url = get_custom_llm_url_env() + if not base_url: + raise HTTPException( + status_code=400, + detail="Custom LLM URL is not set", + ) + return OpenAIClientConfig( + base_url=base_url, + api_key=get_custom_llm_api_key_env() or "null", + ) + case LLMProvider.CODEX: + return ChatGPTClientConfig( + access_token=_get_codex_access_token(), + account_id=get_codex_account_id_env() or None, + ) + case _: + raise HTTPException( + status_code=400, + detail=( + "LLM Provider must be either openai, google, anthropic, " + "ollama, custom, or codex" + ), + ) + + +def get_extra_body() -> Optional[dict]: + if get_llm_provider() == LLMProvider.CUSTOM and disable_thinking(): + return {"enable_thinking": False} + return None diff --git a/servers/fastapi/utils/llm_utils.py b/servers/fastapi/utils/llm_utils.py new file mode 100644 index 00000000..c10a7341 --- /dev/null +++ b/servers/fastapi/utils/llm_utils.py @@ -0,0 +1,134 @@ +import asyncio +import json +from collections.abc import AsyncGenerator, Sequence +from typing import Any, Optional + +import dirtyjson +from llmai.shared import ( + LLMTool, + Message, + ResponseFormat, + normalize_content_parts, +) + +from utils.llm_config import get_extra_body + + +def get_generate_kwargs( + model: str, + messages: Sequence[Message], + max_tokens: Optional[int] = None, + tools: Optional[list[LLMTool]] = None, + response_format: Optional[ResponseFormat] = None, + stream: bool = False, +) -> dict[str, Any]: + kwargs: dict[str, Any] = { + "model": model, + "messages": list(messages), + "stream": stream, + } + if max_tokens is not None: + kwargs["max_tokens"] = max_tokens + if tools: + kwargs["tools"] = tools + if response_format is not None: + kwargs["response_format"] = response_format + + extra_body = get_extra_body() + if extra_body: + kwargs["extra_body"] = extra_body + + return kwargs + + +def extract_text(content: Any) -> Optional[str]: + if content is None: + return None + if isinstance(content, str): + return content + if isinstance(content, Sequence) and not isinstance(content, (bytes, bytearray)): + parts: list[str] = [] + for part in content: + if isinstance(part, str): + parts.append(part) + continue + text = getattr(part, "text", None) + if isinstance(text, str): + parts.append(text) + joined = "".join(parts) + return joined or None + text = getattr(content, "text", None) + if isinstance(text, str): + return text + return None + + +def extract_structured_content(content: Any) -> Optional[dict]: + if content is None: + return None + if isinstance(content, dict): + return content + if hasattr(content, "model_dump"): + dumped = content.model_dump(mode="json") + if isinstance(dumped, dict): + return dumped + + raw_text = extract_text(content) + if not raw_text: + return None + + try: + parsed = dirtyjson.loads(raw_text) + except Exception: + return None + + if isinstance(parsed, dict): + return dict(parsed) + return None + + +def serialize_structured_content(content: Any) -> Optional[str]: + parsed = extract_structured_content(content) + if parsed is not None: + return json.dumps(parsed, ensure_ascii=False) + + raw_text = extract_text(content) + if raw_text: + return raw_text + return None + + +def message_content_to_text(content: Sequence[Any] | str | None) -> Optional[str]: + joined = "".join( + part.text + for part in normalize_content_parts(content) + if isinstance(getattr(part, "text", None), str) + ) + return joined or None + + +async def stream_generate_events(client: Any, **kwargs) -> AsyncGenerator[Any, None]: + loop = asyncio.get_running_loop() + queue: asyncio.Queue[Any] = asyncio.Queue() + sentinel = object() + + def worker(): + try: + for event in client.generate(**kwargs): + loop.call_soon_threadsafe(queue.put_nowait, event) + except Exception as exc: + loop.call_soon_threadsafe(queue.put_nowait, exc) + finally: + loop.call_soon_threadsafe(queue.put_nowait, sentinel) + + worker_task = asyncio.create_task(asyncio.to_thread(worker)) + try: + while True: + item = await queue.get() + if item is sentinel: + break + if isinstance(item, Exception): + raise item + yield item + finally: + await worker_task diff --git a/servers/fastapi/utils/set_env.py b/servers/fastapi/utils/set_env.py index 1a367735..18456d8e 100644 --- a/servers/fastapi/utils/set_env.py +++ b/servers/fastapi/utils/set_env.py @@ -73,10 +73,6 @@ def set_disable_image_generation_env(value): os.environ["DISABLE_IMAGE_GENERATION"] = value -def set_tool_calls_env(value): - os.environ["TOOL_CALLS"] = value - - def set_disable_thinking_env(value): os.environ["DISABLE_THINKING"] = value diff --git a/servers/fastapi/utils/user_config.py b/servers/fastapi/utils/user_config.py index b7bfaab1..bc499075 100644 --- a/servers/fastapi/utils/user_config.py +++ b/servers/fastapi/utils/user_config.py @@ -22,7 +22,6 @@ from utils.get_env import ( get_openai_api_key_env, get_openai_model_env, get_pexels_api_key_env, - get_tool_calls_env, get_user_config_path_env, get_image_provider_env, get_pixabay_api_key_env, @@ -63,7 +62,6 @@ from utils.set_env import ( set_pexels_api_key_env, set_image_provider_env, set_pixabay_api_key_env, - set_tool_calls_env, set_web_grounding_env, set_codex_access_token_env, set_codex_refresh_token_env, @@ -118,11 +116,6 @@ def get_user_config(): DALL_E_3_QUALITY=existing_config.DALL_E_3_QUALITY or get_dall_e_3_quality_env(), GPT_IMAGE_1_5_QUALITY=existing_config.GPT_IMAGE_1_5_QUALITY or get_gpt_image_1_5_quality_env(), - TOOL_CALLS=( - existing_config.TOOL_CALLS - if existing_config.TOOL_CALLS is not None - else (parse_bool_or_none(get_tool_calls_env()) or False) - ), DISABLE_THINKING=( existing_config.DISABLE_THINKING if existing_config.DISABLE_THINKING is not None @@ -197,8 +190,6 @@ def update_env_with_user_config(): set_dall_e_3_quality_env(user_config.DALL_E_3_QUALITY) if user_config.GPT_IMAGE_1_5_QUALITY: set_gpt_image_1_5_quality_env(user_config.GPT_IMAGE_1_5_QUALITY) - if user_config.TOOL_CALLS is not None: - set_tool_calls_env(str(user_config.TOOL_CALLS)) if user_config.DISABLE_THINKING is not None: set_disable_thinking_env(str(user_config.DISABLE_THINKING)) if user_config.EXTENDED_REASONING is not None: diff --git a/servers/fastapi/uv.lock b/servers/fastapi/uv.lock index 2e7c3f0e..e0ded891 100644 --- a/servers/fastapi/uv.lock +++ b/servers/fastapi/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = "==3.11.*" [[package]] @@ -238,6 +238,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ea/44/b749f8777b020b420bceaaf60f66432fc30cc904ca5b69640ec9cbef11ed/blis-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:27f82b8633030f8d095d2b412dffa7eb6dbc8ee43813139909a20012e54422ea", size = 6171233, upload-time = "2025-11-17T12:27:41.921Z" }, ] +[[package]] +name = "boto3" +version = "1.42.94" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, + { name = "jmespath" }, + { name = "s3transfer" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/6a/6a/95302333208830de932ad1d0b69599ee13e936349a44981fb72632507861/boto3-1.42.94.tar.gz", hash = "sha256:5b6056a661c19e974aaea3cb97690ddbe30d10c31e4f887df3bff06574f34510", size = 113211, upload-time = "2026-04-22T20:36:19.167Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c4/6f/4e175604f3168befcb413c95bf45eada67d12042f92f76a9305d6a817ea9/boto3-1.42.94-py3-none-any.whl", hash = "sha256:56d53bce75629cc7c78a32da8b62de74cee3e2a3d54a2b60ba1a65f9f1b129da", size = 140555, upload-time = "2026-04-22T20:36:16.182Z" }, +] + +[[package]] +name = "botocore" +version = "1.42.94" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jmespath" }, + { name = "python-dateutil" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b7/90/1a4d0e81b325d38e37f81d907ceacac3b8f509ad38b495bb95086ecb609d/botocore-1.42.94.tar.gz", hash = "sha256:41c6b3b11b073221a41f52b222ba387be34459fb77cdc506e8b74cdaf24bdcce", size = 15260901, upload-time = "2026-04-22T20:36:00.853Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/61/73/313af9ee02ac0155247bcf3f04fcf54fcae2e33250bb437528c18aeefd81/botocore-1.42.94-py3-none-any.whl", hash = "sha256:a2143742132ed0f6cdb90204d667b89d0301068b1045e8bc099efa267bf1b348", size = 14942938, upload-time = "2026-04-22T20:35:55.663Z" }, +] + [[package]] name = "cachetools" version = "7.0.6" @@ -783,7 +811,9 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/c6/dba32cab7e3a625b011aa5647486e2d28423a48845a2998c126dd69c85e1/greenlet-3.4.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:805bebb4945094acbab757d34d6e1098be6de8966009ab9ca54f06ff492def58", size = 285504, upload-time = "2026-04-08T15:52:14.071Z" }, { url = "https://files.pythonhosted.org/packages/54/f4/7cb5c2b1feb9a1f50e038be79980dfa969aa91979e5e3a18fdbcfad2c517/greenlet-3.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:439fc2f12b9b512d9dfa681c5afe5f6b3232c708d13e6f02c845e0d9f4c2d8c6", size = 605476, upload-time = "2026-04-08T16:24:37.064Z" }, { url = "https://files.pythonhosted.org/packages/d6/af/b66ab0b2f9a4c5a867c136bf66d9599f34f21a1bcca26a2884a29c450bd9/greenlet-3.4.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a70ed1cb0295bee1df57b63bf7f46b4e56a5c93709eea769c1fec1bb23a95875", size = 618336, upload-time = "2026-04-08T16:30:56.59Z" }, + { url = "https://files.pythonhosted.org/packages/6d/31/56c43d2b5de476f77d36ceeec436328533bff960a4cba9a07616e93063ab/greenlet-3.4.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c5696c42e6bb5cfb7c6ff4453789081c66b9b91f061e5e9367fa15792644e76", size = 625045, upload-time = "2026-04-08T16:40:37.111Z" }, { url = "https://files.pythonhosted.org/packages/e5/5c/8c5633ece6ba611d64bf2770219a98dd439921d6424e4e8cf16b0ac74ea5/greenlet-3.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c660bce1940a1acae5f51f0a064f1bc785d07ea16efcb4bc708090afc4d69e83", size = 613515, upload-time = "2026-04-08T15:56:32.478Z" }, + { url = "https://files.pythonhosted.org/packages/80/ca/704d4e2c90acb8bdf7ae593f5cbc95f58e82de95cc540fb75631c1054533/greenlet-3.4.0-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:89995ce5ddcd2896d89615116dd39b9703bfa0c07b583b85b89bf1b5d6eddf81", size = 419745, upload-time = "2026-04-08T16:43:04.022Z" }, { url = "https://files.pythonhosted.org/packages/a9/df/950d15bca0d90a0e7395eb777903060504cdb509b7b705631e8fb69ff415/greenlet-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee407d4d1ca9dc632265aee1c8732c4a2d60adff848057cdebfe5fe94eb2c8a2", size = 1574623, upload-time = "2026-04-08T16:26:18.596Z" }, { url = "https://files.pythonhosted.org/packages/1a/e7/0839afab829fcb7333c9ff6d80c040949510055d2d4d63251f0d1c7c804e/greenlet-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:956215d5e355fffa7c021d168728321fd4d31fd730ac609b1653b450f6a4bc71", size = 1639579, upload-time = "2026-04-08T15:57:29.231Z" }, { url = "https://files.pythonhosted.org/packages/d9/2b/b4482401e9bcaf9f5c97f67ead38db89c19520ff6d0d6699979c6efcc200/greenlet-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:5cb614ace7c27571270354e9c9f696554d073f8aa9319079dcba466bbdead711", size = 238233, upload-time = "2026-04-08T17:02:54.286Z" }, @@ -1057,6 +1087,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/99/8f/15e7741ff19e9bcd4d753f7ff22f988fd54592f134ca13701c13ea8c20e0/jiter-0.14.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e52c076f187405fc21523c746c04399c9af8ece566077ed147b2126f2bcba577", size = 351445, upload-time = "2026-04-10T14:28:33.093Z" }, ] +[[package]] +name = "jmespath" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, +] + [[package]] name = "joblib" version = "1.5.3" @@ -1146,6 +1185,28 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/db/e655086b7f3a705df045bf0933bdd9c2f79bb3c97bfef1384598bb79a217/keyring-25.7.0-py3-none-any.whl", hash = "sha256:be4a0b195f149690c166e850609a477c532ddbfbaed96a404d4e43f8d5e2689f", size = 39160, upload-time = "2025-11-16T16:26:08.402Z" }, ] +[[package]] +name = "llmai" +version = "0.1.8" +source = { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" } +dependencies = [ + { name = "anthropic" }, + { name = "boto3" }, + { name = "google-genai" }, + { name = "openai" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl", hash = "sha256:c4bae504dae928e88e8437bd3e2e5eb573f459d6df9ed8fc182671ee99b3cf1b" }, +] + +[package.metadata] +requires-dist = [ + { name = "anthropic", specifier = ">=0.79.0" }, + { name = "boto3", specifier = ">=1.42.89" }, + { name = "google-genai", specifier = ">=1.62.0" }, + { name = "openai", specifier = ">=2.18.0" }, +] + [[package]] name = "loguru" version = "0.7.3" @@ -1604,13 +1665,13 @@ dependencies = [ { name = "aiomysql" }, { name = "aiosqlite" }, { name = "alembic" }, - { name = "anthropic" }, { name = "asyncpg" }, { name = "dirtyjson" }, { name = "fastapi", extra = ["standard"] }, { name = "fastembed-vectorstore" }, { name = "fastmcp" }, { name = "google-genai" }, + { name = "llmai" }, { name = "mem0ai", extra = ["nlp"] }, { name = "nltk" }, { name = "openai" }, @@ -1626,13 +1687,13 @@ requires-dist = [ { name = "aiomysql", specifier = ">=0.2.0" }, { name = "aiosqlite", specifier = ">=0.21.0" }, { name = "alembic", specifier = ">=1.14.0" }, - { name = "anthropic", specifier = ">=0.60.0" }, { name = "asyncpg", specifier = ">=0.30.0" }, { name = "dirtyjson", specifier = ">=1.0.8" }, { name = "fastapi", extras = ["standard"], specifier = ">=0.116.1" }, { name = "fastembed-vectorstore", specifier = ">=0.5.2" }, { name = "fastmcp", specifier = ">=2.11.0" }, { name = "google-genai", specifier = ">=1.28.0" }, + { name = "llmai", url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" }, { name = "mem0ai", extras = ["nlp"], specifier = ">=0.1.115" }, { name = "nltk", specifier = ">=3.9.1" }, { name = "openai", specifier = ">=1.98.0" }, @@ -2200,6 +2261,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" }, ] +[[package]] +name = "s3transfer" +version = "0.16.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "botocore" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/46/29/af14f4ef3c11a50435308660e2cc68761c9a7742475e0585cd4396b91777/s3transfer-0.16.1.tar.gz", hash = "sha256:8e424355754b9ccb32467bdc568edf55be82692ef2002d934b1311dbb3b9e524", size = 154801, upload-time = "2026-04-22T20:36:06.475Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/03/19/90d7d4ed51932c022d53f1d02d564b62d10e272692a1f9b76425c1ad2a02/s3transfer-0.16.1-py3-none-any.whl", hash = "sha256:61bcd00ccb83b21a0fe7e91a553fff9729d46c83b4e0106e7c314a733891f7c2", size = 86825, upload-time = "2026-04-22T20:36:04.992Z" }, +] + [[package]] name = "secretstorage" version = "3.5.0" diff --git a/servers/nextjs/components/CustomConfig.tsx b/servers/nextjs/components/CustomConfig.tsx index f79557fa..9ac1cebc 100644 --- a/servers/nextjs/components/CustomConfig.tsx +++ b/servers/nextjs/components/CustomConfig.tsx @@ -20,7 +20,6 @@ interface CustomConfigProps { customLlmUrl: string; customLlmApiKey: string; customModel: string; - toolCalls: boolean; disableThinking: boolean; onInputChange: (value: string | boolean, field: string) => void; } @@ -29,7 +28,6 @@ export default function CustomConfig({ customLlmUrl, customLlmApiKey, customModel, - toolCalls, disableThinking, onInputChange, }: CustomConfigProps) { @@ -165,9 +163,8 @@ export default function CustomConfig({

- Important: Only models with function - calling capabilities (tool calls) or JSON schema support - will work. + Important: Only models with structured + JSON schema output support will work reliably.

)} - - {/* Tool Calls Toggle */} -
-
- - onInputChange(checked, "tool_calls")} - /> -
-

- - If enabled, Tool Calls will be used instead of JSON Schema for Structured Output. -

-
{/* Disable Thinking Toggle */}
@@ -266,4 +246,4 @@ export default function CustomConfig({
); -} \ No newline at end of file +} diff --git a/servers/nextjs/components/LLMSelection.tsx b/servers/nextjs/components/LLMSelection.tsx index 32ba272f..86682ea3 100644 --- a/servers/nextjs/components/LLMSelection.tsx +++ b/servers/nextjs/components/LLMSelection.tsx @@ -292,7 +292,6 @@ export default function LLMProviderSelection({ customLlmUrl={llmConfig.CUSTOM_LLM_URL || ""} customLlmApiKey={llmConfig.CUSTOM_LLM_API_KEY || ""} customModel={llmConfig.CUSTOM_MODEL || ""} - toolCalls={llmConfig.TOOL_CALLS || false} disableThinking={llmConfig.DISABLE_THINKING || false} onInputChange={input_field_changed} /> diff --git a/servers/nextjs/types/llm_config.ts b/servers/nextjs/types/llm_config.ts index 3559b065..976b77b1 100644 --- a/servers/nextjs/types/llm_config.ts +++ b/servers/nextjs/types/llm_config.ts @@ -42,7 +42,6 @@ export interface LLMConfig { GPT_IMAGE_1_5_QUALITY?: string; // Other Configs - TOOL_CALLS?: boolean; DISABLE_THINKING?: boolean; EXTENDED_REASONING?: boolean; WEB_GROUNDING?: boolean; diff --git a/servers/nextjs/utils/providerUtils.ts b/servers/nextjs/utils/providerUtils.ts index da23f138..92ec57ee 100644 --- a/servers/nextjs/utils/providerUtils.ts +++ b/servers/nextjs/utils/providerUtils.ts @@ -46,7 +46,6 @@ export const updateLLMConfig = ( image_provider: "IMAGE_PROVIDER", disable_image_generation: "DISABLE_IMAGE_GENERATION", use_custom_url: "USE_CUSTOM_URL", - tool_calls: "TOOL_CALLS", disable_thinking: "DISABLE_THINKING", extended_reasoning: "EXTENDED_REASONING", web_grounding: "WEB_GROUNDING", @@ -244,4 +243,4 @@ export const pullOllamaModel = async ( void pollOnce(); }, 1000); }); -}; \ No newline at end of file +}; diff --git a/start.js b/start.js index 4eddda4e..6e2425dc 100644 --- a/start.js +++ b/start.js @@ -175,7 +175,6 @@ const setupUserConfigFromEnv = () => { PIXABAY_API_KEY: process.env.PIXABAY_API_KEY || existingConfig.PIXABAY_API_KEY, IMAGE_PROVIDER: process.env.IMAGE_PROVIDER || existingConfig.IMAGE_PROVIDER, - TOOL_CALLS: process.env.TOOL_CALLS || existingConfig.TOOL_CALLS, DISABLE_THINKING: process.env.DISABLE_THINKING || existingConfig.DISABLE_THINKING, EXTENDED_REASONING: From 98d74057770834b4b7f146db6932e38cf0e471b2 Mon Sep 17 00:00:00 2001 From: sauravniraula Date: Thu, 23 Apr 2026 13:20:18 +0545 Subject: [PATCH 2/3] chore: version bump of llmai to 0.1.9 --- servers/fastapi/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/servers/fastapi/pyproject.toml b/servers/fastapi/pyproject.toml index 8fa45385..5eb3e431 100644 --- a/servers/fastapi/pyproject.toml +++ b/servers/fastapi/pyproject.toml @@ -25,7 +25,7 @@ dependencies = [ "pdfplumber>=0.11.7", "python-pptx>=1.0.2", "sqlmodel>=0.0.24", - "llmai==0.1.8", + "llmai==0.1.9", ] [tool.uv] From 1030f07ec7aaefeb10aecba1e165fc66592720f6 Mon Sep 17 00:00:00 2001 From: sauravniraula Date: Thu, 23 Apr 2026 13:33:35 +0545 Subject: [PATCH 3/3] chore: fixes chatgpt none content issue --- servers/fastapi/presenton_backend.egg-info/PKG-INFO | 2 +- .../fastapi/presenton_backend.egg-info/requires.txt | 2 +- servers/fastapi/pyproject.toml | 13 +++++++++---- servers/fastapi/uv.lock | 8 ++++---- 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/servers/fastapi/presenton_backend.egg-info/PKG-INFO b/servers/fastapi/presenton_backend.egg-info/PKG-INFO index c24c29ca..a2ae42d4 100644 --- a/servers/fastapi/presenton_backend.egg-info/PKG-INFO +++ b/servers/fastapi/presenton_backend.egg-info/PKG-INFO @@ -20,4 +20,4 @@ Requires-Dist: pathvalidate>=3.3.1 Requires-Dist: pdfplumber>=0.11.7 Requires-Dist: python-pptx>=1.0.2 Requires-Dist: sqlmodel>=0.0.24 -Requires-Dist: llmai==0.1.8 +Requires-Dist: llmai==0.1.9 diff --git a/servers/fastapi/presenton_backend.egg-info/requires.txt b/servers/fastapi/presenton_backend.egg-info/requires.txt index 87b670ce..b7f83600 100644 --- a/servers/fastapi/presenton_backend.egg-info/requires.txt +++ b/servers/fastapi/presenton_backend.egg-info/requires.txt @@ -15,4 +15,4 @@ pathvalidate>=3.3.1 pdfplumber>=0.11.7 python-pptx>=1.0.2 sqlmodel>=0.0.24 -llmai==0.1.8 +llmai==0.1.9 diff --git a/servers/fastapi/pyproject.toml b/servers/fastapi/pyproject.toml index 5eb3e431..b36d123b 100644 --- a/servers/fastapi/pyproject.toml +++ b/servers/fastapi/pyproject.toml @@ -31,9 +31,14 @@ dependencies = [ [tool.uv] index-strategy = "unsafe-best-match" -[tool.uv.sources] -llmai = { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" } - [tool.setuptools.packages.find] where = ["."] -include = ["api*", "enums*", "models*", "services*", "constants*", "utils*", "templates*"] +include = [ + "api*", + "enums*", + "models*", + "services*", + "constants*", + "utils*", + "templates*", +] diff --git a/servers/fastapi/uv.lock b/servers/fastapi/uv.lock index e0ded891..0a12daf7 100644 --- a/servers/fastapi/uv.lock +++ b/servers/fastapi/uv.lock @@ -1187,8 +1187,8 @@ wheels = [ [[package]] name = "llmai" -version = "0.1.8" -source = { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" } +version = "0.1.9" +source = { url = "https://files.pythonhosted.org/packages/c6/86/5dcfd77b634947cd570680b13217b40bc72cd7d9e7f04cc1a52ff5f549a0/llmai-0.1.9-py3-none-any.whl" } dependencies = [ { name = "anthropic" }, { name = "boto3" }, @@ -1196,7 +1196,7 @@ dependencies = [ { name = "openai" }, ] wheels = [ - { url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl", hash = "sha256:c4bae504dae928e88e8437bd3e2e5eb573f459d6df9ed8fc182671ee99b3cf1b" }, + { url = "https://files.pythonhosted.org/packages/c6/86/5dcfd77b634947cd570680b13217b40bc72cd7d9e7f04cc1a52ff5f549a0/llmai-0.1.9-py3-none-any.whl", hash = "sha256:dcd94502516586bbd6394fe2c9c610941ff4c19eae0f1316825435f35134cfb4" }, ] [package.metadata] @@ -1693,7 +1693,7 @@ requires-dist = [ { name = "fastembed-vectorstore", specifier = ">=0.5.2" }, { name = "fastmcp", specifier = ">=2.11.0" }, { name = "google-genai", specifier = ">=1.28.0" }, - { name = "llmai", url = "https://files.pythonhosted.org/packages/49/9e/64fb2453d9eace7fd50b25635ae267422d014c64861ac511a5e953884f85/llmai-0.1.8-py3-none-any.whl" }, + { name = "llmai", url = "https://files.pythonhosted.org/packages/c6/86/5dcfd77b634947cd570680b13217b40bc72cd7d9e7f04cc1a52ff5f549a0/llmai-0.1.9-py3-none-any.whl" }, { name = "mem0ai", extras = ["nlp"], specifier = ">=0.1.115" }, { name = "nltk", specifier = ">=3.9.1" }, { name = "openai", specifier = ">=1.98.0" },