diff --git a/servers/fastapi/api/routers/presentation/handlers/decompose_documents.py b/servers/fastapi/api/routers/presentation/handlers/decompose_documents.py
index 24e5519b..937bca28 100644
--- a/servers/fastapi/api/routers/presentation/handlers/decompose_documents.py
+++ b/servers/fastapi/api/routers/presentation/handlers/decompose_documents.py
@@ -1,5 +1,3 @@
-import asyncio
-from typing import List
import uuid
from api.models import LogMetadata
from api.routers.presentation.models import (
@@ -37,7 +35,7 @@ class DecomposeDocumentsHandler:
file_path = TEMP_FILE_SERVICE.create_temp_file_path(
f"{str(uuid.uuid4())}.txt", self.temp_dir
)
- parsed_doc = parsed_doc.page_content.replace("
", "\n")
+ parsed_doc = parsed_doc.replace("
", "\n")
with open(file_path, "w") as text_file:
text_file.write(parsed_doc)
document_paths.append(file_path)
diff --git a/servers/fastapi/api/routers/presentation/handlers/generate_outlines.py b/servers/fastapi/api/routers/presentation/handlers/generate_outlines.py
index e7a30bf7..326fe9c3 100644
--- a/servers/fastapi/api/routers/presentation/handlers/generate_outlines.py
+++ b/servers/fastapi/api/routers/presentation/handlers/generate_outlines.py
@@ -1,5 +1,4 @@
import uuid
-import re
from api.models import LogMetadata
from api.routers.presentation.models import GenerateOutlinesRequest
diff --git a/servers/fastapi/api/routers/presentation/handlers/generate_presentation.py b/servers/fastapi/api/routers/presentation/handlers/generate_presentation.py
index d94692ad..3123452e 100644
--- a/servers/fastapi/api/routers/presentation/handlers/generate_presentation.py
+++ b/servers/fastapi/api/routers/presentation/handlers/generate_presentation.py
@@ -27,10 +27,7 @@ from ppt_config_generator.ppt_outlines_generator import generate_ppt_content
from ppt_generator.generator import generate_presentation
from ppt_generator.models.llm_models import (
LLM_CONTENT_TYPE_MAPPING,
- LLMPresentationModel,
)
-from langchain_core.output_parsers import JsonOutputParser
-
from ppt_generator.models.slide_model import SlideModel
diff --git a/servers/fastapi/api/routers/presentation/handlers/generate_stream.py b/servers/fastapi/api/routers/presentation/handlers/generate_stream.py
index feebfed8..6a4bb663 100644
--- a/servers/fastapi/api/routers/presentation/handlers/generate_stream.py
+++ b/servers/fastapi/api/routers/presentation/handlers/generate_stream.py
@@ -31,7 +31,6 @@ from ppt_generator.models.llm_models import (
)
from ppt_generator.models.slide_model import SlideModel
from api.services.instances import TEMP_FILE_SERVICE
-from langchain_core.output_parsers import JsonOutputParser
from ppt_generator.slide_generator import get_slide_content_from_type_and_outline
diff --git a/servers/fastapi/document_processor/loader.py b/servers/fastapi/document_processor/loader.py
index a05e9b9e..a96553f1 100644
--- a/servers/fastapi/document_processor/loader.py
+++ b/servers/fastapi/document_processor/loader.py
@@ -1,11 +1,10 @@
+import asyncio
import mimetypes
import os
from typing import List, Tuple
from fastapi import HTTPException
-from langchain_community.document_loaders import TextLoader, PDFPlumberLoader
-from langchain_core.documents import Document
-from langchain_text_splitters import CharacterTextSplitter, MarkdownTextSplitter
from pptx import Presentation
+import pdfplumber
from docx import Document as DocxDocument
from image_processor.utils import get_page_images_from_pdf_async
@@ -30,23 +29,13 @@ class DocumentsLoader:
def __init__(self, documents: List[str]):
self._document_paths = documents
- self._documents: List[Document] = []
- self._splitted_documents: List[Document] = []
+ self._documents: List[str] = []
self._images: List[List[str]] = []
- self._markdown_splitter = MarkdownTextSplitter(chunk_size=500, chunk_overlap=50)
- self._text_splitter = CharacterTextSplitter(
- separator="/n", chunk_size=500, chunk_overlap=50
- )
-
@property
def documents(self):
return self._documents
- @property
- def splitted_documents(self):
- return self._splitted_documents
-
@property
def images(self):
return self._images
@@ -54,90 +43,69 @@ class DocumentsLoader:
async def load_documents(
self,
temp_dir: str,
- split_documents: bool = False,
- load_markdown: bool = True,
+ load_text: bool = True,
load_images: bool = False,
):
- documents: List[Document] = []
+ documents: List[str] = []
images: List[str] = []
- splitted_documents: List[Document] = []
for file_path in self._document_paths:
if not os.path.exists(file_path):
raise HTTPException(
status_code=404, detail=f"File {file_path} not found"
)
- docs = []
+ document = ""
imgs = []
mime_type = mimetypes.guess_type(file_path)[0]
if mime_type in PDF_MIME_TYPES:
- docs, imgs = await self.load_pdf(
- file_path, load_markdown, load_images, temp_dir
+ document, imgs = await self.load_pdf(
+ file_path, load_text, load_images, temp_dir
)
elif mime_type in TEXT_MIME_TYPES:
- docs = self.load_text(file_path)
+ document = await self.load_text(file_path)
elif mime_type in POWERPOINT_TYPES:
- docs = self.load_powerpoint(file_path)
+ document = self.load_powerpoint(file_path)
elif mime_type in WORD_TYPES:
- docs = self.load_msword(file_path)
+ document = self.load_msword(file_path)
- documents.extend(docs)
+ documents.append(document)
images.append(imgs)
- if split_documents:
- splitted_documents.extend(self.split_documents(docs, mime_type))
-
self._documents = documents
- self._splitted_documents = splitted_documents
self._images = images
- def split_documents(self, documents: List[Document], mime_type):
- return self._text_splitter.split_documents(documents)
-
- def clip_longer_documents(self, documents: List[Document], clip_after: int = 1200):
- for document in documents:
- document.page_content = document.page_content[:clip_after]
- return documents
-
async def load_pdf(
self,
file_path: str,
- load_markdown: bool,
+ load_text: bool,
load_images: bool,
temp_dir: str,
- ) -> Tuple[List[Document], List[str]]:
+ ) -> Tuple[str, List[str]]:
image_paths = []
- documents: List[Document] = []
+ document: str = ""
- if load_markdown:
- loader = PDFPlumberLoader(file_path)
- docs = loader.load()
- pdf_document = Document(page_content="")
- pdf_document.metadata = docs[0].metadata
- for doc in docs:
- pdf_document.page_content += doc.page_content
- documents.append(pdf_document)
+ if load_text:
+ with pdfplumber.open(file_path) as pdf:
+ for page in pdf.pages:
+ document += await asyncio.to_thread(page.extract_text)
if load_images:
image_paths = await get_page_images_from_pdf_async(file_path, temp_dir)
- return documents, image_paths
+ return document, image_paths
- async def decompose_pdf_to_markdown(self, document_path: str) -> str:
- raise Exception("Not Implemented")
+ async def load_text(self, file_path: str) -> str:
+ with open(file_path, "r") as file:
+ return await asyncio.to_thread(file.read)
- def load_text(self, file_path: str) -> List[Document]:
- loader = TextLoader(file_path)
- return loader.load()
-
- def load_msword(self, file_path: str) -> List[Document]:
+ def load_msword(self, file_path: str) -> str:
document = DocxDocument(file_path)
text = "\n".join([paragraph.text for paragraph in document.paragraphs])
- return [Document(page_content=text)]
+ return text
- def load_powerpoint(self, file_path: str) -> List[Document]:
+ def load_powerpoint(self, file_path: str) -> str:
presentation = Presentation(file_path)
extracted_text = ""
@@ -149,4 +117,4 @@ class DocumentsLoader:
extracted_text += f"{paragraph.text}\n"
extracted_text += "\n"
extracted_text += "\n\n"
- return [Document(page_content=extracted_text)]
+ return extracted_text
diff --git a/servers/fastapi/image_processor/icons_vectorstore_utils.py b/servers/fastapi/image_processor/icons_vectorstore_utils.py
index 6315d752..aca492bf 100644
--- a/servers/fastapi/image_processor/icons_vectorstore_utils.py
+++ b/servers/fastapi/image_processor/icons_vectorstore_utils.py
@@ -1,37 +1,34 @@
import json
import os
-from langchain_core.vectorstores import InMemoryVectorStore
+from fastembed import TextEmbedding
-from langchain_core.documents import Document
from api.utils.utils import get_resource
-from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
-
-# Pyinstaller
-import fastembed
def get_icons_vectorstore():
vector_store_path = get_resource("assets/icons_vectorstore.json")
- embeddings = FastEmbedEmbeddings()
+ embedding_model = TextEmbedding()
- if os.path.exists(vector_store_path):
- vector_store = InMemoryVectorStore.load(vector_store_path, embeddings)
- return vector_store
-
- vector_store = InMemoryVectorStore(embeddings)
-
- vector_store.dump(vector_store_path)
+ # if os.path.exists(vector_store_path):
+ # vector_store = InMemoryVectorStore.load(vector_store_path, embeddings)
+ # return vector_store
with open(get_resource("assets/icons.json"), "r") as f:
icons = json.load(f)
icon_names = [icon["name"] for icon in icons["icons"]]
- documents = []
+ bold_icon_names = []
for each in icon_names:
if each.split("-")[-1] == "bold":
- documents.append(Document(id=each, page_content=each))
+ bold_icon_names.append(each)
- vector_store.add_documents(documents)
- vector_store.dump(vector_store_path)
- return vector_store
+ documents_and_embeddings = {
+ "documents": bold_icon_names,
+ "embeddings": embedding_model.embed(bold_icon_names),
+ }
+
+ with open(vector_store_path, "w") as f:
+ json.dump(documents_and_embeddings, f)
+
+ return documents_and_embeddings
diff --git a/servers/fastapi/ppt_config_generator/document_summary_generator.py b/servers/fastapi/ppt_config_generator/document_summary_generator.py
index a699e72a..6d43f791 100644
--- a/servers/fastapi/ppt_config_generator/document_summary_generator.py
+++ b/servers/fastapi/ppt_config_generator/document_summary_generator.py
@@ -1,7 +1,5 @@
import asyncio
from typing import List
-from langchain_core.documents import Document
-from langchain_text_splitters import CharacterTextSplitter
from openai.types.chat.chat_completion import ChatCompletion
from api.utils.model_utils import get_llm_client, get_nano_model
@@ -23,16 +21,13 @@ Maintain as much information as possible.
"""
-async def generate_document_summary(documents: List[Document]):
+async def generate_document_summary(documents: List[str]):
client = get_llm_client()
model = get_nano_model()
- text_splitter = CharacterTextSplitter(chunk_size=200000, chunk_overlap=0)
-
coroutines = []
for document in documents:
- text = document.page_content
- truncated_text = text_splitter.split_text(text)[0]
+ truncated_text = document[:200000]
coroutine = client.chat.completions.create(
model=model,
messages=[
diff --git a/servers/fastapi/requirements.txt b/servers/fastapi/requirements.txt
index 45eb3f4d..4e78657f 100644
--- a/servers/fastapi/requirements.txt
+++ b/servers/fastapi/requirements.txt
@@ -44,14 +44,6 @@ Jinja2==3.1.6
jiter==0.9.0
jsonpatch==1.33
jsonpointer==3.0.0
-langchain==0.3.25
-langchain-community==0.3.24
-langchain-core==0.3.65
-langchain-google-genai==2.1.4
-langchain-ollama==0.3.3
-langchain-openai==0.3.16
-langchain-text-splitters==0.3.8
-langsmith==0.3.45
loguru==0.7.3
lxml==5.4.0
markdown-it-py==3.0.0
diff --git a/servers/fastapi/research_report/generator.py b/servers/fastapi/research_report/generator.py
index ffb8e520..f7e7462e 100644
--- a/servers/fastapi/research_report/generator.py
+++ b/servers/fastapi/research_report/generator.py
@@ -1,58 +1,35 @@
import os
from typing import Optional
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain_openai import ChatOpenAI
-# search_tool = DuckDuckGoSearchRun(
-# api_wrapper=DuckDuckGoSearchAPIWrapper(max_results=50)
-# )
-prompt_template = ChatPromptTemplate.from_messages(
- [
- (
- "system",
- """
- Use provided prompt and search results to create an elaborate and up-to-date research report in mentioned language.
+def get_prompt_template():
+ return [
+ {
+ "role": "system",
+ "content": """
+ Use provided prompt and search results to create an elaborate and up-to-date research report in mentioned language.
- # Steps
- 1. Analyze the prompt and search results.
- 2. Extract topic of the report.
- 3. Generate a report in markdown format.
+ # Steps
+ 1. Analyze the prompt and search results.
+ 2. Extract topic of the report.
+ 3. Generate a report in markdown format.
- # Notes
- - If language is not mentioned, use language from prompt.
- - Format of report should be like *Research Report*.
- - Ignore formatting if mentioned in prompt.
+ # Notes
+ - If language is not mentioned, use language from prompt.
+ - Format of report should be like *Research Report*.
+ - Ignore formatting if mentioned in prompt.
""",
- ),
- (
- "human",
- """
- - Prompt: {prompt}
- - Language: {language}
- - Search Results: {search_results}
+ },
+ {
+ "role": "human",
+ "content": """
+ - Prompt: {prompt}
+ - Language: {language}
+ - Search Results: {search_results}
""",
- ),
+ },
]
-)
async def get_report(query: str, language: Optional[str]):
- model = (
- ChatOpenAI(model="gpt-4.1-nano")
- if os.getenv("LLM") == "openai"
- else ChatGoogleGenerativeAI(model="gemini-2.0-flash")
- )
- chain = prompt_template | model
-
- # search_results = await search_tool.ainvoke(query)
- # response = await chain.ainvoke(
- # {
- # "prompt": query,
- # "language": language,
- # "search_results": search_results,
- # }
- # )
- # return response.content
return "Research Report coming soon"