Merge branch 'main'

2025-08-03 16:34:57 +05:45 · 2025-08-03 16:34:57 +05:45 · 089e620482
commit 089e620482
parent f2e410639a e3779502bf
6 changed files with 3123 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,4 +12,5 @@ tmp
 debug
 .fastembed_cache
 my-doc.txt
-generated_models
+generated_models
+nltk
--- a/servers/fastapi/main.py
+++ b/servers/fastapi/main.py
@ -1,6 +0,0 @@
-def main():
-    print("Hello from fastapi!")
-
-
-if __name__ == "__main__":
-    main()
--- a/servers/fastapi/pyproject.toml
+++ b/servers/fastapi/pyproject.toml
@ -0,0 +1,27 @@
+[project]
+name = "presenton-backend"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.11,<3.12"
+dependencies = [
+    "aiohttp>=3.12.15",
+    "aiomysql>=0.2.0",
+    "aiosqlite>=0.21.0",
+    "anthropic>=0.60.0",
+    "asyncpg>=0.30.0",
+    "chromadb>=1.0.15",
+    "docling>=2.43.0",
+    "fastapi[standard]>=0.116.1",
+    "google-genai>=1.28.0",
+    "nltk>=3.9.1",
+    "openai>=1.98.0",
+    "pathvalidate>=3.3.1",
+    "pdfplumber>=0.11.7",
+    "python-pptx>=1.0.2",
+    "redis>=6.2.0",
+    "sqlmodel>=0.0.24",
+]
+
+[[tool.uv.index]]
+url = "https://download.pytorch.org/whl/cpu"
--- a/servers/fastapi/requirements.txt
+++ b/servers/fastapi/requirements.txt
@ -15,7 +15,7 @@ bcrypt==4.3.0
 beautifulsoup4==4.13.4
 build==1.3.0
 cachetools==5.5.2
-certifi==2025.7.14
+certifi==2025.8.3
 cffi==1.17.1
 charset-normalizer==3.4.2
 chromadb==1.0.15
@ -31,8 +31,8 @@ docling-ibm-models==3.9.0
 docling-parse==4.1.0
 durationpy==0.10
 easyocr==1.7.2
-email_validator==2.2.0
-et_xmlfile==2.0.0
+email-validator==2.2.0
+et-xmlfile==2.0.0
 fastapi==0.116.1
 fastapi-cli==0.0.8
 fastapi-cloud-cli==0.1.5
@ -55,21 +55,37 @@ huggingface-hub==0.34.3
 humanfriendly==10.0
 idna==3.10
 imageio==2.37.0
+<<<<<<< HEAD
 importlib_metadata==8.7.0
 importlib_resources==6.5.2
 Jinja2==3.1.6
 jiter==0.10.0
+=======
+importlib-metadata==8.7.0
+importlib-resources==6.5.2
+jinja2==3.1.6
+jiter==0.10.0
+joblib==1.5.1
+>>>>>>> main
 jsonlines==3.1.0
 jsonref==1.1.0
 jsonschema==4.25.0
 jsonschema-specifications==2025.4.1
 kubernetes==33.1.0
 latex2mathml==3.78.0
+<<<<<<< HEAD
 lazy_loader==0.4
 lxml==5.4.0
 markdown-it-py==3.0.0
 marko==2.1.4
 MarkupSafe==3.0.2
+=======
+lazy-loader==0.4
+lxml==5.4.0
+markdown-it-py==3.0.0
+marko==2.1.4
+markupsafe==3.0.1
+>>>>>>> main
 mdurl==0.1.2
 mmh3==5.2.0
 mpire==2.10.2
@ -78,11 +94,20 @@ multidict==6.6.3
 multiprocess==0.70.18
 networkx==3.5
 ninja==1.11.1.4
+<<<<<<< HEAD
 numpy==2.2.6
 oauthlib==3.3.1
 onnxruntime==1.22.1
 openai==1.98.0
 opencv-python-headless==4.12.0.88
+=======
+nltk==3.9.1
+numpy==2.3.2
+oauthlib==3.3.1
+onnxruntime==1.22.1
+openai==1.98.0
+opencv-python-headless==4.11.0.86
+>>>>>>> main
 openpyxl==3.1.5
 opentelemetry-api==1.36.0
 opentelemetry-exporter-otlp-proto-common==1.36.0
@ -95,7 +120,7 @@ overrides==7.7.0
 packaging==25.0
 pandas==2.3.1
 pathvalidate==3.3.1
-pdfminer.six==20250506
+pdfminer-six==20250506
 pdfplumber==0.11.7
 pillow==11.3.0
 pluggy==1.6.0
@ -104,19 +129,19 @@ propcache==0.3.2
 protobuf==6.31.1
 psutil==7.0.0
 pyasn1==0.6.1
-pyasn1_modules==0.4.2
+pyasn1-modules==0.4.2
 pybase64==1.4.2
 pyclipper==1.3.0.post6
 pycparser==2.22
 pydantic==2.11.7
+pydantic-core==2.33.2
 pydantic-settings==2.10.1
-pydantic_core==2.33.2
-Pygments==2.19.2
+pygments==2.19.2
 pylatexenc==2.10
-PyMySQL==1.1.1
+pymysql==1.1.1
 pypdfium2==4.30.0
-PyPika==0.48.9
-pyproject_hooks==1.2.0
+pypika==0.48.9
+pyproject-hooks==1.2.0
 python-bidi==0.6.6
 python-dateutil==2.9.0.post0
 python-docx==1.2.0
@ -124,7 +149,7 @@ python-dotenv==1.1.1
 python-multipart==0.0.20
 python-pptx==1.0.2
 pytz==2025.2
-PyYAML==6.0.2
+pyyaml==6.0.2
 redis==6.2.0
 referencing==0.36.2
 regex==2025.7.34
@ -146,7 +171,7 @@ shellingham==1.5.4
 six==1.17.0
 sniffio==1.3.1
 soupsieve==2.7
-SQLAlchemy==2.0.42
+sqlalchemy==2.0.42
 sqlmodel==0.0.24
 starlette==0.47.2
 sympy==1.14.0
@ -160,8 +185,8 @@ torchvision==0.22.1+cpu
 tqdm==4.67.1
 transformers==4.54.1
 typer==0.16.0
+typing-extensions==4.14.1
 typing-inspection==0.4.1
-typing_extensions==4.14.1
 tzdata==2025.2
 urllib3==2.5.0
 uvicorn==0.35.0
--- a/servers/fastapi/services/score_based_chunker.py
+++ b/servers/fastapi/services/score_based_chunker.py
@ -0,0 +1,197 @@
+import asyncio
+from typing import List
+import nltk
+
+try:
+    nltk.data.find("tokenizers/punkt")
+except LookupError:
+    nltk.download("punkt", download_dir="./nltk")
+
+
+class ScoreBasedChunker:
+
+    def extract_sentences(self, text: str, min_sentences: int) -> List[str]:
+        sentences = self.extract_sentences_markdown(text)
+        if len(sentences) < min_sentences:
+            sentences = self.extract_sentences_nltk(text)
+        if len(sentences) < min_sentences:
+            sentences = self.extract_sentences_by_stop_words(text)
+        if len(sentences) < min_sentences:
+            sentences = self.extract_sentences_by_new_line(text)
+        if len(sentences) < min_sentences:
+            raise ValueError(
+                f"Only {len(sentences)} sentences found, requested {min_sentences}"
+            )
+        return sentences
+
+    def extract_sentences_markdown(self, text: str) -> List[str]:
+        lines = text.split("\n")
+        sentences = []
+
+        for line in lines:
+            line = line.strip()
+            if line:
+                if line.startswith("#"):
+                    sentences.append(line)
+                else:
+                    if line.endswith((".", "!", "?")):
+                        sentences.append(line)
+                    else:
+                        sentences.append(line)
+
+        return sentences
+
+    def extract_sentences_nltk(self, text: str) -> List[str]:
+        sentences = nltk.sent_tokenize(text)
+        return sentences
+
+    def extract_sentences_by_stop_words(self, text: str) -> List[str]:
+        sentences = []
+        current_sentence = ""
+
+        for char in text:
+            current_sentence += char
+            if char in ".!?":
+                sentences.append(current_sentence.strip())
+                current_sentence = ""
+
+        if current_sentence.strip():
+            sentences.append(current_sentence.strip())
+
+        return [s for s in sentences if s]
+
+    def extract_sentences_by_new_line(self, text: str) -> List[str]:
+        sentences = text.split("\n")
+        result = []
+        for i, sentence in enumerate(sentences):
+            if i < len(sentences) - 1:
+                result.append(sentence + "\n")
+            else:
+                result.append(sentence)
+        return result
+
+    def score_sentences_for_heading(self, sentences: List[str]) -> List[float]:
+        sentences_scores = []
+
+        last_heading_index = -1
+        first_heading_found = False
+
+        for i, sentence in enumerate(sentences):
+            score = 0.0
+
+            if sentence.strip().startswith("#"):
+                heading_level = len(sentence) - len(sentence.lstrip("#"))
+
+                if heading_level <= 3:
+                    score += 10.0 - (heading_level - 1) * 2.0
+                else:
+                    score += 4.0 - (heading_level - 4) * 0.5
+
+                if not first_heading_found:
+                    score += 5.0
+                    first_heading_found = True
+
+                if last_heading_index != -1:
+                    distance = i - last_heading_index
+                    distance_bonus = min(5.0, distance * 0.5)
+                    score += distance_bonus
+
+                last_heading_index = i
+
+            sentences_scores.append(score)
+
+        return sentences_scores
+
+    def get_chunks(
+        self, sentences: List[str], sentences_scores: List[float], top_k: int = 10
+    ) -> List[dict]:
+        if not sentences_scores:
+            sentences_scores = self.score_sentences_for_heading(sentences)
+
+        chunks = []
+        heading_scores = []
+
+        for i, score in enumerate(sentences_scores):
+            if score > 0:
+                heading_scores.append((i, score))
+
+        if len(heading_scores) == 0:
+            return chunks
+
+        heading_scores.sort(key=lambda x: (-x[1], x[0]))
+
+        if len(heading_scores) <= top_k:
+            selected_headings = [idx for idx, _ in heading_scores]
+            selected_headings.sort()
+        else:
+            score_groups = {}
+            for idx, score in heading_scores:
+                rounded_score = round(score)
+                if rounded_score not in score_groups:
+                    score_groups[rounded_score] = []
+                score_groups[rounded_score].append(idx)
+
+            sorted_groups = sorted(
+                score_groups.items(), key=lambda x: x[0], reverse=True
+            )
+
+            selected_headings = []
+
+            for score, headings in sorted_groups:
+                headings.sort()
+                remaining_needed = top_k - len(selected_headings)
+
+                if remaining_needed <= 0:
+                    break
+
+                if len(headings) <= remaining_needed:
+                    selected_headings.extend(headings)
+                else:
+                    if remaining_needed == 1:
+                        mid_idx = len(headings) // 2
+                        selected_headings.append(headings[mid_idx])
+                    elif remaining_needed == 2:
+                        selected_headings.append(headings[0])
+                        selected_headings.append(headings[-1])
+                    else:
+                        step = (len(headings) - 1) / (remaining_needed - 1)
+
+                        for i in range(remaining_needed):
+                            index = int(round(i * step))
+                            if index < len(headings):
+                                selected_headings.append(headings[index])
+
+            selected_headings.sort()
+
+        for i, heading_idx in enumerate(selected_headings):
+            heading = sentences[heading_idx]
+
+            if i + 1 < len(selected_headings):
+                next_heading_idx = selected_headings[i + 1]
+                content_end = next_heading_idx
+            else:
+                content_end = len(sentences)
+
+            content_sentences = sentences[heading_idx + 1 : content_end]
+            content = " ".join(content_sentences).strip()
+
+            chunk = {
+                "heading": heading,
+                "content": content,
+                "heading_index": heading_idx,
+                "score": sentences_scores[heading_idx],
+            }
+            chunks.append(chunk)
+        return chunks
+
+    async def get_n_chunks(self, text: str, n: int) -> List[dict]:
+        sentences = await asyncio.to_thread(self.extract_sentences, text, n)
+        sentences_scores = await asyncio.to_thread(
+            self.score_sentences_for_heading, sentences
+        )
+        chunks = await asyncio.to_thread(
+            self.get_chunks, sentences, sentences_scores, n
+        )
+        if len(chunks) < n:
+            raise ValueError(f"Only {len(chunks)} chunks found, requested {n}")
+        return chunks
--- a/servers/fastapi/uv.lock
+++ b/servers/fastapi/uv.lock