Merge branch 'main'

This commit is contained in:
sauravniraula 2025-08-03 16:34:57 +05:45
commit 089e620482
No known key found for this signature in database
GPG key ID: 60FCC1B5A5E83326
6 changed files with 3123 additions and 20 deletions

3
.gitignore vendored
View file

@ -12,4 +12,5 @@ tmp
debug
.fastembed_cache
my-doc.txt
generated_models
generated_models
nltk

View file

@ -1,6 +0,0 @@
def main():
print("Hello from fastapi!")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,27 @@
[project]
name = "presenton-backend"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.11,<3.12"
dependencies = [
"aiohttp>=3.12.15",
"aiomysql>=0.2.0",
"aiosqlite>=0.21.0",
"anthropic>=0.60.0",
"asyncpg>=0.30.0",
"chromadb>=1.0.15",
"docling>=2.43.0",
"fastapi[standard]>=0.116.1",
"google-genai>=1.28.0",
"nltk>=3.9.1",
"openai>=1.98.0",
"pathvalidate>=3.3.1",
"pdfplumber>=0.11.7",
"python-pptx>=1.0.2",
"redis>=6.2.0",
"sqlmodel>=0.0.24",
]
[[tool.uv.index]]
url = "https://download.pytorch.org/whl/cpu"

View file

@ -15,7 +15,7 @@ bcrypt==4.3.0
beautifulsoup4==4.13.4
build==1.3.0
cachetools==5.5.2
certifi==2025.7.14
certifi==2025.8.3
cffi==1.17.1
charset-normalizer==3.4.2
chromadb==1.0.15
@ -31,8 +31,8 @@ docling-ibm-models==3.9.0
docling-parse==4.1.0
durationpy==0.10
easyocr==1.7.2
email_validator==2.2.0
et_xmlfile==2.0.0
email-validator==2.2.0
et-xmlfile==2.0.0
fastapi==0.116.1
fastapi-cli==0.0.8
fastapi-cloud-cli==0.1.5
@ -55,21 +55,37 @@ huggingface-hub==0.34.3
humanfriendly==10.0
idna==3.10
imageio==2.37.0
<<<<<<< HEAD
importlib_metadata==8.7.0
importlib_resources==6.5.2
Jinja2==3.1.6
jiter==0.10.0
=======
importlib-metadata==8.7.0
importlib-resources==6.5.2
jinja2==3.1.6
jiter==0.10.0
joblib==1.5.1
>>>>>>> main
jsonlines==3.1.0
jsonref==1.1.0
jsonschema==4.25.0
jsonschema-specifications==2025.4.1
kubernetes==33.1.0
latex2mathml==3.78.0
<<<<<<< HEAD
lazy_loader==0.4
lxml==5.4.0
markdown-it-py==3.0.0
marko==2.1.4
MarkupSafe==3.0.2
=======
lazy-loader==0.4
lxml==5.4.0
markdown-it-py==3.0.0
marko==2.1.4
markupsafe==3.0.1
>>>>>>> main
mdurl==0.1.2
mmh3==5.2.0
mpire==2.10.2
@ -78,11 +94,20 @@ multidict==6.6.3
multiprocess==0.70.18
networkx==3.5
ninja==1.11.1.4
<<<<<<< HEAD
numpy==2.2.6
oauthlib==3.3.1
onnxruntime==1.22.1
openai==1.98.0
opencv-python-headless==4.12.0.88
=======
nltk==3.9.1
numpy==2.3.2
oauthlib==3.3.1
onnxruntime==1.22.1
openai==1.98.0
opencv-python-headless==4.11.0.86
>>>>>>> main
openpyxl==3.1.5
opentelemetry-api==1.36.0
opentelemetry-exporter-otlp-proto-common==1.36.0
@ -95,7 +120,7 @@ overrides==7.7.0
packaging==25.0
pandas==2.3.1
pathvalidate==3.3.1
pdfminer.six==20250506
pdfminer-six==20250506
pdfplumber==0.11.7
pillow==11.3.0
pluggy==1.6.0
@ -104,19 +129,19 @@ propcache==0.3.2
protobuf==6.31.1
psutil==7.0.0
pyasn1==0.6.1
pyasn1_modules==0.4.2
pyasn1-modules==0.4.2
pybase64==1.4.2
pyclipper==1.3.0.post6
pycparser==2.22
pydantic==2.11.7
pydantic-core==2.33.2
pydantic-settings==2.10.1
pydantic_core==2.33.2
Pygments==2.19.2
pygments==2.19.2
pylatexenc==2.10
PyMySQL==1.1.1
pymysql==1.1.1
pypdfium2==4.30.0
PyPika==0.48.9
pyproject_hooks==1.2.0
pypika==0.48.9
pyproject-hooks==1.2.0
python-bidi==0.6.6
python-dateutil==2.9.0.post0
python-docx==1.2.0
@ -124,7 +149,7 @@ python-dotenv==1.1.1
python-multipart==0.0.20
python-pptx==1.0.2
pytz==2025.2
PyYAML==6.0.2
pyyaml==6.0.2
redis==6.2.0
referencing==0.36.2
regex==2025.7.34
@ -146,7 +171,7 @@ shellingham==1.5.4
six==1.17.0
sniffio==1.3.1
soupsieve==2.7
SQLAlchemy==2.0.42
sqlalchemy==2.0.42
sqlmodel==0.0.24
starlette==0.47.2
sympy==1.14.0
@ -160,8 +185,8 @@ torchvision==0.22.1+cpu
tqdm==4.67.1
transformers==4.54.1
typer==0.16.0
typing-extensions==4.14.1
typing-inspection==0.4.1
typing_extensions==4.14.1
tzdata==2025.2
urllib3==2.5.0
uvicorn==0.35.0

View file

@ -0,0 +1,197 @@
import asyncio
from typing import List
import nltk
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt", download_dir="./nltk")
class ScoreBasedChunker:
def extract_sentences(self, text: str, min_sentences: int) -> List[str]:
sentences = self.extract_sentences_markdown(text)
if len(sentences) < min_sentences:
sentences = self.extract_sentences_nltk(text)
if len(sentences) < min_sentences:
sentences = self.extract_sentences_by_stop_words(text)
if len(sentences) < min_sentences:
sentences = self.extract_sentences_by_new_line(text)
if len(sentences) < min_sentences:
raise ValueError(
f"Only {len(sentences)} sentences found, requested {min_sentences}"
)
return sentences
def extract_sentences_markdown(self, text: str) -> List[str]:
lines = text.split("\n")
sentences = []
for line in lines:
line = line.strip()
if line:
if line.startswith("#"):
sentences.append(line)
else:
if line.endswith((".", "!", "?")):
sentences.append(line)
else:
sentences.append(line)
return sentences
def extract_sentences_nltk(self, text: str) -> List[str]:
sentences = nltk.sent_tokenize(text)
return sentences
def extract_sentences_by_stop_words(self, text: str) -> List[str]:
sentences = []
current_sentence = ""
for char in text:
current_sentence += char
if char in ".!?":
sentences.append(current_sentence.strip())
current_sentence = ""
if current_sentence.strip():
sentences.append(current_sentence.strip())
return [s for s in sentences if s]
def extract_sentences_by_new_line(self, text: str) -> List[str]:
sentences = text.split("\n")
result = []
for i, sentence in enumerate(sentences):
if i < len(sentences) - 1:
result.append(sentence + "\n")
else:
result.append(sentence)
return result
def score_sentences_for_heading(self, sentences: List[str]) -> List[float]:
sentences_scores = []
last_heading_index = -1
first_heading_found = False
for i, sentence in enumerate(sentences):
score = 0.0
if sentence.strip().startswith("#"):
heading_level = len(sentence) - len(sentence.lstrip("#"))
if heading_level <= 3:
score += 10.0 - (heading_level - 1) * 2.0
else:
score += 4.0 - (heading_level - 4) * 0.5
if not first_heading_found:
score += 5.0
first_heading_found = True
if last_heading_index != -1:
distance = i - last_heading_index
distance_bonus = min(5.0, distance * 0.5)
score += distance_bonus
last_heading_index = i
sentences_scores.append(score)
return sentences_scores
def get_chunks(
self, sentences: List[str], sentences_scores: List[float], top_k: int = 10
) -> List[dict]:
if not sentences_scores:
sentences_scores = self.score_sentences_for_heading(sentences)
chunks = []
heading_scores = []
for i, score in enumerate(sentences_scores):
if score > 0:
heading_scores.append((i, score))
if len(heading_scores) == 0:
return chunks
heading_scores.sort(key=lambda x: (-x[1], x[0]))
if len(heading_scores) <= top_k:
selected_headings = [idx for idx, _ in heading_scores]
selected_headings.sort()
else:
score_groups = {}
for idx, score in heading_scores:
rounded_score = round(score)
if rounded_score not in score_groups:
score_groups[rounded_score] = []
score_groups[rounded_score].append(idx)
sorted_groups = sorted(
score_groups.items(), key=lambda x: x[0], reverse=True
)
selected_headings = []
for score, headings in sorted_groups:
headings.sort()
remaining_needed = top_k - len(selected_headings)
if remaining_needed <= 0:
break
if len(headings) <= remaining_needed:
selected_headings.extend(headings)
else:
if remaining_needed == 1:
mid_idx = len(headings) // 2
selected_headings.append(headings[mid_idx])
elif remaining_needed == 2:
selected_headings.append(headings[0])
selected_headings.append(headings[-1])
else:
step = (len(headings) - 1) / (remaining_needed - 1)
for i in range(remaining_needed):
index = int(round(i * step))
if index < len(headings):
selected_headings.append(headings[index])
selected_headings.sort()
for i, heading_idx in enumerate(selected_headings):
heading = sentences[heading_idx]
if i + 1 < len(selected_headings):
next_heading_idx = selected_headings[i + 1]
content_end = next_heading_idx
else:
content_end = len(sentences)
content_sentences = sentences[heading_idx + 1 : content_end]
content = " ".join(content_sentences).strip()
chunk = {
"heading": heading,
"content": content,
"heading_index": heading_idx,
"score": sentences_scores[heading_idx],
}
chunks.append(chunk)
return chunks
async def get_n_chunks(self, text: str, n: int) -> List[dict]:
sentences = await asyncio.to_thread(self.extract_sentences, text, n)
sentences_scores = await asyncio.to_thread(
self.score_sentences_for_heading, sentences
)
chunks = await asyncio.to_thread(
self.get_chunks, sentences, sentences_scores, n
)
if len(chunks) < n:
raise ValueError(f"Only {len(chunks)} chunks found, requested {n}")
return chunks

2859
servers/fastapi/uv.lock generated Normal file

File diff suppressed because it is too large Load diff