- Step 10: Extended file upload for Excel/CSV/images/URLs (openpyxl, trafilatura) - Step 11: Content intelligence service with rule-based + LLM classification - Step 12: Slide mapping engine mapping content blocks to master deck layouts - Step 13: Chart data extractor, native PPTX chart service (bar/line/pie/gantt/waterfall), ChartDataEditor skeleton Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
183 lines
4.8 KiB
Python
183 lines
4.8 KiB
Python
"""Service for parsing non-document attachments: Excel, CSV, images, URLs."""
|
|
import csv
|
|
import io
|
|
import mimetypes
|
|
import os
|
|
from typing import Any, List, Optional
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
class TableData(BaseModel):
|
|
title: Optional[str] = None
|
|
headers: List[str]
|
|
rows: List[List[Any]]
|
|
sheet_name: Optional[str] = None
|
|
|
|
|
|
class ImageInfo(BaseModel):
|
|
file_path: str
|
|
filename: str
|
|
mime_type: str
|
|
width: Optional[int] = None
|
|
height: Optional[int] = None
|
|
|
|
|
|
def parse_excel(file_path: str) -> List[TableData]:
|
|
"""Parse an Excel (.xlsx/.xls) file and return one TableData per sheet."""
|
|
from openpyxl import load_workbook
|
|
|
|
wb = load_workbook(file_path, read_only=True, data_only=True)
|
|
results: List[TableData] = []
|
|
|
|
for sheet_name in wb.sheetnames:
|
|
ws = wb[sheet_name]
|
|
rows_raw = list(ws.iter_rows(values_only=True))
|
|
if not rows_raw:
|
|
continue
|
|
|
|
# First non-empty row is treated as headers
|
|
headers = [str(c) if c is not None else "" for c in rows_raw[0]]
|
|
data_rows = []
|
|
for row in rows_raw[1:]:
|
|
# Skip completely empty rows
|
|
if all(c is None for c in row):
|
|
continue
|
|
data_rows.append([_serialize_cell(c) for c in row])
|
|
|
|
if not data_rows and not any(h for h in headers):
|
|
continue
|
|
|
|
results.append(
|
|
TableData(
|
|
title=sheet_name if len(wb.sheetnames) > 1 else None,
|
|
headers=headers,
|
|
rows=data_rows,
|
|
sheet_name=sheet_name,
|
|
)
|
|
)
|
|
|
|
wb.close()
|
|
return results
|
|
|
|
|
|
def parse_csv(file_path: str) -> TableData:
|
|
"""Parse a CSV file and return a single TableData."""
|
|
with open(file_path, "r", encoding="utf-8-sig") as f:
|
|
# Sniff delimiter
|
|
sample = f.read(4096)
|
|
f.seek(0)
|
|
try:
|
|
dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|")
|
|
except csv.Error:
|
|
dialect = csv.excel
|
|
|
|
reader = csv.reader(f, dialect)
|
|
all_rows = list(reader)
|
|
|
|
if not all_rows:
|
|
return TableData(headers=[], rows=[])
|
|
|
|
headers = all_rows[0]
|
|
data_rows = [[_serialize_cell(c) for c in row] for row in all_rows[1:] if any(c.strip() for c in row)]
|
|
|
|
return TableData(
|
|
title=os.path.splitext(os.path.basename(file_path))[0],
|
|
headers=headers,
|
|
rows=data_rows,
|
|
)
|
|
|
|
|
|
def extract_images_metadata(file_path: str) -> ImageInfo:
|
|
"""Extract metadata from an image file (dimensions, MIME type)."""
|
|
filename = os.path.basename(file_path)
|
|
mime_type = mimetypes.guess_type(file_path)[0] or "application/octet-stream"
|
|
|
|
width, height = None, None
|
|
try:
|
|
# Use python-pptx's image reader or basic header parsing
|
|
# to avoid adding PIL as a dependency
|
|
width, height = _read_image_dimensions(file_path)
|
|
except Exception:
|
|
pass
|
|
|
|
return ImageInfo(
|
|
file_path=file_path,
|
|
filename=filename,
|
|
mime_type=mime_type,
|
|
width=width,
|
|
height=height,
|
|
)
|
|
|
|
|
|
async def parse_url(url: str) -> str:
|
|
"""Fetch a URL and extract its article content as markdown."""
|
|
import trafilatura
|
|
|
|
downloaded = trafilatura.fetch_url(url)
|
|
if not downloaded:
|
|
return ""
|
|
|
|
text = trafilatura.extract(
|
|
downloaded,
|
|
output_format="txt",
|
|
include_tables=True,
|
|
include_links=False,
|
|
include_images=False,
|
|
)
|
|
return text or ""
|
|
|
|
|
|
# --- Helpers ---
|
|
|
|
|
|
def _serialize_cell(value: Any) -> Any:
|
|
"""Convert cell value to JSON-safe type."""
|
|
if value is None:
|
|
return None
|
|
if isinstance(value, (int, float, bool)):
|
|
return value
|
|
return str(value)
|
|
|
|
|
|
def _read_image_dimensions(file_path: str) -> tuple:
|
|
"""Read image dimensions from file header (PNG/JPEG/GIF/WEBP)."""
|
|
with open(file_path, "rb") as f:
|
|
header = f.read(32)
|
|
|
|
# PNG
|
|
if header[:8] == b"\x89PNG\r\n\x1a\n":
|
|
import struct
|
|
|
|
w, h = struct.unpack(">II", header[16:24])
|
|
return w, h
|
|
|
|
# JPEG
|
|
if header[:2] == b"\xff\xd8":
|
|
with open(file_path, "rb") as f:
|
|
f.seek(2)
|
|
while True:
|
|
marker = f.read(2)
|
|
if len(marker) < 2:
|
|
break
|
|
if marker[0] != 0xFF:
|
|
break
|
|
if marker[1] in (0xC0, 0xC1, 0xC2):
|
|
f.read(3) # length + precision
|
|
import struct
|
|
|
|
h, w = struct.unpack(">HH", f.read(4))
|
|
return w, h
|
|
else:
|
|
length = int.from_bytes(f.read(2), "big")
|
|
f.seek(length - 2, 1)
|
|
return None, None
|
|
|
|
# GIF
|
|
if header[:6] in (b"GIF87a", b"GIF89a"):
|
|
import struct
|
|
|
|
w, h = struct.unpack("<HH", header[6:10])
|
|
return w, h
|
|
|
|
return None, None
|