Merge pull request #209 from presenton/fix/presentation-export-markdown-issue

fix(fastapi): solves issue on PPTX export where markdown content was not correctly parsed
This commit is contained in:
Saurav Niraula 2025-08-12 13:12:35 +05:45 committed by GitHub
commit 01d39d71be
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 107 additions and 75 deletions

View file

@ -57,6 +57,8 @@ class PptxFontModel(BaseModel):
italic: bool = False
color: str = "000000"
font_weight: Optional[int] = 400
underline: Optional[bool] = None
strike: Optional[bool] = None
class PptxFillModel(BaseModel):

View file

@ -0,0 +1,65 @@
from html.parser import HTMLParser
from typing import List, Optional
from models.pptx_models import PptxFontModel, PptxTextRunModel
class InlineHTMLToRunsParser(HTMLParser):
def __init__(self, base_font: PptxFontModel):
super().__init__(convert_charrefs=True)
self.base_font = base_font
self.tag_stack: List[str] = []
self.text_runs: List[PptxTextRunModel] = []
def _current_font(self) -> PptxFontModel:
font_json = self.base_font.model_dump()
is_bold = any(tag in ("strong", "b") for tag in self.tag_stack)
is_italic = any(tag in ("em", "i") for tag in self.tag_stack)
is_underline = any(tag == "u" for tag in self.tag_stack)
is_strike = any(tag in ("s", "strike", "del") for tag in self.tag_stack)
is_code = any(tag == "code" for tag in self.tag_stack)
if is_bold:
font_json["font_weight"] = 700
if is_italic:
font_json["italic"] = True
if is_underline:
font_json["underline"] = True
if is_strike:
font_json["strike"] = True
if is_code:
font_json["name"] = "Courier New"
return PptxFontModel(**font_json)
def handle_starttag(self, tag, attrs):
tag = tag.lower()
if tag == "br":
self.text_runs.append(PptxTextRunModel(text="\n"))
return
self.tag_stack.append(tag)
def handle_endtag(self, tag):
tag = tag.lower()
for i in range(len(self.tag_stack) - 1, -1, -1):
if self.tag_stack[i] == tag:
del self.tag_stack[i]
break
def handle_data(self, data):
if data == "":
return
self.text_runs.append(PptxTextRunModel(text=data, font=self._current_font()))
def parse_html_text_to_text_runs(
text: str, base_font: Optional[PptxFontModel] = None
) -> List[PptxTextRunModel]:
normalized_text = text.replace("\r\n", "\n").replace("\r", "\n")
normalized_text = normalized_text.replace("\n", "<br>")
parser = InlineHTMLToRunsParser(base_font if base_font else PptxFontModel())
parser.feed(normalized_text)
return parser.text_runs

View file

@ -1,6 +1,9 @@
import os
from typing import List, Optional
from lxml import etree
from services.html_to_text_runs_service import (
parse_html_text_to_text_runs as parse_inline_html_to_runs,
)
from pptx import Presentation
from pptx.shapes.autoshape import Shape
@ -276,7 +279,7 @@ class PptxPresentationCreator:
text_runs = []
if paragraph_model.text:
text_runs = self.parse_markdown_text_to_text_runs(
text_runs = self.parse_html_text_to_text_runs(
paragraph_model.font, paragraph_model.text
)
elif paragraph_model.text_runs:
@ -286,78 +289,8 @@ class PptxPresentationCreator:
text_run = paragraph.add_run()
self.populate_text_run(text_run, text_run_model)
def parse_markdown_text_to_text_runs(self, font: PptxFontModel, text: str):
text_runs = []
for line in text.split("\n"):
current_pos = 0
while current_pos < len(line):
# Check for bold and italic (***text***)
if (
line[current_pos:].startswith("***")
and "***" in line[current_pos + 3 :]
):
end_pos = line.find("***", current_pos + 3)
text_content = line[current_pos + 3 : end_pos]
font_json = font.model_dump()
font_json["bold"] = True
font_json["italic"] = True
font_json["font_weight"] = 700 # Set font weight to bold
text_runs.append(
PptxTextRunModel(
text=text_content, font=PptxFontModel(**font_json)
)
)
current_pos = end_pos + 3
# Check for bold (**text**)
elif (
line[current_pos:].startswith("**")
and "**" in line[current_pos + 2 :]
):
end_pos = line.find("**", current_pos + 2)
text_content = line[current_pos + 2 : end_pos]
font_json = font.model_dump()
font_json["bold"] = True
font_json["font_weight"] = 700 # Set font weight to bold
text_runs.append(
PptxTextRunModel(
text=text_content, font=PptxFontModel(**font_json)
)
)
current_pos = end_pos + 2
# Check for italic (*text*)
elif (
line[current_pos:].startswith("__")
and "__" in line[current_pos + 2 :]
):
end_pos = line.find("__", current_pos + 2)
text_content = line[current_pos + 2 : end_pos]
font_json = font.model_dump()
font_json["italic"] = True
text_runs.append(
PptxTextRunModel(
text=text_content, font=PptxFontModel(**font_json)
)
)
current_pos = end_pos + 2
else:
# Find the next formatting marker or end of line
next_marker = float("inf")
for marker in ["***", "**", "__"]:
pos = line.find(marker, current_pos)
if pos != -1:
next_marker = min(next_marker, pos)
end_pos = next_marker if next_marker != float("inf") else len(line)
text_content = line[current_pos:end_pos]
if text_content: # Only add non-empty text
text_runs.append(PptxTextRunModel(text=text_content, font=font))
current_pos = end_pos
# Add newline if not the last line
if line != text.split("\n")[-1]:
text_runs.append(PptxTextRunModel(text="\n"))
return text_runs
def parse_html_text_to_text_runs(self, font: Optional[PptxFontModel], text: str):
return parse_inline_html_to_runs(text, font)
def populate_text_run(self, text_run: _Run, text_run_model: PptxTextRunModel):
text_run.text = text_run_model.text
@ -527,6 +460,20 @@ class PptxPresentationCreator:
font.italic = font_model.italic
font.size = Pt(font_model.size)
font.bold = font_model.font_weight >= 600
if font_model.underline is not None:
font.underline = bool(font_model.underline)
if font_model.strike is not None:
self.apply_strike_to_font(font, font_model.strike)
def apply_strike_to_font(self, font: Font, strike: Optional[bool]):
try:
rPr = font._element
if strike is True:
rPr.set("strike", "sngStrike")
elif strike is False:
rPr.set("strike", "noStrike")
except Exception as e:
print(f"Could not apply strikethrough: {e}")
def save(self, path: str):
self._ppt.save(path)

View file

@ -261,10 +261,29 @@ async function getAllChildElementsAttributes({ element, rootRect = null, depth =
};
}
// Ignore elements with no size (width or height)
if (attributes.position === undefined || attributes.position.width === undefined || attributes.position.height === undefined || attributes.position.width === 0 || attributes.position.height === 0) {
continue;
}
// If element is paragraph and contains only inline formatting tags, don't go deeper
if (attributes.tagName === 'p') {
const innerElementTagNames = await childElementHandle.evaluate((el) => {
return Array.from(el.querySelectorAll('*')).map((e) => e.tagName.toLowerCase());
});
const allowedInlineTags = new Set(['strong', 'u', 'em', 'code', 's']);
const hasOnlyAllowedInlineTags = innerElementTagNames.every((tag) => allowedInlineTags.has(tag));
if (innerElementTagNames.length > 0 && hasOnlyAllowedInlineTags) {
attributes.innerText = await childElementHandle.evaluate((el) => {
return el.innerHTML;
});
allResults.push({ attributes, depth });
continue;
}
}
if (attributes.tagName === 'svg' || attributes.tagName === 'canvas' || attributes.tagName === 'table') {
attributes.should_screenshot = true;
attributes.element = childElementHandle;
@ -272,12 +291,11 @@ async function getAllChildElementsAttributes({ element, rootRect = null, depth =
allResults.push({ attributes, depth });
//? If the element is a canvas, or table, we don't need to go deeper
// If the element is a canvas, or table, we don't need to go deeper
if (attributes.should_screenshot && attributes.tagName !== 'svg') {
continue;
}
const childResults = await getAllChildElementsAttributes({
element: childElementHandle,
rootRect: rootRect,