Merge pull request #209 from presenton/fix/presentation-export-markdown-issue
fix(fastapi): solves issue on PPTX export where markdown content was not correctly parsed
This commit is contained in:
commit
01d39d71be
4 changed files with 107 additions and 75 deletions
|
|
@ -57,6 +57,8 @@ class PptxFontModel(BaseModel):
|
|||
italic: bool = False
|
||||
color: str = "000000"
|
||||
font_weight: Optional[int] = 400
|
||||
underline: Optional[bool] = None
|
||||
strike: Optional[bool] = None
|
||||
|
||||
|
||||
class PptxFillModel(BaseModel):
|
||||
|
|
|
|||
65
servers/fastapi/services/html_to_text_runs_service.py
Normal file
65
servers/fastapi/services/html_to_text_runs_service.py
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
from html.parser import HTMLParser
|
||||
from typing import List, Optional
|
||||
|
||||
from models.pptx_models import PptxFontModel, PptxTextRunModel
|
||||
|
||||
|
||||
class InlineHTMLToRunsParser(HTMLParser):
|
||||
def __init__(self, base_font: PptxFontModel):
|
||||
super().__init__(convert_charrefs=True)
|
||||
self.base_font = base_font
|
||||
self.tag_stack: List[str] = []
|
||||
self.text_runs: List[PptxTextRunModel] = []
|
||||
|
||||
def _current_font(self) -> PptxFontModel:
|
||||
font_json = self.base_font.model_dump()
|
||||
is_bold = any(tag in ("strong", "b") for tag in self.tag_stack)
|
||||
is_italic = any(tag in ("em", "i") for tag in self.tag_stack)
|
||||
is_underline = any(tag == "u" for tag in self.tag_stack)
|
||||
is_strike = any(tag in ("s", "strike", "del") for tag in self.tag_stack)
|
||||
is_code = any(tag == "code" for tag in self.tag_stack)
|
||||
|
||||
if is_bold:
|
||||
font_json["font_weight"] = 700
|
||||
if is_italic:
|
||||
font_json["italic"] = True
|
||||
if is_underline:
|
||||
font_json["underline"] = True
|
||||
if is_strike:
|
||||
font_json["strike"] = True
|
||||
if is_code:
|
||||
font_json["name"] = "Courier New"
|
||||
|
||||
return PptxFontModel(**font_json)
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
tag = tag.lower()
|
||||
if tag == "br":
|
||||
self.text_runs.append(PptxTextRunModel(text="\n"))
|
||||
return
|
||||
self.tag_stack.append(tag)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
tag = tag.lower()
|
||||
for i in range(len(self.tag_stack) - 1, -1, -1):
|
||||
if self.tag_stack[i] == tag:
|
||||
del self.tag_stack[i]
|
||||
break
|
||||
|
||||
def handle_data(self, data):
|
||||
if data == "":
|
||||
return
|
||||
self.text_runs.append(PptxTextRunModel(text=data, font=self._current_font()))
|
||||
|
||||
|
||||
def parse_html_text_to_text_runs(
|
||||
text: str, base_font: Optional[PptxFontModel] = None
|
||||
) -> List[PptxTextRunModel]:
|
||||
normalized_text = text.replace("\r\n", "\n").replace("\r", "\n")
|
||||
normalized_text = normalized_text.replace("\n", "<br>")
|
||||
|
||||
parser = InlineHTMLToRunsParser(base_font if base_font else PptxFontModel())
|
||||
parser.feed(normalized_text)
|
||||
return parser.text_runs
|
||||
|
||||
|
||||
|
|
@ -1,6 +1,9 @@
|
|||
import os
|
||||
from typing import List, Optional
|
||||
from lxml import etree
|
||||
from services.html_to_text_runs_service import (
|
||||
parse_html_text_to_text_runs as parse_inline_html_to_runs,
|
||||
)
|
||||
|
||||
from pptx import Presentation
|
||||
from pptx.shapes.autoshape import Shape
|
||||
|
|
@ -276,7 +279,7 @@ class PptxPresentationCreator:
|
|||
|
||||
text_runs = []
|
||||
if paragraph_model.text:
|
||||
text_runs = self.parse_markdown_text_to_text_runs(
|
||||
text_runs = self.parse_html_text_to_text_runs(
|
||||
paragraph_model.font, paragraph_model.text
|
||||
)
|
||||
elif paragraph_model.text_runs:
|
||||
|
|
@ -286,78 +289,8 @@ class PptxPresentationCreator:
|
|||
text_run = paragraph.add_run()
|
||||
self.populate_text_run(text_run, text_run_model)
|
||||
|
||||
def parse_markdown_text_to_text_runs(self, font: PptxFontModel, text: str):
|
||||
text_runs = []
|
||||
for line in text.split("\n"):
|
||||
current_pos = 0
|
||||
while current_pos < len(line):
|
||||
# Check for bold and italic (***text***)
|
||||
if (
|
||||
line[current_pos:].startswith("***")
|
||||
and "***" in line[current_pos + 3 :]
|
||||
):
|
||||
end_pos = line.find("***", current_pos + 3)
|
||||
text_content = line[current_pos + 3 : end_pos]
|
||||
font_json = font.model_dump()
|
||||
font_json["bold"] = True
|
||||
font_json["italic"] = True
|
||||
font_json["font_weight"] = 700 # Set font weight to bold
|
||||
text_runs.append(
|
||||
PptxTextRunModel(
|
||||
text=text_content, font=PptxFontModel(**font_json)
|
||||
)
|
||||
)
|
||||
current_pos = end_pos + 3
|
||||
# Check for bold (**text**)
|
||||
elif (
|
||||
line[current_pos:].startswith("**")
|
||||
and "**" in line[current_pos + 2 :]
|
||||
):
|
||||
end_pos = line.find("**", current_pos + 2)
|
||||
text_content = line[current_pos + 2 : end_pos]
|
||||
font_json = font.model_dump()
|
||||
font_json["bold"] = True
|
||||
font_json["font_weight"] = 700 # Set font weight to bold
|
||||
text_runs.append(
|
||||
PptxTextRunModel(
|
||||
text=text_content, font=PptxFontModel(**font_json)
|
||||
)
|
||||
)
|
||||
current_pos = end_pos + 2
|
||||
# Check for italic (*text*)
|
||||
elif (
|
||||
line[current_pos:].startswith("__")
|
||||
and "__" in line[current_pos + 2 :]
|
||||
):
|
||||
end_pos = line.find("__", current_pos + 2)
|
||||
text_content = line[current_pos + 2 : end_pos]
|
||||
font_json = font.model_dump()
|
||||
font_json["italic"] = True
|
||||
text_runs.append(
|
||||
PptxTextRunModel(
|
||||
text=text_content, font=PptxFontModel(**font_json)
|
||||
)
|
||||
)
|
||||
current_pos = end_pos + 2
|
||||
else:
|
||||
# Find the next formatting marker or end of line
|
||||
next_marker = float("inf")
|
||||
for marker in ["***", "**", "__"]:
|
||||
pos = line.find(marker, current_pos)
|
||||
if pos != -1:
|
||||
next_marker = min(next_marker, pos)
|
||||
|
||||
end_pos = next_marker if next_marker != float("inf") else len(line)
|
||||
text_content = line[current_pos:end_pos]
|
||||
if text_content: # Only add non-empty text
|
||||
text_runs.append(PptxTextRunModel(text=text_content, font=font))
|
||||
current_pos = end_pos
|
||||
|
||||
# Add newline if not the last line
|
||||
if line != text.split("\n")[-1]:
|
||||
text_runs.append(PptxTextRunModel(text="\n"))
|
||||
|
||||
return text_runs
|
||||
def parse_html_text_to_text_runs(self, font: Optional[PptxFontModel], text: str):
|
||||
return parse_inline_html_to_runs(text, font)
|
||||
|
||||
def populate_text_run(self, text_run: _Run, text_run_model: PptxTextRunModel):
|
||||
text_run.text = text_run_model.text
|
||||
|
|
@ -527,6 +460,20 @@ class PptxPresentationCreator:
|
|||
font.italic = font_model.italic
|
||||
font.size = Pt(font_model.size)
|
||||
font.bold = font_model.font_weight >= 600
|
||||
if font_model.underline is not None:
|
||||
font.underline = bool(font_model.underline)
|
||||
if font_model.strike is not None:
|
||||
self.apply_strike_to_font(font, font_model.strike)
|
||||
|
||||
def apply_strike_to_font(self, font: Font, strike: Optional[bool]):
|
||||
try:
|
||||
rPr = font._element
|
||||
if strike is True:
|
||||
rPr.set("strike", "sngStrike")
|
||||
elif strike is False:
|
||||
rPr.set("strike", "noStrike")
|
||||
except Exception as e:
|
||||
print(f"Could not apply strikethrough: {e}")
|
||||
|
||||
def save(self, path: str):
|
||||
self._ppt.save(path)
|
||||
|
|
|
|||
|
|
@ -261,10 +261,29 @@ async function getAllChildElementsAttributes({ element, rootRect = null, depth =
|
|||
};
|
||||
}
|
||||
|
||||
// Ignore elements with no size (width or height)
|
||||
if (attributes.position === undefined || attributes.position.width === undefined || attributes.position.height === undefined || attributes.position.width === 0 || attributes.position.height === 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If element is paragraph and contains only inline formatting tags, don't go deeper
|
||||
if (attributes.tagName === 'p') {
|
||||
const innerElementTagNames = await childElementHandle.evaluate((el) => {
|
||||
return Array.from(el.querySelectorAll('*')).map((e) => e.tagName.toLowerCase());
|
||||
});
|
||||
|
||||
const allowedInlineTags = new Set(['strong', 'u', 'em', 'code', 's']);
|
||||
const hasOnlyAllowedInlineTags = innerElementTagNames.every((tag) => allowedInlineTags.has(tag));
|
||||
|
||||
if (innerElementTagNames.length > 0 && hasOnlyAllowedInlineTags) {
|
||||
attributes.innerText = await childElementHandle.evaluate((el) => {
|
||||
return el.innerHTML;
|
||||
});
|
||||
allResults.push({ attributes, depth });
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (attributes.tagName === 'svg' || attributes.tagName === 'canvas' || attributes.tagName === 'table') {
|
||||
attributes.should_screenshot = true;
|
||||
attributes.element = childElementHandle;
|
||||
|
|
@ -272,12 +291,11 @@ async function getAllChildElementsAttributes({ element, rootRect = null, depth =
|
|||
|
||||
allResults.push({ attributes, depth });
|
||||
|
||||
//? If the element is a canvas, or table, we don't need to go deeper
|
||||
// If the element is a canvas, or table, we don't need to go deeper
|
||||
if (attributes.should_screenshot && attributes.tagName !== 'svg') {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
const childResults = await getAllChildElementsAttributes({
|
||||
element: childElementHandle,
|
||||
rootRect: rootRect,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue