Merge pull request #209 from presenton/fix/presentation-export-markdown-issue

fix(fastapi): solves issue on PPTX export where markdown content was not correctly parsed
2025-08-12 13:12:35 +05:45 · 2025-08-12 13:12:35 +05:45 · 01d39d71be
commit 01d39d71be
parent 6937885b47 362af45189
4 changed files with 107 additions and 75 deletions
--- a/servers/fastapi/models/pptx_models.py
+++ b/servers/fastapi/models/pptx_models.py
@ -57,6 +57,8 @@ class PptxFontModel(BaseModel):
    italic: bool = False
    color: str = "000000"
    font_weight: Optional[int] = 400
+    underline: Optional[bool] = None
+    strike: Optional[bool] = None


 class PptxFillModel(BaseModel):
--- a/servers/fastapi/services/html_to_text_runs_service.py
+++ b/servers/fastapi/services/html_to_text_runs_service.py
@ -0,0 +1,65 @@
+from html.parser import HTMLParser
+from typing import List, Optional
+
+from models.pptx_models import PptxFontModel, PptxTextRunModel
+
+
+class InlineHTMLToRunsParser(HTMLParser):
+    def __init__(self, base_font: PptxFontModel):
+        super().__init__(convert_charrefs=True)
+        self.base_font = base_font
+        self.tag_stack: List[str] = []
+        self.text_runs: List[PptxTextRunModel] = []
+
+    def _current_font(self) -> PptxFontModel:
+        font_json = self.base_font.model_dump()
+        is_bold = any(tag in ("strong", "b") for tag in self.tag_stack)
+        is_italic = any(tag in ("em", "i") for tag in self.tag_stack)
+        is_underline = any(tag == "u" for tag in self.tag_stack)
+        is_strike = any(tag in ("s", "strike", "del") for tag in self.tag_stack)
+        is_code = any(tag == "code" for tag in self.tag_stack)
+
+        if is_bold:
+            font_json["font_weight"] = 700
+        if is_italic:
+            font_json["italic"] = True
+        if is_underline:
+            font_json["underline"] = True
+        if is_strike:
+            font_json["strike"] = True
+        if is_code:
+            font_json["name"] = "Courier New"
+
+        return PptxFontModel(**font_json)
+
+    def handle_starttag(self, tag, attrs):
+        tag = tag.lower()
+        if tag == "br":
+            self.text_runs.append(PptxTextRunModel(text="\n"))
+            return
+        self.tag_stack.append(tag)
+
+    def handle_endtag(self, tag):
+        tag = tag.lower()
+        for i in range(len(self.tag_stack) - 1, -1, -1):
+            if self.tag_stack[i] == tag:
+                del self.tag_stack[i]
+                break
+
+    def handle_data(self, data):
+        if data == "":
+            return
+        self.text_runs.append(PptxTextRunModel(text=data, font=self._current_font()))
+
+
+def parse_html_text_to_text_runs(
+    text: str, base_font: Optional[PptxFontModel] = None
+) -> List[PptxTextRunModel]:
+    normalized_text = text.replace("\r\n", "\n").replace("\r", "\n")
+    normalized_text = normalized_text.replace("\n", "<br>")
+
+    parser = InlineHTMLToRunsParser(base_font if base_font else PptxFontModel())
+    parser.feed(normalized_text)
+    return parser.text_runs
+
+
--- a/servers/fastapi/services/pptx_presentation_creator.py
+++ b/servers/fastapi/services/pptx_presentation_creator.py
@ -1,6 +1,9 @@
 import os
 from typing import List, Optional
 from lxml import etree
+from services.html_to_text_runs_service import (
+    parse_html_text_to_text_runs as parse_inline_html_to_runs,
+)

 from pptx import Presentation
 from pptx.shapes.autoshape import Shape
@ -276,7 +279,7 @@ class PptxPresentationCreator:

        text_runs = []
        if paragraph_model.text:
-            text_runs = self.parse_markdown_text_to_text_runs(
+            text_runs = self.parse_html_text_to_text_runs(
                paragraph_model.font, paragraph_model.text
            )
        elif paragraph_model.text_runs:
@ -286,78 +289,8 @@ class PptxPresentationCreator:
            text_run = paragraph.add_run()
            self.populate_text_run(text_run, text_run_model)

-    def parse_markdown_text_to_text_runs(self, font: PptxFontModel, text: str):
-        text_runs = []
-        for line in text.split("\n"):
-            current_pos = 0
-            while current_pos < len(line):
-                # Check for bold and italic (***text***)
-                if (
-                    line[current_pos:].startswith("***")
-                    and "***" in line[current_pos + 3 :]
-                ):
-                    end_pos = line.find("***", current_pos + 3)
-                    text_content = line[current_pos + 3 : end_pos]
-                    font_json = font.model_dump()
-                    font_json["bold"] = True
-                    font_json["italic"] = True
-                    font_json["font_weight"] = 700  # Set font weight to bold
-                    text_runs.append(
-                        PptxTextRunModel(
-                            text=text_content, font=PptxFontModel(**font_json)
-                        )
-                    )
-                    current_pos = end_pos + 3
-                # Check for bold (**text**)
-                elif (
-                    line[current_pos:].startswith("**")
-                    and "**" in line[current_pos + 2 :]
-                ):
-                    end_pos = line.find("**", current_pos + 2)
-                    text_content = line[current_pos + 2 : end_pos]
-                    font_json = font.model_dump()
-                    font_json["bold"] = True
-                    font_json["font_weight"] = 700  # Set font weight to bold
-                    text_runs.append(
-                        PptxTextRunModel(
-                            text=text_content, font=PptxFontModel(**font_json)
-                        )
-                    )
-                    current_pos = end_pos + 2
-                # Check for italic (*text*)
-                elif (
-                    line[current_pos:].startswith("__")
-                    and "__" in line[current_pos + 2 :]
-                ):
-                    end_pos = line.find("__", current_pos + 2)
-                    text_content = line[current_pos + 2 : end_pos]
-                    font_json = font.model_dump()
-                    font_json["italic"] = True
-                    text_runs.append(
-                        PptxTextRunModel(
-                            text=text_content, font=PptxFontModel(**font_json)
-                        )
-                    )
-                    current_pos = end_pos + 2
-                else:
-                    # Find the next formatting marker or end of line
-                    next_marker = float("inf")
-                    for marker in ["***", "**", "__"]:
-                        pos = line.find(marker, current_pos)
-                        if pos != -1:
-                            next_marker = min(next_marker, pos)
-
-                    end_pos = next_marker if next_marker != float("inf") else len(line)
-                    text_content = line[current_pos:end_pos]
-                    if text_content:  # Only add non-empty text
-                        text_runs.append(PptxTextRunModel(text=text_content, font=font))
-                    current_pos = end_pos
-
-            # Add newline if not the last line
-            if line != text.split("\n")[-1]:
-                text_runs.append(PptxTextRunModel(text="\n"))
-
-        return text_runs
+    def parse_html_text_to_text_runs(self, font: Optional[PptxFontModel], text: str):
+        return parse_inline_html_to_runs(text, font)

    def populate_text_run(self, text_run: _Run, text_run_model: PptxTextRunModel):
        text_run.text = text_run_model.text
@ -527,6 +460,20 @@ class PptxPresentationCreator:
        font.italic = font_model.italic
        font.size = Pt(font_model.size)
        font.bold = font_model.font_weight >= 600
+        if font_model.underline is not None:
+            font.underline = bool(font_model.underline)
+        if font_model.strike is not None:
+            self.apply_strike_to_font(font, font_model.strike)
+
+    def apply_strike_to_font(self, font: Font, strike: Optional[bool]):
+        try:
+            rPr = font._element
+            if strike is True:
+                rPr.set("strike", "sngStrike")
+            elif strike is False:
+                rPr.set("strike", "noStrike")
+        except Exception as e:
+            print(f"Could not apply strikethrough: {e}")

    def save(self, path: str):
        self._ppt.save(path)
--- a/servers/nextjs/app/api/presentation_to_pptx_model/route.ts
+++ b/servers/nextjs/app/api/presentation_to_pptx_model/route.ts
@ -261,10 +261,29 @@ async function getAllChildElementsAttributes({ element, rootRect = null, depth =
      };
    }

+    // Ignore elements with no size (width or height)
    if (attributes.position === undefined || attributes.position.width === undefined || attributes.position.height === undefined || attributes.position.width === 0 || attributes.position.height === 0) {
      continue;
    }

+    // If element is paragraph and contains only inline formatting tags, don't go deeper
+    if (attributes.tagName === 'p') {
+      const innerElementTagNames = await childElementHandle.evaluate((el) => {
+        return Array.from(el.querySelectorAll('*')).map((e) => e.tagName.toLowerCase());
+      });
+
+      const allowedInlineTags = new Set(['strong', 'u', 'em', 'code', 's']);
+      const hasOnlyAllowedInlineTags = innerElementTagNames.every((tag) => allowedInlineTags.has(tag));
+
+      if (innerElementTagNames.length > 0 && hasOnlyAllowedInlineTags) {
+        attributes.innerText = await childElementHandle.evaluate((el) => {
+          return el.innerHTML;
+        });
+        allResults.push({ attributes, depth });
+        continue;
+      }
+    }
+
    if (attributes.tagName === 'svg' || attributes.tagName === 'canvas' || attributes.tagName === 'table') {
      attributes.should_screenshot = true;
      attributes.element = childElementHandle;
@ -272,12 +291,11 @@ async function getAllChildElementsAttributes({ element, rootRect = null, depth =

    allResults.push({ attributes, depth });

-    //? If the element is a canvas, or table, we don't need to go deeper
+    // If the element is a canvas, or table, we don't need to go deeper
    if (attributes.should_screenshot && attributes.tagName !== 'svg') {
      continue;
    }

-
    const childResults = await getAllChildElementsAttributes({
      element: childElementHandle,
      rootRect: rootRect,