feat: add html edit with image endpoint

2025-07-30 22:51:11 +05:45 · 2025-07-30 22:51:11 +05:45 · 6722a91748
commit 6722a91748
parent 182c3d4c78
3 changed files with 224 additions and 4 deletions
--- a/servers/fastapi/api/v1/ppt/endpoints/slide_to_html.py
+++ b/servers/fastapi/api/v1/ppt/endpoints/slide_to_html.py
@ -1,7 +1,7 @@
 import os
 import base64
 from typing import Optional
-from fastapi import APIRouter, HTTPException
+from fastapi import APIRouter, HTTPException, File, UploadFile, Form
 from pydantic import BaseModel
 import anthropic
 from utils.asset_directory_utils import get_images_directory
@ -9,6 +9,7 @@ from utils.asset_directory_utils import get_images_directory
 # Create separate routers for each functionality
 SLIDE_TO_HTML_ROUTER = APIRouter(prefix="/slide-to-html", tags=["slide-to-html"])
 HTML_TO_REACT_ROUTER = APIRouter(prefix="/html-to-react", tags=["html-to-react"])
+HTML_EDIT_ROUTER = APIRouter(prefix="/html-edit", tags=["html-edit"])


 # Request/Response models for slide-to-html endpoint
@ -23,6 +24,13 @@ class SlideToHtmlResponse(BaseModel):
    message: Optional[str] = None


+# Request/Response models for html-edit endpoint
+class HtmlEditResponse(BaseModel):
+    success: bool
+    edited_html: str
+    message: Optional[str] = None
+
+
 # Request/Response models for html-to-react endpoint
 class HtmlToReactRequest(BaseModel):
    html: str   # HTML content to convert to React component
@ -227,6 +235,10 @@ const BulletWithIconsSlideLayout: React.FC<BulletWithIconsSlideLayoutProps> = ({
 export default BulletWithIconsSlideLayout 
 """

+HTML_EDIT_SYSTEM_PROMPT = """
+You need to edit given html with respect to the indication and sketch in the given UI. You'll be given the code for current UI which is in presentation size, along with its visualization in image form. Over that you'll also be given another image which has indications of what might change in form of sketch in the UI. You will have to return the edited html with tailwind with the changes as indicated on the image and through prompt. Make sure you think through the design before making the change and also make sure you don't change the non-indicated part. Try to follow the design style of current content for generated content. Only give out code and nothing else.
+"""
+

 async def generate_html_from_slide(base64_image: str, media_type: str, xml_content: str, api_key: str) -> str:
    """
@ -425,6 +437,119 @@ async def generate_react_component_from_html(html_content: str, api_key: str) ->
        )


+async def edit_html_with_images(current_ui_base64: str, sketch_base64: str, media_type: str, html_content: str, prompt: str, api_key: str) -> str:
+    """
+    Edit HTML content based on two images and a text prompt using Anthropic Claude API.
+    
+    Args:
+        current_ui_base64: Base64 encoded current UI image data
+        sketch_base64: Base64 encoded sketch/indication image data
+        media_type: MIME type of the images (e.g., 'image/png')
+        html_content: Current HTML content to edit
+        prompt: Text prompt describing the changes
+        api_key: Anthropic API key
+    
+    Returns:
+        Edited HTML content as string
+    
+    Raises:
+        HTTPException: If API call fails or no content is generated
+    """
+    try:
+        # Initialize Anthropic client
+        client = anthropic.Anthropic(api_key=api_key)
+        
+        print("Starting streaming request to Claude for HTML editing...")
+        
+        edited_html = ""
+        thinking_content = ""
+        
+        with client.messages.stream(
+            model="claude-sonnet-4-20250514",
+            max_tokens=64000,
+            temperature=1,
+            system=HTML_EDIT_SYSTEM_PROMPT,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Current HTML to edit:\n\n{html_content}\n\nText prompt for changes: {prompt}"
+                        },
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": media_type,
+                                "data": current_ui_base64
+                            }
+                        },
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": media_type,
+                                "data": sketch_base64
+                            }
+                        }
+                    ]
+                }
+            ],
+            thinking={
+                "type": "enabled",
+                "budget_tokens": 16000
+            }
+        ) as stream:
+            print("Streaming started, collecting edited HTML response...")
+            
+            # Collect all streamed content
+            for event in stream:
+                if event.type == "content_block_delta":
+                    if event.delta.type == "thinking_delta":
+                        thinking_content += event.delta.thinking
+                        print(f"[HTML EDIT THINKING] {event.delta.thinking}", end="", flush=True)
+                    elif event.delta.type == "text_delta":
+                        edited_html += event.delta.text
+                        print(f"[HTML EDIT] {event.delta.text}", end="", flush=True)
+                elif event.type == "content_block_start":
+                    if hasattr(event.content_block, 'type'):
+                        print(f"\n[HTML EDIT BLOCK START] {event.content_block.type}")
+                elif event.type == "content_block_stop":
+                    print(f"\n[HTML EDIT BLOCK STOP] Index: {event.index}")
+                elif event.type == "message_start":
+                    print("[HTML EDIT MESSAGE START]")
+                elif event.type == "message_stop":
+                    print("\n[HTML EDIT MESSAGE STOP] - Streaming complete")
+        
+        print(f"\nCollected edited HTML content length: {len(edited_html)}")
+        print(f"Collected HTML edit thinking content length: {len(thinking_content)}")
+        
+        if not edited_html:
+            raise HTTPException(
+                status_code=500,
+                detail="No edited HTML content generated by Claude API"
+            )
+        
+        return edited_html
+        
+    except anthropic.APITimeoutError as e:
+        raise HTTPException(
+            status_code=408,
+            detail=f"Claude API timeout during HTML editing: {str(e)}"
+        )
+    except anthropic.APIConnectionError as e:
+        raise HTTPException(
+            status_code=503,
+            detail=f"Claude API connection error during HTML editing: {str(e)}"
+        )
+    except anthropic.APIError as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Anthropic API error during HTML editing: {str(e)}"
+        )
+
+
 # ENDPOINT 1: Slide to HTML conversion
@SLIDE_TO_HTML_ROUTER.post("/", response_model=SlideToHtmlResponse)
 async def convert_slide_to_html(request: SlideToHtmlRequest):
@ -494,7 +619,7 @@ async def convert_slide_to_html(request: SlideToHtmlRequest):
            media_type=media_type,
            xml_content=request.xml,
            api_key=api_key
-        )
+            )
        
        return SlideToHtmlResponse(
            success=True,
@ -564,4 +689,97 @@ async def convert_html_to_react(request: HtmlToReactRequest):
        raise HTTPException(
            status_code=500,
            detail=f"Error processing HTML to React: {str(e)}"
-        )
+        )
+
+
+# ENDPOINT 3: HTML editing with images
+@HTML_EDIT_ROUTER.post("/", response_model=HtmlEditResponse)
+async def edit_html_with_images_endpoint(
+    current_ui_image: UploadFile = File(..., description="Current UI image file"),
+    sketch_image: UploadFile = File(..., description="Sketch/indication image file"),
+    html: str = Form(..., description="Current HTML content to edit"),
+    prompt: str = Form(..., description="Text prompt describing the changes")
+):
+    """
+    Edit HTML content based on two uploaded images and a text prompt using Anthropic Claude API.
+    
+    Args:
+        current_ui_image: Uploaded current UI image file
+        sketch_image: Uploaded sketch/indication image file
+        html: Current HTML content to edit (form data)
+        prompt: Text prompt describing the changes (form data)
+    
+    Returns:
+        HtmlEditResponse with edited HTML
+    """
+    try:
+        # Get Anthropic API key from environment
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise HTTPException(
+                status_code=500, 
+                detail="ANTHROPIC_API_KEY environment variable not set"
+            )
+        
+        # Validate inputs
+        if not html or not html.strip():
+            raise HTTPException(
+                status_code=400,
+                detail="HTML content cannot be empty"
+            )
+        
+        if not prompt or not prompt.strip():
+            raise HTTPException(
+                status_code=400,
+                detail="Text prompt cannot be empty"
+            )
+        
+        # Validate image files
+        if not current_ui_image.content_type or not current_ui_image.content_type.startswith("image/"):
+            raise HTTPException(
+                status_code=400,
+                detail="Current UI file must be an image"
+            )
+        
+        if not sketch_image.content_type or not sketch_image.content_type.startswith("image/"):
+            raise HTTPException(
+                status_code=400,
+                detail="Sketch file must be an image"
+            )
+        
+        # Read and encode both images to base64
+        current_ui_content = await current_ui_image.read()
+        current_ui_base64 = base64.b64encode(current_ui_content).decode('utf-8')
+        
+        sketch_content = await sketch_image.read()
+        sketch_base64 = base64.b64encode(sketch_content).decode('utf-8')
+        
+        # Use the content type from the uploaded files
+        media_type = current_ui_image.content_type
+        
+        # Edit HTML using the function
+        edited_html = await edit_html_with_images(
+            current_ui_base64=current_ui_base64,
+            sketch_base64=sketch_base64,
+            media_type=media_type,
+            html_content=html,
+            prompt=prompt,
+            api_key=api_key
+        )
+        
+        return HtmlEditResponse(
+            success=True,
+            edited_html=edited_html,
+            message="HTML edited successfully"
+        )
+        
+    except HTTPException:
+        # Re-raise HTTP exceptions as-is
+        raise
+    except Exception as e:
+        # Log the full error for debugging
+        print(f"Unexpected error during HTML editing: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error processing HTML editing: {str(e)}"
+        ) 
--- a/servers/fastapi/api/v1/ppt/router.py
+++ b/servers/fastapi/api/v1/ppt/router.py
@ -9,7 +9,7 @@ from api.v1.ppt.endpoints.outlines import OUTLINES_ROUTER
 from api.v1.ppt.endpoints.presentation import PRESENTATION_ROUTER
 from api.v1.ppt.endpoints.pptx_slides import PPTX_SLIDES_ROUTER
 from api.v1.ppt.endpoints.slide import SLIDE_ROUTER
-from api.v1.ppt.endpoints.slide_to_html import SLIDE_TO_HTML_ROUTER, HTML_TO_REACT_ROUTER
+from api.v1.ppt.endpoints.slide_to_html import SLIDE_TO_HTML_ROUTER, HTML_TO_REACT_ROUTER, HTML_EDIT_ROUTER


 API_V1_PPT_ROUTER = APIRouter(prefix="/api/v1/ppt")
@ -21,6 +21,7 @@ API_V1_PPT_ROUTER.include_router(PPTX_SLIDES_ROUTER)
 API_V1_PPT_ROUTER.include_router(SLIDE_ROUTER)
 API_V1_PPT_ROUTER.include_router(SLIDE_TO_HTML_ROUTER)
 API_V1_PPT_ROUTER.include_router(HTML_TO_REACT_ROUTER)
+API_V1_PPT_ROUTER.include_router(HTML_EDIT_ROUTER)
 API_V1_PPT_ROUTER.include_router(IMAGES_ROUTER)
 API_V1_PPT_ROUTER.include_router(ICONS_ROUTER)
 API_V1_PPT_ROUTER.include_router(OLLAMA_ROUTER)
--- a/servers/fastapi/requirements.txt
+++ b/servers/fastapi/requirements.txt
@ -2,6 +2,7 @@ aiohappyeyeballs==2.6.1
 aiohttp==3.12.14
 aiosignal==1.4.0
 annotated-types==0.7.0
+anthropic==0.60.0
 anyio==4.9.0
 async-timeout==5.0.1
 attrs==25.3.0