From cced7b942371b9410051b1dbaea71f291f13eddb Mon Sep 17 00:00:00 2001 From: Suraj Jha Date: Wed, 6 Aug 2025 13:26:45 +0545 Subject: [PATCH] update: use gemini for slide to html and edit --- .../fastapi/api/v1/ppt/endpoints/prompts.py | 27 +- .../api/v1/ppt/endpoints/slide_to_html.py | 403 ++++++++---------- 2 files changed, 195 insertions(+), 235 deletions(-) diff --git a/servers/fastapi/api/v1/ppt/endpoints/prompts.py b/servers/fastapi/api/v1/ppt/endpoints/prompts.py index 6554d6ec..ddf12a5f 100644 --- a/servers/fastapi/api/v1/ppt/endpoints/prompts.py +++ b/servers/fastapi/api/v1/ppt/endpoints/prompts.py @@ -1,19 +1,26 @@ GENERATE_HTML_SYSTEM_PROMPT = """ -You need to generate html and tailwind code for given presentation slide image. You need to think through each design elements and then decide where each element should go. +You need to generate html and tailwind code for given presentation slide image. Generated code will be used as template for different content. You need to think through each design elements and then decide where each element should go. Follow these rules strictly: - Make sure the design from html and tailwind is exact to the slide. - Make sure all components are in their own place. -- Make sure size of elements are exact. -- Smallest of elements should be noted of and should be added as it is. +- Make sure size of elements are exact. Check sizes of images and other elements from OXML and convert them to pixels. +- Make sure all components should be noted of and should be added as it is. - Image's and icons's size and position should be added exactly as it is. -- Read through the OXML data of slide and then match exact position ans size of elements. Make sure to convert between dimension and pixels. +- Read through the OXML data of slide and then match exact position ans size of elements. Make sure to convert between dimension and pixels. +- Make sure the vertical and horizonal spacing between elements are same as in the image. Try to get spacing from the OXML document as well. Make sure no elements overflows because of high spacing. +- Do not use absolute position unless absolutely necessary. Use flex, grid and spacing to properly arrange components. +- First, layout everything using flex or grid. Try to fit all the components using this layout. Finally, if you cannot layout any element without flex and grid, then only use absolute to place the element. +- Analyze each text's available space and it's design, and give minimum characters to fill in the text for the space and context and maximum that the space can handle. Be conservative with how many characters text space can handle. Make sure no text overflows and decide as to not disrupt the slide. Do this for every text. +- Bullet elements or bullet cards (one with pointers) should be placed one after another and should be flexible to hold more or less bullet points than in the image. Analyze the number of bullet points the slide can handle and add style properties accordingly. Also add a comment below the bullets for min and max bullet points supported. Make sure the number you quote should fit in the available space. Don't be too ambitious. +- For each text add font size and font family as tailwind property. Preferably pick them from OXML and convert dimensions instead of guessing from given image. +- Make sure that no elements overflow or exceed slide bounding in any way. - Properly export shapes as exact SVG. - Add relevant font in tailwind to all texts. -- Wrap the output code inside these classes: \"relative w-full rounded-sm max-w-[1280px] shadow-lg max-h-[720px] aspect-video bg-white relative z-20 mx-auto overflow-hidden\". -- For image everywhere use /static/images/placeholder.jpg +- Wrap the output code inside these classes: \\\"relative w-full rounded-sm max-w-[1280px] shadow-lg max-h-[720px] aspect-video bg-white relative z-20 mx-auto overflow-hidden\\\". +- For image everywhere use https://images.pexels.com/photos/31527637/pexels-photo-31527637.jpeg - Image should never be inside of a SVG. -- Give out only HTML and Tailwind code. No other texts or explanations. - """ +- Give out only HTML and Tailwind code. No other texts or explanations. +""" HTML_TO_REACT_SYSTEM_PROMPT = """ Convert given static HTML and Tailwind slide to a TSX React component so that it can be dynamically populated. Follow these rules strictly while converting: @@ -27,6 +34,8 @@ Convert given static HTML and Tailwind slide to a TSX React component so that it 7) For image and icons schema should be compulsorily declared with two dunder fields for prompt and url separately. 8) Component name at the end should always yo 'dynamicSlideLayout'. 9) **Import or export statements should not be present in the output.** + - Don't give "import {React} from 'react'" + - Don't give "import {z} from 'zod'" 10) Always use double quotes for strings. 11) Layout Id, layout name and layout description should be declared and should describe the structure of the layout not its purpose. Do not describe numbers of any items in the layout. -layoutDescription should not have any purpose for elements in it, so use '...cards' instead of '...goal cards' and '...bullet points' instead of '...solution bullet points'. @@ -35,7 +44,7 @@ Convert given static HTML and Tailwind slide to a TSX React component so that it -Layout Id examples: header-description-bullet-points-slide, header-description-image-slide -Layout Name examples: HeaderDescriptionBulletPointsLayout, HeaderDescriptionImageLayout -Layout Description examples: A slide with a header, description, and bullet points and A slide with a header, description, and image - +12. Only give Code and nothing else. No other text or comments. For example: Input:

Effects of Global Warming

global warming effects on earth

Global warming triggers a cascade of effects on our planet. These changes impact everything from our oceans to our ecosystems.

sea level rising icon

Rising Sea Levels

Rising sea levels threaten coastal communities and ecosystems due to melting glaciers and thermal expansion.

heatwave icon

Intense Heatwaves

Heatwaves are becoming more frequent and intense, posing significant risks to human health and agriculture.

precipitation changes icon

Changes in Precipitation

Altered precipitation patterns lead to increased droughts in some regions and severe flooding in others, affecting water resources.

diff --git a/servers/fastapi/api/v1/ppt/endpoints/slide_to_html.py b/servers/fastapi/api/v1/ppt/endpoints/slide_to_html.py index 221b3aa4..d55f38a3 100644 --- a/servers/fastapi/api/v1/ppt/endpoints/slide_to_html.py +++ b/servers/fastapi/api/v1/ppt/endpoints/slide_to_html.py @@ -4,7 +4,8 @@ from datetime import datetime from typing import Optional, List, Dict from fastapi import APIRouter, HTTPException, File, UploadFile, Form, Depends from pydantic import BaseModel -import anthropic +from google import genai +from google.genai import types from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy import select, delete, func from utils.asset_directory_utils import get_images_directory @@ -94,13 +95,13 @@ class ErrorResponse(BaseModel): async def generate_html_from_slide(base64_image: str, media_type: str, xml_content: str, api_key: str) -> str: """ - Generate HTML content from slide image and XML using Anthropic Claude API. + Generate HTML content from slide image and XML using Google Gen AI API. Args: base64_image: Base64 encoded image data media_type: MIME type of the image (e.g., 'image/png') xml_content: OXML content as text - api_key: Anthropic API key + api_key: Google Gen AI API key Returns: Generated HTML content as string @@ -109,100 +110,85 @@ async def generate_html_from_slide(base64_image: str, media_type: str, xml_conte HTTPException: If API call fails or no content is generated """ try: - # Initialize Anthropic client - client = anthropic.Anthropic(api_key=api_key) + # Initialize Google Gen AI client + client = genai.Client(api_key=api_key) + + # Convert base64 to bytes + image_bytes = base64.b64decode(base64_image) # Use streaming to handle long requests - print("Starting streaming request to Claude for HTML generation...") + print("Starting streaming request to Google Gen AI for HTML generation...") html_content = "" - thinking_content = "" - with client.messages.stream( - model="claude-sonnet-4-20250514", - max_tokens=64000, - temperature=1, - system=GENERATE_HTML_SYSTEM_PROMPT, - messages=[ - { - "role": "user", - "content": [ - { - "type": "image", - "source": { - "type": "base64", - "media_type": media_type, - "data": base64_image - } - }, - { - "type": "text", - "text": f"\nOXML: \n\n{xml_content}" - } - ] - } - ], - thinking={ - "type": "enabled", - "budget_tokens": 55000 - } - ) as stream: - print("Streaming started, collecting HTML response...") - - # Collect all streamed content - for event in stream: - if event.type == "content_block_delta": - if event.delta.type == "thinking_delta": - thinking_content += event.delta.thinking - print(f"[HTML THINKING] {event.delta.thinking}", end="", flush=True) - elif event.delta.type == "text_delta": - html_content += event.delta.text - print(f"[HTML] {event.delta.text}", end="", flush=True) - elif event.type == "content_block_start": - if hasattr(event.content_block, 'type'): - print(f"\n[HTML BLOCK START] {event.content_block.type}") - elif event.type == "content_block_stop": - print(f"\n[HTML BLOCK STOP] Index: {event.index}") - elif event.type == "message_start": - print("[HTML MESSAGE START]") - elif event.type == "message_stop": - print("\n[HTML MESSAGE STOP] - Streaming complete") + # Create content with image and text + contents = [ + types.Part.from_bytes( + mime_type=media_type, + data=image_bytes, + ), + types.Part.from_text(text=f"\nOXML: \n\n{xml_content}"), + ] + + # Generate content config with thinking enabled + generate_content_config = types.GenerateContentConfig( + system_instruction=GENERATE_HTML_SYSTEM_PROMPT, + max_output_tokens=65536, + temperature=1.0, + thinking_config=types.ThinkingConfig( + thinking_budget=32768, + ), + ) + + print("Streaming started, collecting HTML response...") + + # Stream the response + for chunk in client.models.generate_content_stream( + model="gemini-2.5-pro", + contents=contents, + config=generate_content_config, + ): + if chunk.text: + html_content += chunk.text + print(f"[HTML] {chunk.text}", end="", flush=True) print(f"\nCollected HTML content length: {len(html_content)}") - print(f"Collected HTML thinking content length: {len(thinking_content)}") if not html_content: raise HTTPException( status_code=500, - detail="No HTML content generated by Claude API" + detail="No HTML content generated by Google Gen AI API" ) return html_content - except anthropic.APITimeoutError as e: - raise HTTPException( - status_code=408, - detail=f"Claude API timeout during HTML streaming: {str(e)}" - ) - except anthropic.APIConnectionError as e: - raise HTTPException( - status_code=503, - detail=f"Claude API connection error during HTML streaming: {str(e)}" - ) - except anthropic.APIError as e: - raise HTTPException( - status_code=500, - detail=f"Anthropic API error during HTML generation: {str(e)}" - ) + except Exception as e: + # Handle various API errors + error_msg = str(e) + if "timeout" in error_msg.lower(): + raise HTTPException( + status_code=408, + detail=f"Google Gen AI API timeout during HTML streaming: {error_msg}" + ) + elif "connection" in error_msg.lower(): + raise HTTPException( + status_code=503, + detail=f"Google Gen AI API connection error during HTML streaming: {error_msg}" + ) + else: + raise HTTPException( + status_code=500, + detail=f"Google Gen AI API error during HTML generation: {error_msg}" + ) async def generate_react_component_from_html(html_content: str, api_key: str) -> str: """ - Convert HTML content to TSX React component using Anthropic Claude API. + Convert HTML content to TSX React component using Google Gen AI API. Args: html_content: Generated HTML content - api_key: Anthropic API key + api_key: Google Gen AI API key Returns: Generated TSX React component code as string @@ -211,63 +197,44 @@ async def generate_react_component_from_html(html_content: str, api_key: str) -> HTTPException: If API call fails or no content is generated """ try: - # Initialize Anthropic client - client = anthropic.Anthropic(api_key=api_key) + # Initialize Google Gen AI client + client = genai.Client(api_key=api_key) - print("Starting streaming request to Claude for React component generation...") + print("Starting streaming request to Google Gen AI for React component generation...") react_content = "" - thinking_content = "" - with client.messages.stream( - model="claude-sonnet-4-20250514", - max_tokens=64000, - temperature=1, - system=HTML_TO_REACT_SYSTEM_PROMPT, - messages=[ - { - "role": "user", - "content": [ - { - "type": "text", - "text": html_content - } - ] - } - ], - thinking={ - "type": "enabled", - "budget_tokens": 25000 - } - ) as stream: - print("Streaming started, collecting React component response...") - - # Collect all streamed content - for event in stream: - if event.type == "content_block_delta": - if event.delta.type == "thinking_delta": - thinking_content += event.delta.thinking - print(f"[REACT THINKING] {event.delta.thinking}", end="", flush=True) - elif event.delta.type == "text_delta": - react_content += event.delta.text - print(f"[REACT] {event.delta.text}", end="", flush=True) - elif event.type == "content_block_start": - if hasattr(event.content_block, 'type'): - print(f"\n[REACT BLOCK START] {event.content_block.type}") - elif event.type == "content_block_stop": - print(f"\n[REACT BLOCK STOP] Index: {event.index}") - elif event.type == "message_start": - print("[REACT MESSAGE START]") - elif event.type == "message_stop": - print("\n[REACT MESSAGE STOP] - Streaming complete") + # Create content with text + contents = types.Part.from_text(text=html_content) + + # Generate content config with thinking enabled + generate_content_config = types.GenerateContentConfig( + system_instruction=HTML_TO_REACT_SYSTEM_PROMPT, + max_output_tokens=65536, + temperature=1.0, + thinking_config=types.ThinkingConfig( + thinking_budget=15000, + ), + ) + + print("Streaming started, collecting React component response...") + + # Stream the response + for chunk in client.models.generate_content_stream( + model="gemini-2.5-pro", + contents=contents, + config=generate_content_config, + ): + if chunk.text: + react_content += chunk.text + print(f"[REACT] {chunk.text}", end="", flush=True) print(f"\nCollected React content length: {len(react_content)}") - print(f"Collected React thinking content length: {len(thinking_content)}") if not react_content: raise HTTPException( status_code=500, - detail="No React component generated by Claude API" + detail="No React component generated by Google Gen AI API" ) # Filter out lines that start with import or export @@ -282,26 +249,29 @@ async def generate_react_component_from_html(html_content: str, api_key: str) -> return filtered_react_content - except anthropic.APITimeoutError as e: - raise HTTPException( - status_code=408, - detail=f"Claude API timeout during React generation: {str(e)}" - ) - except anthropic.APIConnectionError as e: - raise HTTPException( - status_code=503, - detail=f"Claude API connection error during React generation: {str(e)}" - ) - except anthropic.APIError as e: - raise HTTPException( - status_code=500, - detail=f"Anthropic API error during React generation: {str(e)}" - ) + except Exception as e: + # Handle various API errors + error_msg = str(e) + if "timeout" in error_msg.lower(): + raise HTTPException( + status_code=408, + detail=f"Google Gen AI API timeout during React generation: {error_msg}" + ) + elif "connection" in error_msg.lower(): + raise HTTPException( + status_code=503, + detail=f"Google Gen AI API connection error during React generation: {error_msg}" + ) + else: + raise HTTPException( + status_code=500, + detail=f"Google Gen AI API error during React generation: {error_msg}" + ) async def edit_html_with_images(current_ui_base64: str, sketch_base64: Optional[str], media_type: str, html_content: str, prompt: str, api_key: str) -> str: """ - Edit HTML content based on one or two images and a text prompt using Anthropic Claude API. + Edit HTML content based on one or two images and a text prompt using Google Gen AI API. Args: current_ui_base64: Base64 encoded current UI image data @@ -309,7 +279,7 @@ async def edit_html_with_images(current_ui_base64: str, sketch_base64: Optional[ media_type: MIME type of the images (e.g., 'image/png') html_content: Current HTML content to edit prompt: Text prompt describing the changes - api_key: Anthropic API key + api_key: Google Gen AI API key Returns: Edited HTML content as string @@ -318,104 +288,85 @@ async def edit_html_with_images(current_ui_base64: str, sketch_base64: Optional[ HTTPException: If API call fails or no content is generated """ try: - # Initialize Anthropic client - client = anthropic.Anthropic(api_key=api_key) + # Initialize Google Gen AI client + client = genai.Client(api_key=api_key) - print("Starting streaming request to Claude for HTML editing...") + print("Starting streaming request to Google Gen AI for HTML editing...") edited_html = "" - thinking_content = "" + + # Convert base64 images to bytes + current_ui_bytes = base64.b64decode(current_ui_base64) # Build content array - always include text and current UI image - content = [ - { - "type": "text", - "text": f"Current HTML to edit:\n\n{html_content}\n\nText prompt for changes: {prompt}" - }, - { - "type": "image", - "source": { - "type": "base64", - "media_type": media_type, - "data": current_ui_base64 - } - } + contents = [ + types.Part.from_text(text=f"Current HTML to edit:\n\n{html_content}\n\nText prompt for changes: {prompt}"), + types.Part.from_bytes( + mime_type=media_type, + data=current_ui_bytes, + ) ] # Only add sketch image if provided if sketch_base64: - content.append({ - "type": "image", - "source": { - "type": "base64", - "media_type": media_type, - "data": sketch_base64 - } - }) + sketch_bytes = base64.b64decode(sketch_base64) + contents.append( + types.Part.from_bytes( + mime_type=media_type, + data=sketch_bytes, + ) + ) - with client.messages.stream( - model="claude-sonnet-4-20250514", - max_tokens=64000, - temperature=1, - system=HTML_EDIT_SYSTEM_PROMPT, - messages=[ - { - "role": "user", - "content": content - } - ], - thinking={ - "type": "enabled", - "budget_tokens": 16000 - } - ) as stream: - print("Streaming started, collecting edited HTML response...") - - # Collect all streamed content - for event in stream: - if event.type == "content_block_delta": - if event.delta.type == "thinking_delta": - thinking_content += event.delta.thinking - print(f"[HTML EDIT THINKING] {event.delta.thinking}", end="", flush=True) - elif event.delta.type == "text_delta": - edited_html += event.delta.text - print(f"[HTML EDIT] {event.delta.text}", end="", flush=True) - elif event.type == "content_block_start": - if hasattr(event.content_block, 'type'): - print(f"\n[HTML EDIT BLOCK START] {event.content_block.type}") - elif event.type == "content_block_stop": - print(f"\n[HTML EDIT BLOCK STOP] Index: {event.index}") - elif event.type == "message_start": - print("[HTML EDIT MESSAGE START]") - elif event.type == "message_stop": - print("\n[HTML EDIT MESSAGE STOP] - Streaming complete") + # Generate content config with thinking enabled + generate_content_config = types.GenerateContentConfig( + system_instruction=HTML_EDIT_SYSTEM_PROMPT, + max_output_tokens=65536, + temperature=1.0, + thinking_config=types.ThinkingConfig( + thinking_budget=16000, + ), + ) + + print("Streaming started, collecting edited HTML response...") + + # Stream the response + for chunk in client.models.generate_content_stream( + model="gemini-2.5-pro", + contents=contents, + config=generate_content_config, + ): + if chunk.text: + edited_html += chunk.text + print(f"[HTML EDIT] {chunk.text}", end="", flush=True) print(f"\nCollected edited HTML content length: {len(edited_html)}") - print(f"Collected HTML edit thinking content length: {len(thinking_content)}") if not edited_html: raise HTTPException( status_code=500, - detail="No edited HTML content generated by Claude API" + detail="No edited HTML content generated by Google Gen AI API" ) return edited_html - except anthropic.APITimeoutError as e: - raise HTTPException( - status_code=408, - detail=f"Claude API timeout during HTML editing: {str(e)}" - ) - except anthropic.APIConnectionError as e: - raise HTTPException( - status_code=503, - detail=f"Claude API connection error during HTML editing: {str(e)}" - ) - except anthropic.APIError as e: - raise HTTPException( - status_code=500, - detail=f"Anthropic API error during HTML editing: {str(e)}" - ) + except Exception as e: + # Handle various API errors + error_msg = str(e) + if "timeout" in error_msg.lower(): + raise HTTPException( + status_code=408, + detail=f"Google Gen AI API timeout during HTML editing: {error_msg}" + ) + elif "connection" in error_msg.lower(): + raise HTTPException( + status_code=503, + detail=f"Google Gen AI API connection error during HTML editing: {error_msg}" + ) + else: + raise HTTPException( + status_code=500, + detail=f"Google Gen AI API error during HTML editing: {error_msg}" + ) # ENDPOINT 1: Slide to HTML conversion @@ -431,12 +382,12 @@ async def convert_slide_to_html(request: SlideToHtmlRequest): SlideToHtmlResponse with generated HTML """ try: - # Get Anthropic API key from environment - api_key = os.getenv("ANTHROPIC_API_KEY") + # Get Google Gen AI API key from environment + api_key = os.getenv("GOOGLE_API_KEY") if not api_key: raise HTTPException( status_code=500, - detail="ANTHROPIC_API_KEY environment variable not set" + detail="GOOGLE_API_KEY environment variable not set" ) # Resolve image path to actual file system path @@ -521,12 +472,12 @@ async def convert_html_to_react(request: HtmlToReactRequest): HtmlToReactResponse with generated React component """ try: - # Get Anthropic API key from environment - api_key = os.getenv("ANTHROPIC_API_KEY") + # Get Google Gen AI API key from environment + api_key = os.getenv("GOOGLE_API_KEY") if not api_key: raise HTTPException( status_code=500, - detail="ANTHROPIC_API_KEY environment variable not set" + detail="GOOGLE_API_KEY environment variable not set" ) # Validate HTML content @@ -583,12 +534,12 @@ async def edit_html_with_images_endpoint( HtmlEditResponse with edited HTML """ try: - # Get Anthropic API key from environment - api_key = os.getenv("ANTHROPIC_API_KEY") + # Get Google Gen AI API key from environment + api_key = os.getenv("GOOGLE_API_KEY") if not api_key: raise HTTPException( status_code=500, - detail="ANTHROPIC_API_KEY environment variable not set" + detail="GOOGLE_API_KEY environment variable not set" ) # Validate inputs