volt-newsroom-scraper-report/summarizer.py

"""
AI summarization using Claude API.
Generates concise title + 2-3 bullet point summaries for articles/posts.
"""

from anthropic import Anthropic
from config import Config


class ContentSummarizer:
    """Summarizer using Claude API for content summarization."""

    def __init__(self):
        """Initialize Claude API client."""
        self.client = Anthropic(api_key=Config.ANTHROPIC_API_KEY)
        # Claude Sonnet 4.5 (latest model)
        self.model = "claude-sonnet-4-5-20250929"

    def summarize_content(self, content_item):
        """
        Summarize a single content item into title + 2-3 bullet points.

        Args:
            content_item (dict): Content to summarize with keys:
                - url (str)
                - title (str, optional)
                - content (str)
                - platform (str, optional for social media)

        Returns:
            dict: Summarized content
                {
                    'title': str,
                    'bullets': list of str,
                    'url': str
                }
        """
        content = content_item.get('content', '')
        existing_title = content_item.get('title', '')
        url = content_item.get('url', '')
        platform = content_item.get('platform', '')

        # Handle empty content
        if not content or content.strip() == '':
            return {
                'title': existing_title or 'Content unavailable',
                'bullets': ['Unable to extract content from this source.'],
                'url': url
            }

        try:
            # Create prompt for Claude
            prompt = self._create_summarization_prompt(content, existing_title, platform)

            # Call Claude API
            message = self.client.messages.create(
                model=self.model,
                max_tokens=500,
                messages=[
                    {"role": "user", "content": prompt}
                ]
            )

            # Parse response
            response_text = message.content[0].text
            summary = self._parse_summary_response(response_text, existing_title)

            return {
                'title': summary['title'],
                'bullets': summary['bullets'],
                'so_what': summary['so_what'],
                'url': url,
                'screenshot_path': content_item.get('screenshot_path')  # Preserve screenshot path
            }

        except Exception as e:
            print(f"Error summarizing content for {url}: {e}")
            # Fallback to existing title and truncated content
            return {
                'title': existing_title or 'Summary unavailable',
                'bullets': [
                    'Error generating summary.'
                ],
                'so_what': 'Unable to determine implications.',
                'url': url,
                'screenshot_path': content_item.get('screenshot_path')  # Preserve screenshot path
            }

    def summarize_batch(self, content_items):
        """
        Summarize multiple content items.

        Args:
            content_items (list): List of content dictionaries

        Returns:
            list: List of summarized content dictionaries
        """
        print(f"Summarizing {len(content_items)} items with Claude API...")

        summaries = []
        for i, item in enumerate(content_items, 1):
            print(f"  Summarizing item {i}/{len(content_items)}...")
            summary = self.summarize_content(item)
            summaries.append(summary)

        print(f"Completed {len(summaries)} summaries")
        return summaries

    def _create_summarization_prompt(self, content, existing_title, platform):
        """
        Create prompt for Claude API to summarize content.

        Args:
            content (str): Content to summarize
            existing_title (str): Existing title if available
            platform (str): Platform name for social media

        Returns:
            str: Formatted prompt
        """
        platform_context = f" from {platform}" if platform else ""

        prompt = f"""You are summarizing content{platform_context} for a daily newsroom report for Molson Coors beverage company.

Please provide:
1. A clear, engaging title (if the existing title isn't good, create a better one)
2. 1-2 concise bullet points highlighting the key information
3. A brief "So What" statement explaining implications for Molson Coors and their beverage brands

Focus on actionable insights relevant to the beverage/alcohol industry. Be concise and professional.

{'Existing title: ' + existing_title if existing_title else ''}

Content to summarize:
{content[:3000]}

Please respond in this exact format:
TITLE: [your title here]
BULLETS:
- [bullet point 1]
- [bullet point 2 if needed]
SO WHAT: [1-2 sentence implication for Molson Coors - how this impacts their business, brands, or strategy]"""

        return prompt

    def _parse_summary_response(self, response_text, fallback_title):
        """
        Parse Claude's response into structured format.

        Args:
            response_text (str): Response from Claude
            fallback_title (str): Fallback title if parsing fails

        Returns:
            dict: Parsed summary with 'title', 'bullets', and 'so_what' keys
        """
        try:
            lines = response_text.strip().split('\n')

            title = fallback_title
            bullets = []
            so_what = ''

            parsing_bullets = False

            for line in lines:
                line = line.strip()

                if line.startswith('TITLE:'):
                    title = line.replace('TITLE:', '').strip()
                    parsing_bullets = False

                elif line.startswith('BULLETS:'):
                    parsing_bullets = True

                elif line.startswith('SO WHAT:'):
                    so_what = line.replace('SO WHAT:', '').strip()
                    parsing_bullets = False

                elif parsing_bullets and (line.startswith('-') or line.startswith('•')):
                    bullet = line.lstrip('- •').strip()
                    if bullet:
                        bullets.append(bullet)

            # Ensure we have at least one bullet
            if not bullets:
                bullets = ['Summary could not be generated.']

            return {
                'title': title or fallback_title or 'Untitled',
                'bullets': bullets[:2],  # Limit to 2 bullets
                'so_what': so_what or 'No specific implications identified.'
            }

        except Exception as e:
            print(f"Error parsing summary response: {e}")
            return {
                'title': fallback_title or 'Untitled',
                'bullets': ['Summary could not be generated.']
            }