assistant-extractor/assistant_extractor.py

#!/usr/bin/env python3
"""
OpenAI Assistant Data Extractor

This script extracts data from OpenAI assistants including:
- Assistant name and ID
- System instructions
- Attached vector stores and their IDs
- JSON schemas from function tools
- Response format schemas
- Exports data to CSV format
"""

import os
import csv
import json
from typing import List, Dict, Any
from openai import OpenAI


class AssistantExtractor:
    def __init__(self, api_key: str = None):
        """Initialize the extractor with OpenAI API key."""
        self.client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY'))
        self.schema_counter = 0

    def list_all_assistants(self) -> List[Dict[str, Any]]:
        """List all assistants in the organization."""
        assistants = []
        try:
            response = self.client.beta.assistants.list(limit=100)
            assistants.extend(response.data)

            # Handle pagination if there are more than 100 assistants
            while response.has_more:
                response = self.client.beta.assistants.list(
                    limit=100,
                    after=response.data[-1].id
                )
                assistants.extend(response.data)

        except Exception as e:
            print(f"Error listing assistants: {e}")

        return assistants

    def extract_assistant_data(self, assistant) -> Dict[str, Any]:
        """Extract relevant data from an assistant object."""
        # Get vector store IDs from file_search tool
        vector_store_ids = []
        json_schemas = []
        function_tools = []

        if assistant.tools:
            for tool in assistant.tools:
                # Check for file_search tools
                if hasattr(tool, 'file_search') and tool.file_search:
                    if hasattr(tool.file_search, 'vector_store_ids'):
                        vector_store_ids.extend(tool.file_search.vector_store_ids or [])

                # Check for function tools with JSON schemas
                elif hasattr(tool, 'function') and tool.function:
                    function_tools.append(tool.function.name)
                    if hasattr(tool.function, 'parameters') and tool.function.parameters:
                        schema_str = json.dumps(tool.function.parameters, separators=(',', ':'))
                        json_schemas.append(f"{tool.function.name}: {schema_str}")

        # Check for response_format JSON schema
        response_format_schema_ref = 'None'
        if hasattr(assistant, 'response_format') and assistant.response_format:
            if hasattr(assistant.response_format, 'json_schema') and assistant.response_format.json_schema:
                schema_obj = assistant.response_format.json_schema
                if hasattr(schema_obj, 'schema') and schema_obj.schema:
                    response_format_schema_ref = f"response_format_schema_{assistant.id}.json"

        # Also check tool_resources for vector stores
        if hasattr(assistant, 'tool_resources') and assistant.tool_resources:
            if hasattr(assistant.tool_resources, 'file_search') and assistant.tool_resources.file_search:
                if hasattr(assistant.tool_resources.file_search, 'vector_store_ids'):
                    vector_store_ids.extend(assistant.tool_resources.file_search.vector_store_ids or [])

        return {
            'assistant_id': assistant.id,
            'assistant_name': assistant.name or 'Unnamed Assistant',
            'system_instructions': assistant.instructions or '',
            'vector_store_ids': ', '.join(vector_store_ids) if vector_store_ids else 'None',
            'function_tools': ', '.join(function_tools) if function_tools else 'None',
            'function_schemas': ' | '.join(json_schemas) if json_schemas else 'None',
            'response_format_schema_file': response_format_schema_ref,
            'model': assistant.model,
            'created_at': assistant.created_at
        }

    def get_vector_store_details(self, vector_store_id: str) -> Dict[str, Any]:
        """Get details about a specific vector store."""
        try:
            vector_store = self.client.beta.vector_stores.retrieve(vector_store_id)
            return {
                'id': vector_store.id,
                'name': vector_store.name or 'Unnamed Store',
                'file_counts': getattr(vector_store, 'file_counts', {})
            }
        except Exception as e:
            print(f"Error retrieving vector store {vector_store_id}: {e}")
            return {'id': vector_store_id, 'name': 'Error retrieving', 'file_counts': {}}

    def extract_all_data(self) -> List[Dict[str, Any]]:
        """Extract all assistant data including vector store details."""
        assistants = self.list_all_assistants()
        extracted_data = []

        for assistant in assistants:
            data = self.extract_assistant_data(assistant)

            # Save schemas to separate files and update references
            if data['function_schemas'] != 'None':
                function_schema_filename = f"function_schemas_{assistant.id}.txt"
                try:
                    with open(function_schema_filename, 'w', encoding='utf-8') as f:
                        f.write(data['function_schemas'])
                    data['function_schemas'] = function_schema_filename
                except Exception as e:
                    print(f"Error saving function schemas: {e}")
                    data['function_schemas'] = f"Error: {str(e)}"

            # Save response format schema if it exists
            if data['response_format_schema_file'] != 'None':
                schema_filename = data['response_format_schema_file']
                try:
                    schema_obj = assistant.response_format.json_schema
                    # Try different ways to access the schema
                    schema_data = None
                    if hasattr(schema_obj, 'schema') and callable(schema_obj.schema):
                        schema_data = schema_obj.schema()
                    elif hasattr(schema_obj, 'schema'):
                        schema_data = schema_obj.schema
                    elif hasattr(schema_obj, '__dict__'):
                        schema_data = schema_obj.__dict__
                    else:
                        # Convert the whole object to dict
                        schema_data = dict(schema_obj) if hasattr(schema_obj, 'items') else str(schema_obj)

                    with open(schema_filename, 'w', encoding='utf-8') as f:
                        if isinstance(schema_data, (dict, list)):
                            json.dump(schema_data, f, indent=2)
                        else:
                            f.write(str(schema_data))
                except Exception as e:
                    print(f"Error saving response format schema: {e}")
                    data['response_format_schema_file'] = f"Error: {str(e)}"

            # Get vector store details if any exist
            if data['vector_store_ids'] != 'None':
                store_ids = data['vector_store_ids'].split(', ')
                store_details = []
                for store_id in store_ids:
                    store_info = self.get_vector_store_details(store_id)
                    store_details.append(f"{store_info['name']} ({store_id})")
                data['vector_store_names'] = ', '.join(store_details)
            else:
                data['vector_store_names'] = 'None'

            extracted_data.append(data)

        return extracted_data

    def export_to_csv(self, data: List[Dict[str, Any]], filename: str = 'assistants_data.csv'):
        """Export extracted data to CSV file."""
        if not data:
            print("No data to export")
            return

        fieldnames = [
            'assistant_id',
            'assistant_name',
            'system_instructions',
            'vector_store_ids',
            'vector_store_names',
            'function_tools',
            'function_schemas',
            'response_format_schema_file',
            'model',
            'created_at'
        ]

        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(data)

        print(f"Data exported to {filename}")


def main():
    """Main function to run the assistant extractor."""
    # Check for API key
    api_key = os.getenv('OPENAI_API_KEY')
    if not api_key:
        print("Please set your OPENAI_API_KEY environment variable")
        return

    # Initialize extractor
    extractor = AssistantExtractor(api_key)

    # Extract all data
    print("Extracting assistant data...")
    data = extractor.extract_all_data()

    if data:
        print(f"Found {len(data)} assistants")

        # Export to CSV
        extractor.export_to_csv(data)

        # Print summary
        print("\nSummary:")
        for item in data:
            print(f"- {item['assistant_name']} ({item['assistant_id']})")
            if item['vector_store_names'] != 'None':
                print(f"  Vector Stores: {item['vector_store_names']}")
            if item['function_tools'] != 'None':
                print(f"  Function Tools: {item['function_tools']}")
            if item['function_schemas'] != 'None':
                print(f"  Function Schemas: Yes")
            if item['response_format_schema_file'] != 'None':
                print(f"  Response Format Schema: {item['response_format_schema_file']}")
    else:
        print("No assistants found")


if __name__ == "__main__":
    main()