#!/usr/bin/env python3 """ OpenAI Assistant Data Extractor This script extracts data from OpenAI assistants including: - Assistant name and ID - System instructions - Attached vector stores and their IDs - JSON schemas from function tools - Response format schemas - Exports data to CSV format """ import os import csv import json from typing import List, Dict, Any from openai import OpenAI class AssistantExtractor: def __init__(self, api_key: str = None): """Initialize the extractor with OpenAI API key.""" self.client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY')) self.schema_counter = 0 def list_all_assistants(self) -> List[Dict[str, Any]]: """List all assistants in the organization.""" assistants = [] try: response = self.client.beta.assistants.list(limit=100) assistants.extend(response.data) # Handle pagination if there are more than 100 assistants while response.has_more: response = self.client.beta.assistants.list( limit=100, after=response.data[-1].id ) assistants.extend(response.data) except Exception as e: print(f"Error listing assistants: {e}") return assistants def extract_assistant_data(self, assistant) -> Dict[str, Any]: """Extract relevant data from an assistant object.""" # Get vector store IDs from file_search tool vector_store_ids = [] json_schemas = [] function_tools = [] if assistant.tools: for tool in assistant.tools: # Check for file_search tools if hasattr(tool, 'file_search') and tool.file_search: if hasattr(tool.file_search, 'vector_store_ids'): vector_store_ids.extend(tool.file_search.vector_store_ids or []) # Check for function tools with JSON schemas elif hasattr(tool, 'function') and tool.function: function_tools.append(tool.function.name) if hasattr(tool.function, 'parameters') and tool.function.parameters: schema_str = json.dumps(tool.function.parameters, separators=(',', ':')) json_schemas.append(f"{tool.function.name}: {schema_str}") # Check for response_format JSON schema response_format_schema_ref = 'None' if hasattr(assistant, 'response_format') and assistant.response_format: if hasattr(assistant.response_format, 'json_schema') and assistant.response_format.json_schema: schema_obj = assistant.response_format.json_schema if hasattr(schema_obj, 'schema') and schema_obj.schema: response_format_schema_ref = f"response_format_schema_{assistant.id}.json" # Also check tool_resources for vector stores if hasattr(assistant, 'tool_resources') and assistant.tool_resources: if hasattr(assistant.tool_resources, 'file_search') and assistant.tool_resources.file_search: if hasattr(assistant.tool_resources.file_search, 'vector_store_ids'): vector_store_ids.extend(assistant.tool_resources.file_search.vector_store_ids or []) return { 'assistant_id': assistant.id, 'assistant_name': assistant.name or 'Unnamed Assistant', 'system_instructions': assistant.instructions or '', 'vector_store_ids': ', '.join(vector_store_ids) if vector_store_ids else 'None', 'function_tools': ', '.join(function_tools) if function_tools else 'None', 'function_schemas': ' | '.join(json_schemas) if json_schemas else 'None', 'response_format_schema_file': response_format_schema_ref, 'model': assistant.model, 'created_at': assistant.created_at } def get_vector_store_details(self, vector_store_id: str) -> Dict[str, Any]: """Get details about a specific vector store.""" try: vector_store = self.client.beta.vector_stores.retrieve(vector_store_id) return { 'id': vector_store.id, 'name': vector_store.name or 'Unnamed Store', 'file_counts': getattr(vector_store, 'file_counts', {}) } except Exception as e: print(f"Error retrieving vector store {vector_store_id}: {e}") return {'id': vector_store_id, 'name': 'Error retrieving', 'file_counts': {}} def extract_all_data(self) -> List[Dict[str, Any]]: """Extract all assistant data including vector store details.""" assistants = self.list_all_assistants() extracted_data = [] for assistant in assistants: data = self.extract_assistant_data(assistant) # Save schemas to separate files and update references if data['function_schemas'] != 'None': function_schema_filename = f"function_schemas_{assistant.id}.txt" try: with open(function_schema_filename, 'w', encoding='utf-8') as f: f.write(data['function_schemas']) data['function_schemas'] = function_schema_filename except Exception as e: print(f"Error saving function schemas: {e}") data['function_schemas'] = f"Error: {str(e)}" # Save response format schema if it exists if data['response_format_schema_file'] != 'None': schema_filename = data['response_format_schema_file'] try: schema_obj = assistant.response_format.json_schema # Try different ways to access the schema schema_data = None if hasattr(schema_obj, 'schema') and callable(schema_obj.schema): schema_data = schema_obj.schema() elif hasattr(schema_obj, 'schema'): schema_data = schema_obj.schema elif hasattr(schema_obj, '__dict__'): schema_data = schema_obj.__dict__ else: # Convert the whole object to dict schema_data = dict(schema_obj) if hasattr(schema_obj, 'items') else str(schema_obj) with open(schema_filename, 'w', encoding='utf-8') as f: if isinstance(schema_data, (dict, list)): json.dump(schema_data, f, indent=2) else: f.write(str(schema_data)) except Exception as e: print(f"Error saving response format schema: {e}") data['response_format_schema_file'] = f"Error: {str(e)}" # Get vector store details if any exist if data['vector_store_ids'] != 'None': store_ids = data['vector_store_ids'].split(', ') store_details = [] for store_id in store_ids: store_info = self.get_vector_store_details(store_id) store_details.append(f"{store_info['name']} ({store_id})") data['vector_store_names'] = ', '.join(store_details) else: data['vector_store_names'] = 'None' extracted_data.append(data) return extracted_data def export_to_csv(self, data: List[Dict[str, Any]], filename: str = 'assistants_data.csv'): """Export extracted data to CSV file.""" if not data: print("No data to export") return fieldnames = [ 'assistant_id', 'assistant_name', 'system_instructions', 'vector_store_ids', 'vector_store_names', 'function_tools', 'function_schemas', 'response_format_schema_file', 'model', 'created_at' ] with open(filename, 'w', newline='', encoding='utf-8') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() writer.writerows(data) print(f"Data exported to {filename}") def main(): """Main function to run the assistant extractor.""" # Check for API key api_key = os.getenv('OPENAI_API_KEY') if not api_key: print("Please set your OPENAI_API_KEY environment variable") return # Initialize extractor extractor = AssistantExtractor(api_key) # Extract all data print("Extracting assistant data...") data = extractor.extract_all_data() if data: print(f"Found {len(data)} assistants") # Export to CSV extractor.export_to_csv(data) # Print summary print("\nSummary:") for item in data: print(f"- {item['assistant_name']} ({item['assistant_id']})") if item['vector_store_names'] != 'None': print(f" Vector Stores: {item['vector_store_names']}") if item['function_tools'] != 'None': print(f" Function Tools: {item['function_tools']}") if item['function_schemas'] != 'None': print(f" Function Schemas: Yes") if item['response_format_schema_file'] != 'None': print(f" Response Format Schema: {item['response_format_schema_file']}") else: print("No assistants found") if __name__ == "__main__": main()