assistant-extractor/assistant_extractor.py
DJP ee960c544f Initial commit: OpenAI Assistant Data Extractor
- Add Python script to extract assistant data via OpenAI API
- Extract names, IDs, system instructions, and vector stores
- Support for function tool schemas and response format schemas
- Export to CSV with separate schema files
- Handle pagination and error cases

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-02 16:43:01 -04:00

232 lines
No EOL
9.7 KiB
Python

#!/usr/bin/env python3
"""
OpenAI Assistant Data Extractor
This script extracts data from OpenAI assistants including:
- Assistant name and ID
- System instructions
- Attached vector stores and their IDs
- JSON schemas from function tools
- Response format schemas
- Exports data to CSV format
"""
import os
import csv
import json
from typing import List, Dict, Any
from openai import OpenAI
class AssistantExtractor:
def __init__(self, api_key: str = None):
"""Initialize the extractor with OpenAI API key."""
self.client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY'))
self.schema_counter = 0
def list_all_assistants(self) -> List[Dict[str, Any]]:
"""List all assistants in the organization."""
assistants = []
try:
response = self.client.beta.assistants.list(limit=100)
assistants.extend(response.data)
# Handle pagination if there are more than 100 assistants
while response.has_more:
response = self.client.beta.assistants.list(
limit=100,
after=response.data[-1].id
)
assistants.extend(response.data)
except Exception as e:
print(f"Error listing assistants: {e}")
return assistants
def extract_assistant_data(self, assistant) -> Dict[str, Any]:
"""Extract relevant data from an assistant object."""
# Get vector store IDs from file_search tool
vector_store_ids = []
json_schemas = []
function_tools = []
if assistant.tools:
for tool in assistant.tools:
# Check for file_search tools
if hasattr(tool, 'file_search') and tool.file_search:
if hasattr(tool.file_search, 'vector_store_ids'):
vector_store_ids.extend(tool.file_search.vector_store_ids or [])
# Check for function tools with JSON schemas
elif hasattr(tool, 'function') and tool.function:
function_tools.append(tool.function.name)
if hasattr(tool.function, 'parameters') and tool.function.parameters:
schema_str = json.dumps(tool.function.parameters, separators=(',', ':'))
json_schemas.append(f"{tool.function.name}: {schema_str}")
# Check for response_format JSON schema
response_format_schema_ref = 'None'
if hasattr(assistant, 'response_format') and assistant.response_format:
if hasattr(assistant.response_format, 'json_schema') and assistant.response_format.json_schema:
schema_obj = assistant.response_format.json_schema
if hasattr(schema_obj, 'schema') and schema_obj.schema:
response_format_schema_ref = f"response_format_schema_{assistant.id}.json"
# Also check tool_resources for vector stores
if hasattr(assistant, 'tool_resources') and assistant.tool_resources:
if hasattr(assistant.tool_resources, 'file_search') and assistant.tool_resources.file_search:
if hasattr(assistant.tool_resources.file_search, 'vector_store_ids'):
vector_store_ids.extend(assistant.tool_resources.file_search.vector_store_ids or [])
return {
'assistant_id': assistant.id,
'assistant_name': assistant.name or 'Unnamed Assistant',
'system_instructions': assistant.instructions or '',
'vector_store_ids': ', '.join(vector_store_ids) if vector_store_ids else 'None',
'function_tools': ', '.join(function_tools) if function_tools else 'None',
'function_schemas': ' | '.join(json_schemas) if json_schemas else 'None',
'response_format_schema_file': response_format_schema_ref,
'model': assistant.model,
'created_at': assistant.created_at
}
def get_vector_store_details(self, vector_store_id: str) -> Dict[str, Any]:
"""Get details about a specific vector store."""
try:
vector_store = self.client.beta.vector_stores.retrieve(vector_store_id)
return {
'id': vector_store.id,
'name': vector_store.name or 'Unnamed Store',
'file_counts': getattr(vector_store, 'file_counts', {})
}
except Exception as e:
print(f"Error retrieving vector store {vector_store_id}: {e}")
return {'id': vector_store_id, 'name': 'Error retrieving', 'file_counts': {}}
def extract_all_data(self) -> List[Dict[str, Any]]:
"""Extract all assistant data including vector store details."""
assistants = self.list_all_assistants()
extracted_data = []
for assistant in assistants:
data = self.extract_assistant_data(assistant)
# Save schemas to separate files and update references
if data['function_schemas'] != 'None':
function_schema_filename = f"function_schemas_{assistant.id}.txt"
try:
with open(function_schema_filename, 'w', encoding='utf-8') as f:
f.write(data['function_schemas'])
data['function_schemas'] = function_schema_filename
except Exception as e:
print(f"Error saving function schemas: {e}")
data['function_schemas'] = f"Error: {str(e)}"
# Save response format schema if it exists
if data['response_format_schema_file'] != 'None':
schema_filename = data['response_format_schema_file']
try:
schema_obj = assistant.response_format.json_schema
# Try different ways to access the schema
schema_data = None
if hasattr(schema_obj, 'schema') and callable(schema_obj.schema):
schema_data = schema_obj.schema()
elif hasattr(schema_obj, 'schema'):
schema_data = schema_obj.schema
elif hasattr(schema_obj, '__dict__'):
schema_data = schema_obj.__dict__
else:
# Convert the whole object to dict
schema_data = dict(schema_obj) if hasattr(schema_obj, 'items') else str(schema_obj)
with open(schema_filename, 'w', encoding='utf-8') as f:
if isinstance(schema_data, (dict, list)):
json.dump(schema_data, f, indent=2)
else:
f.write(str(schema_data))
except Exception as e:
print(f"Error saving response format schema: {e}")
data['response_format_schema_file'] = f"Error: {str(e)}"
# Get vector store details if any exist
if data['vector_store_ids'] != 'None':
store_ids = data['vector_store_ids'].split(', ')
store_details = []
for store_id in store_ids:
store_info = self.get_vector_store_details(store_id)
store_details.append(f"{store_info['name']} ({store_id})")
data['vector_store_names'] = ', '.join(store_details)
else:
data['vector_store_names'] = 'None'
extracted_data.append(data)
return extracted_data
def export_to_csv(self, data: List[Dict[str, Any]], filename: str = 'assistants_data.csv'):
"""Export extracted data to CSV file."""
if not data:
print("No data to export")
return
fieldnames = [
'assistant_id',
'assistant_name',
'system_instructions',
'vector_store_ids',
'vector_store_names',
'function_tools',
'function_schemas',
'response_format_schema_file',
'model',
'created_at'
]
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
print(f"Data exported to {filename}")
def main():
"""Main function to run the assistant extractor."""
# Check for API key
api_key = os.getenv('OPENAI_API_KEY')
if not api_key:
print("Please set your OPENAI_API_KEY environment variable")
return
# Initialize extractor
extractor = AssistantExtractor(api_key)
# Extract all data
print("Extracting assistant data...")
data = extractor.extract_all_data()
if data:
print(f"Found {len(data)} assistants")
# Export to CSV
extractor.export_to_csv(data)
# Print summary
print("\nSummary:")
for item in data:
print(f"- {item['assistant_name']} ({item['assistant_id']})")
if item['vector_store_names'] != 'None':
print(f" Vector Stores: {item['vector_store_names']}")
if item['function_tools'] != 'None':
print(f" Function Tools: {item['function_tools']}")
if item['function_schemas'] != 'None':
print(f" Function Schemas: Yes")
if item['response_format_schema_file'] != 'None':
print(f" Response Format Schema: {item['response_format_schema_file']}")
else:
print("No assistants found")
if __name__ == "__main__":
main()