- Add Python script to extract assistant data via OpenAI API - Extract names, IDs, system instructions, and vector stores - Support for function tool schemas and response format schemas - Export to CSV with separate schema files - Handle pagination and error cases 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
232 lines
No EOL
9.7 KiB
Python
232 lines
No EOL
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
OpenAI Assistant Data Extractor
|
|
|
|
This script extracts data from OpenAI assistants including:
|
|
- Assistant name and ID
|
|
- System instructions
|
|
- Attached vector stores and their IDs
|
|
- JSON schemas from function tools
|
|
- Response format schemas
|
|
- Exports data to CSV format
|
|
"""
|
|
|
|
import os
|
|
import csv
|
|
import json
|
|
from typing import List, Dict, Any
|
|
from openai import OpenAI
|
|
|
|
|
|
class AssistantExtractor:
|
|
def __init__(self, api_key: str = None):
|
|
"""Initialize the extractor with OpenAI API key."""
|
|
self.client = OpenAI(api_key=api_key or os.getenv('OPENAI_API_KEY'))
|
|
self.schema_counter = 0
|
|
|
|
def list_all_assistants(self) -> List[Dict[str, Any]]:
|
|
"""List all assistants in the organization."""
|
|
assistants = []
|
|
try:
|
|
response = self.client.beta.assistants.list(limit=100)
|
|
assistants.extend(response.data)
|
|
|
|
# Handle pagination if there are more than 100 assistants
|
|
while response.has_more:
|
|
response = self.client.beta.assistants.list(
|
|
limit=100,
|
|
after=response.data[-1].id
|
|
)
|
|
assistants.extend(response.data)
|
|
|
|
except Exception as e:
|
|
print(f"Error listing assistants: {e}")
|
|
|
|
return assistants
|
|
|
|
def extract_assistant_data(self, assistant) -> Dict[str, Any]:
|
|
"""Extract relevant data from an assistant object."""
|
|
# Get vector store IDs from file_search tool
|
|
vector_store_ids = []
|
|
json_schemas = []
|
|
function_tools = []
|
|
|
|
if assistant.tools:
|
|
for tool in assistant.tools:
|
|
# Check for file_search tools
|
|
if hasattr(tool, 'file_search') and tool.file_search:
|
|
if hasattr(tool.file_search, 'vector_store_ids'):
|
|
vector_store_ids.extend(tool.file_search.vector_store_ids or [])
|
|
|
|
# Check for function tools with JSON schemas
|
|
elif hasattr(tool, 'function') and tool.function:
|
|
function_tools.append(tool.function.name)
|
|
if hasattr(tool.function, 'parameters') and tool.function.parameters:
|
|
schema_str = json.dumps(tool.function.parameters, separators=(',', ':'))
|
|
json_schemas.append(f"{tool.function.name}: {schema_str}")
|
|
|
|
# Check for response_format JSON schema
|
|
response_format_schema_ref = 'None'
|
|
if hasattr(assistant, 'response_format') and assistant.response_format:
|
|
if hasattr(assistant.response_format, 'json_schema') and assistant.response_format.json_schema:
|
|
schema_obj = assistant.response_format.json_schema
|
|
if hasattr(schema_obj, 'schema') and schema_obj.schema:
|
|
response_format_schema_ref = f"response_format_schema_{assistant.id}.json"
|
|
|
|
# Also check tool_resources for vector stores
|
|
if hasattr(assistant, 'tool_resources') and assistant.tool_resources:
|
|
if hasattr(assistant.tool_resources, 'file_search') and assistant.tool_resources.file_search:
|
|
if hasattr(assistant.tool_resources.file_search, 'vector_store_ids'):
|
|
vector_store_ids.extend(assistant.tool_resources.file_search.vector_store_ids or [])
|
|
|
|
return {
|
|
'assistant_id': assistant.id,
|
|
'assistant_name': assistant.name or 'Unnamed Assistant',
|
|
'system_instructions': assistant.instructions or '',
|
|
'vector_store_ids': ', '.join(vector_store_ids) if vector_store_ids else 'None',
|
|
'function_tools': ', '.join(function_tools) if function_tools else 'None',
|
|
'function_schemas': ' | '.join(json_schemas) if json_schemas else 'None',
|
|
'response_format_schema_file': response_format_schema_ref,
|
|
'model': assistant.model,
|
|
'created_at': assistant.created_at
|
|
}
|
|
|
|
def get_vector_store_details(self, vector_store_id: str) -> Dict[str, Any]:
|
|
"""Get details about a specific vector store."""
|
|
try:
|
|
vector_store = self.client.beta.vector_stores.retrieve(vector_store_id)
|
|
return {
|
|
'id': vector_store.id,
|
|
'name': vector_store.name or 'Unnamed Store',
|
|
'file_counts': getattr(vector_store, 'file_counts', {})
|
|
}
|
|
except Exception as e:
|
|
print(f"Error retrieving vector store {vector_store_id}: {e}")
|
|
return {'id': vector_store_id, 'name': 'Error retrieving', 'file_counts': {}}
|
|
|
|
def extract_all_data(self) -> List[Dict[str, Any]]:
|
|
"""Extract all assistant data including vector store details."""
|
|
assistants = self.list_all_assistants()
|
|
extracted_data = []
|
|
|
|
for assistant in assistants:
|
|
data = self.extract_assistant_data(assistant)
|
|
|
|
# Save schemas to separate files and update references
|
|
if data['function_schemas'] != 'None':
|
|
function_schema_filename = f"function_schemas_{assistant.id}.txt"
|
|
try:
|
|
with open(function_schema_filename, 'w', encoding='utf-8') as f:
|
|
f.write(data['function_schemas'])
|
|
data['function_schemas'] = function_schema_filename
|
|
except Exception as e:
|
|
print(f"Error saving function schemas: {e}")
|
|
data['function_schemas'] = f"Error: {str(e)}"
|
|
|
|
# Save response format schema if it exists
|
|
if data['response_format_schema_file'] != 'None':
|
|
schema_filename = data['response_format_schema_file']
|
|
try:
|
|
schema_obj = assistant.response_format.json_schema
|
|
# Try different ways to access the schema
|
|
schema_data = None
|
|
if hasattr(schema_obj, 'schema') and callable(schema_obj.schema):
|
|
schema_data = schema_obj.schema()
|
|
elif hasattr(schema_obj, 'schema'):
|
|
schema_data = schema_obj.schema
|
|
elif hasattr(schema_obj, '__dict__'):
|
|
schema_data = schema_obj.__dict__
|
|
else:
|
|
# Convert the whole object to dict
|
|
schema_data = dict(schema_obj) if hasattr(schema_obj, 'items') else str(schema_obj)
|
|
|
|
with open(schema_filename, 'w', encoding='utf-8') as f:
|
|
if isinstance(schema_data, (dict, list)):
|
|
json.dump(schema_data, f, indent=2)
|
|
else:
|
|
f.write(str(schema_data))
|
|
except Exception as e:
|
|
print(f"Error saving response format schema: {e}")
|
|
data['response_format_schema_file'] = f"Error: {str(e)}"
|
|
|
|
# Get vector store details if any exist
|
|
if data['vector_store_ids'] != 'None':
|
|
store_ids = data['vector_store_ids'].split(', ')
|
|
store_details = []
|
|
for store_id in store_ids:
|
|
store_info = self.get_vector_store_details(store_id)
|
|
store_details.append(f"{store_info['name']} ({store_id})")
|
|
data['vector_store_names'] = ', '.join(store_details)
|
|
else:
|
|
data['vector_store_names'] = 'None'
|
|
|
|
extracted_data.append(data)
|
|
|
|
return extracted_data
|
|
|
|
def export_to_csv(self, data: List[Dict[str, Any]], filename: str = 'assistants_data.csv'):
|
|
"""Export extracted data to CSV file."""
|
|
if not data:
|
|
print("No data to export")
|
|
return
|
|
|
|
fieldnames = [
|
|
'assistant_id',
|
|
'assistant_name',
|
|
'system_instructions',
|
|
'vector_store_ids',
|
|
'vector_store_names',
|
|
'function_tools',
|
|
'function_schemas',
|
|
'response_format_schema_file',
|
|
'model',
|
|
'created_at'
|
|
]
|
|
|
|
with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
|
|
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(data)
|
|
|
|
print(f"Data exported to {filename}")
|
|
|
|
|
|
def main():
|
|
"""Main function to run the assistant extractor."""
|
|
# Check for API key
|
|
api_key = os.getenv('OPENAI_API_KEY')
|
|
if not api_key:
|
|
print("Please set your OPENAI_API_KEY environment variable")
|
|
return
|
|
|
|
# Initialize extractor
|
|
extractor = AssistantExtractor(api_key)
|
|
|
|
# Extract all data
|
|
print("Extracting assistant data...")
|
|
data = extractor.extract_all_data()
|
|
|
|
if data:
|
|
print(f"Found {len(data)} assistants")
|
|
|
|
# Export to CSV
|
|
extractor.export_to_csv(data)
|
|
|
|
# Print summary
|
|
print("\nSummary:")
|
|
for item in data:
|
|
print(f"- {item['assistant_name']} ({item['assistant_id']})")
|
|
if item['vector_store_names'] != 'None':
|
|
print(f" Vector Stores: {item['vector_store_names']}")
|
|
if item['function_tools'] != 'None':
|
|
print(f" Function Tools: {item['function_tools']}")
|
|
if item['function_schemas'] != 'None':
|
|
print(f" Function Schemas: Yes")
|
|
if item['response_format_schema_file'] != 'None':
|
|
print(f" Response Format Schema: {item['response_format_schema_file']}")
|
|
else:
|
|
print("No assistants found")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |