init: add RAG test app with Excel/CSV export

- rag_test_app: OpenAI Assistants benchmark tool
- TEST_TO_RUN: Barclays test configs (Internal Banners, Social Posts, Display Banners, PPC)
- Added report.xlsx + report.csv export alongside HTML report

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Vadym Samoilenko 2026-05-10 13:28:27 +01:00
parent b31945ec22
commit ed040ea497
17 changed files with 2927 additions and 28 deletions

50
.gitignore vendored
View file

@ -1,46 +1,35 @@
# These are some examples of commonly ignored file patterns.
# You should customize this list as applicable to your project.
# Learn more about .gitignore:
# https://www.atlassian.com/git/tutorials/saving-changes/gitignore
# Python
__pycache__/
*.py[cod]
.venv/
venv/
*.egg-info/
# Node artifact files
# Test results (large, generated)
rag_test_app/results/*/
# Node
node_modules/
dist/
# Compiled Java class files
# Build artifacts
*.class
# Compiled Python bytecode
*.py[cod]
# Log files
*.log
# Package files
*.jar
# Maven
target/
dist/
# JetBrains IDE
# IDEs
.idea/
# Unit test reports
TEST*.xml
# Generated by MacOS
# macOS
.DS_Store
# Generated by Windows
# Windows
Thumbs.db
# Applications
*.app
*.exe
*.war
# Logs
*.log
# Large media files
# Large media
*.mp4
*.tiff
*.avi
@ -48,3 +37,8 @@ Thumbs.db
*.mov
*.wmv
# Claude Code local settings
.claude/
# Environment
.env

65
CLAUDE.md Normal file
View file

@ -0,0 +1,65 @@
# CLAUDE.md
This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
## What this repo is
A standalone Python CLI tool (`rag_test_app/`) that benchmarks OpenAI Assistants with RAG knowledge bases. Built for Barclays — tests banner-creation assistants against brand/compliance documents and produces scored HTML reports.
The `TEST_TO_RUN/` directory contains pre-built Barclays test configs (JSON) + document ZIPs for four campaign types: Internal Banners, Social Posts, Display Banners, PPC.
## Commands
All commands run from `rag_test_app/`:
```bash
cd rag_test_app
# Install dependencies
pip install -r requirements.txt
# Create a config template
python cli.py --create-config my_config.json
# Run a single test config
python cli.py --config my_config.json
# Batch-run all configs in a directory
python cli.py --config-dir ../TEST_TO_RUN/
# Generate questions only (no API calls to assistant)
python cli.py --config my_config.json --generate-only
# Reuse previously generated questions
python cli.py --config my_config.json --questions-file results/*/test_questions.json
# Run tests
pytest tests/
pytest tests/test_main.py::TestRAGTester::test_generate_questions
```
Required env var: `OPENAI_API_KEY` (alternatively set `api_key` in the JSON config, though env var is preferred).
## Architecture
Two files, one class:
**`main.py``RAGTester`**
1. `__init__`: Loads source documents (`.txt` or `.docx` via `docx2txt`). Concatenates multiple docs with `--- Document: filename ---` separators into a single `document_content` string.
2. `generate_test_questions()`: Sends the first 3000 chars of `document_content` to GPT-4o with one of three prompt templates (`task-based` / `content-based` / `scenario-based`). Returns parsed JSON question list. Falls back to `gpt-3.5-turbo` if JSON response format fails.
3. `run_tests()`: Runs every question × every iteration against the OpenAI Assistants API (`beta.threads`) using `ThreadPoolExecutor`. Saves partial results every 5 completions to avoid data loss.
4. `evaluate_results()`: Groups results by question, calls GPT-4o to score each group on quality / consistency / accuracy / completeness (110). Falls back on parse failure to default score of 5.
5. `generate_report()`: Produces `report.html` + four matplotlib PNGs (`scores_by_question`, `response_times`, `score_distribution`, `radar_chart`) all saved into the timestamped output dir.
**`cli.py` — CLI wrapper**
Handles argument parsing, config file merging (CLI args override config file values), and batch mode (`--config-dir`). Creates timestamped output dirs: `{output_dir}/{config_name}_{YYYYMMDD_HHMMSS}/`.
## Key behaviours to know
- **`batch_size` vs `parallel`**: `batch_size` controls memory; `parallel` controls concurrency. Recommended: `batch_size = 2-3× parallel`. High parallelism hits OpenAI rate limits — start at `parallel: 5`.
- **Output never overwrites**: every run creates a new timestamped directory.
- **Evaluation is expensive**: GPT-4o is called once per question for evaluation, in addition to the assistant calls. A 20-question × 3-iteration run costs ~63 API calls minimum.
- **Document truncation in prompts**: question generation only sends the first 3000 characters of `document_content` to GPT, regardless of total document size. The full content is not summarised or chunked further.
- **Supported document formats**: `.txt` and `.docx` only. PDF/XLS/PPT are silently skipped.

View file

@ -0,0 +1,19 @@
{
"assistant_id": "asst_vlFx0Uud1BKtp7j77Vp0pi8H",
"documents": [
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Duty summary.docx",
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Understanding.docx",
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Internal Banner Examples 23102024 -Markdown.docx",
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Our Regulators and developing marketing material Chatai.docx"
],
"api_key": "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",
"output_dir": "results",
"num_questions": 20,
"iterations": 3,
"questions_file": "",
"generate_only": false,
"verbose": true,
"model": "gpt-4o",
"parallel": 10,
"batch_size": 20
}

Binary file not shown.

View file

@ -0,0 +1,19 @@
{
"assistant_id": "asst_MT0qKXI57m8Y2RVllqwFUqBe",
"documents": [
"/Users/daveporter/Python-Enviroments/RAG-TEST-BAIC/DIsplay-Banner-Docs/BUK Social Media Playbook.docx",
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Understanding.docx",
"/Users/daveporter/Python-Enviroments/RAG-TEST-BAIC/DIsplay-Banner-Docs/OMD Barclays Social 101.docx",
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Our Regulators and developing marketing material Chatai.docx"
],
"api_key": "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",
"output_dir": "results",
"num_questions": 20,
"iterations": 3,
"questions_file": "",
"generate_only": false,
"verbose": true,
"model": "gpt-4o",
"parallel": 10,
"batch_size": 20
}

Binary file not shown.

View file

@ -0,0 +1,18 @@
{
"assistant_id": "asst_eAsIXFpSGiy7jQzyF8p0IRDA",
"documents": [
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Duty summary.docx",
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Understanding.docx",
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Our Regulators and developing marketing material Chatai.docx"
],
"api_key": "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",
"output_dir": "results",
"num_questions": 20,
"iterations": 3,
"questions_file": "",
"generate_only": false,
"verbose": true,
"model": "gpt-4o",
"parallel": 10,
"batch_size": 20
}

Binary file not shown.

16
TEST_TO_RUN/PPC.json Normal file
View file

@ -0,0 +1,16 @@
{
"assistant_id": "asst_Pz7uhnK7aOoYykl7KalyirY9",
"documents": [
"/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Our Regulators and developing marketing material Chatai.docx"
],
"api_key": "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",
"output_dir": "results",
"num_questions": 20,
"iterations": 3,
"questions_file": "",
"generate_only": false,
"verbose": true,
"model": "gpt-4o",
"parallel": 10,
"batch_size": 20
}

BIN
TEST_TO_RUN/PPC.zip Normal file

Binary file not shown.

794
rag_test_app/README.md Normal file
View file

@ -0,0 +1,794 @@
# RAG Testing Application
A comprehensive Python application for automatically testing and evaluating OpenAI assistants with Retrieval-Augmented Generation (RAG) capabilities.
## Table of Contents
- [Overview](#overview)
- [Features](#features)
- [Installation](#installation)
- [Quick Start Guide](#quick-start-guide)
- [Complete User Guide](#complete-user-guide)
- [Configuration File Reference](#configuration-file-reference)
- [Prompt Types](#prompt-types)
- [Batch Processing](#batch-processing)
- [Output Directory Structure](#output-directory-structure)
- [Command Line Reference](#command-line-reference)
- [Advanced Usage Examples](#advanced-usage-examples)
- [Understanding the Results](#understanding-the-results)
- [Troubleshooting](#troubleshooting)
## Overview
This tool helps you evaluate and benchmark OpenAI assistants by:
1. **Generating test prompts** from your source documents (with multiple prompt styles)
2. **Running prompts** against your assistant multiple times to test consistency
3. **Evaluating responses** for quality, consistency, accuracy, and completeness
4. **Generating detailed reports** with visualizations and metrics
**Perfect for:** Testing assistants that create content (banners, copy, documents) using RAG knowledge bases.
## Features
- ✅ **Multi-document support**: Test with individual documents, directories, or specified sets of documents
- ✅ **Multiple prompt types**: Generate realistic user tasks, knowledge questions, or business scenarios
- ✅ **Batch processing**: Run multiple test configurations in sequence automatically
- ✅ **Timestamped results**: Each test run creates a unique timestamped directory - no more overwriting!
- ✅ **Support for DOCX files**: Works with both text and Microsoft Word files
- ✅ **Optimized performance**: Parallel processing and batch execution for significantly faster testing
- ✅ **Comprehensive evaluation**: Assesses responses for quality, accuracy, consistency, and completeness
- ✅ **Interactive reporting**: Generates professional HTML reports with detailed visualizations
- ✅ **Performance tracking**: Measures and analyzes response times and other key metrics
- ✅ **Data export**: Saves all results as JSON for further analysis
- ✅ **Config-based workflow**: Easy to set up and customize via configuration files
## Installation
1. Clone this repository:
```bash
git clone https://github.com/yourusername/rag-test-app.git
cd rag-test-app
```
2. Install the required packages:
```bash
pip install -r requirements.txt
```
3. Set up your OpenAI API key:
```bash
export OPENAI_API_KEY="your-api-key-here"
```
## Quick Start Guide
### 1. Create a Configuration File
The easiest way to get started:
```bash
python cli.py --create-config my_test_config.json
```
This creates a template configuration file.
### 2. Edit Your Configuration
Open `my_test_config.json` and update:
- `assistant_id`: Your OpenAI Assistant ID (e.g., "asst_abc123...")
- `documents`: Paths to your RAG documents
- `api_key`: Your OpenAI API key (or use environment variable)
### 3. Run Your First Test
```bash
python cli.py --config my_test_config.json
```
### 4. View Results
Open the generated `results/your_test_YYYYMMDD_HHMMSS/report.html` in your browser!
## Complete User Guide
### Configuration File Reference
Here's a complete configuration file with ALL available options:
```json
{
"assistant_id": "asst_YourAssistantIdHere",
"documents": [
"/path/to/your/document1.txt",
"/path/to/your/document2.docx"
],
"api_key": "YOUR_OPENAI_API_KEY",
"output_dir": "results",
"num_questions": 20,
"iterations": 3,
"questions_file": "",
"generate_only": false,
"verbose": true,
"model": "gpt-4o",
"prompt_type": "task-based",
"parallel": 10,
"batch_size": 30
}
```
#### Configuration Options Explained
| Option | Type | Default | Description |
|--------|------|---------|-------------|
| **`assistant_id`** | string | (required) | Your OpenAI Assistant ID starting with "asst_" |
| **`documents`** | array | null | List of document file paths (preferred method) |
| **`document`** | string | null | Single document or directory path (alternative to `documents`) |
| **`api_key`** | string | env var | OpenAI API key (can also use `OPENAI_API_KEY` environment variable) |
| **`output_dir`** | string | "results" | Base directory for saving results |
| **`num_questions`** | integer | 20 | Number of test prompts to generate |
| **`iterations`** | integer | 3 | How many times to test each prompt (for consistency checking) |
| **`questions_file`** | string | null | Path to pre-generated questions JSON file |
| **`generate_only`** | boolean | false | Only generate questions, don't run tests |
| **`verbose`** | boolean | false | Enable detailed logging for debugging |
| **`model`** | string | "gpt-4o" | GPT model for question generation and evaluation |
| **`prompt_type`** | string | "task-based" | Type of prompts to generate (see below) |
| **`parallel`** | integer | 5 | Number of parallel workers for running tests |
| **`batch_size`** | integer | same as parallel | Questions per batch (set to 2-3x parallel for best performance) |
### Prompt Types
**NEW FEATURE:** Choose how test prompts are generated to match your testing needs.
#### 1. **`task-based`** (Default - Recommended)
Generates realistic user task requests that emulate how real users interact with your assistant.
**Best for:** Testing assistants that create content (banners, copy, ads, documents)
**Example prompts:**
- "Create a banner for our new credit card offer with 0% APR"
- "Write copy for a savings account promotion targeting young professionals"
- "Generate headlines for our mobile banking app launch"
- "Design promotional text for a balance transfer campaign"
**When to use:**
- Testing content creation assistants
- Simulating real user interactions
- Evaluating practical usability
**Configuration:**
```json
{
"prompt_type": "task-based"
}
```
**Command line:**
```bash
python cli.py --config myconfig.json --prompt-type task-based
```
#### 2. **`content-based`** (Original)
Generates knowledge questions about the content in your RAG documents.
**Best for:** Testing document understanding and knowledge retrieval
**Example prompts:**
- "What is the FCA Consumer Duty requirement?"
- "Explain the principles of clear customer communication"
- "What are the considerations for vulnerable customers?"
- "List the regulatory guidelines for financial advertising"
**When to use:**
- Verifying RAG knowledge accuracy
- Testing document comprehension
- Auditing information retrieval
**Configuration:**
```json
{
"prompt_type": "content-based"
}
```
**Command line:**
```bash
python cli.py --config myconfig.json --prompt-type content-based
```
#### 3. **`scenario-based`**
Generates realistic business scenarios that combine tasks with context and requirements.
**Best for:** Testing complex real-world use cases with constraints
**Example prompts:**
- "We're launching a new credit card for students. Create FCA-compliant banner copy that's clear and accessible"
- "Our vulnerable customer initiative needs promotional materials. Write banner text that follows Consumer Duty guidelines"
- "Create an internal banner for our mobile banking upgrade targeting existing customers aged 50+"
- "We have a new savings product for first-time buyers. Generate compliant promotional copy"
**When to use:**
- Testing compliance and constraints
- Simulating real business workflows
- Evaluating context handling
**Configuration:**
```json
{
"prompt_type": "scenario-based"
}
```
**Command line:**
```bash
python cli.py --config myconfig.json --prompt-type scenario-based
```
#### Comparing Prompt Types
| Prompt Type | Use Case | Complexity | Best For |
|-------------|----------|------------|----------|
| **task-based** | Simple user requests | Low | Daily user interactions |
| **content-based** | Knowledge questions | Medium | RAG accuracy testing |
| **scenario-based** | Business scenarios | High | Real-world workflows |
### Batch Processing
**NEW FEATURE:** Run multiple test configurations automatically in sequence.
Instead of running tests one at a time, point to a directory containing multiple config files and run them all at once!
#### Setting Up Batch Tests
1. **Create a directory with multiple configs:**
```bash
mkdir my_test_suite
```
2. **Add multiple configuration files:**
```
my_test_suite/
├── test_credit_cards.json
├── test_savings.json
├── test_loans.json
└── test_mobile_banking.json
```
3. **Run all tests:**
```bash
python cli.py --config-dir my_test_suite
```
#### What Happens
```
============================================================
BATCH PROCESSING MODE
Found 4 configuration file(s)
============================================================
• test_credit_cards.json
• test_savings.json
• test_loans.json
• test_mobile_banking.json
>>> Processing 1/4
============================================================
Processing config: test_credit_cards.json
============================================================
[Running tests...]
>>> Processing 2/4
============================================================
Processing config: test_savings.json
============================================================
[Running tests...]
... and so on ...
============================================================
BATCH PROCESSING COMPLETE
============================================================
✓ Successful: 4
Total time: 45.2 minutes
```
#### Batch Processing Benefits
- ✅ Run comprehensive test suites overnight
- ✅ Compare results across different assistants
- ✅ Test multiple prompt types automatically
- ✅ Automated CI/CD testing pipelines
- ✅ Progress tracking and error reporting
#### Command Line Options with Batch
You can override settings for all configs:
```bash
# Run all configs but use content-based prompts
python cli.py --config-dir my_test_suite --prompt-type content-based
# Run with higher parallelization
python cli.py --config-dir my_test_suite --parallel 15 --batch-size 45
# Generate questions only (no testing)
python cli.py --config-dir my_test_suite --generate-only
```
### Output Directory Structure
**NEW FEATURE:** Each test run creates a unique timestamped directory - no more overwriting!
#### Directory Naming
Results are saved as:
```
{output_dir}/{config_name}_{timestamp}/
```
**Example:**
```
results/
├── test_credit_cards_20251112_143022/
│ ├── report.html
│ ├── test_results.json
│ ├── evaluation.json
│ ├── test_questions.json
│ └── *.png (charts)
├── test_credit_cards_20251112_153045/
│ ├── report.html
│ └── ...
└── test_savings_20251112_160112/
├── report.html
└── ...
```
#### Benefits
- ✅ **Never lose results** - each run is preserved
- ✅ **Easy comparison** - compare results across test runs
- ✅ **Audit trail** - complete history of all tests
- ✅ **Organized** - group results by test name and time
#### Customizing Output Location
**In config file:**
```json
{
"output_dir": "my_results"
}
```
**Command line:**
```bash
python cli.py --config myconfig.json --output-dir my_results
```
Results will be saved to:
```
my_results/{config_name}_{timestamp}/
```
## Command Line Reference
### Full Command Syntax
```
usage: cli.py [-h] [--config CONFIG] [--config-dir CONFIG_DIR]
[--create-config OUTPUT_PATH] [--api-key API_KEY]
[--assistant-id ASSISTANT_ID] [--document DOCUMENT]
[--documents DOCUMENTS [DOCUMENTS ...]] [--output-dir OUTPUT_DIR]
[--num-questions NUM_QUESTIONS] [--iterations ITERATIONS]
[--questions-file QUESTIONS_FILE] [--generate-only] [--verbose]
[--model MODEL] [--prompt-type {task-based,content-based,scenario-based}]
[--parallel PARALLEL] [--batch-size BATCH_SIZE]
```
### Common Commands
```bash
# Get help
python cli.py --help
# Create a config template
python cli.py --create-config my_config.json
# Run single test with config file
python cli.py --config my_config.json
# Run batch tests
python cli.py --config-dir my_test_suite/
# Run without config (all command line)
python cli.py --assistant-id asst_abc123 --document myfile.txt
# Generate questions only
python cli.py --config my_config.json --generate-only
# Use pre-generated questions
python cli.py --config my_config.json --questions-file results/test_questions.json
# Change prompt type
python cli.py --config my_config.json --prompt-type scenario-based
# High performance mode
python cli.py --config my_config.json --parallel 15 --batch-size 45
```
## Advanced Usage Examples
### Example 1: Complete Testing Workflow
```bash
# Step 1: Create config
python cli.py --create-config banner_test.json
# Step 2: Edit banner_test.json with your settings
# Step 3: Generate questions first to review
python cli.py --config banner_test.json --generate-only --num-questions 50
# Step 4: Review generated questions in results/*/test_questions.json
# Step 5: Run the full test
python cli.py --config banner_test.json --questions-file results/banner_test_*/test_questions.json
# Step 6: Open report.html to view results
```
### Example 2: Testing Multiple Prompt Types
```bash
# Create base config
cat > base_config.json << EOF
{
"assistant_id": "asst_abc123",
"documents": ["docs/guidelines.docx"],
"num_questions": 30,
"iterations": 5
}
EOF
# Test with task-based prompts
python cli.py --config base_config.json --prompt-type task-based
# Test with content-based prompts
python cli.py --config base_config.json --prompt-type content-based
# Test with scenario-based prompts
python cli.py --config base_config.json --prompt-type scenario-based
# Compare the three result directories!
```
### Example 3: High-Volume Testing
```bash
# For testing with many questions and high parallelization
python cli.py --config my_config.json \
--num-questions 100 \
--iterations 10 \
--parallel 20 \
--batch-size 60 \
--verbose
```
### Example 4: Continuous Integration
```bash
#!/bin/bash
# run_tests.sh - Automated testing script
# Set environment
export OPENAI_API_KEY="your-key"
# Run test suite
python cli.py --config-dir test_configs/
# Check exit code
if [ $? -eq 0 ]; then
echo "All tests passed!"
else
echo "Some tests failed!"
exit 1
fi
```
### Example 5: A/B Testing Different Assistants
```json
// config_assistant_v1.json
{
"assistant_id": "asst_v1_abc123",
"documents": ["docs/guidelines.docx"],
"questions_file": "shared_questions.json",
"num_questions": 50
}
// config_assistant_v2.json
{
"assistant_id": "asst_v2_def456",
"documents": ["docs/guidelines.docx"],
"questions_file": "shared_questions.json",
"num_questions": 50
}
```
```bash
# Generate questions once
python cli.py --config config_assistant_v1.json --generate-only
# Test both assistants with same questions
python cli.py --config config_assistant_v1.json
python cli.py --config config_assistant_v2.json
# Compare the results!
```
### Example 6: Multi-Document Testing
```json
{
"assistant_id": "asst_abc123",
"documents": [
"/path/to/consumer_duty.docx",
"/path/to/fca_guidelines.docx",
"/path/to/brand_guidelines.txt",
"/path/to/product_specs.docx"
],
"num_questions": 40,
"prompt_type": "scenario-based"
}
```
## Understanding the Results
### HTML Report
After tests complete, open `report.html` to see:
#### 1. Summary Metrics
- Overall quality score
- Average consistency score
- Average accuracy score
- Average completeness score
- Average response time
- Total tests run
#### 2. Performance Charts
- **Scores by Question**: Bar chart showing all metric scores for each question
- **Response Times**: How fast the assistant responds
- **Score Distribution**: Histogram of score ranges
- **Radar Chart**: Visual comparison of quality, consistency, accuracy, and completeness
#### 3. Question-by-Question Analysis
For each test prompt:
- Question text
- Individual scores (quality, consistency, accuracy, completeness)
- Evaluation notes and feedback
- All response iterations (collapsible)
### Evaluation Metrics
Each response is scored 1-10 on four dimensions:
#### Quality Score (1-10)
- Clarity and coherence
- Professional tone
- No hallucinations
- Grammar and readability
#### Consistency Score (1-10)
- Similar answers across iterations
- Consistent facts and details
- No contradictions
- Stable level of detail
#### Accuracy Score (1-10)
- Information matches documents
- Correct facts and numbers
- No misrepresentations
- Proper context interpretation
#### Completeness Score (1-10)
- Addresses all aspects of the question
- Includes necessary context
- Sufficient detail
- No significant omissions
### JSON Output Files
#### `test_questions.json`
```json
{
"questions": [
"Create a banner for...",
"Write copy for...",
...
]
}
```
#### `test_results.json`
```json
{
"results": [
{
"question_id": 0,
"question": "Create a banner...",
"iteration": 0,
"response": "Here's your banner: ...",
"response_time": 2.34,
"status": "completed"
},
...
]
}
```
#### `evaluation.json`
```json
{
"summary": {
"total_questions": 20,
"total_iterations": 60,
"average_quality": 8.5,
"average_consistency": 9.2,
"average_accuracy": 8.8,
"average_completeness": 8.6,
"average_response_time": 2.1
},
"by_question": [...]
}
```
## Troubleshooting
### Common Issues
#### 1. Assistant Not Found Error
```
ERROR: No assistant found with id 'asst_...'
```
**Solution:** Check your assistant ID on https://platform.openai.com/assistants
#### 2. API Rate Limits
```
Error: Rate limit exceeded
```
**Solution:** Reduce parallel workers:
```bash
python cli.py --config my_config.json --parallel 3
```
#### 3. Document Loading Errors
```
Warning: No content loaded from documents
```
**Solutions:**
- Check file paths are correct
- For `.docx` files: `pip install docx2txt`
- Verify files are readable (not corrupted)
- Supported formats: `.txt`, `.docx` only
#### 4. Memory Issues
```
MemoryError: ...
```
**Solution:** Reduce batch size:
```bash
python cli.py --config my_config.json --batch-size 10
```
#### 5. Missing API Key
```
Error: No OpenAI API key provided
```
**Solutions:**
```bash
# Option 1: Environment variable
export OPENAI_API_KEY="your-key"
# Option 2: In config file
{
"api_key": "your-key"
}
# Option 3: Command line
python cli.py --api-key "your-key" ...
```
### Debug Mode
Enable verbose output for detailed logging:
```bash
python cli.py --config my_config.json --verbose
```
Or in config:
```json
{
"verbose": true
}
```
### Performance Tips
1. **Optimize Parallelization**
- Start with `parallel: 5`
- Increase gradually if no rate limits
- Set `batch_size` to 2-3x `parallel`
2. **Balance Speed vs. Cost**
- More parallel workers = faster but higher API costs
- More iterations = better consistency data but more tests
3. **Question Generation**
- Generate questions once, reuse with `questions_file`
- Save API calls on repeated tests
## Supported File Types
- ✅ **Text files** (`.txt`): Plain text with UTF-8 encoding
- ✅ **Word documents** (`.docx`): Microsoft Word files (requires `docx2txt`)
- ❌ **PDF files**: Not currently supported
- ❌ **Excel/PowerPoint**: Not currently supported
## Best Practices
### 1. Start Small
```bash
# Test with few questions first
python cli.py --config my_config.json --num-questions 5 --iterations 2
```
### 2. Use Configuration Files
- Easier to track and version
- Reusable across tests
- Less prone to typos
### 3. Organize Your Tests
```
my_project/
├── configs/
│ ├── test_suite_1/
│ │ ├── credit_cards.json
│ │ └── loans.json
│ └── test_suite_2/
│ └── mobile_banking.json
├── results/
└── docs/
```
### 4. Version Control Your Configs
```bash
git add configs/
git commit -m "Add test configurations"
```
### 5. Archive Important Results
```bash
# Save important test results
cp -r results/important_test_20251112_143022 archived_results/
```
## License
MIT
---
## Need Help?
- 📖 **Documentation**: You're reading it!
- 🐛 **Issues**: Report bugs on GitHub
- 💡 **Feature Requests**: Open an issue with your idea
- 📧 **Contact**: [your-email@example.com]
---
**Happy Testing! 🚀**

481
rag_test_app/cli.py Normal file
View file

@ -0,0 +1,481 @@
#!/usr/bin/env python3
import argparse
import os
import sys
import json
import time
from datetime import datetime
from main import RAGTester
from rich.console import Console
from typing import Dict, Any, List
console = Console()
def load_config(config_path: str) -> Dict[str, Any]:
"""
Load configuration from a JSON file
"""
try:
with open(config_path, 'r') as f:
config = json.load(f)
console.print(f"[green]Loaded configuration from {config_path}[/green]")
return config
except Exception as e:
console.print(f"[bold red]Error loading config file: {str(e)}[/bold red]")
sys.exit(1)
def process_single_config(config_path: str, args: argparse.Namespace) -> bool:
"""
Process a single configuration file.
Returns True if successful, False otherwise.
"""
console.print(f"\n[bold cyan]{'='*60}[/bold cyan]")
console.print(f"[bold cyan]Processing config: {os.path.basename(config_path)}[/bold cyan]")
console.print(f"[bold cyan]{'='*60}[/bold cyan]\n")
try:
# Load config file
if not os.path.exists(config_path):
console.print(f"[bold red]Error: Config file not found: {config_path}[/bold red]")
return False
config = load_config(config_path)
# Merge command-line arguments with config file (command-line takes precedence)
api_key = args.api_key or config.get("api_key") or os.environ.get("OPENAI_API_KEY")
assistant_id = args.assistant_id or config.get("assistant_id")
document_path = args.document or config.get("document")
documents = args.documents or config.get("documents")
output_dir = args.output_dir if args.output_dir != "results" else config.get("output_dir", "results")
num_questions = args.num_questions if args.num_questions != 20 else config.get("num_questions", 20)
iterations = args.iterations if args.iterations != 3 else config.get("iterations", 3)
questions_file = args.questions_file or config.get("questions_file")
generate_only = args.generate_only or config.get("generate_only", False)
verbose = args.verbose or config.get("verbose", False)
model = args.model if args.model != "gpt-4o" else config.get("model", "gpt-4o")
prompt_type = args.prompt_type if args.prompt_type != "task-based" else config.get("prompt_type", "task-based")
parallel = args.parallel if args.parallel != 5 else config.get("parallel", 5)
batch_size = args.batch_size or config.get("batch_size")
# Create unique timestamped output directory
config_name = os.path.splitext(os.path.basename(config_path))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
unique_output_dir = os.path.join(output_dir, f"{config_name}_{timestamp}")
output_dir = unique_output_dir
# Check for required parameters
if not api_key:
console.print("[bold red]Error: No OpenAI API key provided.[/bold red]")
console.print("Please provide an API key using --api-key, config file, or set the OPENAI_API_KEY environment variable.")
return False
if not assistant_id:
console.print("[bold red]Error: No assistant ID provided.[/bold red]")
console.print("Please provide an assistant ID using --assistant-id or in the config file.")
return False
# Check if at least one document source is provided
if not documents and not document_path:
console.print("[bold red]Error: No documents provided.[/bold red]")
console.print("Please provide documents using --documents, --document, or in the config file.")
return False
# Check if document paths exist
if documents:
for doc in documents:
if not os.path.exists(doc):
console.print(f"[bold red]Warning: Document not found: {doc}[/bold red]")
elif document_path and not os.path.exists(document_path):
console.print(f"[bold red]Error: Document or directory not found: {document_path}[/bold red]")
return False
# Print startup banner
console.print("\n[bold green]╔══════════════════════════════════════════════╗[/bold green]")
console.print("[bold green]║ RAG Testing Application ║[/bold green]")
console.print("[bold green]╚══════════════════════════════════════════════╝[/bold green]\n")
if verbose:
console.print("[bold yellow]Verbose mode enabled[/bold yellow]")
# Create tester
tester = RAGTester(
api_key=api_key,
assistant_id=assistant_id,
document_paths=documents,
document_path=None if documents else document_path,
output_dir=output_dir,
verbose=verbose,
model=model,
prompt_type=prompt_type
)
# Load or generate questions
if questions_file:
if not os.path.exists(questions_file):
console.print(f"[bold red]Error: Questions file not found: {questions_file}[/bold red]")
return False
console.print(f"Loading questions from {questions_file}")
tester.load_questions_from_file(questions_file)
else:
console.print(f"Generating {num_questions} test questions")
tester.generate_test_questions(num_questions)
# Exit if generate-only
if generate_only:
console.print("[green]Questions generated and saved. Exiting.[/green]")
return True
# Run tests with parallelization
tester.run_tests(iterations=iterations, max_workers=parallel, batch_size=batch_size)
# Evaluate results
evaluation = tester.evaluate_results()
# Generate report
tester.generate_report()
console.print(f"[bold green]Testing complete! Results saved to {output_dir}[/bold green]")
return True
except Exception as e:
console.print(f"[bold red]Error processing config {config_path}: {str(e)}[/bold red]")
import traceback
console.print(f"[red]{traceback.format_exc()}[/red]")
return False
def create_config_template(output_path: str) -> None:
"""
Create a configuration template file at the specified path
"""
template = {
"assistant_id": "asst_YourAssistantIdHere",
"documents": [
"/path/to/your/document1.txt",
"/path/to/your/document2.txt"
],
"api_key": "YOUR_OPENAI_API_KEY",
"output_dir": "results",
"num_questions": 20,
"iterations": 3,
"questions_file": "", # Leave empty to generate new questions
"generate_only": False,
"verbose": True,
"model": "gpt-4o",
"prompt_type": "task-based", # Options: "task-based", "content-based", "scenario-based"
"parallel": 10,
"batch_size": 30
}
try:
# Ensure directory exists
os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
# Write the template with pretty formatting
with open(output_path, 'w') as f:
json.dump(template, f, indent=2)
console.print(f"[green]Created configuration template at: {output_path}[/green]")
console.print("[cyan]Edit this file with your settings and then run:[/cyan]")
console.print(f"[cyan] python cli.py --config {output_path}[/cyan]")
except Exception as e:
console.print(f"[bold red]Error creating configuration template: {str(e)}[/bold red]")
sys.exit(1)
def main():
"""Main entry point for the RAG Testing CLI"""
parser = argparse.ArgumentParser(
description="Test OpenAI assistants with RAG capabilities",
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
# Add config group
config_group = parser.add_argument_group('Configuration')
config_group.add_argument(
"--config",
type=str,
help="Path to a JSON configuration file"
)
config_group.add_argument(
"--config-dir",
type=str,
help="Path to a directory containing multiple JSON configuration files. All configs will be run in sequence."
)
config_group.add_argument(
"--create-config",
type=str,
metavar="OUTPUT_PATH",
help="Create a configuration template file at the specified path and exit"
)
parser.add_argument(
"--api-key",
type=str,
help="OpenAI API key. If not provided, will use OPENAI_API_KEY environment variable."
)
parser.add_argument(
"--assistant-id",
type=str,
help="ID of the OpenAI assistant to test"
)
parser.add_argument(
"--document",
type=str,
help="Path to the document or directory of documents to use for testing"
)
parser.add_argument(
"--documents",
nargs='+',
help="List of document paths to use for testing (takes precedence over --document)"
)
parser.add_argument(
"--output-dir",
type=str,
default="results",
help="Directory to save test results"
)
parser.add_argument(
"--num-questions",
type=int,
default=20,
help="Number of test questions to generate"
)
parser.add_argument(
"--iterations",
type=int,
default=3,
help="Number of times to test each question"
)
parser.add_argument(
"--questions-file",
type=str,
help="Path to a JSON file with pre-generated questions"
)
parser.add_argument(
"--generate-only",
action="store_true",
help="Only generate questions, don't run tests"
)
parser.add_argument(
"--verbose",
action="store_true",
help="Enable verbose output for debugging"
)
parser.add_argument(
"--model",
type=str,
default="gpt-4o",
help="OpenAI model to use for question generation and evaluation"
)
parser.add_argument(
"--prompt-type",
type=str,
choices=["task-based", "content-based", "scenario-based"],
default="task-based",
help="Type of prompts to generate: 'task-based' (real user tasks like 'create a banner'), "
"'content-based' (knowledge questions about documents), "
"'scenario-based' (realistic business scenarios). Default: task-based"
)
parser.add_argument(
"--parallel",
type=int,
default=5,
help="Number of parallel workers for running tests (default: 5)"
)
parser.add_argument(
"--batch-size",
type=int,
help="Number of questions to process in a batch (defaults to same as --parallel if not specified)"
)
args = parser.parse_args()
# Check if we need to create a config template
if args.create_config:
create_config_template(args.create_config)
sys.exit(0)
# Handle batch processing of multiple config files
if args.config_dir:
if not os.path.exists(args.config_dir):
console.print(f"[bold red]Error: Config directory not found: {args.config_dir}[/bold red]")
sys.exit(1)
if not os.path.isdir(args.config_dir):
console.print(f"[bold red]Error: {args.config_dir} is not a directory[/bold red]")
sys.exit(1)
# Find all JSON config files in the directory
config_files = sorted([
os.path.join(args.config_dir, f)
for f in os.listdir(args.config_dir)
if f.endswith('.json')
])
if not config_files:
console.print(f"[bold red]Error: No JSON config files found in {args.config_dir}[/bold red]")
sys.exit(1)
console.print(f"\n[bold magenta]{'='*60}[/bold magenta]")
console.print(f"[bold magenta]BATCH PROCESSING MODE[/bold magenta]")
console.print(f"[bold magenta]Found {len(config_files)} configuration file(s)[/bold magenta]")
console.print(f"[bold magenta]{'='*60}[/bold magenta]\n")
for config_file in config_files:
console.print(f"{os.path.basename(config_file)}")
console.print()
# Process each config file
successful = 0
failed = 0
start_time = time.time()
for idx, config_file in enumerate(config_files, 1):
console.print(f"\n[bold yellow]>>> Processing {idx}/{len(config_files)}[/bold yellow]")
if process_single_config(config_file, args):
successful += 1
else:
failed += 1
console.print(f"[bold red]Failed to process {os.path.basename(config_file)}[/bold red]")
# Print summary
elapsed_time = time.time() - start_time
console.print(f"\n[bold magenta]{'='*60}[/bold magenta]")
console.print(f"[bold magenta]BATCH PROCESSING COMPLETE[/bold magenta]")
console.print(f"[bold magenta]{'='*60}[/bold magenta]")
console.print(f"[bold green]✓ Successful: {successful}[/bold green]")
if failed > 0:
console.print(f"[bold red]✗ Failed: {failed}[/bold red]")
console.print(f"[cyan]Total time: {elapsed_time/60:.1f} minutes[/cyan]")
console.print()
sys.exit(0 if failed == 0 else 1)
# Single config file mode
if args.config:
if not os.path.exists(args.config):
console.print(f"[bold red]Error: Config file not found: {args.config}[/bold red]")
sys.exit(1)
success = process_single_config(args.config, args)
sys.exit(0 if success else 1)
# Original command-line mode (no config file)
config = {}
# Merge command-line arguments with config file (command-line takes precedence)
api_key = args.api_key or config.get("api_key") or os.environ.get("OPENAI_API_KEY")
assistant_id = args.assistant_id or config.get("assistant_id")
document_path = args.document or config.get("document")
documents = args.documents or config.get("documents")
output_dir = args.output_dir if args.output_dir != "results" else config.get("output_dir", "results")
num_questions = args.num_questions if args.num_questions != 20 else config.get("num_questions", 20)
iterations = args.iterations if args.iterations != 3 else config.get("iterations", 3)
questions_file = args.questions_file or config.get("questions_file")
generate_only = args.generate_only or config.get("generate_only", False)
verbose = args.verbose or config.get("verbose", False)
model = args.model if args.model != "gpt-4o" else config.get("model", "gpt-4o")
parallel = args.parallel if args.parallel != 5 else config.get("parallel", 5)
batch_size = args.batch_size or config.get("batch_size")
# Create unique timestamped output directory
# Get the base name from config file if available, otherwise use 'test'
config_name = "test"
if args.config:
config_name = os.path.splitext(os.path.basename(args.config))[0]
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
unique_output_dir = os.path.join(output_dir, f"{config_name}_{timestamp}")
output_dir = unique_output_dir
# Check for required parameters
if not api_key:
console.print("[bold red]Error: No OpenAI API key provided.[/bold red]")
console.print("Please provide an API key using --api-key, config file, or set the OPENAI_API_KEY environment variable.")
sys.exit(1)
if not assistant_id:
console.print("[bold red]Error: No assistant ID provided.[/bold red]")
console.print("Please provide an assistant ID using --assistant-id or in the config file.")
sys.exit(1)
# Check if at least one document source is provided
if not documents and not document_path:
console.print("[bold red]Error: No documents provided.[/bold red]")
console.print("Please provide documents using --documents, --document, or in the config file.")
sys.exit(1)
# Check if document paths exist
if documents:
for doc in documents:
if not os.path.exists(doc):
console.print(f"[bold red]Warning: Document not found: {doc}[/bold red]")
elif document_path and not os.path.exists(document_path):
console.print(f"[bold red]Error: Document or directory not found: {document_path}[/bold red]")
sys.exit(1)
# Print startup banner
console.print("\n[bold green]╔══════════════════════════════════════════════╗[/bold green]")
console.print("[bold green]║ RAG Testing Application ║[/bold green]")
console.print("[bold green]╚══════════════════════════════════════════════╝[/bold green]\n")
if verbose:
console.print("[bold yellow]Verbose mode enabled[/bold yellow]")
# Create tester
tester = RAGTester(
api_key=api_key,
assistant_id=assistant_id,
document_paths=documents,
document_path=None if documents else document_path,
output_dir=output_dir,
verbose=verbose,
model=model
)
# Load or generate questions
if questions_file:
if not os.path.exists(questions_file):
console.print(f"[bold red]Error: Questions file not found: {questions_file}[/bold red]")
sys.exit(1)
console.print(f"Loading questions from {questions_file}")
tester.load_questions_from_file(questions_file)
else:
console.print(f"Generating {num_questions} test questions")
tester.generate_test_questions(num_questions)
# Exit if generate-only
if generate_only:
console.print("[green]Questions generated and saved. Exiting.[/green]")
sys.exit(0)
# Run tests with parallelization
tester.run_tests(iterations=iterations, max_workers=parallel, batch_size=batch_size)
# Evaluate results
evaluation = tester.evaluate_results()
# Generate report
tester.generate_report()
console.print(f"[bold green]Testing complete! Results saved to {output_dir}[/bold green]")
if __name__ == "__main__":
main()

1326
rag_test_app/main.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,12 @@
openai>=1.12.0
pandas>=2.0.0
numpy>=1.24.0
langchain>=0.1.0
scikit-learn>=1.2.0
tiktoken>=0.5.0
sentence-transformers>=2.2.0
pytest>=7.0.0
matplotlib>=3.7.0
rich>=13.0.0
docx2txt>=0.8
openpyxl>=3.1.0

Binary file not shown.

31
rag_test_app/setup.py Normal file
View file

@ -0,0 +1,31 @@
from setuptools import setup, find_packages
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
with open("requirements.txt", "r", encoding="utf-8") as fh:
requirements = fh.read().splitlines()
setup(
name="rag-test-app",
version="0.1.0",
author="Your Name",
author_email="your.email@example.com",
description="A tool for testing OpenAI assistants with RAG capabilities",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/yourusername/rag-test-app",
packages=find_packages(),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires=">=3.8",
install_requires=requirements,
entry_points={
"console_scripts": [
"rag-test=cli:main",
],
},
)

View file

@ -0,0 +1,124 @@
import os
import sys
import json
import pytest
from unittest.mock import patch, MagicMock
# Add parent directory to path
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from main import RAGTester
# Sample test data
SAMPLE_DOCUMENT = "This is a test document about machine learning."
SAMPLE_QUESTIONS = ["What is machine learning?", "Explain supervised learning."]
SAMPLE_RESPONSES = ["Machine learning is...", "Supervised learning involves..."]
class TestRAGTester:
@pytest.fixture
def mock_openai(self):
with patch('main.OpenAI') as mock_openai:
# Mock client
mock_client = MagicMock()
mock_openai.return_value = mock_client
# Mock chat.completions.create
mock_completion = MagicMock()
mock_choice = MagicMock()
mock_message = MagicMock()
mock_message.content = json.dumps({"questions": SAMPLE_QUESTIONS})
mock_choice.message = mock_message
mock_completion.choices = [mock_choice]
mock_client.chat.completions.create.return_value = mock_completion
# Mock threads
mock_thread = MagicMock()
mock_thread.id = "thread_123"
mock_client.beta.threads.create.return_value = mock_thread
# Mock messages
mock_message = MagicMock()
mock_message.role = "assistant"
mock_content = MagicMock()
mock_content.text = MagicMock()
mock_content.text.value = SAMPLE_RESPONSES[0]
mock_message.content = [mock_content]
mock_messages = MagicMock()
mock_messages.data = [mock_message]
mock_client.beta.threads.messages.list.return_value = mock_messages
# Mock runs
mock_run = MagicMock()
mock_run.id = "run_123"
mock_run.status = "completed"
mock_client.beta.threads.runs.create.return_value = mock_run
mock_client.beta.threads.runs.retrieve.return_value = mock_run
yield mock_client
@pytest.fixture
def tester(self, tmp_path, mock_openai):
# Create a temporary document
doc_path = tmp_path / "test_doc.txt"
doc_path.write_text(SAMPLE_DOCUMENT)
# Create output directory
output_dir = tmp_path / "results"
output_dir.mkdir()
# Create tester
tester = RAGTester(
api_key="sk-test",
assistant_id="asst_123",
document_path=str(doc_path),
output_dir=str(output_dir)
)
return tester
def test_initialization(self, tester):
"""Test that the tester initializes correctly"""
assert tester.api_key == "sk-test"
assert tester.assistant_id == "asst_123"
assert tester.document_content == SAMPLE_DOCUMENT
assert len(tester.questions) == 0
assert len(tester.results) == 0
def test_generate_questions(self, tester, mock_openai):
"""Test question generation"""
questions = tester.generate_test_questions(num_questions=2)
# Check that questions were generated and saved
assert len(questions) == 2
assert questions == SAMPLE_QUESTIONS
assert mock_openai.chat.completions.create.called
# Check that questions were saved to file
questions_file = os.path.join(tester.output_dir, "test_questions.json")
assert os.path.exists(questions_file)
with open(questions_file, 'r') as f:
saved_questions = json.load(f)
assert saved_questions["questions"] == SAMPLE_QUESTIONS
def test_run_tests(self, tester, mock_openai):
"""Test running tests"""
# First generate questions
tester.questions = SAMPLE_QUESTIONS
# Run tests
results = tester.run_tests(iterations=1)
# Check results
assert len(results) == 2 # 2 questions x 1 iteration
assert results[0]["question"] == SAMPLE_QUESTIONS[0]
assert results[0]["response"] == SAMPLE_RESPONSES[0]
assert "response_time" in results[0]
# Check that API calls were made
assert mock_openai.beta.threads.create.called
assert mock_openai.beta.threads.messages.create.called
assert mock_openai.beta.threads.runs.create.called
assert mock_openai.beta.threads.runs.retrieve.called
assert mock_openai.beta.threads.messages.list.called