diff --git a/.gitignore b/.gitignore index b24d71e..6643a86 100644 --- a/.gitignore +++ b/.gitignore @@ -1,46 +1,35 @@ -# These are some examples of commonly ignored file patterns. -# You should customize this list as applicable to your project. -# Learn more about .gitignore: -# https://www.atlassian.com/git/tutorials/saving-changes/gitignore +# Python +__pycache__/ +*.py[cod] +.venv/ +venv/ +*.egg-info/ -# Node artifact files +# Test results (large, generated) +rag_test_app/results/*/ + +# Node node_modules/ dist/ -# Compiled Java class files +# Build artifacts *.class - -# Compiled Python bytecode -*.py[cod] - -# Log files -*.log - -# Package files *.jar - -# Maven target/ -dist/ -# JetBrains IDE +# IDEs .idea/ -# Unit test reports -TEST*.xml - -# Generated by MacOS +# macOS .DS_Store -# Generated by Windows +# Windows Thumbs.db -# Applications -*.app -*.exe -*.war +# Logs +*.log -# Large media files +# Large media *.mp4 *.tiff *.avi @@ -48,3 +37,8 @@ Thumbs.db *.mov *.wmv +# Claude Code local settings +.claude/ + +# Environment +.env diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..b38bc08 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,65 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What this repo is + +A standalone Python CLI tool (`rag_test_app/`) that benchmarks OpenAI Assistants with RAG knowledge bases. Built for Barclays — tests banner-creation assistants against brand/compliance documents and produces scored HTML reports. + +The `TEST_TO_RUN/` directory contains pre-built Barclays test configs (JSON) + document ZIPs for four campaign types: Internal Banners, Social Posts, Display Banners, PPC. + +## Commands + +All commands run from `rag_test_app/`: + +```bash +cd rag_test_app + +# Install dependencies +pip install -r requirements.txt + +# Create a config template +python cli.py --create-config my_config.json + +# Run a single test config +python cli.py --config my_config.json + +# Batch-run all configs in a directory +python cli.py --config-dir ../TEST_TO_RUN/ + +# Generate questions only (no API calls to assistant) +python cli.py --config my_config.json --generate-only + +# Reuse previously generated questions +python cli.py --config my_config.json --questions-file results/*/test_questions.json + +# Run tests +pytest tests/ +pytest tests/test_main.py::TestRAGTester::test_generate_questions +``` + +Required env var: `OPENAI_API_KEY` (alternatively set `api_key` in the JSON config, though env var is preferred). + +## Architecture + +Two files, one class: + +**`main.py` — `RAGTester`** + +1. `__init__`: Loads source documents (`.txt` or `.docx` via `docx2txt`). Concatenates multiple docs with `--- Document: filename ---` separators into a single `document_content` string. +2. `generate_test_questions()`: Sends the first 3000 chars of `document_content` to GPT-4o with one of three prompt templates (`task-based` / `content-based` / `scenario-based`). Returns parsed JSON question list. Falls back to `gpt-3.5-turbo` if JSON response format fails. +3. `run_tests()`: Runs every question × every iteration against the OpenAI Assistants API (`beta.threads`) using `ThreadPoolExecutor`. Saves partial results every 5 completions to avoid data loss. +4. `evaluate_results()`: Groups results by question, calls GPT-4o to score each group on quality / consistency / accuracy / completeness (1–10). Falls back on parse failure to default score of 5. +5. `generate_report()`: Produces `report.html` + four matplotlib PNGs (`scores_by_question`, `response_times`, `score_distribution`, `radar_chart`) all saved into the timestamped output dir. + +**`cli.py` — CLI wrapper** + +Handles argument parsing, config file merging (CLI args override config file values), and batch mode (`--config-dir`). Creates timestamped output dirs: `{output_dir}/{config_name}_{YYYYMMDD_HHMMSS}/`. + +## Key behaviours to know + +- **`batch_size` vs `parallel`**: `batch_size` controls memory; `parallel` controls concurrency. Recommended: `batch_size = 2-3× parallel`. High parallelism hits OpenAI rate limits — start at `parallel: 5`. +- **Output never overwrites**: every run creates a new timestamped directory. +- **Evaluation is expensive**: GPT-4o is called once per question for evaluation, in addition to the assistant calls. A 20-question × 3-iteration run costs ~63 API calls minimum. +- **Document truncation in prompts**: question generation only sends the first 3000 characters of `document_content` to GPT, regardless of total document size. The full content is not summarised or chunked further. +- **Supported document formats**: `.txt` and `.docx` only. PDF/XLS/PPT are silently skipped. diff --git a/TEST_TO_RUN/Barclays_Internal_Banners.json b/TEST_TO_RUN/Barclays_Internal_Banners.json new file mode 100644 index 0000000..5d6f9d8 --- /dev/null +++ b/TEST_TO_RUN/Barclays_Internal_Banners.json @@ -0,0 +1,19 @@ +{ + "assistant_id": "asst_vlFx0Uud1BKtp7j77Vp0pi8H", + "documents": [ + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Duty summary.docx", + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Understanding.docx", + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Internal Banner Examples 23102024 -Markdown.docx", + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Our Regulators and developing marketing material Chatai.docx" + ], + "api_key": "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", + "output_dir": "results", + "num_questions": 20, + "iterations": 3, + "questions_file": "", + "generate_only": false, + "verbose": true, + "model": "gpt-4o", + "parallel": 10, + "batch_size": 20 +} \ No newline at end of file diff --git a/TEST_TO_RUN/Barclays_Internal_Banners.zip b/TEST_TO_RUN/Barclays_Internal_Banners.zip new file mode 100644 index 0000000..3b6cd85 Binary files /dev/null and b/TEST_TO_RUN/Barclays_Internal_Banners.zip differ diff --git a/TEST_TO_RUN/Barclays_Social_Posts.json b/TEST_TO_RUN/Barclays_Social_Posts.json new file mode 100644 index 0000000..90a0ee1 --- /dev/null +++ b/TEST_TO_RUN/Barclays_Social_Posts.json @@ -0,0 +1,19 @@ +{ + "assistant_id": "asst_MT0qKXI57m8Y2RVllqwFUqBe", + "documents": [ + "/Users/daveporter/Python-Enviroments/RAG-TEST-BAIC/DIsplay-Banner-Docs/BUK Social Media Playbook.docx", + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Understanding.docx", + "/Users/daveporter/Python-Enviroments/RAG-TEST-BAIC/DIsplay-Banner-Docs/OMD Barclays Social 101.docx", + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Our Regulators and developing marketing material Chatai.docx" + ], + "api_key": "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", + "output_dir": "results", + "num_questions": 20, + "iterations": 3, + "questions_file": "", + "generate_only": false, + "verbose": true, + "model": "gpt-4o", + "parallel": 10, + "batch_size": 20 +} \ No newline at end of file diff --git a/TEST_TO_RUN/Barclays_Social_Posts.zip b/TEST_TO_RUN/Barclays_Social_Posts.zip new file mode 100644 index 0000000..6036fdb Binary files /dev/null and b/TEST_TO_RUN/Barclays_Social_Posts.zip differ diff --git a/TEST_TO_RUN/Display_banners.json b/TEST_TO_RUN/Display_banners.json new file mode 100644 index 0000000..817ed6b --- /dev/null +++ b/TEST_TO_RUN/Display_banners.json @@ -0,0 +1,18 @@ +{ + "assistant_id": "asst_eAsIXFpSGiy7jQzyF8p0IRDA", + "documents": [ + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Duty summary.docx", + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Consumer Understanding.docx", + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Our Regulators and developing marketing material Chatai.docx" + ], + "api_key": "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", + "output_dir": "results", + "num_questions": 20, + "iterations": 3, + "questions_file": "", + "generate_only": false, + "verbose": true, + "model": "gpt-4o", + "parallel": 10, + "batch_size": 20 +} \ No newline at end of file diff --git a/TEST_TO_RUN/Display_banners.zip b/TEST_TO_RUN/Display_banners.zip new file mode 100644 index 0000000..f083d22 Binary files /dev/null and b/TEST_TO_RUN/Display_banners.zip differ diff --git a/TEST_TO_RUN/PPC.json b/TEST_TO_RUN/PPC.json new file mode 100644 index 0000000..4120984 --- /dev/null +++ b/TEST_TO_RUN/PPC.json @@ -0,0 +1,16 @@ +{ + "assistant_id": "asst_Pz7uhnK7aOoYykl7KalyirY9", + "documents": [ + "/Users/daveporter/Python-Enviroments/ASSISTANT-TEST-BAIC/TESTFILES-BANNER_ASSISTANT/RAG Docs/Our Regulators and developing marketing material Chatai.docx" + ], + "api_key": "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", + "output_dir": "results", + "num_questions": 20, + "iterations": 3, + "questions_file": "", + "generate_only": false, + "verbose": true, + "model": "gpt-4o", + "parallel": 10, + "batch_size": 20 +} \ No newline at end of file diff --git a/TEST_TO_RUN/PPC.zip b/TEST_TO_RUN/PPC.zip new file mode 100644 index 0000000..43a2cc4 Binary files /dev/null and b/TEST_TO_RUN/PPC.zip differ diff --git a/rag_test_app/README.md b/rag_test_app/README.md new file mode 100644 index 0000000..df93f8c --- /dev/null +++ b/rag_test_app/README.md @@ -0,0 +1,794 @@ +# RAG Testing Application + +A comprehensive Python application for automatically testing and evaluating OpenAI assistants with Retrieval-Augmented Generation (RAG) capabilities. + +## Table of Contents + +- [Overview](#overview) +- [Features](#features) +- [Installation](#installation) +- [Quick Start Guide](#quick-start-guide) +- [Complete User Guide](#complete-user-guide) + - [Configuration File Reference](#configuration-file-reference) + - [Prompt Types](#prompt-types) + - [Batch Processing](#batch-processing) + - [Output Directory Structure](#output-directory-structure) +- [Command Line Reference](#command-line-reference) +- [Advanced Usage Examples](#advanced-usage-examples) +- [Understanding the Results](#understanding-the-results) +- [Troubleshooting](#troubleshooting) + +## Overview + +This tool helps you evaluate and benchmark OpenAI assistants by: + +1. **Generating test prompts** from your source documents (with multiple prompt styles) +2. **Running prompts** against your assistant multiple times to test consistency +3. **Evaluating responses** for quality, consistency, accuracy, and completeness +4. **Generating detailed reports** with visualizations and metrics + +**Perfect for:** Testing assistants that create content (banners, copy, documents) using RAG knowledge bases. + +## Features + +- ✅ **Multi-document support**: Test with individual documents, directories, or specified sets of documents +- ✅ **Multiple prompt types**: Generate realistic user tasks, knowledge questions, or business scenarios +- ✅ **Batch processing**: Run multiple test configurations in sequence automatically +- ✅ **Timestamped results**: Each test run creates a unique timestamped directory - no more overwriting! +- ✅ **Support for DOCX files**: Works with both text and Microsoft Word files +- ✅ **Optimized performance**: Parallel processing and batch execution for significantly faster testing +- ✅ **Comprehensive evaluation**: Assesses responses for quality, accuracy, consistency, and completeness +- ✅ **Interactive reporting**: Generates professional HTML reports with detailed visualizations +- ✅ **Performance tracking**: Measures and analyzes response times and other key metrics +- ✅ **Data export**: Saves all results as JSON for further analysis +- ✅ **Config-based workflow**: Easy to set up and customize via configuration files + +## Installation + +1. Clone this repository: +```bash +git clone https://github.com/yourusername/rag-test-app.git +cd rag-test-app +``` + +2. Install the required packages: +```bash +pip install -r requirements.txt +``` + +3. Set up your OpenAI API key: +```bash +export OPENAI_API_KEY="your-api-key-here" +``` + +## Quick Start Guide + +### 1. Create a Configuration File + +The easiest way to get started: + +```bash +python cli.py --create-config my_test_config.json +``` + +This creates a template configuration file. + +### 2. Edit Your Configuration + +Open `my_test_config.json` and update: +- `assistant_id`: Your OpenAI Assistant ID (e.g., "asst_abc123...") +- `documents`: Paths to your RAG documents +- `api_key`: Your OpenAI API key (or use environment variable) + +### 3. Run Your First Test + +```bash +python cli.py --config my_test_config.json +``` + +### 4. View Results + +Open the generated `results/your_test_YYYYMMDD_HHMMSS/report.html` in your browser! + +## Complete User Guide + +### Configuration File Reference + +Here's a complete configuration file with ALL available options: + +```json +{ + "assistant_id": "asst_YourAssistantIdHere", + "documents": [ + "/path/to/your/document1.txt", + "/path/to/your/document2.docx" + ], + "api_key": "YOUR_OPENAI_API_KEY", + "output_dir": "results", + "num_questions": 20, + "iterations": 3, + "questions_file": "", + "generate_only": false, + "verbose": true, + "model": "gpt-4o", + "prompt_type": "task-based", + "parallel": 10, + "batch_size": 30 +} +``` + +#### Configuration Options Explained + +| Option | Type | Default | Description | +|--------|------|---------|-------------| +| **`assistant_id`** | string | (required) | Your OpenAI Assistant ID starting with "asst_" | +| **`documents`** | array | null | List of document file paths (preferred method) | +| **`document`** | string | null | Single document or directory path (alternative to `documents`) | +| **`api_key`** | string | env var | OpenAI API key (can also use `OPENAI_API_KEY` environment variable) | +| **`output_dir`** | string | "results" | Base directory for saving results | +| **`num_questions`** | integer | 20 | Number of test prompts to generate | +| **`iterations`** | integer | 3 | How many times to test each prompt (for consistency checking) | +| **`questions_file`** | string | null | Path to pre-generated questions JSON file | +| **`generate_only`** | boolean | false | Only generate questions, don't run tests | +| **`verbose`** | boolean | false | Enable detailed logging for debugging | +| **`model`** | string | "gpt-4o" | GPT model for question generation and evaluation | +| **`prompt_type`** | string | "task-based" | Type of prompts to generate (see below) | +| **`parallel`** | integer | 5 | Number of parallel workers for running tests | +| **`batch_size`** | integer | same as parallel | Questions per batch (set to 2-3x parallel for best performance) | + +### Prompt Types + +**NEW FEATURE:** Choose how test prompts are generated to match your testing needs. + +#### 1. **`task-based`** (Default - Recommended) + +Generates realistic user task requests that emulate how real users interact with your assistant. + +**Best for:** Testing assistants that create content (banners, copy, ads, documents) + +**Example prompts:** +- "Create a banner for our new credit card offer with 0% APR" +- "Write copy for a savings account promotion targeting young professionals" +- "Generate headlines for our mobile banking app launch" +- "Design promotional text for a balance transfer campaign" + +**When to use:** +- Testing content creation assistants +- Simulating real user interactions +- Evaluating practical usability + +**Configuration:** +```json +{ + "prompt_type": "task-based" +} +``` + +**Command line:** +```bash +python cli.py --config myconfig.json --prompt-type task-based +``` + +#### 2. **`content-based`** (Original) + +Generates knowledge questions about the content in your RAG documents. + +**Best for:** Testing document understanding and knowledge retrieval + +**Example prompts:** +- "What is the FCA Consumer Duty requirement?" +- "Explain the principles of clear customer communication" +- "What are the considerations for vulnerable customers?" +- "List the regulatory guidelines for financial advertising" + +**When to use:** +- Verifying RAG knowledge accuracy +- Testing document comprehension +- Auditing information retrieval + +**Configuration:** +```json +{ + "prompt_type": "content-based" +} +``` + +**Command line:** +```bash +python cli.py --config myconfig.json --prompt-type content-based +``` + +#### 3. **`scenario-based`** + +Generates realistic business scenarios that combine tasks with context and requirements. + +**Best for:** Testing complex real-world use cases with constraints + +**Example prompts:** +- "We're launching a new credit card for students. Create FCA-compliant banner copy that's clear and accessible" +- "Our vulnerable customer initiative needs promotional materials. Write banner text that follows Consumer Duty guidelines" +- "Create an internal banner for our mobile banking upgrade targeting existing customers aged 50+" +- "We have a new savings product for first-time buyers. Generate compliant promotional copy" + +**When to use:** +- Testing compliance and constraints +- Simulating real business workflows +- Evaluating context handling + +**Configuration:** +```json +{ + "prompt_type": "scenario-based" +} +``` + +**Command line:** +```bash +python cli.py --config myconfig.json --prompt-type scenario-based +``` + +#### Comparing Prompt Types + +| Prompt Type | Use Case | Complexity | Best For | +|-------------|----------|------------|----------| +| **task-based** | Simple user requests | Low | Daily user interactions | +| **content-based** | Knowledge questions | Medium | RAG accuracy testing | +| **scenario-based** | Business scenarios | High | Real-world workflows | + +### Batch Processing + +**NEW FEATURE:** Run multiple test configurations automatically in sequence. + +Instead of running tests one at a time, point to a directory containing multiple config files and run them all at once! + +#### Setting Up Batch Tests + +1. **Create a directory with multiple configs:** +```bash +mkdir my_test_suite +``` + +2. **Add multiple configuration files:** +``` +my_test_suite/ +├── test_credit_cards.json +├── test_savings.json +├── test_loans.json +└── test_mobile_banking.json +``` + +3. **Run all tests:** +```bash +python cli.py --config-dir my_test_suite +``` + +#### What Happens + +``` +============================================================ +BATCH PROCESSING MODE +Found 4 configuration file(s) +============================================================ + • test_credit_cards.json + • test_savings.json + • test_loans.json + • test_mobile_banking.json + +>>> Processing 1/4 +============================================================ +Processing config: test_credit_cards.json +============================================================ +[Running tests...] + +>>> Processing 2/4 +============================================================ +Processing config: test_savings.json +============================================================ +[Running tests...] + +... and so on ... + +============================================================ +BATCH PROCESSING COMPLETE +============================================================ +✓ Successful: 4 +Total time: 45.2 minutes +``` + +#### Batch Processing Benefits + +- ✅ Run comprehensive test suites overnight +- ✅ Compare results across different assistants +- ✅ Test multiple prompt types automatically +- ✅ Automated CI/CD testing pipelines +- ✅ Progress tracking and error reporting + +#### Command Line Options with Batch + +You can override settings for all configs: + +```bash +# Run all configs but use content-based prompts +python cli.py --config-dir my_test_suite --prompt-type content-based + +# Run with higher parallelization +python cli.py --config-dir my_test_suite --parallel 15 --batch-size 45 + +# Generate questions only (no testing) +python cli.py --config-dir my_test_suite --generate-only +``` + +### Output Directory Structure + +**NEW FEATURE:** Each test run creates a unique timestamped directory - no more overwriting! + +#### Directory Naming + +Results are saved as: +``` +{output_dir}/{config_name}_{timestamp}/ +``` + +**Example:** +``` +results/ +├── test_credit_cards_20251112_143022/ +│ ├── report.html +│ ├── test_results.json +│ ├── evaluation.json +│ ├── test_questions.json +│ └── *.png (charts) +├── test_credit_cards_20251112_153045/ +│ ├── report.html +│ └── ... +└── test_savings_20251112_160112/ + ├── report.html + └── ... +``` + +#### Benefits + +- ✅ **Never lose results** - each run is preserved +- ✅ **Easy comparison** - compare results across test runs +- ✅ **Audit trail** - complete history of all tests +- ✅ **Organized** - group results by test name and time + +#### Customizing Output Location + +**In config file:** +```json +{ + "output_dir": "my_results" +} +``` + +**Command line:** +```bash +python cli.py --config myconfig.json --output-dir my_results +``` + +Results will be saved to: +``` +my_results/{config_name}_{timestamp}/ +``` + +## Command Line Reference + +### Full Command Syntax + +``` +usage: cli.py [-h] [--config CONFIG] [--config-dir CONFIG_DIR] + [--create-config OUTPUT_PATH] [--api-key API_KEY] + [--assistant-id ASSISTANT_ID] [--document DOCUMENT] + [--documents DOCUMENTS [DOCUMENTS ...]] [--output-dir OUTPUT_DIR] + [--num-questions NUM_QUESTIONS] [--iterations ITERATIONS] + [--questions-file QUESTIONS_FILE] [--generate-only] [--verbose] + [--model MODEL] [--prompt-type {task-based,content-based,scenario-based}] + [--parallel PARALLEL] [--batch-size BATCH_SIZE] +``` + +### Common Commands + +```bash +# Get help +python cli.py --help + +# Create a config template +python cli.py --create-config my_config.json + +# Run single test with config file +python cli.py --config my_config.json + +# Run batch tests +python cli.py --config-dir my_test_suite/ + +# Run without config (all command line) +python cli.py --assistant-id asst_abc123 --document myfile.txt + +# Generate questions only +python cli.py --config my_config.json --generate-only + +# Use pre-generated questions +python cli.py --config my_config.json --questions-file results/test_questions.json + +# Change prompt type +python cli.py --config my_config.json --prompt-type scenario-based + +# High performance mode +python cli.py --config my_config.json --parallel 15 --batch-size 45 +``` + +## Advanced Usage Examples + +### Example 1: Complete Testing Workflow + +```bash +# Step 1: Create config +python cli.py --create-config banner_test.json + +# Step 2: Edit banner_test.json with your settings + +# Step 3: Generate questions first to review +python cli.py --config banner_test.json --generate-only --num-questions 50 + +# Step 4: Review generated questions in results/*/test_questions.json + +# Step 5: Run the full test +python cli.py --config banner_test.json --questions-file results/banner_test_*/test_questions.json + +# Step 6: Open report.html to view results +``` + +### Example 2: Testing Multiple Prompt Types + +```bash +# Create base config +cat > base_config.json << EOF +{ + "assistant_id": "asst_abc123", + "documents": ["docs/guidelines.docx"], + "num_questions": 30, + "iterations": 5 +} +EOF + +# Test with task-based prompts +python cli.py --config base_config.json --prompt-type task-based + +# Test with content-based prompts +python cli.py --config base_config.json --prompt-type content-based + +# Test with scenario-based prompts +python cli.py --config base_config.json --prompt-type scenario-based + +# Compare the three result directories! +``` + +### Example 3: High-Volume Testing + +```bash +# For testing with many questions and high parallelization +python cli.py --config my_config.json \ + --num-questions 100 \ + --iterations 10 \ + --parallel 20 \ + --batch-size 60 \ + --verbose +``` + +### Example 4: Continuous Integration + +```bash +#!/bin/bash +# run_tests.sh - Automated testing script + +# Set environment +export OPENAI_API_KEY="your-key" + +# Run test suite +python cli.py --config-dir test_configs/ + +# Check exit code +if [ $? -eq 0 ]; then + echo "All tests passed!" +else + echo "Some tests failed!" + exit 1 +fi +``` + +### Example 5: A/B Testing Different Assistants + +```json +// config_assistant_v1.json +{ + "assistant_id": "asst_v1_abc123", + "documents": ["docs/guidelines.docx"], + "questions_file": "shared_questions.json", + "num_questions": 50 +} + +// config_assistant_v2.json +{ + "assistant_id": "asst_v2_def456", + "documents": ["docs/guidelines.docx"], + "questions_file": "shared_questions.json", + "num_questions": 50 +} +``` + +```bash +# Generate questions once +python cli.py --config config_assistant_v1.json --generate-only + +# Test both assistants with same questions +python cli.py --config config_assistant_v1.json +python cli.py --config config_assistant_v2.json + +# Compare the results! +``` + +### Example 6: Multi-Document Testing + +```json +{ + "assistant_id": "asst_abc123", + "documents": [ + "/path/to/consumer_duty.docx", + "/path/to/fca_guidelines.docx", + "/path/to/brand_guidelines.txt", + "/path/to/product_specs.docx" + ], + "num_questions": 40, + "prompt_type": "scenario-based" +} +``` + +## Understanding the Results + +### HTML Report + +After tests complete, open `report.html` to see: + +#### 1. Summary Metrics +- Overall quality score +- Average consistency score +- Average accuracy score +- Average completeness score +- Average response time +- Total tests run + +#### 2. Performance Charts +- **Scores by Question**: Bar chart showing all metric scores for each question +- **Response Times**: How fast the assistant responds +- **Score Distribution**: Histogram of score ranges +- **Radar Chart**: Visual comparison of quality, consistency, accuracy, and completeness + +#### 3. Question-by-Question Analysis +For each test prompt: +- Question text +- Individual scores (quality, consistency, accuracy, completeness) +- Evaluation notes and feedback +- All response iterations (collapsible) + +### Evaluation Metrics + +Each response is scored 1-10 on four dimensions: + +#### Quality Score (1-10) +- Clarity and coherence +- Professional tone +- No hallucinations +- Grammar and readability + +#### Consistency Score (1-10) +- Similar answers across iterations +- Consistent facts and details +- No contradictions +- Stable level of detail + +#### Accuracy Score (1-10) +- Information matches documents +- Correct facts and numbers +- No misrepresentations +- Proper context interpretation + +#### Completeness Score (1-10) +- Addresses all aspects of the question +- Includes necessary context +- Sufficient detail +- No significant omissions + +### JSON Output Files + +#### `test_questions.json` +```json +{ + "questions": [ + "Create a banner for...", + "Write copy for...", + ... + ] +} +``` + +#### `test_results.json` +```json +{ + "results": [ + { + "question_id": 0, + "question": "Create a banner...", + "iteration": 0, + "response": "Here's your banner: ...", + "response_time": 2.34, + "status": "completed" + }, + ... + ] +} +``` + +#### `evaluation.json` +```json +{ + "summary": { + "total_questions": 20, + "total_iterations": 60, + "average_quality": 8.5, + "average_consistency": 9.2, + "average_accuracy": 8.8, + "average_completeness": 8.6, + "average_response_time": 2.1 + }, + "by_question": [...] +} +``` + +## Troubleshooting + +### Common Issues + +#### 1. Assistant Not Found Error +``` +ERROR: No assistant found with id 'asst_...' +``` +**Solution:** Check your assistant ID on https://platform.openai.com/assistants + +#### 2. API Rate Limits +``` +Error: Rate limit exceeded +``` +**Solution:** Reduce parallel workers: +```bash +python cli.py --config my_config.json --parallel 3 +``` + +#### 3. Document Loading Errors +``` +Warning: No content loaded from documents +``` +**Solutions:** +- Check file paths are correct +- For `.docx` files: `pip install docx2txt` +- Verify files are readable (not corrupted) +- Supported formats: `.txt`, `.docx` only + +#### 4. Memory Issues +``` +MemoryError: ... +``` +**Solution:** Reduce batch size: +```bash +python cli.py --config my_config.json --batch-size 10 +``` + +#### 5. Missing API Key +``` +Error: No OpenAI API key provided +``` +**Solutions:** +```bash +# Option 1: Environment variable +export OPENAI_API_KEY="your-key" + +# Option 2: In config file +{ + "api_key": "your-key" +} + +# Option 3: Command line +python cli.py --api-key "your-key" ... +``` + +### Debug Mode + +Enable verbose output for detailed logging: + +```bash +python cli.py --config my_config.json --verbose +``` + +Or in config: +```json +{ + "verbose": true +} +``` + +### Performance Tips + +1. **Optimize Parallelization** + - Start with `parallel: 5` + - Increase gradually if no rate limits + - Set `batch_size` to 2-3x `parallel` + +2. **Balance Speed vs. Cost** + - More parallel workers = faster but higher API costs + - More iterations = better consistency data but more tests + +3. **Question Generation** + - Generate questions once, reuse with `questions_file` + - Save API calls on repeated tests + +## Supported File Types + +- ✅ **Text files** (`.txt`): Plain text with UTF-8 encoding +- ✅ **Word documents** (`.docx`): Microsoft Word files (requires `docx2txt`) +- ❌ **PDF files**: Not currently supported +- ❌ **Excel/PowerPoint**: Not currently supported + +## Best Practices + +### 1. Start Small +```bash +# Test with few questions first +python cli.py --config my_config.json --num-questions 5 --iterations 2 +``` + +### 2. Use Configuration Files +- Easier to track and version +- Reusable across tests +- Less prone to typos + +### 3. Organize Your Tests +``` +my_project/ +├── configs/ +│ ├── test_suite_1/ +│ │ ├── credit_cards.json +│ │ └── loans.json +│ └── test_suite_2/ +│ └── mobile_banking.json +├── results/ +└── docs/ +``` + +### 4. Version Control Your Configs +```bash +git add configs/ +git commit -m "Add test configurations" +``` + +### 5. Archive Important Results +```bash +# Save important test results +cp -r results/important_test_20251112_143022 archived_results/ +``` + +## License + +MIT + +--- + +## Need Help? + +- 📖 **Documentation**: You're reading it! +- 🐛 **Issues**: Report bugs on GitHub +- 💡 **Feature Requests**: Open an issue with your idea +- 📧 **Contact**: [your-email@example.com] + +--- + +**Happy Testing! 🚀** diff --git a/rag_test_app/cli.py b/rag_test_app/cli.py new file mode 100644 index 0000000..6914db0 --- /dev/null +++ b/rag_test_app/cli.py @@ -0,0 +1,481 @@ +#!/usr/bin/env python3 +import argparse +import os +import sys +import json +import time +from datetime import datetime +from main import RAGTester +from rich.console import Console +from typing import Dict, Any, List + +console = Console() + +def load_config(config_path: str) -> Dict[str, Any]: + """ + Load configuration from a JSON file + """ + try: + with open(config_path, 'r') as f: + config = json.load(f) + console.print(f"[green]Loaded configuration from {config_path}[/green]") + return config + except Exception as e: + console.print(f"[bold red]Error loading config file: {str(e)}[/bold red]") + sys.exit(1) + +def process_single_config(config_path: str, args: argparse.Namespace) -> bool: + """ + Process a single configuration file. + Returns True if successful, False otherwise. + """ + console.print(f"\n[bold cyan]{'='*60}[/bold cyan]") + console.print(f"[bold cyan]Processing config: {os.path.basename(config_path)}[/bold cyan]") + console.print(f"[bold cyan]{'='*60}[/bold cyan]\n") + + try: + # Load config file + if not os.path.exists(config_path): + console.print(f"[bold red]Error: Config file not found: {config_path}[/bold red]") + return False + config = load_config(config_path) + + # Merge command-line arguments with config file (command-line takes precedence) + api_key = args.api_key or config.get("api_key") or os.environ.get("OPENAI_API_KEY") + assistant_id = args.assistant_id or config.get("assistant_id") + document_path = args.document or config.get("document") + documents = args.documents or config.get("documents") + output_dir = args.output_dir if args.output_dir != "results" else config.get("output_dir", "results") + num_questions = args.num_questions if args.num_questions != 20 else config.get("num_questions", 20) + iterations = args.iterations if args.iterations != 3 else config.get("iterations", 3) + questions_file = args.questions_file or config.get("questions_file") + generate_only = args.generate_only or config.get("generate_only", False) + verbose = args.verbose or config.get("verbose", False) + model = args.model if args.model != "gpt-4o" else config.get("model", "gpt-4o") + prompt_type = args.prompt_type if args.prompt_type != "task-based" else config.get("prompt_type", "task-based") + parallel = args.parallel if args.parallel != 5 else config.get("parallel", 5) + batch_size = args.batch_size or config.get("batch_size") + + # Create unique timestamped output directory + config_name = os.path.splitext(os.path.basename(config_path))[0] + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + unique_output_dir = os.path.join(output_dir, f"{config_name}_{timestamp}") + output_dir = unique_output_dir + + # Check for required parameters + if not api_key: + console.print("[bold red]Error: No OpenAI API key provided.[/bold red]") + console.print("Please provide an API key using --api-key, config file, or set the OPENAI_API_KEY environment variable.") + return False + + if not assistant_id: + console.print("[bold red]Error: No assistant ID provided.[/bold red]") + console.print("Please provide an assistant ID using --assistant-id or in the config file.") + return False + + # Check if at least one document source is provided + if not documents and not document_path: + console.print("[bold red]Error: No documents provided.[/bold red]") + console.print("Please provide documents using --documents, --document, or in the config file.") + return False + + # Check if document paths exist + if documents: + for doc in documents: + if not os.path.exists(doc): + console.print(f"[bold red]Warning: Document not found: {doc}[/bold red]") + elif document_path and not os.path.exists(document_path): + console.print(f"[bold red]Error: Document or directory not found: {document_path}[/bold red]") + return False + + # Print startup banner + console.print("\n[bold green]╔══════════════════════════════════════════════╗[/bold green]") + console.print("[bold green]║ RAG Testing Application ║[/bold green]") + console.print("[bold green]╚══════════════════════════════════════════════╝[/bold green]\n") + + if verbose: + console.print("[bold yellow]Verbose mode enabled[/bold yellow]") + + # Create tester + tester = RAGTester( + api_key=api_key, + assistant_id=assistant_id, + document_paths=documents, + document_path=None if documents else document_path, + output_dir=output_dir, + verbose=verbose, + model=model, + prompt_type=prompt_type + ) + + # Load or generate questions + if questions_file: + if not os.path.exists(questions_file): + console.print(f"[bold red]Error: Questions file not found: {questions_file}[/bold red]") + return False + + console.print(f"Loading questions from {questions_file}") + tester.load_questions_from_file(questions_file) + else: + console.print(f"Generating {num_questions} test questions") + tester.generate_test_questions(num_questions) + + # Exit if generate-only + if generate_only: + console.print("[green]Questions generated and saved. Exiting.[/green]") + return True + + # Run tests with parallelization + tester.run_tests(iterations=iterations, max_workers=parallel, batch_size=batch_size) + + # Evaluate results + evaluation = tester.evaluate_results() + + # Generate report + tester.generate_report() + + console.print(f"[bold green]Testing complete! Results saved to {output_dir}[/bold green]") + return True + + except Exception as e: + console.print(f"[bold red]Error processing config {config_path}: {str(e)}[/bold red]") + import traceback + console.print(f"[red]{traceback.format_exc()}[/red]") + return False + +def create_config_template(output_path: str) -> None: + """ + Create a configuration template file at the specified path + """ + template = { + "assistant_id": "asst_YourAssistantIdHere", + "documents": [ + "/path/to/your/document1.txt", + "/path/to/your/document2.txt" + ], + "api_key": "YOUR_OPENAI_API_KEY", + "output_dir": "results", + "num_questions": 20, + "iterations": 3, + "questions_file": "", # Leave empty to generate new questions + "generate_only": False, + "verbose": True, + "model": "gpt-4o", + "prompt_type": "task-based", # Options: "task-based", "content-based", "scenario-based" + "parallel": 10, + "batch_size": 30 + } + + try: + # Ensure directory exists + os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True) + + # Write the template with pretty formatting + with open(output_path, 'w') as f: + json.dump(template, f, indent=2) + + console.print(f"[green]Created configuration template at: {output_path}[/green]") + console.print("[cyan]Edit this file with your settings and then run:[/cyan]") + console.print(f"[cyan] python cli.py --config {output_path}[/cyan]") + except Exception as e: + console.print(f"[bold red]Error creating configuration template: {str(e)}[/bold red]") + sys.exit(1) + +def main(): + """Main entry point for the RAG Testing CLI""" + parser = argparse.ArgumentParser( + description="Test OpenAI assistants with RAG capabilities", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + # Add config group + config_group = parser.add_argument_group('Configuration') + + config_group.add_argument( + "--config", + type=str, + help="Path to a JSON configuration file" + ) + + config_group.add_argument( + "--config-dir", + type=str, + help="Path to a directory containing multiple JSON configuration files. All configs will be run in sequence." + ) + + config_group.add_argument( + "--create-config", + type=str, + metavar="OUTPUT_PATH", + help="Create a configuration template file at the specified path and exit" + ) + + parser.add_argument( + "--api-key", + type=str, + help="OpenAI API key. If not provided, will use OPENAI_API_KEY environment variable." + ) + + parser.add_argument( + "--assistant-id", + type=str, + help="ID of the OpenAI assistant to test" + ) + + parser.add_argument( + "--document", + type=str, + help="Path to the document or directory of documents to use for testing" + ) + + parser.add_argument( + "--documents", + nargs='+', + help="List of document paths to use for testing (takes precedence over --document)" + ) + + parser.add_argument( + "--output-dir", + type=str, + default="results", + help="Directory to save test results" + ) + + parser.add_argument( + "--num-questions", + type=int, + default=20, + help="Number of test questions to generate" + ) + + parser.add_argument( + "--iterations", + type=int, + default=3, + help="Number of times to test each question" + ) + + parser.add_argument( + "--questions-file", + type=str, + help="Path to a JSON file with pre-generated questions" + ) + + parser.add_argument( + "--generate-only", + action="store_true", + help="Only generate questions, don't run tests" + ) + + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose output for debugging" + ) + + parser.add_argument( + "--model", + type=str, + default="gpt-4o", + help="OpenAI model to use for question generation and evaluation" + ) + + parser.add_argument( + "--prompt-type", + type=str, + choices=["task-based", "content-based", "scenario-based"], + default="task-based", + help="Type of prompts to generate: 'task-based' (real user tasks like 'create a banner'), " + "'content-based' (knowledge questions about documents), " + "'scenario-based' (realistic business scenarios). Default: task-based" + ) + + parser.add_argument( + "--parallel", + type=int, + default=5, + help="Number of parallel workers for running tests (default: 5)" + ) + + parser.add_argument( + "--batch-size", + type=int, + help="Number of questions to process in a batch (defaults to same as --parallel if not specified)" + ) + + args = parser.parse_args() + + # Check if we need to create a config template + if args.create_config: + create_config_template(args.create_config) + sys.exit(0) + + # Handle batch processing of multiple config files + if args.config_dir: + if not os.path.exists(args.config_dir): + console.print(f"[bold red]Error: Config directory not found: {args.config_dir}[/bold red]") + sys.exit(1) + + if not os.path.isdir(args.config_dir): + console.print(f"[bold red]Error: {args.config_dir} is not a directory[/bold red]") + sys.exit(1) + + # Find all JSON config files in the directory + config_files = sorted([ + os.path.join(args.config_dir, f) + for f in os.listdir(args.config_dir) + if f.endswith('.json') + ]) + + if not config_files: + console.print(f"[bold red]Error: No JSON config files found in {args.config_dir}[/bold red]") + sys.exit(1) + + console.print(f"\n[bold magenta]{'='*60}[/bold magenta]") + console.print(f"[bold magenta]BATCH PROCESSING MODE[/bold magenta]") + console.print(f"[bold magenta]Found {len(config_files)} configuration file(s)[/bold magenta]") + console.print(f"[bold magenta]{'='*60}[/bold magenta]\n") + + for config_file in config_files: + console.print(f" • {os.path.basename(config_file)}") + + console.print() + + # Process each config file + successful = 0 + failed = 0 + start_time = time.time() + + for idx, config_file in enumerate(config_files, 1): + console.print(f"\n[bold yellow]>>> Processing {idx}/{len(config_files)}[/bold yellow]") + if process_single_config(config_file, args): + successful += 1 + else: + failed += 1 + console.print(f"[bold red]Failed to process {os.path.basename(config_file)}[/bold red]") + + # Print summary + elapsed_time = time.time() - start_time + console.print(f"\n[bold magenta]{'='*60}[/bold magenta]") + console.print(f"[bold magenta]BATCH PROCESSING COMPLETE[/bold magenta]") + console.print(f"[bold magenta]{'='*60}[/bold magenta]") + console.print(f"[bold green]✓ Successful: {successful}[/bold green]") + if failed > 0: + console.print(f"[bold red]✗ Failed: {failed}[/bold red]") + console.print(f"[cyan]Total time: {elapsed_time/60:.1f} minutes[/cyan]") + console.print() + + sys.exit(0 if failed == 0 else 1) + + # Single config file mode + if args.config: + if not os.path.exists(args.config): + console.print(f"[bold red]Error: Config file not found: {args.config}[/bold red]") + sys.exit(1) + + success = process_single_config(args.config, args) + sys.exit(0 if success else 1) + + # Original command-line mode (no config file) + config = {} + + # Merge command-line arguments with config file (command-line takes precedence) + api_key = args.api_key or config.get("api_key") or os.environ.get("OPENAI_API_KEY") + assistant_id = args.assistant_id or config.get("assistant_id") + document_path = args.document or config.get("document") + documents = args.documents or config.get("documents") + output_dir = args.output_dir if args.output_dir != "results" else config.get("output_dir", "results") + num_questions = args.num_questions if args.num_questions != 20 else config.get("num_questions", 20) + iterations = args.iterations if args.iterations != 3 else config.get("iterations", 3) + questions_file = args.questions_file or config.get("questions_file") + generate_only = args.generate_only or config.get("generate_only", False) + verbose = args.verbose or config.get("verbose", False) + model = args.model if args.model != "gpt-4o" else config.get("model", "gpt-4o") + parallel = args.parallel if args.parallel != 5 else config.get("parallel", 5) + batch_size = args.batch_size or config.get("batch_size") + + # Create unique timestamped output directory + # Get the base name from config file if available, otherwise use 'test' + config_name = "test" + if args.config: + config_name = os.path.splitext(os.path.basename(args.config))[0] + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + unique_output_dir = os.path.join(output_dir, f"{config_name}_{timestamp}") + output_dir = unique_output_dir + + # Check for required parameters + if not api_key: + console.print("[bold red]Error: No OpenAI API key provided.[/bold red]") + console.print("Please provide an API key using --api-key, config file, or set the OPENAI_API_KEY environment variable.") + sys.exit(1) + + if not assistant_id: + console.print("[bold red]Error: No assistant ID provided.[/bold red]") + console.print("Please provide an assistant ID using --assistant-id or in the config file.") + sys.exit(1) + + # Check if at least one document source is provided + if not documents and not document_path: + console.print("[bold red]Error: No documents provided.[/bold red]") + console.print("Please provide documents using --documents, --document, or in the config file.") + sys.exit(1) + + # Check if document paths exist + if documents: + for doc in documents: + if not os.path.exists(doc): + console.print(f"[bold red]Warning: Document not found: {doc}[/bold red]") + elif document_path and not os.path.exists(document_path): + console.print(f"[bold red]Error: Document or directory not found: {document_path}[/bold red]") + sys.exit(1) + + # Print startup banner + console.print("\n[bold green]╔══════════════════════════════════════════════╗[/bold green]") + console.print("[bold green]║ RAG Testing Application ║[/bold green]") + console.print("[bold green]╚══════════════════════════════════════════════╝[/bold green]\n") + + if verbose: + console.print("[bold yellow]Verbose mode enabled[/bold yellow]") + + # Create tester + tester = RAGTester( + api_key=api_key, + assistant_id=assistant_id, + document_paths=documents, + document_path=None if documents else document_path, + output_dir=output_dir, + verbose=verbose, + model=model + ) + + # Load or generate questions + if questions_file: + if not os.path.exists(questions_file): + console.print(f"[bold red]Error: Questions file not found: {questions_file}[/bold red]") + sys.exit(1) + + console.print(f"Loading questions from {questions_file}") + tester.load_questions_from_file(questions_file) + else: + console.print(f"Generating {num_questions} test questions") + tester.generate_test_questions(num_questions) + + # Exit if generate-only + if generate_only: + console.print("[green]Questions generated and saved. Exiting.[/green]") + sys.exit(0) + + # Run tests with parallelization + tester.run_tests(iterations=iterations, max_workers=parallel, batch_size=batch_size) + + # Evaluate results + evaluation = tester.evaluate_results() + + # Generate report + tester.generate_report() + + console.print(f"[bold green]Testing complete! Results saved to {output_dir}[/bold green]") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/rag_test_app/main.py b/rag_test_app/main.py new file mode 100644 index 0000000..31a7c7c --- /dev/null +++ b/rag_test_app/main.py @@ -0,0 +1,1326 @@ +import os +import json +import pandas as pd +import numpy as np +from typing import List, Dict, Any, Optional, Tuple +import time +import concurrent.futures +from openai import OpenAI +from rich.console import Console +from rich.table import Table +from rich.progress import track, Progress +import matplotlib.pyplot as plt + +class RAGTester: + def __init__(self, + api_key: str, + assistant_id: str, + document_paths: List[str] = None, + document_path: str = None, + output_dir: str = "results", + verbose: bool = False, + model: str = "gpt-4o", + prompt_type: str = "task-based"): + """ + Initialize the RAG tester. + + Args: + api_key: OpenAI API key + assistant_id: ID of the assistant to test + document_paths: List of paths to documents to use for testing (preferred over document_path) + document_path: Path to a single document or directory with documents to use for testing + output_dir: Directory to save results + verbose: Whether to print verbose output + model: The OpenAI model to use for generating questions and evaluations + prompt_type: Type of prompts to generate ("task-based", "content-based", "scenario-based") + """ + # Import docx2txt for reading .docx files + try: + import docx2txt + self.docx2txt_available = True + except ImportError: + self.docx2txt_available = False + self.console = Console() # Initialize console early + self.console.print("[yellow]Warning: docx2txt not installed. Will not be able to read .docx files.[/yellow]") + self.console.print("[yellow]Install with: pip install docx2txt[/yellow]") + self.api_key = api_key + self.assistant_id = assistant_id + self.document_paths = document_paths or [] + self.document_path = document_path + self.output_dir = output_dir + self.verbose = verbose + self.model = model + self.prompt_type = prompt_type + self.client = OpenAI(api_key=api_key) + self.console = Console() + + self.console.print(f"[bold blue]Initializing RAG Tester:[/bold blue]") + self.console.print(f" [cyan]Assistant ID:[/cyan] {assistant_id}") + if document_paths: + self.console.print(f" [cyan]Documents:[/cyan] {len(document_paths)} files specified") + elif document_path: + self.console.print(f" [cyan]Document/Directory:[/cyan] {document_path}") + self.console.print(f" [cyan]Output Directory:[/cyan] {output_dir}") + self.console.print(f" [cyan]Model:[/cyan] {model}") + + os.makedirs(output_dir, exist_ok=True) + self.console.print(f"[green]Created output directory: {output_dir}[/green]") + + # Load document content - can be from multiple sources + self.document_content = "" + total_size = 0 + file_count = 0 + + try: + # Case 1: List of document paths specified + if document_paths: + self.console.print(f"[cyan]Loading specified documents...[/cyan]") + + for doc_path in document_paths: + if not os.path.exists(doc_path): + self.console.print(f" [yellow]Warning: Document not found: {doc_path}[/yellow]") + continue + + try: + filename = os.path.basename(doc_path) + file_extension = os.path.splitext(filename)[1].lower() + + # Handle different file types + if file_extension == '.docx' and self.docx2txt_available: + # Use docx2txt to extract text from .docx files + import docx2txt + content = docx2txt.process(doc_path) + self.console.print(f" [green]Loaded DOCX: {filename}[/green]") + elif file_extension == '.docx' and not self.docx2txt_available: + self.console.print(f" [yellow]Skipping {filename}: docx2txt not installed[/yellow]") + continue + elif file_extension in ['.pdf', '.xls', '.xlsx', '.ppt', '.pptx']: + self.console.print(f" [yellow]Skipping unsupported file type: {filename}[/yellow]") + continue + else: + # Default text file reading + with open(doc_path, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + + # Add the content to our document collection + self.document_content += f"\n\n--- Document: {filename} ---\n\n{content}" + file_size = len(content) + total_size += file_size + file_count += 1 + self.console.print(f" [green]Loaded: {filename} ({file_size} characters)[/green]") + except Exception as e: + self.console.print(f" [yellow]Could not load {doc_path}: {str(e)}[/yellow]") + + # Case 2: Directory specified + elif document_path and os.path.isdir(document_path): + self.console.print(f"[cyan]Loading documents from directory: {document_path}[/cyan]") + + for filename in os.listdir(document_path): + file_path = os.path.join(document_path, filename) + + # Skip directories and non-text files + if os.path.isdir(file_path): + continue + + try: + file_extension = os.path.splitext(filename)[1].lower() + + # Handle different file types + if file_extension == '.docx' and self.docx2txt_available: + # Use docx2txt to extract text from .docx files + import docx2txt + content = docx2txt.process(file_path) + self.console.print(f" [green]Loaded DOCX: {filename}[/green]") + elif file_extension == '.docx' and not self.docx2txt_available: + self.console.print(f" [yellow]Skipping {filename}: docx2txt not installed[/yellow]") + continue + elif file_extension in ['.pdf', '.xls', '.xlsx', '.ppt', '.pptx']: + self.console.print(f" [yellow]Skipping unsupported file type: {filename}[/yellow]") + continue + else: + # Default text file reading + with open(file_path, 'r', encoding='utf-8', errors='replace') as f: + content = f.read() + + # Add the content to our document collection + self.document_content += f"\n\n--- Document: {filename} ---\n\n{content}" + file_size = len(content) + total_size += file_size + file_count += 1 + self.console.print(f" [green]Loaded: {filename} ({file_size} characters)[/green]") + except Exception as e: + self.console.print(f" [yellow]Could not load {filename}: {str(e)}[/yellow]") + + # Case 3: Single document specified + elif document_path: + filename = os.path.basename(document_path) + file_extension = os.path.splitext(filename)[1].lower() + + try: + # Handle different file types + if file_extension == '.docx' and self.docx2txt_available: + # Use docx2txt to extract text from .docx files + import docx2txt + self.document_content = docx2txt.process(document_path) + self.console.print(f"[green]Loaded DOCX document: {filename}[/green]") + elif file_extension == '.docx' and not self.docx2txt_available: + self.console.print(f"[yellow]Cannot load {filename}: docx2txt not installed[/yellow]") + self.document_content = "" + elif file_extension in ['.pdf', '.xls', '.xlsx', '.ppt', '.pptx']: + self.console.print(f"[yellow]Unsupported file type: {filename}[/yellow]") + self.document_content = "" + else: + # Default text file reading + with open(document_path, 'r', encoding='utf-8', errors='replace') as f: + self.document_content = f.read() + + doc_size = len(self.document_content) + doc_preview = self.document_content[:100] + "..." if doc_size > 100 else self.document_content + self.console.print(f"[green]Loaded document ({doc_size} characters)[/green]") + file_count = 1 + total_size = doc_size + if self.verbose: + self.console.print(f"[dim]Document preview: {doc_preview}[/dim]") + except Exception as e: + self.console.print(f"[bold red]Error loading document: {str(e)}[/bold red]") + self.document_content = "" + else: + self.console.print(f"[bold red]No documents specified![/bold red]") + + # Report on loaded documents + if file_count > 0: + self.console.print(f"[green]Successfully loaded {file_count} document(s) (total {total_size} characters)[/green]") + + # Check if we have any content + if not self.document_content: + self.console.print(f"[bold red]Warning: No content loaded from documents[/bold red]") + + except Exception as e: + self.console.print(f"[bold red]Error loading document(s): {str(e)}[/bold red]") + raise + + self.questions = [] + self.results = [] + + def log(self, message, level="info"): + """Log a message if verbose mode is enabled""" + if self.verbose or level != "debug": + if level == "debug": + self.console.print(f"[dim]{message}[/dim]") + elif level == "info": + self.console.print(message) + elif level == "warning": + self.console.print(f"[yellow]{message}[/yellow]") + elif level == "error": + self.console.print(f"[bold red]{message}[/bold red]") + elif level == "success": + self.console.print(f"[green]{message}[/green]") + + def generate_test_questions(self, num_questions: int = 20) -> List[str]: + """ + Generate test questions from the document. + + Args: + num_questions: Number of questions to generate + + Returns: + List of generated questions + """ + self.console.print("[bold blue]Generating test questions from document...[/bold blue]") + self.log(f"Requesting {num_questions} questions using model: {self.model}", "info") + self.log(f"Using prompt type: {self.prompt_type}", "info") + + # Define different prompt templates based on prompt_type + prompt_templates = { + "task-based": { + "system": "You are a helpful assistant that generates realistic user task requests that someone would ask a digital banner creation assistant.", + "user": f"""Generate {num_questions} diverse realistic user requests that someone would ask when using a digital banner creation assistant. + +The requests should sound like natural user tasks, such as: +- "Create a banner for our new credit card offer" +- "Write copy for a savings account promotion" +- "Generate headlines for our mobile banking app" +- "Design text for a balance transfer campaign" + +Important: +- Make them sound like REAL user requests, not questions about the documents +- Vary the products: credit cards, loans, savings, banking services, financial tools +- Include different banner types: promotional, informational, awareness campaigns +- Keep them concise and action-oriented +- Some should mention specific requirements like target audience or compliance needs + +Context from documents to inform realistic requests: +{self.document_content[:3000]} + +Return the requests as a JSON array of strings named 'questions'. +Format: {{"questions": ["request 1", "request 2", ...]}}""" + }, + "content-based": { + "system": "You are a helpful assistant that generates diverse test questions from a document.", + "user": f"""Generate {num_questions} diverse questions based on the following document. +The questions should test different aspects and levels of understanding. +Return the questions as a JSON array of strings named 'questions'. + +{self.document_content}""" + }, + "scenario-based": { + "system": "You are a helpful assistant that generates realistic business scenario requests for a digital banner creation assistant.", + "user": f"""Generate {num_questions} diverse realistic business scenarios that combine a specific banner creation task with business context. + +The scenarios should sound like real business requests, such as: +- "We're launching a new credit card for students. Create banner copy that's compliant with FCA Consumer Duty guidelines" +- "Our vulnerable customer initiative needs promotional materials. Write banner text that's clear and accessible" +- "Create an internal banner for our mobile banking upgrade, targeting existing customers" +- "We have a new savings product for first-time buyers. Generate compliant promotional copy" + +Important: +- Make them sound like REAL business scenarios with context +- Include specific target audiences (students, vulnerable customers, first-time buyers, etc.) +- Mention compliance or regulatory considerations when relevant +- Vary the products and campaign types +- Include both external and internal communications +- Keep them realistic but concise + +Context from documents to inform realistic scenarios: +{self.document_content[:3000]} + +Return the scenarios as a JSON array of strings named 'questions'. +Format: {{"questions": ["scenario 1", "scenario 2", ...]}}""" + } + } + + # Get the appropriate prompt template + prompt_template = prompt_templates.get(self.prompt_type, prompt_templates["task-based"]) + + try: + # First try with response_format (newer models support this) + self.log("Attempting to generate questions with JSON response format", "debug") + + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": prompt_template["system"]}, + {"role": "user", "content": prompt_template["user"]} + ], + response_format={"type": "json_object"} + ) + + self.log("Successfully received response with JSON format", "debug") + if self.verbose: + self.log(f"Raw response: {response.choices[0].message.content}", "debug") + + questions_json = json.loads(response.choices[0].message.content) + self.questions = questions_json.get("questions", []) + + self.log(f"Extracted {len(self.questions)} questions from JSON response", "success") + + except Exception as e: + # Fallback method without response_format + self.log(f"JSON response format failed: {str(e)}", "warning") + self.log("Trying fallback method without response_format", "info") + + fallback_model = "gpt-3.5-turbo" if self.model != "gpt-3.5-turbo" else "gpt-4" + self.log(f"Using fallback model: {fallback_model}", "debug") + + # Add JSON format instruction to the user prompt for fallback + fallback_user_prompt = prompt_template["user"] + if "Format:" not in fallback_user_prompt: + fallback_user_prompt += "\n\nReturn ONLY a JSON object with a 'questions' key containing an array of strings. Format: {'questions': ['item 1', 'item 2', ...]}" + + response = self.client.chat.completions.create( + model=fallback_model, + messages=[ + {"role": "system", "content": prompt_template["system"]}, + {"role": "user", "content": fallback_user_prompt} + ] + ) + + self.log("Received fallback response, attempting to parse", "debug") + if self.verbose: + self.log(f"Raw fallback response: {response.choices[0].message.content}", "debug") + + # Try to parse the JSON from the response + try: + content = response.choices[0].message.content + self.log("Looking for JSON in response", "debug") + + # Extract JSON if it's wrapped in code blocks or other text + import re + json_match = re.search(r'```json\s*(.*?)\s*```', content, re.DOTALL) + if json_match: + self.log("Found JSON in code block", "debug") + content = json_match.group(1) + else: + # Try to find anything that looks like JSON + json_match = re.search(r'\{.*\}', content, re.DOTALL) + if json_match: + self.log("Found JSON object in text", "debug") + content = json_match.group(0) + + self.log("Attempting to parse JSON content", "debug") + questions_json = json.loads(content) + self.questions = questions_json.get("questions", []) + + if self.questions: + self.log(f"Successfully extracted {len(self.questions)} questions from JSON", "success") + else: + self.log("No questions found in JSON, trying to parse from text", "warning") + + # If we couldn't find questions in JSON format, try to parse them from the text + if not self.questions: + # Look for numbered or bulleted list items + self.log("Looking for numbered or bulleted lists", "debug") + questions = re.findall(r'(?:^|\n)(?:\d+\.\s*|\*\s*|-\s*)(.+?)(?=(?:\n\d+\.|\n\*|\n-|\n\n|$))', content) + if questions: + self.log(f"Found {len(questions)} questions in list format", "success") + self.questions = [q.strip() for q in questions] + except Exception as json_error: + self.log(f"Error parsing questions: {str(json_error)}", "error") + if self.verbose: + self.log(f"Content that failed to parse: {content}", "debug") + + # Last resort: try to extract questions line by line + self.log("Attempting last resort method: extract lines with question marks", "warning") + lines = response.choices[0].message.content.split('\n') + potential_questions = [line for line in lines if '?' in line] + if potential_questions: + self.log(f"Found {len(potential_questions)} lines with question marks", "success") + self.questions = potential_questions[:num_questions] + else: + self.log("Could not extract any questions, giving up", "error") + raise ValueError("Could not generate or parse questions from the model's response") + + # Print the questions for verification + if self.questions: + self.log("Generated questions:", "info") + for i, q in enumerate(self.questions[:5]): # Show first 5 questions + self.console.print(f" [cyan]{i+1}.[/cyan] {q}") + if len(self.questions) > 5: + self.console.print(f" ... and {len(self.questions) - 5} more questions") + else: + self.log("No questions were generated!", "error") + + # Save questions to file + with open(f"{self.output_dir}/test_questions.json", "w") as f: + json.dump({"questions": self.questions}, f, indent=2) + + self.console.print(f"[green]Generated {len(self.questions)} test questions[/green]") + return self.questions + + def load_questions_from_file(self, file_path: str) -> List[str]: + """Load questions from a JSON file""" + with open(file_path, 'r') as f: + data = json.load(f) + self.questions = data.get("questions", []) + return self.questions + + def _run_single_test(self, question_data: Tuple[int, str, int]) -> Dict[str, Any]: + """ + Run a single test for a question + + Args: + question_data: Tuple containing (question_index, question_text, iteration) + + Returns: + Dictionary with test results + """ + i, question, iteration = question_data + + # Create a new client for each thread to avoid rate limiting issues + client = OpenAI(api_key=self.api_key) + + start_time = time.time() + result = {} + + try: + # Create a thread and run it + thread = client.beta.threads.create() + thread_id = thread.id + + # Add a message to the thread + client.beta.threads.messages.create( + thread_id=thread_id, + role="user", + content=question + ) + + # Run the assistant + run = client.beta.threads.runs.create( + thread_id=thread_id, + assistant_id=self.assistant_id + ) + run_id = run.id + + # Wait for the run to complete + status = "queued" + + while status not in ["completed", "failed", "cancelled", "expired"]: + time.sleep(1) + run = client.beta.threads.runs.retrieve( + thread_id=thread_id, + run_id=run_id + ) + status = run.status + + # Get the response + messages = client.beta.threads.messages.list( + thread_id=thread_id + ) + + # Get the assistant's response + response = None + for msg in messages.data: + if msg.role == "assistant": + response = msg.content[0].text.value + break + + end_time = time.time() + response_time = end_time - start_time + + # Store results + result = { + "question_id": i, + "question": question, + "iteration": iteration, + "response": response, + "response_time": response_time, + "thread_id": thread_id, + "run_id": run_id, + "timestamp": time.time(), + "status": status + } + + except Exception as e: + end_time = time.time() + response_time = end_time - start_time + + result = { + "question_id": i, + "question": question, + "iteration": iteration, + "response": f"ERROR: {str(e)}", + "response_time": response_time, + "thread_id": "", + "run_id": "", + "timestamp": time.time(), + "status": "error" + } + + return result + + def run_tests(self, iterations: int = 3, max_workers: int = 5, batch_size: int = None) -> List[Dict[str, Any]]: + """ + Run tests for each question multiple times in parallel. + + Args: + iterations: Number of times to test each question + max_workers: Maximum number of parallel threads (default=5, adjust based on your rate limits) + batch_size: Number of questions to process in a batch (defaults to max_workers if None) + Higher values increase throughput at the cost of more memory usage + + Returns: + List of test results + """ + # If batch_size is not specified, use max_workers as default + if batch_size is None: + batch_size = max_workers + if not self.questions: + self.console.print("[bold red]No questions available. Generate or load questions first.[/bold red]") + return [] + + self.results = [] + total_tests = len(self.questions) * iterations + + self.console.print(f"[bold blue]Running {total_tests} tests ({iterations} iterations for {len(self.questions)} questions) with parallelization...[/bold blue]") + self.log(f"Using assistant ID: {self.assistant_id}", "info") + self.log(f"Running with {max_workers} parallel workers", "info") + + # Prepare all question-iteration combinations + test_items = [] + for i, question in enumerate(self.questions): + for iteration in range(iterations): + test_items.append((i, question, iteration)) + + # Setup progress bar + with Progress() as progress: + task = progress.add_task("[cyan]Running tests...", total=total_tests) + + # Process test items in batches for better throughput and memory management + remaining_items = test_items + + while remaining_items: + # Get the next batch of items + current_batch = remaining_items[:batch_size] + remaining_items = remaining_items[batch_size:] + + # Run the current batch in parallel + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # Submit batch tasks + future_to_test = { + executor.submit(self._run_single_test, item): item for item in current_batch + } + + # Process results as they complete + for future in concurrent.futures.as_completed(future_to_test): + test_item = future_to_test[future] + i, question, iteration = test_item + + try: + result = future.result() + self.results.append(result) + + # Log brief result info + status = result.get("status", "unknown") + response_time = result.get("response_time", 0) + + if status == "completed": + self.log(f"Question {i+1}, iteration {iteration+1} completed in {response_time:.2f}s", "success") + else: + self.log(f"Question {i+1}, iteration {iteration+1} ended with status: {status}", "warning") + + # Save results frequently to avoid data loss + if len(self.results) % 5 == 0: # Save after every 5 completed tests + self._save_results() + + except Exception as e: + self.log(f"Error processing question {i+1}, iteration {iteration+1}: {str(e)}", "error") + + progress.update(task, advance=1) + + # Save results after each batch + self._save_results() + + # Log batch progress + if remaining_items: + completed = total_tests - len(remaining_items) + self.log(f"Batch complete. Progress: {completed}/{total_tests} tests ({completed/total_tests*100:.1f}%)", "info") + + # Final save + self._save_results() + + # Sort results by question_id and iteration for consistency + self.results.sort(key=lambda x: (x["question_id"], x["iteration"])) + + self.console.print(f"[green]Completed {len(self.results)}/{total_tests} tests[/green]") + + # Report on any failures + failures = [r for r in self.results if r.get("status") != "completed"] + if failures: + self.console.print(f"[yellow]Warning: {len(failures)} tests did not complete successfully[/yellow]") + + return self.results + + def evaluate_results(self) -> Dict[str, Any]: + """ + Evaluate test results for quality and consistency. + + Returns: + Dictionary with evaluation metrics + """ + if not self.results: + self.console.print("[bold red]No results available. Run tests first.[/bold red]") + return {} + + self.console.print("[bold blue]Evaluating test results...[/bold blue]") + + # Group results by question + results_by_question = {} + for result in self.results: + q_id = result["question_id"] + if q_id not in results_by_question: + results_by_question[q_id] = [] + results_by_question[q_id].append(result) + + # Calculate metrics + evaluation = { + "total_questions": len(results_by_question), + "total_tests": len(self.results), + "avg_response_time": np.mean([r["response_time"] for r in self.results]), + "question_metrics": [] + } + + # Evaluate each question + for q_id, q_results in results_by_question.items(): + # Use OpenAI to evaluate response quality and consistency + responses = [r["response"] for r in q_results] + question = q_results[0]["question"] + + # Calculate response time statistics + response_times = [r["response_time"] for r in q_results] + + # Evaluate consistency and quality with OpenAI + self.log(f"Evaluating responses for question: '{question[:50]}...'", "info") + self.log(f"Using model {self.model} for evaluation", "debug") + + try: + # First try with response_format (newer models support this) + self.log("Attempting evaluation with JSON response format", "debug") + eval_response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": "You are a helpful assistant that evaluates the quality and consistency of responses."}, + {"role": "user", "content": f"Question: {question}\n\nResponses:\n" + + "\n".join([f"Response {i+1}: {r}" for i, r in enumerate(responses)]) + + "\n\nEvaluate these responses according to these metrics:\n\n" + + "1. Quality (1-10): Overall quality of responses, including clarity, coherence, professional tone, " + + "lack of hallucinations, and grammatical correctness.\n\n" + + "2. Consistency (1-10): How consistent responses are across multiple iterations, including similarity " + + "of answers, consistency of core facts, level of detail, and lack of contradictions.\n\n" + + "3. Accuracy (1-10): Factual accuracy compared to source documents, including correct representation of " + + "information, quotes, numbers, facts, and proper interpretation of context.\n\n" + + "4. Completeness (1-10): How thoroughly responses answer the question, including addressing all aspects " + + "of the question, providing important context, sufficient detail, and lack of significant omissions.\n\n" + + "Return a JSON object with these fields:\n" + + "- quality_score: 1-10 rating of overall response quality\n" + + "- consistency_score: 1-10 rating of consistency between responses\n" + + "- accuracy_score: 1-10 rating of factual accuracy\n" + + "- completeness_score: 1-10 rating of how completely the responses answer the question\n" + + "- explanation: Brief explanation of scores"} + ], + response_format={"type": "json_object"} + ) + + self.log("Successfully received JSON format evaluation", "debug") + if self.verbose: + self.log(f"Raw evaluation response: {eval_response.choices[0].message.content}", "debug") + + evaluation_data = json.loads(eval_response.choices[0].message.content) + self.log(f"Evaluation scores: Quality={evaluation_data.get('quality_score')}, " + + f"Consistency={evaluation_data.get('consistency_score')}, " + + f"Accuracy={evaluation_data.get('accuracy_score')}, " + + f"Completeness={evaluation_data.get('completeness_score')}", "success") + + except Exception as e: + # Fallback method without response_format + self.log(f"JSON response format failed: {str(e)}", "warning") + self.log("Using fallback method for evaluation", "info") + + fallback_model = "gpt-3.5-turbo" if self.model != "gpt-3.5-turbo" else "gpt-4" + self.log(f"Using fallback model: {fallback_model}", "debug") + + eval_response = self.client.chat.completions.create( + model=fallback_model, + messages=[ + {"role": "system", "content": "You are a helpful assistant that evaluates the quality and consistency of responses."}, + {"role": "user", "content": f"Question: {question}\n\nResponses:\n" + + "\n".join([f"Response {i+1}: {r}" for i, r in enumerate(responses)]) + + "\n\nEvaluate these responses according to these metrics:\n\n" + + "1. Quality (1-10): Overall quality of responses, including clarity, coherence, professional tone, " + + "lack of hallucinations, and grammatical correctness.\n\n" + + "2. Consistency (1-10): How consistent responses are across multiple iterations, including similarity " + + "of answers, consistency of core facts, level of detail, and lack of contradictions.\n\n" + + "3. Accuracy (1-10): Factual accuracy compared to source documents, including correct representation of " + + "information, quotes, numbers, facts, and proper interpretation of context.\n\n" + + "4. Completeness (1-10): How thoroughly responses answer the question, including addressing all aspects " + + "of the question, providing important context, sufficient detail, and lack of significant omissions.\n\n" + + "Return ONLY a JSON object with these fields:\n" + + "- quality_score: 1-10 rating of overall response quality\n" + + "- consistency_score: 1-10 rating of consistency between responses\n" + + "- accuracy_score: 1-10 rating of factual accuracy\n" + + "- completeness_score: 1-10 rating of how completely the responses answer the question\n" + + "- explanation: Brief explanation of scores\n\n" + + "Format: {'quality_score': X, 'consistency_score': Y, 'accuracy_score': Z, 'completeness_score': W, 'explanation': 'text'}"} + ] + ) + + self.log("Received fallback evaluation, attempting to parse", "debug") + if self.verbose: + self.log(f"Raw fallback evaluation: {eval_response.choices[0].message.content}", "debug") + + # Try to parse the JSON from the response + try: + content = eval_response.choices[0].message.content + self.log("Looking for JSON in evaluation response", "debug") + + # Extract JSON if it's wrapped in code blocks or other text + import re + json_match = re.search(r'```(?:json)?\s*(.*?)\s*```', content, re.DOTALL) + if json_match: + self.log("Found JSON in code block", "debug") + content = json_match.group(1) + else: + # Try to find anything that looks like JSON + json_match = re.search(r'\{.*\}', content, re.DOTALL) + if json_match: + self.log("Found JSON object in text", "debug") + content = json_match.group(0) + + self.log("Attempting to parse JSON evaluation", "debug") + evaluation_data = json.loads(content) + self.log(f"Successfully parsed evaluation data", "success") + + if self.verbose: + self.log(f"Parsed evaluation data: {evaluation_data}", "debug") + + except Exception as json_error: + self.log(f"Error parsing evaluation: {str(json_error)}", "error") + if self.verbose: + self.log(f"Content that failed to parse: {content}", "debug") + + # Create default evaluation data with average scores + self.log("Using default evaluation scores due to parsing error", "warning") + evaluation_data = { + "quality_score": 5, + "consistency_score": 5, + "accuracy_score": 5, + "completeness_score": 5, + "explanation": "Default scores used due to parsing error" + } + + # Add metrics to evaluation + q_metrics = { + "question_id": q_id, + "question": question, + "avg_response_time": np.mean(response_times), + "std_response_time": np.std(response_times), + "quality_score": evaluation_data.get("quality_score"), + "consistency_score": evaluation_data.get("consistency_score"), + "accuracy_score": evaluation_data.get("accuracy_score"), + "completeness_score": evaluation_data.get("completeness_score"), + "explanation": evaluation_data.get("explanation") + } + + evaluation["question_metrics"].append(q_metrics) + + # Calculate overall scores + evaluation["avg_quality_score"] = np.mean([q["quality_score"] for q in evaluation["question_metrics"]]) + evaluation["avg_consistency_score"] = np.mean([q["consistency_score"] for q in evaluation["question_metrics"]]) + evaluation["avg_accuracy_score"] = np.mean([q["accuracy_score"] for q in evaluation["question_metrics"]]) + evaluation["avg_completeness_score"] = np.mean([q["completeness_score"] for q in evaluation["question_metrics"]]) + + # Save evaluation + with open(f"{self.output_dir}/evaluation.json", "w") as f: + json.dump(evaluation, f, indent=2) + + self.console.print("[green]Evaluation complete[/green]") + return evaluation + + def generate_report(self) -> None: + """Generate a comprehensive report with visualizations""" + if not hasattr(self, 'evaluation') or not self.evaluation: + self.evaluation = self.evaluate_results() + + # If we still don't have evaluation data, exit early + if not hasattr(self, 'evaluation') or not self.evaluation or not self.evaluation.get('total_questions'): + self.console.print("[bold red]No evaluation data available. Cannot generate report.[/bold red]") + return + + self.console.print("[bold blue]Generating report...[/bold blue]") + + # Display summary table + table = Table(title="RAG Test Summary") + table.add_column("Metric", style="cyan") + table.add_column("Value", style="magenta") + + table.add_row("Total Questions", str(self.evaluation["total_questions"])) + table.add_row("Total Tests", str(self.evaluation["total_tests"])) + table.add_row("Avg Response Time", f"{self.evaluation['avg_response_time']:.2f}s") + table.add_row("Avg Quality Score", f"{self.evaluation['avg_quality_score']:.2f}/10") + table.add_row("Avg Consistency Score", f"{self.evaluation['avg_consistency_score']:.2f}/10") + table.add_row("Avg Accuracy Score", f"{self.evaluation['avg_accuracy_score']:.2f}/10") + table.add_row("Avg Completeness Score", f"{self.evaluation['avg_completeness_score']:.2f}/10") + + self.console.print(table) + + # Create visualizations + self._create_visualizations() + + # Generate HTML report + self._generate_html_report() + + # Generate Excel/CSV exports + self._generate_excel_report() + + self.console.print(f"[green]Report generated in {self.output_dir}/report.html[/green]") + self.console.print(f"[green]Excel export saved to {self.output_dir}/report.xlsx[/green]") + self.console.print(f"[green]CSV export saved to {self.output_dir}/report.csv[/green]") + + def _generate_excel_report(self) -> None: + """Export evaluation data to Excel (.xlsx) and CSV for client reporting.""" + summary_rows = [ + {"Metric": "Total Questions", "Value": self.evaluation["total_questions"]}, + {"Metric": "Total Tests", "Value": self.evaluation["total_tests"]}, + {"Metric": "Avg Response Time (s)", "Value": round(self.evaluation["avg_response_time"], 2)}, + {"Metric": "Avg Quality Score", "Value": round(self.evaluation["avg_quality_score"], 2)}, + {"Metric": "Avg Consistency Score", "Value": round(self.evaluation["avg_consistency_score"], 2)}, + {"Metric": "Avg Accuracy Score", "Value": round(self.evaluation["avg_accuracy_score"], 2)}, + {"Metric": "Avg Completeness Score", "Value": round(self.evaluation["avg_completeness_score"], 2)}, + ] + summary_df = pd.DataFrame(summary_rows) + + detail_rows = [] + for q in self.evaluation["question_metrics"]: + avg_score = ( + q["quality_score"] + q["consistency_score"] + + q["accuracy_score"] + q["completeness_score"] + ) / 4 + detail_rows.append({ + "Question #": q["question_id"] + 1, + "Question": q["question"], + "Quality": q["quality_score"], + "Consistency": q["consistency_score"], + "Accuracy": q["accuracy_score"], + "Completeness": q["completeness_score"], + "Average Score": round(avg_score, 2), + "Avg Response Time (s)": round(q["avg_response_time"], 2), + "Evaluation Notes": q["explanation"], + }) + detail_df = pd.DataFrame(detail_rows) + + # Excel: two sheets + excel_path = f"{self.output_dir}/report.xlsx" + with pd.ExcelWriter(excel_path, engine="openpyxl") as writer: + summary_df.to_excel(writer, sheet_name="Summary", index=False) + detail_df.to_excel(writer, sheet_name="Details", index=False) + + # Auto-size columns on Details sheet + ws = writer.sheets["Details"] + for col in ws.columns: + max_len = max(len(str(cell.value or "")) for cell in col) + ws.column_dimensions[col[0].column_letter].width = min(max_len + 4, 80) + + # CSV: details only (what Richard uses for PowerPoint) + csv_path = f"{self.output_dir}/report.csv" + detail_df.to_csv(csv_path, index=False, encoding="utf-8-sig") + + def _save_results(self) -> None: + """Save test results to file""" + with open(f"{self.output_dir}/test_results.json", "w") as f: + json.dump({"results": self.results}, f, indent=2) + + def _create_visualizations(self) -> None: + """Create visualizations for the report""" + # Set Montserrat as the default font for all plots + plt.rcParams['font.family'] = 'Montserrat' + plt.rcParams['font.size'] = 12 + + # Use a professional color palette + colors = ['#3498db', '#2ecc71', '#e74c3c', '#f39c12'] + + # Prepare data + question_ids = [q["question_id"] for q in self.evaluation["question_metrics"]] + quality_scores = [q["quality_score"] for q in self.evaluation["question_metrics"]] + consistency_scores = [q["consistency_score"] for q in self.evaluation["question_metrics"]] + accuracy_scores = [q["accuracy_score"] for q in self.evaluation["question_metrics"]] + completeness_scores = [q["completeness_score"] for q in self.evaluation["question_metrics"]] + response_times = [q["avg_response_time"] for q in self.evaluation["question_metrics"]] + + # Create score comparison by question + plt.figure(figsize=(14, 9), facecolor='white') + bar_width = 0.2 + x = np.arange(len(question_ids)) + + plt.bar(x - 1.5*bar_width, quality_scores, bar_width, label='Quality', color=colors[0], alpha=0.8) + plt.bar(x - 0.5*bar_width, consistency_scores, bar_width, label='Consistency', color=colors[1], alpha=0.8) + plt.bar(x + 0.5*bar_width, accuracy_scores, bar_width, label='Accuracy', color=colors[2], alpha=0.8) + plt.bar(x + 1.5*bar_width, completeness_scores, bar_width, label='Completeness', color=colors[3], alpha=0.8) + + plt.xlabel('Question ID', fontweight='bold') + plt.ylabel('Score (1-10)', fontweight='bold') + plt.title('Performance Scores by Question', fontsize=16, fontweight='bold', pad=20) + plt.xticks(x, question_ids) + plt.ylim(0, 10) + plt.grid(axis='y', linestyle='--', alpha=0.3) + plt.legend(frameon=True, framealpha=0.9, shadow=True) + # Add background + ax = plt.gca() + ax.set_facecolor('#f8f9fa') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/scores_by_question.png", dpi=300, bbox_inches='tight') + + # Create response time chart + plt.figure(figsize=(14, 7), facecolor='white') + plt.bar(question_ids, response_times, color='#2980b9', alpha=0.8) + plt.xlabel('Question ID', fontweight='bold') + plt.ylabel('Average Response Time (seconds)', fontweight='bold') + plt.title('Response Time by Question', fontsize=16, fontweight='bold', pad=20) + plt.grid(axis='y', linestyle='--', alpha=0.3) + # Add average line + avg_time = np.mean(response_times) + plt.axhline(y=avg_time, color='#e74c3c', linestyle='--', + label=f'Average: {avg_time:.2f}s') + plt.legend(frameon=True) + # Add background + ax = plt.gca() + ax.set_facecolor('#f8f9fa') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/response_times.png", dpi=300, bbox_inches='tight') + + # Create score distribution histogram + plt.figure(figsize=(12, 7), facecolor='white') + all_scores = quality_scores + consistency_scores + accuracy_scores + completeness_scores + bins = np.arange(0, 11, 1) - 0.5 + n, bins, patches = plt.hist(all_scores, bins=bins, alpha=0.8, color='#8e44ad', + rwidth=0.85, edgecolor='white') + + plt.xlabel('Score (1-10)', fontweight='bold') + plt.ylabel('Frequency', fontweight='bold') + plt.title('Distribution of All Scores', fontsize=16, fontweight='bold', pad=20) + plt.xticks(range(11)) + plt.grid(axis='y', linestyle='--', alpha=0.3) + + # Add mean score line + mean_score = np.mean(all_scores) + plt.axvline(x=mean_score, color='#e74c3c', linestyle='--', + label=f'Mean Score: {mean_score:.2f}') + plt.legend(frameon=True) + + # Add background + ax = plt.gca() + ax.set_facecolor('#f8f9fa') + plt.tight_layout() + plt.savefig(f"{self.output_dir}/score_distribution.png", dpi=300, bbox_inches='tight') + + # Create radar chart for average scores + categories = ['Quality', 'Consistency', 'Accuracy', 'Completeness'] + values = [ + self.evaluation["avg_quality_score"], + self.evaluation["avg_consistency_score"], + self.evaluation["avg_accuracy_score"], + self.evaluation["avg_completeness_score"] + ] + + angles = np.linspace(0, 2*np.pi, len(categories), endpoint=False).tolist() + values = values + [values[0]] + angles = angles + [angles[0]] + categories = categories + [categories[0]] + + fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(polar=True), facecolor='white') + ax.plot(angles, values, 'o-', linewidth=3, color='#3498db') + ax.fill(angles, values, color='#3498db', alpha=0.25) + ax.set_thetagrids(np.degrees(angles[:-1]), categories[:-1], fontweight='bold') + ax.set_ylim(0, 10) + + # Add circular gridlines + ax.set_rticks([2, 4, 6, 8, 10]) + ax.set_rlabel_position(0) + ax.grid(True) + ax.tick_params(colors='#333333') + + plt.title('Average Scores by Category', y=1.1, fontsize=16, fontweight='bold') + plt.savefig(f"{self.output_dir}/radar_chart.png", dpi=300, bbox_inches='tight') + + def _generate_html_report(self) -> None: + """Generate an HTML report""" + html_content = f""" + + + + RAG Test Report for Assistant {self.assistant_id} + + + + + + +

RAG Testing Report

+
+

For Assistant: {self.assistant_id}

+

Generated on: {time.strftime("%B %d, %Y at %H:%M:%S")}

+
+ +
+

Assistant Information

+

Assistant ID: {self.assistant_id}

+

Documents: {f"{len(self.document_paths)} files" if self.document_paths else os.path.basename(self.document_path) if self.document_path else "None"}

+

Test Configuration: {self.evaluation["total_questions"]} questions, {self.evaluation["total_tests"] // self.evaluation["total_questions"]} iterations per question

+
+ +
+

Performance Summary

+ +
+
+
Quality
+
{self.evaluation["avg_quality_score"]:.1f}
+
out of 10
+
+
+
Consistency
+
{self.evaluation["avg_consistency_score"]:.1f}
+
out of 10
+
+
+
Accuracy
+
{self.evaluation["avg_accuracy_score"]:.1f}
+
out of 10
+
+
+
Completeness
+
{self.evaluation["avg_completeness_score"]:.1f}
+
out of 10
+
+
+ + + + + + + + + + +
MetricValue
Total Questions{self.evaluation["total_questions"]}
Total Tests{self.evaluation["total_tests"]}
Avg Response Time{self.evaluation["avg_response_time"]:.2f} seconds
Avg Quality Score{self.evaluation["avg_quality_score"]:.2f}/10
Avg Consistency Score{self.evaluation["avg_consistency_score"]:.2f}/10
Avg Accuracy Score{self.evaluation["avg_accuracy_score"]:.2f}/10
Avg Completeness Score{self.evaluation["avg_completeness_score"]:.2f}/10
+
+ +
+
+

Scores by Question

+ Scores by Question +
+
+

Response Times

+ Response Times +
+
+

Score Distribution

+ Score Distribution +
+
+

Average Scores by Category

+ Average Scores by Category +
+
+ +

Detailed Question Analysis

+ """ + + # Add question-by-question analysis + for q_metric in self.evaluation["question_metrics"]: + # Create a color for the question card based on average score + avg_score = (q_metric["quality_score"] + q_metric["consistency_score"] + + q_metric["accuracy_score"] + q_metric["completeness_score"]) / 4 + + if avg_score >= 8: + card_color = "#e3f2fd" # Light blue for high scores + border_color = "#2196f3" + elif avg_score >= 6: + card_color = "#e8f5e9" # Light green for good scores + border_color = "#4caf50" + elif avg_score >= 4: + card_color = "#fff3e0" # Light orange for medium scores + border_color = "#ff9800" + else: + card_color = "#ffebee" # Light red for low scores + border_color = "#f44336" + + # Find all responses for this question + q_responses = [] + for result in self.results: + if result["question_id"] == q_metric["question_id"]: + q_responses.append(result) + + # Sort responses by iteration + q_responses.sort(key=lambda x: x["iteration"]) + + html_content += f""" +
+

Question {q_metric["question_id"] + 1}

+

Question: {q_metric["question"]}

+ +
+
+
Quality
+
{q_metric["quality_score"]}/10
+
+
+
Consistency
+
{q_metric["consistency_score"]}/10
+
+
+
Accuracy
+
{q_metric["accuracy_score"]}/10
+
+
+
Completeness
+
{q_metric["completeness_score"]}/10
+
+
+
Response Time
+
{q_metric["avg_response_time"]:.2f}s
+
+
+ +
+

Evaluation Notes

+

{q_metric["explanation"]}

+
+ +
+
+ + + + + View All Responses ({len(q_responses)} iterations) + +
+
+ """ + + # Add each response + for i, response in enumerate(q_responses): + response_text = response["response"] or "No response received" + response_time = response["response_time"] + + html_content += f""" +
+
+ Response {i+1} (Response time: {response_time:.2f}s) +
+
+ {response_text} +
+
+ """ + + html_content += """ +
+
+
+
+
+ """ + + # Add footer + html_content += f""" +
+

Generated by RAG Testing App on {time.strftime("%B %d, %Y at %H:%M:%S")}

+

Assistant ID: {self.assistant_id}

+
+ + + """ + + with open(f"{self.output_dir}/report.html", "w") as f: + f.write(html_content) \ No newline at end of file diff --git a/rag_test_app/requirements.txt b/rag_test_app/requirements.txt new file mode 100644 index 0000000..61dde84 --- /dev/null +++ b/rag_test_app/requirements.txt @@ -0,0 +1,12 @@ +openai>=1.12.0 +pandas>=2.0.0 +numpy>=1.24.0 +langchain>=0.1.0 +scikit-learn>=1.2.0 +tiktoken>=0.5.0 +sentence-transformers>=2.2.0 +pytest>=7.0.0 +matplotlib>=3.7.0 +rich>=13.0.0 +docx2txt>=0.8 +openpyxl>=3.1.0 \ No newline at end of file diff --git a/rag_test_app/results/Archive.zip b/rag_test_app/results/Archive.zip new file mode 100644 index 0000000..c74f2a9 Binary files /dev/null and b/rag_test_app/results/Archive.zip differ diff --git a/rag_test_app/setup.py b/rag_test_app/setup.py new file mode 100644 index 0000000..b6b22a3 --- /dev/null +++ b/rag_test_app/setup.py @@ -0,0 +1,31 @@ +from setuptools import setup, find_packages + +with open("README.md", "r", encoding="utf-8") as fh: + long_description = fh.read() + +with open("requirements.txt", "r", encoding="utf-8") as fh: + requirements = fh.read().splitlines() + +setup( + name="rag-test-app", + version="0.1.0", + author="Your Name", + author_email="your.email@example.com", + description="A tool for testing OpenAI assistants with RAG capabilities", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/yourusername/rag-test-app", + packages=find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires=">=3.8", + install_requires=requirements, + entry_points={ + "console_scripts": [ + "rag-test=cli:main", + ], + }, +) \ No newline at end of file diff --git a/rag_test_app/tests/test_main.py b/rag_test_app/tests/test_main.py new file mode 100644 index 0000000..48be259 --- /dev/null +++ b/rag_test_app/tests/test_main.py @@ -0,0 +1,124 @@ +import os +import sys +import json +import pytest +from unittest.mock import patch, MagicMock + +# Add parent directory to path +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from main import RAGTester + +# Sample test data +SAMPLE_DOCUMENT = "This is a test document about machine learning." +SAMPLE_QUESTIONS = ["What is machine learning?", "Explain supervised learning."] +SAMPLE_RESPONSES = ["Machine learning is...", "Supervised learning involves..."] + +class TestRAGTester: + + @pytest.fixture + def mock_openai(self): + with patch('main.OpenAI') as mock_openai: + # Mock client + mock_client = MagicMock() + mock_openai.return_value = mock_client + + # Mock chat.completions.create + mock_completion = MagicMock() + mock_choice = MagicMock() + mock_message = MagicMock() + mock_message.content = json.dumps({"questions": SAMPLE_QUESTIONS}) + mock_choice.message = mock_message + mock_completion.choices = [mock_choice] + mock_client.chat.completions.create.return_value = mock_completion + + # Mock threads + mock_thread = MagicMock() + mock_thread.id = "thread_123" + mock_client.beta.threads.create.return_value = mock_thread + + # Mock messages + mock_message = MagicMock() + mock_message.role = "assistant" + mock_content = MagicMock() + mock_content.text = MagicMock() + mock_content.text.value = SAMPLE_RESPONSES[0] + mock_message.content = [mock_content] + mock_messages = MagicMock() + mock_messages.data = [mock_message] + mock_client.beta.threads.messages.list.return_value = mock_messages + + # Mock runs + mock_run = MagicMock() + mock_run.id = "run_123" + mock_run.status = "completed" + mock_client.beta.threads.runs.create.return_value = mock_run + mock_client.beta.threads.runs.retrieve.return_value = mock_run + + yield mock_client + + @pytest.fixture + def tester(self, tmp_path, mock_openai): + # Create a temporary document + doc_path = tmp_path / "test_doc.txt" + doc_path.write_text(SAMPLE_DOCUMENT) + + # Create output directory + output_dir = tmp_path / "results" + output_dir.mkdir() + + # Create tester + tester = RAGTester( + api_key="sk-test", + assistant_id="asst_123", + document_path=str(doc_path), + output_dir=str(output_dir) + ) + + return tester + + def test_initialization(self, tester): + """Test that the tester initializes correctly""" + assert tester.api_key == "sk-test" + assert tester.assistant_id == "asst_123" + assert tester.document_content == SAMPLE_DOCUMENT + assert len(tester.questions) == 0 + assert len(tester.results) == 0 + + def test_generate_questions(self, tester, mock_openai): + """Test question generation""" + questions = tester.generate_test_questions(num_questions=2) + + # Check that questions were generated and saved + assert len(questions) == 2 + assert questions == SAMPLE_QUESTIONS + assert mock_openai.chat.completions.create.called + + # Check that questions were saved to file + questions_file = os.path.join(tester.output_dir, "test_questions.json") + assert os.path.exists(questions_file) + + with open(questions_file, 'r') as f: + saved_questions = json.load(f) + assert saved_questions["questions"] == SAMPLE_QUESTIONS + + def test_run_tests(self, tester, mock_openai): + """Test running tests""" + # First generate questions + tester.questions = SAMPLE_QUESTIONS + + # Run tests + results = tester.run_tests(iterations=1) + + # Check results + assert len(results) == 2 # 2 questions x 1 iteration + assert results[0]["question"] == SAMPLE_QUESTIONS[0] + assert results[0]["response"] == SAMPLE_RESPONSES[0] + assert "response_time" in results[0] + + # Check that API calls were made + assert mock_openai.beta.threads.create.called + assert mock_openai.beta.threads.messages.create.called + assert mock_openai.beta.threads.runs.create.called + assert mock_openai.beta.threads.runs.retrieve.called + assert mock_openai.beta.threads.messages.list.called \ No newline at end of file