video-query/extract_user_logs_robust.sh
2025-09-18 14:25:24 -05:00

175 lines
No EOL
5.5 KiB
Bash
Executable file

#!/bin/bash
# Enhanced script to extract user emails and prompts from veo-video-generator systemd logs
# Usage: ./extract_user_logs_robust.sh [output_file.csv] [service_name] [date_range]
# Examples:
# ./extract_user_logs_robust.sh usage_report.csv
# ./extract_user_logs_robust.sh usage_report.csv veo-video-generator "--since=2024-06-01"
# Set defaults
OUTPUT_FILE="${1:-video_generation_usage.csv}"
SERVICE_NAME="${2:-veo-video-generator}"
DATE_RANGE="${3:-}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Function to print colored output
print_status() {
echo -e "${GREEN}[INFO]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
# Check if required tools are installed
check_dependencies() {
local missing_deps=()
if ! command -v journalctl >/dev/null 2>&1; then
missing_deps+=("systemd (journalctl)")
fi
if ! command -v jq >/dev/null 2>&1; then
missing_deps+=("jq")
fi
if [ ${#missing_deps[@]} -ne 0 ]; then
print_error "Missing required dependencies: ${missing_deps[*]}"
print_error "Please install missing dependencies and try again"
exit 1
fi
}
# Function to validate JSON and extract fields safely
extract_json_fields() {
local json_string="$1"
local timestamp="$2"
# Try to validate JSON first
if echo "$json_string" | jq empty 2>/dev/null; then
# Extract fields using jq
local user_email=$(echo "$json_string" | jq -r '.user_email // empty' 2>/dev/null)
local prompt=$(echo "$json_string" | jq -r '.prompt // empty' 2>/dev/null)
local video_length=$(echo "$json_string" | jq -r '.video_length_sec // empty' 2>/dev/null)
local aspect_ratio=$(echo "$json_string" | jq -r '.aspect_ratio // empty' 2>/dev/null)
local person_generation=$(echo "$json_string" | jq -r '.person_generation // empty' 2>/dev/null)
# Clean up prompt for CSV (replace commas and quotes)
prompt=$(echo "$prompt" | sed 's/,/;/g' | sed 's/"/\\"/g')
# Only output if we have essential fields
if [ -n "$user_email" ] && [ "$user_email" != "null" ]; then
echo "\"$timestamp\",\"$user_email\",\"$prompt\",\"$video_length\",\"$aspect_ratio\",\"$person_generation\""
return 0
fi
else
print_warning "Invalid JSON found at $timestamp: $json_string"
fi
return 1
}
print_status "Starting log extraction..."
print_status "Service: $SERVICE_NAME"
print_status "Output file: $OUTPUT_FILE"
if [ -n "$DATE_RANGE" ]; then
print_status "Date range: $DATE_RANGE"
fi
# Check dependencies
check_dependencies
# Check if service exists
if ! systemctl list-units --full -a | grep -q "$SERVICE_NAME.service"; then
print_warning "Service '$SERVICE_NAME' not found in systemctl list-units"
print_warning "This might be normal if the service is not currently loaded"
fi
# Create CSV header
echo "timestamp,user_email,prompt,video_length_sec,aspect_ratio,person_generation" > "$OUTPUT_FILE"
# Build journalctl command
JOURNAL_CMD="journalctl -u $SERVICE_NAME --no-pager --output=short-iso"
if [ -n "$DATE_RANGE" ]; then
JOURNAL_CMD="$JOURNAL_CMD $DATE_RANGE"
fi
print_status "Extracting logs... (this may take a while for large log files)"
# Counter for processing
processed_lines=0
valid_records=0
# Process logs
eval "$JOURNAL_CMD" | grep "Raw JSON data received:" | while IFS= read -r line; do
processed_lines=$((processed_lines + 1))
# Show progress every 100 lines
if [ $((processed_lines % 100)) -eq 0 ]; then
print_status "Processed $processed_lines log lines..."
fi
# Extract timestamp (first field)
timestamp=$(echo "$line" | awk '{print $1}')
# Extract JSON part - handle various formats
json_part=""
if [[ "$line" =~ Raw\ JSON\ data\ received:\ (.+)$ ]]; then
json_part="${BASH_REMATCH[1]}"
else
# Fallback extraction method
json_part=$(echo "$line" | sed -n "s/.*Raw JSON data received: \(.*\)/\1/p")
fi
# Process if we found JSON
if [ -n "$json_part" ]; then
if extract_json_fields "$json_part" "$timestamp" >> "$OUTPUT_FILE"; then
valid_records=$((valid_records + 1))
fi
fi
done
# Get final counts (need to do this outside the while loop due to subshell)
record_count=$(tail -n +2 "$OUTPUT_FILE" | wc -l)
print_status "Processing complete!"
print_status "Total valid records extracted: $record_count"
print_status "Output saved to: $OUTPUT_FILE"
if [ $record_count -eq 0 ]; then
print_warning "No records found. This could mean:"
print_warning " - No logs exist for the specified service/date range"
print_warning " - The log format has changed"
print_warning " - The service name is incorrect"
exit 1
fi
# Show summary statistics
echo ""
print_status "=== SUMMARY REPORT ==="
# Unique users
echo "Unique users found:"
tail -n +2 "$OUTPUT_FILE" | cut -d',' -f2 | sed 's/"//g' | sort | uniq -c | sort -nr
# Date range
echo ""
echo "Date range of requests:"
tail -n +2 "$OUTPUT_FILE" | cut -d',' -f1 | sed 's/"//g' | sort | head -1 | xargs -I {} echo "First: {}"
tail -n +2 "$OUTPUT_FILE" | cut -d',' -f1 | sed 's/"//g' | sort | tail -1 | xargs -I {} echo "Last: {}"
# Most active users
echo ""
echo "Top 5 most active users:"
tail -n +2 "$OUTPUT_FILE" | cut -d',' -f2 | sed 's/"//g' | sort | uniq -c | sort -nr | head -5
print_status "Report generation complete!"