semblance/backend/app/services/bulk_persona_export_service.py
2025-12-19 19:26:16 +00:00

751 lines
No EOL
34 KiB
Python
Executable file

"""
Bulk Persona Export Service
Handles bulk export of persona profiles to various formats (markdown, JSON, CSV)
with real-time progress tracking via WebSocket events.
"""
import os
import json
import logging
import zipfile
import tempfile
import uuid
import asyncio
from typing import Dict, List, Any, Optional, Tuple
from datetime import datetime
# Removed PersonaExportService dependency - using direct conversion
from app.models.persona import Persona
from app.websocket_manager_async import get_async_websocket_manager
from app.services.task_manager import CancellableTask
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class BulkPersonaExportService:
"""Service for bulk exporting persona profiles with progress tracking."""
def __init__(self):
"""Initialize the bulk persona export service."""
self.websocket_manager = get_async_websocket_manager()
def _create_temp_directory(self) -> str:
"""Create a temporary directory for export files."""
temp_dir = os.path.join(
os.path.dirname(__file__),
"..",
"..",
"temp"
)
os.makedirs(temp_dir, exist_ok=True)
# Create unique subdirectory for this export
export_id = str(uuid.uuid4())
export_dir = os.path.join(temp_dir, f"export_{export_id}")
os.makedirs(export_dir, exist_ok=True)
return export_dir
def _sanitize_filename(self, filename: str) -> str:
"""Sanitize filename for safe file system use."""
# Remove or replace invalid characters
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, '_')
# Limit length and ensure it's not empty
filename = filename[:200].strip()
if not filename:
filename = "persona"
return filename
async def _emit_progress(self, user_id: str, task_id: str, progress: int,
current_item: str, completed_count: int, total_count: int,
current_persona_name: Optional[str] = None):
"""Emit progress update via WebSocket."""
try:
if self.websocket_manager:
await self.websocket_manager.emit_to_user(
user_id,
'bulk_export_progress',
{
'task_id': task_id,
'task_type': 'bulk_persona_export',
'progress': progress,
'current_item': current_item,
'completed_count': completed_count,
'total_count': total_count,
'current_persona_name': current_persona_name
}
)
logger.debug(f"Emitted progress: {progress}% - {current_item}")
else:
logger.warning("WebSocket manager not available for progress updates")
except Exception as e:
logger.error(f"Failed to emit progress update: {e}")
def _create_markdown_table(self, data: List[tuple], headers: List[str] = None) -> List[str]:
"""Create a markdown table from data tuples."""
if not data:
return []
# Use default headers if none provided
if not headers:
headers = ["Field", "Value"]
# Create table
table_lines = []
# Headers
header_line = "| " + " | ".join(headers) + " |"
separator_line = "|" + "|".join(["-" * (len(h) + 2) for h in headers]) + "|"
table_lines.append(header_line)
table_lines.append(separator_line)
# Data rows
for row in data:
# Ensure all values are strings and escape pipe characters
escaped_row = []
for value in row:
str_value = str(value) if value is not None else ""
# Escape pipe characters in cell content
str_value = str_value.replace("|", "\\|")
# Replace newlines with spaces for table formatting
str_value = str_value.replace("\n", " ").replace("\r", " ")
escaped_row.append(str_value)
row_line = "| " + " | ".join(escaped_row) + " |"
table_lines.append(row_line)
return table_lines
def _create_comprehensive_markdown(self, persona_data: Dict[str, Any]) -> str:
"""Create comprehensive markdown from persona data with all fields included."""
try:
name = persona_data.get('name', 'Unknown Persona')
# Build comprehensive markdown from all available data
markdown_parts = [f"# {name} - Complete Persona Profile\n"]
# AI-Generated Summary (if available) - put at top as overview
if 'aiSynthesizedBio' in persona_data and persona_data['aiSynthesizedBio']:
markdown_parts.append("## Overview")
markdown_parts.append(persona_data['aiSynthesizedBio'])
markdown_parts.append("")
# Core Demographics Section - Table Format
demo_fields = [
('age', 'Age'), ('gender', 'Gender'), ('occupation', 'Occupation'),
('education', 'Education'), ('location', 'Location'), ('ethnicity', 'Ethnicity'),
('householdIncome', 'Household Income'), ('householdComposition', 'Household Composition'),
('socialGrade', 'Social Grade')
]
demographics_data = []
for field, label in demo_fields:
if field in persona_data and persona_data[field]:
demographics_data.append((label, persona_data[field]))
if demographics_data:
markdown_parts.append("## Demographics")
table_lines = self._create_markdown_table(demographics_data, ["Attribute", "Value"])
markdown_parts.extend(table_lines)
markdown_parts.append("")
# Behavioral Profile & Preferences Section - Table Format
behavioral_fields = [
('techSavviness', 'Tech Savviness'), ('personality', 'Personality'),
('brandLoyalty', 'Brand Loyalty'), ('priceConsciousness', 'Price Consciousness'),
('environmentalConcern', 'Environmental Concern'), ('interests', 'Interests'),
('shoppingHabits', 'Shopping Habits'), ('mediaConsumption', 'Media Consumption'),
('deviceUsage', 'Device Usage'), ('brandPreferences', 'Brand Preferences'),
('hasPurchasingPower', 'Has Purchasing Power'), ('hasChildren', 'Has Children')
]
behavioral_data = []
for field, label in behavioral_fields:
if field in persona_data and persona_data[field] is not None:
value = persona_data[field]
if isinstance(value, bool):
value = "Yes" if value else "No"
behavioral_data.append((label, value))
if behavioral_data:
markdown_parts.append("## Behavioral Profile & Preferences")
table_lines = self._create_markdown_table(behavioral_data, ["Attribute", "Value"])
markdown_parts.extend(table_lines)
markdown_parts.append("")
# Goals, Motivations & Aspirations
goal_sections = [
("Goals", "goals"), ("Motivations", "motivations"),
("Frustrations", "frustrations"), ("Fears", "fears"),
("Scenarios", "scenarios")
]
for section_name, field in goal_sections:
if field in persona_data and persona_data[field]:
items = persona_data[field]
if isinstance(items, list) and items:
markdown_parts.append(f"## {section_name}")
for item in items:
if item and item.strip():
markdown_parts.append(f"- {item}")
markdown_parts.append("")
# Think, Feel, Do Psychology Framework - 3-Column Table
if 'thinkFeelDo' in persona_data and persona_data['thinkFeelDo']:
tfd = persona_data['thinkFeelDo']
markdown_parts.append("## Psychological Profile - Think, Feel, Do")
# Get the lists for each category
thinks = tfd.get('thinks', []) if isinstance(tfd.get('thinks'), list) else []
feels = tfd.get('feels', []) if isinstance(tfd.get('feels'), list) else []
does = tfd.get('does', []) if isinstance(tfd.get('does'), list) else []
# Create table data by combining the three lists
max_items = max(len(thinks), len(feels), len(does))
if max_items > 0:
tfd_data = []
for i in range(max_items):
think_item = thinks[i] if i < len(thinks) else ""
feel_item = feels[i] if i < len(feels) else ""
do_item = does[i] if i < len(does) else ""
# Only add row if at least one cell has content
if think_item or feel_item or do_item:
tfd_data.append((think_item, feel_item, do_item))
if tfd_data:
table_lines = self._create_markdown_table(tfd_data, ["Thinks", "Feels", "Does"])
markdown_parts.extend(table_lines)
markdown_parts.append("")
# OCEAN Personality Traits (Big Five) - Enhanced Table Format
if 'oceanTraits' in persona_data and persona_data['oceanTraits']:
ocean = persona_data['oceanTraits']
markdown_parts.append("## OCEAN Personality Traits (Big Five)")
trait_descriptions = {
'openness': 'Openness to Experience',
'conscientiousness': 'Conscientiousness',
'extraversion': 'Extraversion',
'agreeableness': 'Agreeableness',
'neuroticism': 'Neuroticism'
}
def get_level_description(score):
"""Get descriptive level for OCEAN trait score."""
if score < 0.3:
return "Low"
elif score < 0.7:
return "Moderate"
else:
return "High"
ocean_data = []
for trait, score in ocean.items():
if score is not None:
trait_name = trait_descriptions.get(trait, trait.title())
# Handle different score formats - scores are already decimal (0.0-1.0)
if isinstance(score, (int, float)):
# If score is already 0-1 range, multiply by 100; if it's 0-100 range, use as is
if score <= 1.0:
percentage = f"{round(float(score) * 100)}%"
else:
percentage = f"{round(float(score))}%"
level = get_level_description(float(score) if score <= 1.0 else float(score) / 100)
else:
percentage = f"{score}%"
level = "N/A"
ocean_data.append((trait_name, percentage, level))
if ocean_data:
table_lines = self._create_markdown_table(ocean_data, ["Trait", "Score", "Level"])
markdown_parts.extend(table_lines)
markdown_parts.append("")
# Top Personality Traits
if 'topPersonalityTraits' in persona_data and persona_data['topPersonalityTraits']:
traits = persona_data['topPersonalityTraits']
if isinstance(traits, list) and traits:
markdown_parts.append("## Top Personality Traits")
markdown_parts.append(", ".join([trait for trait in traits if trait]))
markdown_parts.append("")
# Qualitative Attributes
if 'qualitativeAttributes' in persona_data and persona_data['qualitativeAttributes']:
attrs = persona_data['qualitativeAttributes']
if isinstance(attrs, list) and attrs:
markdown_parts.append("## Key Qualitative Attributes")
markdown_parts.append(", ".join([attr for attr in attrs if attr]))
markdown_parts.append("")
# Lifestyle & Consumer Behavior - Table Format
lifestyle_fields = [
('coreValues', 'Core Values'), ('lifestyleChoices', 'Lifestyle Choices'),
('socialActivities', 'Social Activities'), ('categoryKnowledge', 'Category Knowledge'),
('paymentMethods', 'Payment Methods'), ('purchaseBehaviour', 'Purchase Behavior'),
('decisionInfluences', 'Decision Influences'), ('painPoints', 'Pain Points'),
('journeyContext', 'Journey Context')
]
lifestyle_data = []
for field, label in lifestyle_fields:
if field in persona_data and persona_data[field]:
lifestyle_data.append((label, persona_data[field]))
if lifestyle_data:
markdown_parts.append("## Lifestyle & Consumer Behavior")
table_lines = self._create_markdown_table(lifestyle_data, ["Attribute", "Value"])
markdown_parts.extend(table_lines)
markdown_parts.append("")
# Generation Context & Research - Table Format
context_data = []
if 'audience_brief' in persona_data and persona_data['audience_brief']:
context_data.append(("Audience Brief", persona_data['audience_brief']))
if 'research_objective' in persona_data and persona_data['research_objective']:
context_data.append(("Research Objective", persona_data['research_objective']))
if context_data:
markdown_parts.append("## Generation Context")
table_lines = self._create_markdown_table(context_data, ["Type", "Description"])
markdown_parts.extend(table_lines)
markdown_parts.append("")
# Additional Data Fields (catch any remaining fields) - Table Format
processed_fields = {
'name', 'aiSynthesizedBio', 'age', 'gender', 'occupation', 'education', 'location',
'ethnicity', 'householdIncome', 'householdComposition', 'socialGrade', 'techSavviness',
'personality', 'brandLoyalty', 'priceConsciousness', 'environmentalConcern', 'interests',
'shoppingHabits', 'mediaConsumption', 'deviceUsage', 'brandPreferences', 'hasPurchasingPower',
'hasChildren', 'goals', 'motivations', 'frustrations', 'fears', 'scenarios', 'thinkFeelDo',
'oceanTraits', 'topPersonalityTraits', 'qualitativeAttributes', 'coreValues', 'lifestyleChoices',
'socialActivities', 'categoryKnowledge', 'paymentMethods', 'purchaseBehaviour', 'decisionInfluences',
'painPoints', 'journeyContext', 'audience_brief', 'research_objective', '_id', 'created_at',
'created_by', 'updated_at', 'folder_ids'
}
additional_data = []
for key, value in persona_data.items():
if key not in processed_fields and value is not None:
if isinstance(value, list):
if value: # Non-empty list
formatted_value = ", ".join([str(v) for v in value if v])
if formatted_value:
additional_data.append((key.replace('_', ' ').title(), formatted_value))
elif isinstance(value, dict):
if value: # Non-empty dict
formatted_dict = ", ".join([f"{k}: {v}" for k, v in value.items() if v is not None])
if formatted_dict:
additional_data.append((key.replace('_', ' ').title(), formatted_dict))
else:
if str(value).strip(): # Non-empty value
additional_data.append((key.replace('_', ' ').title(), str(value)))
if additional_data:
markdown_parts.append("## Additional Data")
table_lines = self._create_markdown_table(additional_data, ["Attribute", "Value"])
markdown_parts.extend(table_lines)
markdown_parts.append("")
# Metadata - Table Format
metadata_data = []
if 'created_at' in persona_data and persona_data['created_at']:
metadata_data.append(("Created", persona_data['created_at']))
if 'updated_at' in persona_data and persona_data['updated_at']:
metadata_data.append(("Last Updated", persona_data['updated_at']))
if 'created_by' in persona_data and persona_data['created_by']:
metadata_data.append(("Created By", persona_data['created_by']))
if 'folder_ids' in persona_data and persona_data['folder_ids']:
folder_count = len(persona_data['folder_ids'])
metadata_data.append(("Folder Assignments", f"{folder_count} folder(s)"))
if metadata_data:
markdown_parts.append("## Metadata")
table_lines = self._create_markdown_table(metadata_data, ["Field", "Value"])
markdown_parts.extend(table_lines)
return "\n".join(markdown_parts)
except Exception as e:
logger.error(f"Failed to create comprehensive markdown: {e}")
return f"# {persona_data.get('name', 'Unknown Persona')}\n\nError generating profile.\n\n```json\n{json.dumps(persona_data, indent=2, default=str)}\n```"
async def export_personas_bulk(
self,
persona_ids: List[str],
user_id: str,
export_format: str = 'markdown'
) -> Tuple[bool, str, Optional[str]]:
"""
Export multiple personas to specified format with progress tracking.
Args:
persona_ids: List of persona IDs to export
user_id: ID of user requesting export
export_format: Format for export ('markdown', 'json', 'csv')
Returns:
Tuple of (success, file_path_or_error_message, task_id)
"""
task_id = str(uuid.uuid4())
export_dir = None
try:
async with CancellableTask("bulk_persona_export", user_id, {"export_format": export_format}) as registered_task_id:
task_id = registered_task_id or task_id
logger.info(f"Starting bulk export for {len(persona_ids)} personas (user: {user_id}, format: {export_format})")
# Create temp directory
export_dir = self._create_temp_directory()
# Emit initial progress
await self._emit_progress(
user_id, task_id, 0,
"Initializing export...", 0, len(persona_ids)
)
# Fetch all personas
await self._emit_progress(
user_id, task_id, 5,
"Fetching persona data...", 0, len(persona_ids)
)
personas = []
for persona_id in persona_ids:
persona = await Persona.find_by_id(persona_id)
if persona:
personas.append(persona)
else:
logger.warning(f"Persona not found: {persona_id}")
if not personas:
await self._emit_progress(
user_id, task_id, 100,
"No valid personas found", 0, len(persona_ids)
)
return False, "No valid personas found for export", task_id
# Process personas based on format
if export_format == 'markdown':
return await self._export_as_markdown_zip(
personas, user_id, task_id, export_dir
)
elif export_format == 'json':
return await self._export_as_json_zip(
personas, user_id, task_id, export_dir
)
elif export_format == 'csv':
return await self._export_as_csv_zip(
personas, user_id, task_id, export_dir
)
else:
return False, f"Unsupported export format: {export_format}", task_id
except asyncio.CancelledError:
logger.info(f"Bulk export cancelled by user: {user_id}")
if export_dir and os.path.exists(export_dir):
import shutil
shutil.rmtree(export_dir, ignore_errors=True)
if self.websocket_manager:
await self.websocket_manager.emit_to_user(
user_id,
'task_cancelled',
{
'task_id': task_id,
'message': 'Export cancelled successfully'
}
)
return False, "Export cancelled by user", task_id
except Exception as e:
logger.error(f"Bulk export error: {e}")
if export_dir and os.path.exists(export_dir):
import shutil
shutil.rmtree(export_dir, ignore_errors=True)
if self.websocket_manager:
await self.websocket_manager.emit_to_user(
user_id,
'task_failed',
{
'task_id': task_id,
'message': f'Export failed: {str(e)}'
}
)
return False, f"Export failed: {str(e)}", task_id
async def _export_as_markdown_zip(
self,
personas: List[Dict[str, Any]],
user_id: str,
task_id: str,
export_dir: str
) -> Tuple[bool, str, str]:
"""Export personas as markdown files in a ZIP archive."""
try:
zip_path = os.path.join(export_dir, f"persona_profiles_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.zip")
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
total_personas = len(personas)
for i, persona in enumerate(personas):
# Check for cancellation
current_task = asyncio.current_task()
if current_task and current_task.cancelled():
raise asyncio.CancelledError("Task was cancelled")
persona_name = persona.get('name', f'Persona_{i+1}')
# Update progress
progress = int(10 + (i / total_personas) * 80) # 10-90%
await self._emit_progress(
user_id, task_id, progress,
f"Processing persona {i+1} of {total_personas}",
i, total_personas, persona_name
)
# Make persona data serializable and convert directly to markdown
from app.routes.personas import make_serializable
serializable_persona = make_serializable(persona)
# Generate comprehensive markdown directly from persona data
markdown_content = self._create_comprehensive_markdown(serializable_persona)
# Create safe filename
safe_name = self._sanitize_filename(persona_name)
filename = f"{safe_name}.md"
# Add to ZIP
zipf.writestr(filename, markdown_content.encode('utf-8'))
logger.info(f"Added {filename} to ZIP ({len(markdown_content)} chars)")
# Final progress update
await self._emit_progress(
user_id, task_id, 95,
"Finalizing ZIP file...", total_personas, total_personas
)
# Verify ZIP was created
if not os.path.exists(zip_path):
return False, "Failed to create ZIP file", task_id
file_size = os.path.getsize(zip_path)
logger.info(f"Created ZIP file: {zip_path} ({file_size} bytes)")
# Success notification
await self._emit_progress(
user_id, task_id, 100,
f"Export completed! {total_personas} personas exported.",
total_personas, total_personas
)
if self.websocket_manager:
await self.websocket_manager.emit_to_user(
user_id,
'task_completed',
{
'task_id': task_id,
'message': f'Successfully exported {total_personas} persona profiles',
'file_path': zip_path,
'file_size': file_size
}
)
return True, zip_path, task_id
except Exception as e:
logger.error(f"Markdown ZIP export error: {e}")
return False, f"Markdown export failed: {str(e)}", task_id
async def _export_as_json_zip(
self,
personas: List[Dict[str, Any]],
user_id: str,
task_id: str,
export_dir: str
) -> Tuple[bool, str, str]:
"""Export personas as JSON files in a ZIP archive."""
try:
zip_path = os.path.join(export_dir, f"persona_data_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.zip")
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
total_personas = len(personas)
for i, persona in enumerate(personas):
# Check for cancellation
current_task = asyncio.current_task()
if current_task and current_task.cancelled():
raise asyncio.CancelledError("Task was cancelled")
persona_name = persona.get('name', f'Persona_{i+1}')
# Update progress
progress = int(10 + (i / total_personas) * 80) # 10-90%
await self._emit_progress(
user_id, task_id, progress,
f"Processing persona {i+1} of {total_personas}",
i, total_personas, persona_name
)
# Make persona data serializable and convert to JSON
from app.routes.personas import make_serializable
serializable_persona = make_serializable(persona)
json_content = json.dumps(serializable_persona, indent=2, ensure_ascii=False, default=str)
# Create safe filename
safe_name = self._sanitize_filename(persona_name)
filename = f"{safe_name}.json"
# Add to ZIP
zipf.writestr(filename, json_content.encode('utf-8'))
# Final steps
await self._emit_progress(
user_id, task_id, 100,
f"Export completed! {total_personas} personas exported.",
total_personas, total_personas
)
if self.websocket_manager:
await self.websocket_manager.emit_to_user(
user_id,
'task_completed',
{
'task_id': task_id,
'message': f'Successfully exported {total_personas} persona JSON files',
'file_path': zip_path,
'file_size': os.path.getsize(zip_path)
}
)
return True, zip_path, task_id
except Exception as e:
logger.error(f"JSON ZIP export error: {e}")
return False, f"JSON export failed: {str(e)}", task_id
async def _export_as_csv_zip(
self,
personas: List[Dict[str, Any]],
user_id: str,
task_id: str,
export_dir: str
) -> Tuple[bool, str, str]:
"""Export personas as individual CSV files in a ZIP archive."""
try:
import csv
import io
zip_path = os.path.join(export_dir, f"persona_csvs_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}.zip")
with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
total_personas = len(personas)
for i, persona in enumerate(personas):
# Check for cancellation
current_task = asyncio.current_task()
if current_task and current_task.cancelled():
raise asyncio.CancelledError("Task was cancelled")
persona_name = persona.get('name', f'Persona_{i+1}')
# Update progress
progress = int(10 + (i / total_personas) * 80) # 10-90%
await self._emit_progress(
user_id, task_id, progress,
f"Processing persona {i+1} of {total_personas}",
i, total_personas, persona_name
)
# Make persona data serializable and flatten for CSV
from app.routes.personas import make_serializable
serializable_persona = make_serializable(persona)
# Flatten nested objects and arrays for CSV format
flat_persona = {}
for key, value in serializable_persona.items():
if isinstance(value, dict):
# Flatten nested objects: {"oceanTraits": {"openness": 0.7}} -> {"oceanTraits_openness": 0.7}
for subkey, subvalue in value.items():
flat_persona[f"{key}_{subkey}"] = subvalue
elif isinstance(value, list):
# Convert arrays to semicolon-separated strings
if value:
if isinstance(value[0], str):
flat_persona[key] = "; ".join(str(v) for v in value if v)
else:
flat_persona[key] = json.dumps(value)
else:
flat_persona[key] = ""
else:
flat_persona[key] = value if value is not None else ""
# Create CSV content using built-in csv module
output = io.StringIO()
if flat_persona: # Only proceed if we have data
fieldnames = list(flat_persona.keys())
writer = csv.DictWriter(output, fieldnames=fieldnames)
writer.writeheader()
writer.writerow(flat_persona)
csv_content = output.getvalue()
output.close()
# Create safe filename
safe_name = self._sanitize_filename(persona_name)
filename = f"{safe_name}.csv"
# Add to ZIP
zipf.writestr(filename, csv_content.encode('utf-8'))
logger.info(f"Added {filename} to ZIP ({len(csv_content)} chars)")
# Final progress update
await self._emit_progress(
user_id, task_id, 95,
"Finalizing ZIP file...", total_personas, total_personas
)
# Verify ZIP was created
if not os.path.exists(zip_path):
return False, "Failed to create ZIP file", task_id
file_size = os.path.getsize(zip_path)
logger.info(f"Created CSV ZIP file: {zip_path} ({file_size} bytes)")
# Success notification
await self._emit_progress(
user_id, task_id, 100,
f"Export completed! {total_personas} personas exported.",
total_personas, total_personas
)
if self.websocket_manager:
await self.websocket_manager.emit_to_user(
user_id,
'task_completed',
{
'task_id': task_id,
'message': f'Successfully exported {total_personas} persona CSV files',
'file_path': zip_path,
'file_size': file_size
}
)
return True, zip_path, task_id
except Exception as e:
logger.error(f"CSV ZIP export error: {e}")
return False, f"CSV export failed: {str(e)}", task_id