Add environment configuration and local development documentation

- Parameterize CORS_ORIGINS in docker-compose.swarm.yml - Add .env.example with configuration options - Create comprehensive LOCAL_DEVELOPMENT.md guide - Update README.md with environment variable documentation - Provide alternatives for local development without production domain 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-10 18:20:52 +10:00
parent daf0766e29
commit f3cbb5c6f7
50 changed files with 6339 additions and 528 deletions
--- a/backend/app/core/pycache/hive_coordinator.cpython-310.pyc
+++ b/backend/app/core/pycache/hive_coordinator.cpython-310.pyc
--- a/backend/app/core/performance_monitor.py
+++ b/backend/app/core/performance_monitor.py
@@ -0,0 +1,664 @@
+"""
+Performance Monitoring and Optimization System
+Real-time monitoring and automatic optimization for distributed workflows
+"""
+
+import asyncio
+import time
+import logging
+from typing import Dict, List, Any, Optional, Tuple
+from dataclasses import dataclass, field
+from datetime import datetime, timedelta
+from collections import defaultdict, deque
+import json
+import statistics
+import psutil
+import aiofiles
+
+from prometheus_client import (
+    Counter, Histogram, Gauge, Summary,
+    CollectorRegistry, generate_latest, CONTENT_TYPE_LATEST
+)
+
+logger = logging.getLogger(__name__)
+
+@dataclass
+class PerformanceMetric:
+    """Individual performance metric"""
+    timestamp: datetime
+    agent_id: str
+    metric_type: str
+    value: float
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+@dataclass
+class AgentPerformanceProfile:
+    """Performance profile for a cluster agent"""
+    agent_id: str
+    avg_response_time: float = 0.0
+    task_throughput: float = 0.0  # tasks per minute
+    success_rate: float = 1.0
+    current_load: float = 0.0
+    memory_usage: float = 0.0
+    gpu_utilization: float = 0.0
+    last_updated: datetime = field(default_factory=datetime.now)
+    
+    # Historical data (keep last 100 measurements)
+    response_times: deque = field(default_factory=lambda: deque(maxlen=100))
+    task_completions: deque = field(default_factory=lambda: deque(maxlen=100))
+    error_count: int = 0
+    total_tasks: int = 0
+
+@dataclass
+class WorkflowPerformanceData:
+    """Performance data for a workflow"""
+    workflow_id: str
+    start_time: datetime
+    end_time: Optional[datetime] = None
+    total_tasks: int = 0
+    completed_tasks: int = 0
+    failed_tasks: int = 0
+    avg_task_duration: float = 0.0
+    bottleneck_agents: List[str] = field(default_factory=list)
+    optimization_suggestions: List[str] = field(default_factory=list)
+
+class PerformanceMonitor:
+    """Real-time performance monitoring and optimization system"""
+    
+    def __init__(self, monitoring_interval: int = 30):
+        self.monitoring_interval = monitoring_interval
+        self.agent_profiles: Dict[str, AgentPerformanceProfile] = {}
+        self.workflow_data: Dict[str, WorkflowPerformanceData] = {}
+        self.metrics_history: deque = deque(maxlen=10000)  # Keep last 10k metrics
+        
+        # Performance thresholds
+        self.thresholds = {
+            'response_time_warning': 30.0,  # seconds
+            'response_time_critical': 60.0,  # seconds
+            'success_rate_warning': 0.9,
+            'success_rate_critical': 0.8,
+            'utilization_warning': 0.8,
+            'utilization_critical': 0.95,
+            'queue_depth_warning': 10,
+            'queue_depth_critical': 25
+        }
+        
+        # Optimization rules
+        self.optimization_rules = {
+            'load_balancing': True,
+            'auto_scaling': True,
+            'performance_tuning': True,
+            'bottleneck_detection': True,
+            'predictive_optimization': True
+        }
+        
+        # Prometheus metrics
+        self.setup_prometheus_metrics()
+        
+        # Background tasks
+        self.monitoring_task: Optional[asyncio.Task] = None
+        self.optimization_task: Optional[asyncio.Task] = None
+        
+        # Performance alerts
+        self.active_alerts: Dict[str, Dict] = {}
+        self.alert_history: List[Dict] = []
+        
+    def setup_prometheus_metrics(self):
+        """Setup Prometheus metrics for monitoring"""
+        self.registry = CollectorRegistry()
+        
+        # Task metrics
+        self.task_duration = Histogram(
+            'hive_task_duration_seconds',
+            'Task execution duration',
+            ['agent_id', 'task_type'],
+            registry=self.registry
+        )
+        
+        self.task_counter = Counter(
+            'hive_tasks_total',
+            'Total tasks processed',
+            ['agent_id', 'task_type', 'status'],
+            registry=self.registry
+        )
+        
+        # Agent metrics
+        self.agent_response_time = Histogram(
+            'hive_agent_response_time_seconds',
+            'Agent response time',
+            ['agent_id'],
+            registry=self.registry
+        )
+        
+        self.agent_utilization = Gauge(
+            'hive_agent_utilization_ratio',
+            'Agent utilization ratio',
+            ['agent_id'],
+            registry=self.registry
+        )
+        
+        self.agent_queue_depth = Gauge(
+            'hive_agent_queue_depth',
+            'Number of queued tasks per agent',
+            ['agent_id'],
+            registry=self.registry
+        )
+        
+        # Workflow metrics
+        self.workflow_duration = Histogram(
+            'hive_workflow_duration_seconds',
+            'Workflow completion time',
+            ['workflow_type'],
+            registry=self.registry
+        )
+        
+        self.workflow_success_rate = Gauge(
+            'hive_workflow_success_rate',
+            'Workflow success rate',
+            registry=self.registry
+        )
+        
+        # System metrics
+        self.system_cpu_usage = Gauge(
+            'hive_system_cpu_usage_percent',
+            'System CPU usage percentage',
+            registry=self.registry
+        )
+        
+        self.system_memory_usage = Gauge(
+            'hive_system_memory_usage_percent',
+            'System memory usage percentage',
+            registry=self.registry
+        )
+    
+    async def start_monitoring(self):
+        """Start the performance monitoring system"""
+        logger.info("Starting performance monitoring system")
+        
+        # Start monitoring tasks
+        self.monitoring_task = asyncio.create_task(self._monitoring_loop())
+        self.optimization_task = asyncio.create_task(self._optimization_loop())
+        
+        logger.info("Performance monitoring system started")
+    
+    async def stop_monitoring(self):
+        """Stop the performance monitoring system"""
+        logger.info("Stopping performance monitoring system")
+        
+        # Cancel background tasks
+        if self.monitoring_task:
+            self.monitoring_task.cancel()
+            try:
+                await self.monitoring_task
+            except asyncio.CancelledError:
+                pass
+        
+        if self.optimization_task:
+            self.optimization_task.cancel()
+            try:
+                await self.optimization_task
+            except asyncio.CancelledError:
+                pass
+        
+        logger.info("Performance monitoring system stopped")
+    
+    async def _monitoring_loop(self):
+        """Main monitoring loop"""
+        while True:
+            try:
+                await self._collect_system_metrics()
+                await self._update_agent_metrics()
+                await self._detect_performance_issues()
+                await self._update_prometheus_metrics()
+                
+                await asyncio.sleep(self.monitoring_interval)
+                
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Error in monitoring loop: {e}")
+                await asyncio.sleep(self.monitoring_interval)
+    
+    async def _optimization_loop(self):
+        """Main optimization loop"""
+        while True:
+            try:
+                await self._optimize_load_balancing()
+                await self._optimize_agent_parameters()
+                await self._generate_optimization_recommendations()
+                await self._cleanup_old_data()
+                
+                await asyncio.sleep(self.monitoring_interval * 2)  # Run less frequently
+                
+            except asyncio.CancelledError:
+                break
+            except Exception as e:
+                logger.error(f"Error in optimization loop: {e}")
+                await asyncio.sleep(self.monitoring_interval * 2)
+    
+    async def _collect_system_metrics(self):
+        """Collect system-level metrics"""
+        try:
+            # CPU usage
+            cpu_percent = psutil.cpu_percent(interval=1)
+            self.system_cpu_usage.set(cpu_percent)
+            
+            # Memory usage
+            memory = psutil.virtual_memory()
+            memory_percent = memory.percent
+            self.system_memory_usage.set(memory_percent)
+            
+            # Log critical system metrics
+            if cpu_percent > 90:
+                logger.warning(f"High system CPU usage: {cpu_percent:.1f}%")
+            if memory_percent > 90:
+                logger.warning(f"High system memory usage: {memory_percent:.1f}%")
+                
+        except Exception as e:
+            logger.error(f"Error collecting system metrics: {e}")
+    
+    async def _update_agent_metrics(self):
+        """Update agent performance metrics"""
+        for agent_id, profile in self.agent_profiles.items():
+            try:
+                # Calculate current metrics
+                if profile.response_times:
+                    profile.avg_response_time = statistics.mean(profile.response_times)
+                    
+                # Calculate task throughput (tasks per minute)
+                recent_completions = [
+                    timestamp for timestamp in profile.task_completions
+                    if timestamp > datetime.now() - timedelta(minutes=5)
+                ]
+                profile.task_throughput = len(recent_completions) / 5.0 * 60  # per minute
+                
+                # Calculate success rate
+                if profile.total_tasks > 0:
+                    profile.success_rate = 1.0 - (profile.error_count / profile.total_tasks)
+                
+                # Update Prometheus metrics
+                self.agent_response_time.labels(agent_id=agent_id).observe(profile.avg_response_time)
+                self.agent_utilization.labels(agent_id=agent_id).set(profile.current_load)
+                
+                profile.last_updated = datetime.now()
+                
+            except Exception as e:
+                logger.error(f"Error updating metrics for agent {agent_id}: {e}")
+    
+    async def _detect_performance_issues(self):
+        """Detect performance issues and generate alerts"""
+        current_time = datetime.now()
+        
+        for agent_id, profile in self.agent_profiles.items():
+            alerts = []
+            
+            # Response time alerts
+            if profile.avg_response_time > self.thresholds['response_time_critical']:
+                alerts.append({
+                    'type': 'critical',
+                    'metric': 'response_time',
+                    'value': profile.avg_response_time,
+                    'threshold': self.thresholds['response_time_critical'],
+                    'message': f"Agent {agent_id} has critical response time: {profile.avg_response_time:.2f}s"
+                })
+            elif profile.avg_response_time > self.thresholds['response_time_warning']:
+                alerts.append({
+                    'type': 'warning',
+                    'metric': 'response_time',
+                    'value': profile.avg_response_time,
+                    'threshold': self.thresholds['response_time_warning'],
+                    'message': f"Agent {agent_id} has high response time: {profile.avg_response_time:.2f}s"
+                })
+            
+            # Success rate alerts
+            if profile.success_rate < self.thresholds['success_rate_critical']:
+                alerts.append({
+                    'type': 'critical',
+                    'metric': 'success_rate',
+                    'value': profile.success_rate,
+                    'threshold': self.thresholds['success_rate_critical'],
+                    'message': f"Agent {agent_id} has critical success rate: {profile.success_rate:.2%}"
+                })
+            elif profile.success_rate < self.thresholds['success_rate_warning']:
+                alerts.append({
+                    'type': 'warning',
+                    'metric': 'success_rate',
+                    'value': profile.success_rate,
+                    'threshold': self.thresholds['success_rate_warning'],
+                    'message': f"Agent {agent_id} has low success rate: {profile.success_rate:.2%}"
+                })
+            
+            # Process alerts
+            for alert in alerts:
+                alert_key = f"{agent_id}_{alert['metric']}"
+                alert['agent_id'] = agent_id
+                alert['timestamp'] = current_time.isoformat()
+                
+                # Add to active alerts
+                self.active_alerts[alert_key] = alert
+                self.alert_history.append(alert)
+                
+                # Log alert
+                if alert['type'] == 'critical':
+                    logger.error(alert['message'])
+                else:
+                    logger.warning(alert['message'])
+    
+    async def _update_prometheus_metrics(self):
+        """Update Prometheus metrics"""
+        try:
+            # Update workflow success rate
+            total_workflows = len(self.workflow_data)
+            if total_workflows > 0:
+                successful_workflows = sum(
+                    1 for workflow in self.workflow_data.values()
+                    if workflow.end_time and workflow.failed_tasks == 0
+                )
+                success_rate = successful_workflows / total_workflows
+                self.workflow_success_rate.set(success_rate)
+                
+        except Exception as e:
+            logger.error(f"Error updating Prometheus metrics: {e}")
+    
+    async def _optimize_load_balancing(self):
+        """Optimize load balancing across agents"""
+        if not self.optimization_rules['load_balancing']:
+            return
+        
+        try:
+            # Calculate load distribution
+            agent_loads = {
+                agent_id: profile.current_load / profile.total_tasks if profile.total_tasks > 0 else 0
+                for agent_id, profile in self.agent_profiles.items()
+            }
+            
+            if not agent_loads:
+                return
+            
+            # Identify overloaded and underloaded agents
+            avg_load = statistics.mean(agent_loads.values())
+            overloaded_agents = [
+                agent_id for agent_id, load in agent_loads.items()
+                if load > avg_load * 1.5
+            ]
+            underloaded_agents = [
+                agent_id for agent_id, load in agent_loads.items()
+                if load < avg_load * 0.5
+            ]
+            
+            # Log load balancing opportunities
+            if overloaded_agents and underloaded_agents:
+                logger.info(f"Load balancing opportunity detected:")
+                logger.info(f"  Overloaded: {overloaded_agents}")
+                logger.info(f"  Underloaded: {underloaded_agents}")
+                
+        except Exception as e:
+            logger.error(f"Error in load balancing optimization: {e}")
+    
+    async def _optimize_agent_parameters(self):
+        """Optimize agent parameters based on performance"""
+        if not self.optimization_rules['performance_tuning']:
+            return
+        
+        try:
+            for agent_id, profile in self.agent_profiles.items():
+                optimizations = []
+                
+                # Optimize based on response time
+                if profile.avg_response_time > self.thresholds['response_time_warning']:
+                    if profile.current_load > 0.8:
+                        optimizations.append("Reduce max_concurrent tasks")
+                    optimizations.append("Consider model quantization")
+                    optimizations.append("Enable connection pooling")
+                
+                # Optimize based on throughput
+                if profile.task_throughput < 5:  # Less than 5 tasks per minute
+                    optimizations.append("Increase task batching")
+                    optimizations.append("Optimize prompt templates")
+                
+                # Optimize based on success rate
+                if profile.success_rate < self.thresholds['success_rate_warning']:
+                    optimizations.append("Review error handling")
+                    optimizations.append("Increase timeout limits")
+                    optimizations.append("Check agent health")
+                
+                if optimizations:
+                    logger.info(f"Optimization recommendations for {agent_id}:")
+                    for opt in optimizations:
+                        logger.info(f"  - {opt}")
+                        
+        except Exception as e:
+            logger.error(f"Error in agent parameter optimization: {e}")
+    
+    async def _generate_optimization_recommendations(self):
+        """Generate system-wide optimization recommendations"""
+        try:
+            recommendations = []
+            
+            # Analyze overall system performance
+            if self.agent_profiles:
+                avg_response_time = statistics.mean(
+                    profile.avg_response_time for profile in self.agent_profiles.values()
+                )
+                avg_success_rate = statistics.mean(
+                    profile.success_rate for profile in self.agent_profiles.values()
+                )
+                
+                if avg_response_time > 30:
+                    recommendations.append({
+                        'type': 'performance',
+                        'priority': 'high',
+                        'recommendation': 'Consider adding more GPU capacity to the cluster',
+                        'impact': 'Reduce average response time'
+                    })
+                
+                if avg_success_rate < 0.9:
+                    recommendations.append({
+                        'type': 'reliability',
+                        'priority': 'high',
+                        'recommendation': 'Investigate and resolve agent stability issues',
+                        'impact': 'Improve workflow success rate'
+                    })
+                
+                # Analyze task distribution
+                task_counts = [profile.total_tasks for profile in self.agent_profiles.values()]
+                if task_counts and max(task_counts) > min(task_counts) * 3:
+                    recommendations.append({
+                        'type': 'load_balancing',
+                        'priority': 'medium',
+                        'recommendation': 'Rebalance task distribution across agents',
+                        'impact': 'Improve cluster utilization'
+                    })
+            
+            # Log recommendations
+            if recommendations:
+                logger.info("System optimization recommendations:")
+                for rec in recommendations:
+                    logger.info(f"  [{rec['priority'].upper()}] {rec['recommendation']}")
+                    
+        except Exception as e:
+            logger.error(f"Error generating optimization recommendations: {e}")
+    
+    async def _cleanup_old_data(self):
+        """Clean up old performance data"""
+        try:
+            cutoff_time = datetime.now() - timedelta(hours=24)
+            
+            # Clean up old metrics
+            self.metrics_history = deque(
+                [metric for metric in self.metrics_history if metric.timestamp > cutoff_time],
+                maxlen=10000
+            )
+            
+            # Clean up old alerts
+            self.alert_history = [
+                alert for alert in self.alert_history
+                if datetime.fromisoformat(alert['timestamp']) > cutoff_time
+            ]
+            
+            # Clean up completed workflows older than 24 hours
+            old_workflows = [
+                workflow_id for workflow_id, workflow in self.workflow_data.items()
+                if workflow.end_time and workflow.end_time < cutoff_time
+            ]
+            
+            for workflow_id in old_workflows:
+                del self.workflow_data[workflow_id]
+                
+            if old_workflows:
+                logger.info(f"Cleaned up {len(old_workflows)} old workflow records")
+                
+        except Exception as e:
+            logger.error(f"Error in data cleanup: {e}")
+    
+    def record_task_start(self, agent_id: str, task_id: str, task_type: str):
+        """Record the start of a task"""
+        if agent_id not in self.agent_profiles:
+            self.agent_profiles[agent_id] = AgentPerformanceProfile(agent_id=agent_id)
+        
+        profile = self.agent_profiles[agent_id]
+        profile.current_load += 1
+        profile.total_tasks += 1
+        
+        # Record metric
+        metric = PerformanceMetric(
+            timestamp=datetime.now(),
+            agent_id=agent_id,
+            metric_type='task_start',
+            value=1.0,
+            metadata={'task_id': task_id, 'task_type': task_type}
+        )
+        self.metrics_history.append(metric)
+    
+    def record_task_completion(self, agent_id: str, task_id: str, duration: float, success: bool):
+        """Record the completion of a task"""
+        if agent_id not in self.agent_profiles:
+            return
+        
+        profile = self.agent_profiles[agent_id]
+        profile.current_load = max(0, profile.current_load - 1)
+        profile.response_times.append(duration)
+        profile.task_completions.append(datetime.now())
+        
+        if not success:
+            profile.error_count += 1
+        
+        # Update Prometheus metrics
+        status = 'success' if success else 'failure'
+        self.task_counter.labels(agent_id=agent_id, task_type='unknown', status=status).inc()
+        self.task_duration.labels(agent_id=agent_id, task_type='unknown').observe(duration)
+        
+        # Record metric
+        metric = PerformanceMetric(
+            timestamp=datetime.now(),
+            agent_id=agent_id,
+            metric_type='task_completion',
+            value=duration,
+            metadata={'task_id': task_id, 'success': success}
+        )
+        self.metrics_history.append(metric)
+    
+    def record_workflow_start(self, workflow_id: str, total_tasks: int):
+        """Record the start of a workflow"""
+        self.workflow_data[workflow_id] = WorkflowPerformanceData(
+            workflow_id=workflow_id,
+            start_time=datetime.now(),
+            total_tasks=total_tasks
+        )
+    
+    def record_workflow_completion(self, workflow_id: str, completed_tasks: int, failed_tasks: int):
+        """Record the completion of a workflow"""
+        if workflow_id not in self.workflow_data:
+            return
+        
+        workflow = self.workflow_data[workflow_id]
+        workflow.end_time = datetime.now()
+        workflow.completed_tasks = completed_tasks
+        workflow.failed_tasks = failed_tasks
+        
+        # Calculate workflow duration
+        if workflow.start_time:
+            duration = (workflow.end_time - workflow.start_time).total_seconds()
+            self.workflow_duration.labels(workflow_type='standard').observe(duration)
+    
+    def get_performance_summary(self) -> Dict[str, Any]:
+        """Get a comprehensive performance summary"""
+        summary = {
+            'timestamp': datetime.now().isoformat(),
+            'cluster_overview': {
+                'total_agents': len(self.agent_profiles),
+                'healthy_agents': sum(
+                    1 for profile in self.agent_profiles.values()
+                    if profile.success_rate > 0.8
+                ),
+                'avg_response_time': statistics.mean(
+                    profile.avg_response_time for profile in self.agent_profiles.values()
+                ) if self.agent_profiles else 0.0,
+                'avg_success_rate': statistics.mean(
+                    profile.success_rate for profile in self.agent_profiles.values()
+                ) if self.agent_profiles else 1.0,
+                'total_tasks_processed': sum(
+                    profile.total_tasks for profile in self.agent_profiles.values()
+                )
+            },
+            'agent_performance': {
+                agent_id: {
+                    'avg_response_time': profile.avg_response_time,
+                    'task_throughput': profile.task_throughput,
+                    'success_rate': profile.success_rate,
+                    'current_load': profile.current_load,
+                    'total_tasks': profile.total_tasks,
+                    'error_count': profile.error_count
+                }
+                for agent_id, profile in self.agent_profiles.items()
+            },
+            'workflow_statistics': {
+                'total_workflows': len(self.workflow_data),
+                'completed_workflows': sum(
+                    1 for workflow in self.workflow_data.values()
+                    if workflow.end_time is not None
+                ),
+                'successful_workflows': sum(
+                    1 for workflow in self.workflow_data.values()
+                    if workflow.end_time and workflow.failed_tasks == 0
+                ),
+                'avg_workflow_duration': statistics.mean([
+                    (workflow.end_time - workflow.start_time).total_seconds()
+                    for workflow in self.workflow_data.values()
+                    if workflow.end_time
+                ]) if any(w.end_time for w in self.workflow_data.values()) else 0.0
+            },
+            'active_alerts': list(self.active_alerts.values()),
+            'recent_alerts': self.alert_history[-10:],  # Last 10 alerts
+            'system_health': {
+                'metrics_collected': len(self.metrics_history),
+                'monitoring_active': self.monitoring_task is not None and not self.monitoring_task.done(),
+                'optimization_active': self.optimization_task is not None and not self.optimization_task.done()
+            }
+        }
+        
+        return summary
+    
+    async def export_prometheus_metrics(self) -> str:
+        """Export Prometheus metrics"""
+        return generate_latest(self.registry).decode('utf-8')
+    
+    async def save_performance_report(self, filename: str):
+        """Save a detailed performance report to file"""
+        summary = self.get_performance_summary()
+        
+        async with aiofiles.open(filename, 'w') as f:
+            await f.write(json.dumps(summary, indent=2, default=str))
+        
+        logger.info(f"Performance report saved to {filename}")
+
+
+# Global performance monitor instance
+performance_monitor: Optional[PerformanceMonitor] = None
+
+def get_performance_monitor() -> PerformanceMonitor:
+    """Get the global performance monitor instance"""
+    global performance_monitor
+    if performance_monitor is None:
+        performance_monitor = PerformanceMonitor()
+    return performance_monitor