Refactor UnifiedCoordinator to follow Single Responsibility Principle
- Create dedicated service classes for separated concerns: * AgentService: Agent management and health monitoring * WorkflowService: Workflow parsing and execution tracking * PerformanceService: Metrics and load balancing * BackgroundService: Background processes and cleanup * TaskService: Database persistence (already existed) - Refactor UnifiedCoordinator into UnifiedCoordinatorRefactored * Clean separation of responsibilities * Improved maintainability and testability * Dependency injection pattern for services * Clear service boundaries and interfaces - Maintain backward compatibility through re-exports - Update main.py to use refactored coordinator 🚀 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
300
backend/app/services/agent_service.py
Normal file
300
backend/app/services/agent_service.py
Normal file
@@ -0,0 +1,300 @@
|
||||
"""
|
||||
Agent Management Service
|
||||
|
||||
Handles agent registration, health monitoring, and connectivity management.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Set, Any
|
||||
from dataclasses import dataclass, field
|
||||
from sqlalchemy.orm import Session
|
||||
from enum import Enum
|
||||
|
||||
from ..models.agent import Agent as ORMAgent
|
||||
from ..core.database import SessionLocal
|
||||
from ..cli_agents.cli_agent_manager import get_cli_agent_manager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AgentType(Enum):
|
||||
"""Unified agent types supporting both original and distributed workflows"""
|
||||
# Original agent types
|
||||
KERNEL_DEV = "kernel_dev"
|
||||
PYTORCH_DEV = "pytorch_dev"
|
||||
PROFILER = "profiler"
|
||||
DOCS_WRITER = "docs_writer"
|
||||
TESTER = "tester"
|
||||
CLI_GEMINI = "cli_gemini"
|
||||
GENERAL_AI = "general_ai"
|
||||
REASONING = "reasoning"
|
||||
|
||||
# Distributed workflow types
|
||||
CODE_GENERATION = "code_generation"
|
||||
CODE_REVIEW = "code_review"
|
||||
TESTING = "testing"
|
||||
COMPILATION = "compilation"
|
||||
OPTIMIZATION = "optimization"
|
||||
DOCUMENTATION = "documentation"
|
||||
DEPLOYMENT = "deployment"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Agent:
|
||||
"""Unified agent representation supporting both Ollama and CLI agents"""
|
||||
id: str
|
||||
endpoint: str
|
||||
model: str
|
||||
specialty: AgentType
|
||||
max_concurrent: int = 2
|
||||
current_tasks: int = 0
|
||||
agent_type: str = "ollama" # "ollama" or "cli"
|
||||
cli_config: Optional[Dict[str, Any]] = None
|
||||
|
||||
# Enhanced fields for distributed workflows
|
||||
gpu_type: str = "unknown"
|
||||
capabilities: Set[str] = field(default_factory=set)
|
||||
performance_history: List[float] = field(default_factory=list)
|
||||
specializations: List[AgentType] = field(default_factory=list)
|
||||
last_heartbeat: float = field(default_factory=time.time)
|
||||
|
||||
def __post_init__(self):
|
||||
if self.specializations:
|
||||
self.capabilities.update([spec.value for spec in self.specializations])
|
||||
|
||||
|
||||
class AgentService:
|
||||
"""Service for managing agents in the Hive cluster"""
|
||||
|
||||
def __init__(self):
|
||||
self.agents: Dict[str, Agent] = {}
|
||||
self.cli_agent_manager = None
|
||||
self._initialized = False
|
||||
|
||||
async def initialize(self):
|
||||
"""Initialize the agent service"""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
try:
|
||||
# Initialize CLI agent manager
|
||||
self.cli_agent_manager = get_cli_agent_manager()
|
||||
|
||||
# Load agents from database
|
||||
await self._load_database_agents()
|
||||
|
||||
# Initialize predefined cluster agents
|
||||
self._initialize_cluster_agents()
|
||||
|
||||
# Test initial connectivity
|
||||
await self._test_initial_connectivity()
|
||||
|
||||
self._initialized = True
|
||||
logger.info("✅ Agent Service initialized successfully")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to initialize agent service: {e}")
|
||||
raise
|
||||
|
||||
def add_agent(self, agent: Agent):
|
||||
"""Add an agent to the service"""
|
||||
self.agents[agent.id] = agent
|
||||
logger.info(f"✅ Added agent: {agent.id} ({agent.specialty.value})")
|
||||
|
||||
def get_agent(self, agent_id: str) -> Optional[Agent]:
|
||||
"""Get agent by ID"""
|
||||
return self.agents.get(agent_id)
|
||||
|
||||
def get_all_agents(self) -> Dict[str, Agent]:
|
||||
"""Get all agents"""
|
||||
return self.agents.copy()
|
||||
|
||||
def get_agents_by_specialty(self, specialty: AgentType) -> List[Agent]:
|
||||
"""Get agents by specialty"""
|
||||
return [
|
||||
agent for agent in self.agents.values()
|
||||
if agent.specialty == specialty or specialty in agent.specializations
|
||||
]
|
||||
|
||||
def get_available_agents(self, specialty: Optional[AgentType] = None) -> List[Agent]:
|
||||
"""Get available agents, optionally filtered by specialty"""
|
||||
available = [
|
||||
agent for agent in self.agents.values()
|
||||
if agent.current_tasks < agent.max_concurrent
|
||||
]
|
||||
|
||||
if specialty:
|
||||
available = [
|
||||
agent for agent in available
|
||||
if agent.specialty == specialty or specialty in agent.specializations
|
||||
]
|
||||
|
||||
return available
|
||||
|
||||
def get_optimal_agent(self, specialty: AgentType, load_balancer=None) -> Optional[Agent]:
|
||||
"""Get the optimal agent for a task type"""
|
||||
available_agents = [
|
||||
agent for agent in self.agents.values()
|
||||
if (agent.specialty == specialty or specialty in agent.specializations)
|
||||
and agent.current_tasks < agent.max_concurrent
|
||||
]
|
||||
|
||||
if not available_agents:
|
||||
# Fallback to general AI agents
|
||||
available_agents = [
|
||||
agent for agent in self.agents.values()
|
||||
if agent.specialty == AgentType.GENERAL_AI
|
||||
and agent.current_tasks < agent.max_concurrent
|
||||
]
|
||||
|
||||
if available_agents:
|
||||
if load_balancer:
|
||||
return min(available_agents, key=lambda a: load_balancer.get_weight(a.id))
|
||||
else:
|
||||
# Simple round-robin based on current tasks
|
||||
return min(available_agents, key=lambda a: a.current_tasks)
|
||||
|
||||
return None
|
||||
|
||||
def increment_agent_tasks(self, agent_id: str):
|
||||
"""Increment current task count for an agent"""
|
||||
if agent_id in self.agents:
|
||||
self.agents[agent_id].current_tasks += 1
|
||||
|
||||
def decrement_agent_tasks(self, agent_id: str):
|
||||
"""Decrement current task count for an agent"""
|
||||
if agent_id in self.agents:
|
||||
self.agents[agent_id].current_tasks = max(0, self.agents[agent_id].current_tasks - 1)
|
||||
|
||||
def update_agent_heartbeat(self, agent_id: str):
|
||||
"""Update agent heartbeat timestamp"""
|
||||
if agent_id in self.agents:
|
||||
self.agents[agent_id].last_heartbeat = time.time()
|
||||
|
||||
async def _load_database_agents(self):
|
||||
"""Load agents from database"""
|
||||
try:
|
||||
db = SessionLocal()
|
||||
orm_agents = db.query(ORMAgent).all()
|
||||
|
||||
for orm_agent in orm_agents:
|
||||
specialty = AgentType(orm_agent.specialty) if orm_agent.specialty else AgentType.GENERAL_AI
|
||||
agent = Agent(
|
||||
id=orm_agent.id,
|
||||
endpoint=orm_agent.endpoint,
|
||||
model=orm_agent.model or "unknown",
|
||||
specialty=specialty,
|
||||
max_concurrent=orm_agent.max_concurrent,
|
||||
current_tasks=orm_agent.current_tasks,
|
||||
agent_type=orm_agent.agent_type,
|
||||
cli_config=orm_agent.cli_config
|
||||
)
|
||||
self.add_agent(agent)
|
||||
|
||||
db.close()
|
||||
logger.info(f"📊 Loaded {len(orm_agents)} agents from database")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to load agents from database: {e}")
|
||||
|
||||
def _initialize_cluster_agents(self):
|
||||
"""Initialize predefined cluster agents"""
|
||||
cluster_agents = [
|
||||
Agent(
|
||||
id="walnut-codellama",
|
||||
endpoint="http://walnut.local:11434",
|
||||
model="codellama:34b",
|
||||
specialty=AgentType.KERNEL_DEV
|
||||
),
|
||||
Agent(
|
||||
id="oak-gemma",
|
||||
endpoint="http://oak.local:11434",
|
||||
model="gemma2:27b",
|
||||
specialty=AgentType.PYTORCH_DEV
|
||||
),
|
||||
Agent(
|
||||
id="ironwood-llama",
|
||||
endpoint="http://ironwood.local:11434",
|
||||
model="llama3.1:70b",
|
||||
specialty=AgentType.GENERAL_AI
|
||||
)
|
||||
]
|
||||
|
||||
for agent in cluster_agents:
|
||||
if agent.id not in self.agents:
|
||||
self.add_agent(agent)
|
||||
|
||||
async def _test_initial_connectivity(self):
|
||||
"""Test connectivity to all agents"""
|
||||
logger.info("🔍 Testing agent connectivity...")
|
||||
|
||||
for agent in self.agents.values():
|
||||
try:
|
||||
if agent.agent_type == "cli":
|
||||
# Test CLI agent
|
||||
if self.cli_agent_manager:
|
||||
await self.cli_agent_manager.test_agent(agent.id)
|
||||
else:
|
||||
# Test Ollama agent
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
f"{agent.endpoint}/api/tags",
|
||||
timeout=aiohttp.ClientTimeout(total=5)
|
||||
) as response:
|
||||
if response.status == 200:
|
||||
logger.info(f"✅ Agent {agent.id} is responsive")
|
||||
else:
|
||||
logger.warning(f"⚠️ Agent {agent.id} returned HTTP {response.status}")
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Agent {agent.id} is not responsive: {e}")
|
||||
|
||||
async def check_agent_health(self, agent: Agent) -> bool:
|
||||
"""Check individual agent health"""
|
||||
try:
|
||||
if agent.agent_type == "cli":
|
||||
# CLI agent health check
|
||||
if self.cli_agent_manager:
|
||||
return await self.cli_agent_manager.test_agent(agent.id)
|
||||
return False
|
||||
else:
|
||||
# Ollama agent health check
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(
|
||||
f"{agent.endpoint}/api/tags",
|
||||
timeout=aiohttp.ClientTimeout(total=10)
|
||||
) as response:
|
||||
return response.status == 200
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"⚠️ Agent {agent.id} health check error: {e}")
|
||||
return False
|
||||
|
||||
async def health_monitor_cycle(self):
|
||||
"""Single cycle of health monitoring for all agents"""
|
||||
try:
|
||||
for agent in self.agents.values():
|
||||
is_healthy = await self.check_agent_health(agent)
|
||||
if is_healthy:
|
||||
agent.last_heartbeat = time.time()
|
||||
else:
|
||||
logger.warning(f"⚠️ Agent {agent.id} health check failed")
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Health monitor cycle error: {e}")
|
||||
|
||||
def get_agent_status(self) -> Dict[str, Dict]:
|
||||
"""Get status of all agents"""
|
||||
agent_status = {}
|
||||
for agent_id, agent in self.agents.items():
|
||||
agent_status[agent_id] = {
|
||||
"type": agent.agent_type,
|
||||
"model": agent.model,
|
||||
"specialty": agent.specialty.value,
|
||||
"current_tasks": agent.current_tasks,
|
||||
"max_concurrent": agent.max_concurrent,
|
||||
"last_heartbeat": agent.last_heartbeat,
|
||||
"utilization": agent.current_tasks / agent.max_concurrent if agent.max_concurrent > 0 else 0
|
||||
}
|
||||
return agent_status
|
||||
163
backend/app/services/background_service.py
Normal file
163
backend/app/services/background_service.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""
|
||||
Background Processing Service
|
||||
|
||||
Handles background tasks, cleanup, monitoring, and maintenance operations.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Set, Optional, Callable
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BackgroundService:
|
||||
"""Service for managing background tasks and processes"""
|
||||
|
||||
def __init__(self):
|
||||
self.running = False
|
||||
self.executor = ThreadPoolExecutor(max_workers=4)
|
||||
self._background_tasks: Set[asyncio.Task] = set()
|
||||
self._initialized = False
|
||||
|
||||
# Service references (injected)
|
||||
self.agent_service = None
|
||||
self.task_service = None
|
||||
self.workflow_service = None
|
||||
self.performance_service = None
|
||||
|
||||
def initialize(self, agent_service, task_service, workflow_service, performance_service):
|
||||
"""Initialize the background service with dependencies"""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self.agent_service = agent_service
|
||||
self.task_service = task_service
|
||||
self.workflow_service = workflow_service
|
||||
self.performance_service = performance_service
|
||||
|
||||
self._initialized = True
|
||||
logger.info("✅ Background Service initialized successfully")
|
||||
|
||||
async def start(self):
|
||||
"""Start background processes"""
|
||||
if not self._initialized:
|
||||
raise Exception("Background service not initialized")
|
||||
|
||||
self.running = True
|
||||
|
||||
# Start background tasks
|
||||
self._background_tasks.add(asyncio.create_task(self._health_monitor()))
|
||||
self._background_tasks.add(asyncio.create_task(self._performance_optimizer()))
|
||||
self._background_tasks.add(asyncio.create_task(self._cleanup_manager()))
|
||||
|
||||
logger.info("🚀 Background Service processes started")
|
||||
|
||||
async def shutdown(self):
|
||||
"""Shutdown background processes"""
|
||||
logger.info("🛑 Shutting down Background Service...")
|
||||
|
||||
self.running = False
|
||||
|
||||
# Cancel background tasks
|
||||
for task in self._background_tasks:
|
||||
task.cancel()
|
||||
|
||||
# Wait for tasks to complete
|
||||
if self._background_tasks:
|
||||
await asyncio.gather(*self._background_tasks, return_exceptions=True)
|
||||
|
||||
# Shutdown executor
|
||||
self.executor.shutdown(wait=True)
|
||||
|
||||
logger.info("✅ Background Service shutdown complete")
|
||||
|
||||
async def _health_monitor(self):
|
||||
"""Background health monitoring"""
|
||||
while self.running:
|
||||
try:
|
||||
if self.agent_service:
|
||||
await self.agent_service.health_monitor_cycle()
|
||||
await asyncio.sleep(30) # Check every 30 seconds
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Health monitor error: {e}")
|
||||
await asyncio.sleep(60)
|
||||
|
||||
async def _performance_optimizer(self):
|
||||
"""Background performance optimization"""
|
||||
while self.running:
|
||||
try:
|
||||
if self.performance_service and self.agent_service:
|
||||
await self.performance_service.optimization_cycle(
|
||||
self.agent_service.get_all_agents()
|
||||
)
|
||||
await asyncio.sleep(300) # Optimize every 5 minutes
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Performance optimizer error: {e}")
|
||||
await asyncio.sleep(600)
|
||||
|
||||
async def _cleanup_manager(self):
|
||||
"""Background cleanup management"""
|
||||
while self.running:
|
||||
try:
|
||||
# Cleanup completed tasks
|
||||
if self.task_service:
|
||||
cleaned_count = await self._cleanup_completed_tasks()
|
||||
if cleaned_count > 0:
|
||||
logger.info(f"🧹 Cleaned up {cleaned_count} old tasks")
|
||||
|
||||
# Cleanup workflows
|
||||
if self.workflow_service:
|
||||
workflow_cleaned = self.workflow_service.cleanup_completed_workflows(max_age_hours=24)
|
||||
if workflow_cleaned > 0:
|
||||
logger.info(f"🧹 Cleaned up {workflow_cleaned} old workflows")
|
||||
|
||||
await asyncio.sleep(3600) # Cleanup every hour
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Cleanup manager error: {e}")
|
||||
await asyncio.sleep(1800) # Retry in 30 minutes
|
||||
|
||||
async def _cleanup_completed_tasks(self) -> int:
|
||||
"""Clean up old completed tasks"""
|
||||
try:
|
||||
# Clean up database tasks (older ones)
|
||||
db_cleaned_count = self.task_service.cleanup_completed_tasks(max_age_hours=24)
|
||||
return db_cleaned_count
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Failed to cleanup completed tasks: {e}")
|
||||
return 0
|
||||
|
||||
def add_background_task(self, coro):
|
||||
"""Add a custom background task"""
|
||||
if self.running:
|
||||
task = asyncio.create_task(coro)
|
||||
self._background_tasks.add(task)
|
||||
|
||||
# Clean up completed tasks
|
||||
task.add_done_callback(self._background_tasks.discard)
|
||||
|
||||
return task
|
||||
return None
|
||||
|
||||
def schedule_periodic_task(self, coro_func: Callable, interval_seconds: int):
|
||||
"""Schedule a periodic task"""
|
||||
async def periodic_wrapper():
|
||||
while self.running:
|
||||
try:
|
||||
await coro_func()
|
||||
await asyncio.sleep(interval_seconds)
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Periodic task error: {e}")
|
||||
await asyncio.sleep(interval_seconds)
|
||||
|
||||
return self.add_background_task(periodic_wrapper())
|
||||
|
||||
def get_status(self) -> dict:
|
||||
"""Get background service status"""
|
||||
return {
|
||||
"running": self.running,
|
||||
"active_tasks": len([t for t in self._background_tasks if not t.done()]),
|
||||
"total_tasks": len(self._background_tasks),
|
||||
"executor_threads": self.executor._threads if hasattr(self.executor, '_threads') else 0
|
||||
}
|
||||
173
backend/app/services/performance_service.py
Normal file
173
backend/app/services/performance_service.py
Normal file
@@ -0,0 +1,173 @@
|
||||
"""
|
||||
Performance Monitoring and Optimization Service
|
||||
|
||||
Handles performance metrics, load balancing, and system optimization.
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, List, Optional
|
||||
from prometheus_client import Counter, Histogram, Gauge
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Performance Metrics
|
||||
TASK_COUNTER = Counter('hive_tasks_total', 'Total tasks processed', ['task_type', 'agent'])
|
||||
TASK_DURATION = Histogram('hive_task_duration_seconds', 'Task execution time', ['task_type', 'agent'])
|
||||
ACTIVE_TASKS = Gauge('hive_active_tasks', 'Currently active tasks', ['agent'])
|
||||
AGENT_UTILIZATION = Gauge('hive_agent_utilization', 'Agent utilization percentage', ['agent'])
|
||||
|
||||
|
||||
class AdaptiveLoadBalancer:
|
||||
"""Adaptive load balancer for optimal agent selection"""
|
||||
|
||||
def __init__(self):
|
||||
self.weights: Dict[str, float] = {}
|
||||
self.performance_history: Dict[str, List[float]] = {}
|
||||
self.max_history = 100 # Keep last 100 performance measurements
|
||||
|
||||
def update_weight(self, agent_id: str, performance_metric: float):
|
||||
"""Update agent weight based on performance (lower is better)"""
|
||||
# Inverse relationship: better performance = lower weight
|
||||
self.weights[agent_id] = performance_metric
|
||||
|
||||
# Update performance history
|
||||
if agent_id not in self.performance_history:
|
||||
self.performance_history[agent_id] = []
|
||||
|
||||
self.performance_history[agent_id].append(performance_metric)
|
||||
|
||||
# Keep only recent history
|
||||
if len(self.performance_history[agent_id]) > self.max_history:
|
||||
self.performance_history[agent_id] = self.performance_history[agent_id][-self.max_history:]
|
||||
|
||||
def get_weight(self, agent_id: str) -> float:
|
||||
"""Get agent weight (lower = more preferred)"""
|
||||
return self.weights.get(agent_id, 1.0)
|
||||
|
||||
def get_average_performance(self, agent_id: str) -> float:
|
||||
"""Get average performance for an agent"""
|
||||
history = self.performance_history.get(agent_id, [])
|
||||
if not history:
|
||||
return 1.0
|
||||
return sum(history) / len(history)
|
||||
|
||||
def get_performance_stats(self) -> Dict[str, Dict[str, float]]:
|
||||
"""Get performance statistics for all agents"""
|
||||
stats = {}
|
||||
for agent_id in self.weights:
|
||||
history = self.performance_history.get(agent_id, [])
|
||||
if history:
|
||||
stats[agent_id] = {
|
||||
"current_weight": self.weights[agent_id],
|
||||
"average_time": sum(history) / len(history),
|
||||
"min_time": min(history),
|
||||
"max_time": max(history),
|
||||
"sample_count": len(history)
|
||||
}
|
||||
return stats
|
||||
|
||||
|
||||
class PerformanceService:
|
||||
"""Service for performance monitoring and optimization"""
|
||||
|
||||
def __init__(self):
|
||||
self.load_balancer = AdaptiveLoadBalancer()
|
||||
self._initialized = False
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize the performance service"""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self._initialized = True
|
||||
logger.info("✅ Performance Service initialized successfully")
|
||||
|
||||
def record_task_start(self, agent_id: str):
|
||||
"""Record task start for metrics"""
|
||||
ACTIVE_TASKS.labels(agent=agent_id).inc()
|
||||
|
||||
def record_task_completion(self, agent_id: str, task_type: str, execution_time: float):
|
||||
"""Record task completion metrics"""
|
||||
TASK_COUNTER.labels(task_type=task_type, agent=agent_id).inc()
|
||||
TASK_DURATION.labels(task_type=task_type, agent=agent_id).observe(execution_time)
|
||||
ACTIVE_TASKS.labels(agent=agent_id).dec()
|
||||
|
||||
# Update load balancer
|
||||
self.load_balancer.update_weight(agent_id, execution_time)
|
||||
|
||||
def record_task_failure(self, agent_id: str):
|
||||
"""Record task failure for metrics"""
|
||||
ACTIVE_TASKS.labels(agent=agent_id).dec()
|
||||
|
||||
def update_agent_utilization(self, agent_id: str, current_tasks: int, max_concurrent: int):
|
||||
"""Update agent utilization metrics"""
|
||||
utilization = current_tasks / max_concurrent if max_concurrent > 0 else 0
|
||||
AGENT_UTILIZATION.labels(agent=agent_id).set(utilization)
|
||||
|
||||
def get_load_balancer(self) -> AdaptiveLoadBalancer:
|
||||
"""Get the load balancer instance"""
|
||||
return self.load_balancer
|
||||
|
||||
async def optimization_cycle(self, agents: Dict):
|
||||
"""Single cycle of performance optimization"""
|
||||
try:
|
||||
# Update utilization metrics for all agents
|
||||
for agent in agents.values():
|
||||
utilization = agent.current_tasks / agent.max_concurrent if agent.max_concurrent > 0 else 0
|
||||
AGENT_UTILIZATION.labels(agent=agent.id).set(utilization)
|
||||
|
||||
# Additional optimization logic could go here
|
||||
# - Dynamic scaling recommendations
|
||||
# - Agent rebalancing suggestions
|
||||
# - Performance alerts
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ Performance optimization cycle error: {e}")
|
||||
|
||||
def get_performance_metrics(self) -> Dict:
|
||||
"""Get current performance metrics"""
|
||||
return {
|
||||
"load_balancer_stats": self.load_balancer.get_performance_stats(),
|
||||
"prometheus_available": True
|
||||
}
|
||||
|
||||
async def get_prometheus_metrics(self):
|
||||
"""Get Prometheus metrics"""
|
||||
from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
|
||||
return generate_latest()
|
||||
|
||||
def generate_performance_report(self, agents: Dict, tasks: Dict) -> Dict:
|
||||
"""Generate comprehensive performance report"""
|
||||
from .workflow_service import TaskStatus
|
||||
|
||||
# Agent performance
|
||||
agent_stats = {}
|
||||
for agent_id, agent in agents.items():
|
||||
agent_stats[agent_id] = {
|
||||
"current_tasks": agent.current_tasks,
|
||||
"max_concurrent": agent.max_concurrent,
|
||||
"utilization": agent.current_tasks / agent.max_concurrent if agent.max_concurrent > 0 else 0,
|
||||
"average_performance": self.load_balancer.get_average_performance(agent_id),
|
||||
"weight": self.load_balancer.get_weight(agent_id)
|
||||
}
|
||||
|
||||
# Task statistics
|
||||
total_tasks = len(tasks)
|
||||
completed_tasks = len([t for t in tasks.values() if t.status == TaskStatus.COMPLETED])
|
||||
failed_tasks = len([t for t in tasks.values() if t.status == TaskStatus.FAILED])
|
||||
active_tasks = len([t for t in tasks.values() if t.status == TaskStatus.IN_PROGRESS])
|
||||
|
||||
return {
|
||||
"timestamp": time.time(),
|
||||
"task_statistics": {
|
||||
"total": total_tasks,
|
||||
"completed": completed_tasks,
|
||||
"failed": failed_tasks,
|
||||
"active": active_tasks,
|
||||
"success_rate": completed_tasks / total_tasks if total_tasks > 0 else 0
|
||||
},
|
||||
"agent_performance": agent_stats,
|
||||
"active_agents": len([a for a in agents.values() if a.current_tasks > 0]),
|
||||
"load_balancer": self.load_balancer.get_performance_stats()
|
||||
}
|
||||
263
backend/app/services/workflow_service.py
Normal file
263
backend/app/services/workflow_service.py
Normal file
@@ -0,0 +1,263 @@
|
||||
"""
|
||||
Workflow Management Service
|
||||
|
||||
Handles workflow parsing, scheduling, dependency tracking, and execution management.
|
||||
"""
|
||||
|
||||
import time
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
|
||||
# Import shared types
|
||||
from .agent_service import AgentType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TaskStatus(Enum):
|
||||
"""Task status tracking"""
|
||||
PENDING = "pending"
|
||||
IN_PROGRESS = "in_progress"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
@dataclass
|
||||
class Task:
|
||||
"""Unified task representation"""
|
||||
id: str
|
||||
type: AgentType
|
||||
priority: int = 3
|
||||
status: TaskStatus = TaskStatus.PENDING
|
||||
context: Dict[str, Any] = field(default_factory=dict)
|
||||
payload: Dict[str, Any] = field(default_factory=dict)
|
||||
assigned_agent: Optional[str] = None
|
||||
result: Optional[Dict] = None
|
||||
created_at: float = field(default_factory=time.time)
|
||||
completed_at: Optional[float] = None
|
||||
|
||||
# Workflow support
|
||||
workflow_id: Optional[str] = None
|
||||
dependencies: List[str] = field(default_factory=list)
|
||||
|
||||
def cache_key(self) -> str:
|
||||
"""Generate cache key for task result"""
|
||||
import hashlib
|
||||
import json
|
||||
payload_hash = hashlib.md5(json.dumps(self.payload, sort_keys=True).encode()).hexdigest()
|
||||
return f"task_result:{self.type.value}:{payload_hash}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class WorkflowExecution:
|
||||
"""Represents a workflow execution instance"""
|
||||
workflow_id: str
|
||||
execution_id: str
|
||||
tasks: List[Task]
|
||||
created_at: float
|
||||
completed_at: Optional[float] = None
|
||||
status: str = "running"
|
||||
metadata: Dict[str, Any] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.metadata is None:
|
||||
self.metadata = {}
|
||||
|
||||
|
||||
class WorkflowService:
|
||||
"""Service for managing workflows and their execution"""
|
||||
|
||||
def __init__(self):
|
||||
self.workflow_tasks: Dict[str, List[Task]] = {}
|
||||
self.workflow_executions: Dict[str, WorkflowExecution] = {}
|
||||
self._initialized = False
|
||||
|
||||
def initialize(self):
|
||||
"""Initialize the workflow service"""
|
||||
if self._initialized:
|
||||
return
|
||||
|
||||
self._initialized = True
|
||||
logger.info("✅ Workflow Service initialized successfully")
|
||||
|
||||
async def submit_workflow(self, workflow: Dict[str, Any]) -> str:
|
||||
"""Submit a workflow for execution"""
|
||||
workflow_id = f"workflow_{int(time.time())}"
|
||||
execution_id = f"exec_{workflow_id}"
|
||||
|
||||
tasks = self._parse_workflow_to_tasks(workflow, workflow_id)
|
||||
|
||||
# Create workflow execution record
|
||||
execution = WorkflowExecution(
|
||||
workflow_id=workflow_id,
|
||||
execution_id=execution_id,
|
||||
tasks=tasks,
|
||||
created_at=time.time(),
|
||||
metadata=workflow.get('metadata', {})
|
||||
)
|
||||
|
||||
self.workflow_tasks[workflow_id] = tasks
|
||||
self.workflow_executions[execution_id] = execution
|
||||
|
||||
logger.info(f"🔄 Submitted workflow: {workflow_id} with {len(tasks)} tasks")
|
||||
return workflow_id
|
||||
|
||||
def _parse_workflow_to_tasks(self, workflow: Dict[str, Any], workflow_id: str) -> List[Task]:
|
||||
"""Parse workflow definition into tasks"""
|
||||
tasks = []
|
||||
base_tasks = workflow.get('tasks', [])
|
||||
|
||||
for i, task_def in enumerate(base_tasks):
|
||||
task_id = f"{workflow_id}_task_{i}"
|
||||
task_type = AgentType(task_def.get('type', 'general_ai'))
|
||||
|
||||
task = Task(
|
||||
id=task_id,
|
||||
type=task_type,
|
||||
workflow_id=workflow_id,
|
||||
context=task_def.get('context', {}),
|
||||
payload=task_def.get('payload', {}),
|
||||
dependencies=task_def.get('dependencies', []),
|
||||
priority=task_def.get('priority', 3)
|
||||
)
|
||||
tasks.append(task)
|
||||
|
||||
return tasks
|
||||
|
||||
def get_ready_workflow_tasks(self, all_tasks: Dict[str, Task]) -> List[Task]:
|
||||
"""Get workflow tasks that are ready to execute (dependencies satisfied)"""
|
||||
ready_tasks = []
|
||||
|
||||
for workflow_id, workflow_tasks in self.workflow_tasks.items():
|
||||
for task in workflow_tasks:
|
||||
if (task.status == TaskStatus.PENDING and
|
||||
self._dependencies_satisfied(task, all_tasks)):
|
||||
ready_tasks.append(task)
|
||||
|
||||
return ready_tasks
|
||||
|
||||
def _dependencies_satisfied(self, task: Task, all_tasks: Dict[str, Task]) -> bool:
|
||||
"""Check if task dependencies are satisfied"""
|
||||
for dep_id in task.dependencies:
|
||||
dep_task = all_tasks.get(dep_id)
|
||||
if not dep_task or dep_task.status != TaskStatus.COMPLETED:
|
||||
return False
|
||||
return True
|
||||
|
||||
def handle_task_completion(self, task: Task):
|
||||
"""Handle completion of a workflow task"""
|
||||
if not task.workflow_id:
|
||||
return
|
||||
|
||||
# Check if workflow is complete
|
||||
workflow_tasks = self.workflow_tasks.get(task.workflow_id, [])
|
||||
completed_tasks = [t for t in workflow_tasks if t.status == TaskStatus.COMPLETED]
|
||||
failed_tasks = [t for t in workflow_tasks if t.status == TaskStatus.FAILED]
|
||||
|
||||
# Update workflow execution status
|
||||
for execution in self.workflow_executions.values():
|
||||
if execution.workflow_id == task.workflow_id:
|
||||
if len(failed_tasks) > 0:
|
||||
execution.status = "failed"
|
||||
execution.completed_at = time.time()
|
||||
logger.info(f"❌ Workflow {task.workflow_id} failed")
|
||||
elif len(completed_tasks) == len(workflow_tasks):
|
||||
execution.status = "completed"
|
||||
execution.completed_at = time.time()
|
||||
logger.info(f"🎉 Workflow {task.workflow_id} completed")
|
||||
break
|
||||
|
||||
def get_workflow_status(self, workflow_id: str) -> Dict[str, Any]:
|
||||
"""Get workflow execution status"""
|
||||
workflow_tasks = self.workflow_tasks.get(workflow_id, [])
|
||||
|
||||
if not workflow_tasks:
|
||||
return {"error": "Workflow not found"}
|
||||
|
||||
status_counts = {}
|
||||
for status in TaskStatus:
|
||||
status_counts[status.value] = len([t for t in workflow_tasks if t.status == status])
|
||||
|
||||
# Find execution record
|
||||
execution = None
|
||||
for exec_record in self.workflow_executions.values():
|
||||
if exec_record.workflow_id == workflow_id:
|
||||
execution = exec_record
|
||||
break
|
||||
|
||||
return {
|
||||
"workflow_id": workflow_id,
|
||||
"execution_id": execution.execution_id if execution else None,
|
||||
"total_tasks": len(workflow_tasks),
|
||||
"status_breakdown": status_counts,
|
||||
"completed": status_counts.get("completed", 0) == len(workflow_tasks),
|
||||
"status": execution.status if execution else "unknown",
|
||||
"created_at": execution.created_at if execution else None,
|
||||
"completed_at": execution.completed_at if execution else None
|
||||
}
|
||||
|
||||
def get_workflow_tasks(self, workflow_id: str) -> List[Task]:
|
||||
"""Get all tasks for a workflow"""
|
||||
return self.workflow_tasks.get(workflow_id, [])
|
||||
|
||||
def get_all_workflows(self) -> Dict[str, List[Task]]:
|
||||
"""Get all workflows"""
|
||||
return self.workflow_tasks.copy()
|
||||
|
||||
def get_workflow_executions(self, workflow_id: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""Get workflow execution history"""
|
||||
executions = []
|
||||
|
||||
for execution in self.workflow_executions.values():
|
||||
if workflow_id is None or execution.workflow_id == workflow_id:
|
||||
executions.append({
|
||||
"workflow_id": execution.workflow_id,
|
||||
"execution_id": execution.execution_id,
|
||||
"status": execution.status,
|
||||
"task_count": len(execution.tasks),
|
||||
"created_at": execution.created_at,
|
||||
"completed_at": execution.completed_at,
|
||||
"metadata": execution.metadata
|
||||
})
|
||||
|
||||
# Sort by creation time, newest first
|
||||
executions.sort(key=lambda x: x["created_at"], reverse=True)
|
||||
return executions
|
||||
|
||||
def cleanup_completed_workflows(self, max_age_hours: int = 24):
|
||||
"""Clean up old completed workflow executions"""
|
||||
cutoff_time = time.time() - (max_age_hours * 3600)
|
||||
|
||||
# Find completed executions older than cutoff
|
||||
to_remove = []
|
||||
for execution_id, execution in self.workflow_executions.items():
|
||||
if (execution.status in ["completed", "failed"] and
|
||||
execution.completed_at and
|
||||
execution.completed_at < cutoff_time):
|
||||
to_remove.append(execution_id)
|
||||
|
||||
# Remove old executions and their associated workflow tasks
|
||||
removed_count = 0
|
||||
for execution_id in to_remove:
|
||||
execution = self.workflow_executions[execution_id]
|
||||
workflow_id = execution.workflow_id
|
||||
|
||||
# Remove workflow tasks if this is the only execution for this workflow
|
||||
other_executions = [
|
||||
e for e in self.workflow_executions.values()
|
||||
if e.workflow_id == workflow_id and e.execution_id != execution_id
|
||||
]
|
||||
|
||||
if not other_executions:
|
||||
self.workflow_tasks.pop(workflow_id, None)
|
||||
|
||||
# Remove execution record
|
||||
del self.workflow_executions[execution_id]
|
||||
removed_count += 1
|
||||
|
||||
if removed_count > 0:
|
||||
logger.info(f"🧹 Cleaned up {removed_count} old workflow executions")
|
||||
|
||||
return removed_count
|
||||
Reference in New Issue
Block a user