Refactor UnifiedCoordinator to follow Single Responsibility Principle

- Create dedicated service classes for separated concerns: * AgentService: Agent management and health monitoring * WorkflowService: Workflow parsing and execution tracking * PerformanceService: Metrics and load balancing * BackgroundService: Background processes and cleanup * TaskService: Database persistence (already existed) - Refactor UnifiedCoordinator into UnifiedCoordinatorRefactored * Clean separation of responsibilities * Improved maintainability and testability * Dependency injection pattern for services * Clear service boundaries and interfaces - Maintain backward compatibility through re-exports - Update main.py to use refactored coordinator 🚀 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-11 09:09:11 +10:00
parent 36c5e10a51
commit c6d69695a8
3042 changed files with 45137 additions and 46134 deletions
--- a/backend/app/services/agent_service.py
+++ b/backend/app/services/agent_service.py
@@ -0,0 +1,300 @@
+"""
+Agent Management Service
+
+Handles agent registration, health monitoring, and connectivity management.
+"""
+
+import asyncio
+import aiohttp
+import time
+import logging
+from typing import Dict, List, Optional, Set, Any
+from dataclasses import dataclass, field
+from sqlalchemy.orm import Session
+from enum import Enum
+
+from ..models.agent import Agent as ORMAgent
+from ..core.database import SessionLocal
+from ..cli_agents.cli_agent_manager import get_cli_agent_manager
+
+logger = logging.getLogger(__name__)
+
+
+class AgentType(Enum):
+    """Unified agent types supporting both original and distributed workflows"""
+    # Original agent types
+    KERNEL_DEV = "kernel_dev"
+    PYTORCH_DEV = "pytorch_dev" 
+    PROFILER = "profiler"
+    DOCS_WRITER = "docs_writer"
+    TESTER = "tester"
+    CLI_GEMINI = "cli_gemini"
+    GENERAL_AI = "general_ai"
+    REASONING = "reasoning"
+    
+    # Distributed workflow types
+    CODE_GENERATION = "code_generation"
+    CODE_REVIEW = "code_review"
+    TESTING = "testing"
+    COMPILATION = "compilation"
+    OPTIMIZATION = "optimization"
+    DOCUMENTATION = "documentation"
+    DEPLOYMENT = "deployment"
+
+
+@dataclass
+class Agent:
+    """Unified agent representation supporting both Ollama and CLI agents"""
+    id: str
+    endpoint: str
+    model: str
+    specialty: AgentType
+    max_concurrent: int = 2
+    current_tasks: int = 0
+    agent_type: str = "ollama"  # "ollama" or "cli"
+    cli_config: Optional[Dict[str, Any]] = None
+    
+    # Enhanced fields for distributed workflows
+    gpu_type: str = "unknown"
+    capabilities: Set[str] = field(default_factory=set)
+    performance_history: List[float] = field(default_factory=list)
+    specializations: List[AgentType] = field(default_factory=list)
+    last_heartbeat: float = field(default_factory=time.time)
+    
+    def __post_init__(self):
+        if self.specializations:
+            self.capabilities.update([spec.value for spec in self.specializations])
+
+
+class AgentService:
+    """Service for managing agents in the Hive cluster"""
+    
+    def __init__(self):
+        self.agents: Dict[str, Agent] = {}
+        self.cli_agent_manager = None
+        self._initialized = False
+    
+    async def initialize(self):
+        """Initialize the agent service"""
+        if self._initialized:
+            return
+            
+        try:
+            # Initialize CLI agent manager
+            self.cli_agent_manager = get_cli_agent_manager()
+            
+            # Load agents from database
+            await self._load_database_agents()
+            
+            # Initialize predefined cluster agents
+            self._initialize_cluster_agents()
+            
+            # Test initial connectivity
+            await self._test_initial_connectivity()
+            
+            self._initialized = True
+            logger.info("✅ Agent Service initialized successfully")
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to initialize agent service: {e}")
+            raise
+    
+    def add_agent(self, agent: Agent):
+        """Add an agent to the service"""
+        self.agents[agent.id] = agent
+        logger.info(f"✅ Added agent: {agent.id} ({agent.specialty.value})")
+    
+    def get_agent(self, agent_id: str) -> Optional[Agent]:
+        """Get agent by ID"""
+        return self.agents.get(agent_id)
+    
+    def get_all_agents(self) -> Dict[str, Agent]:
+        """Get all agents"""
+        return self.agents.copy()
+    
+    def get_agents_by_specialty(self, specialty: AgentType) -> List[Agent]:
+        """Get agents by specialty"""
+        return [
+            agent for agent in self.agents.values()
+            if agent.specialty == specialty or specialty in agent.specializations
+        ]
+    
+    def get_available_agents(self, specialty: Optional[AgentType] = None) -> List[Agent]:
+        """Get available agents, optionally filtered by specialty"""
+        available = [
+            agent for agent in self.agents.values()
+            if agent.current_tasks < agent.max_concurrent
+        ]
+        
+        if specialty:
+            available = [
+                agent for agent in available
+                if agent.specialty == specialty or specialty in agent.specializations
+            ]
+        
+        return available
+    
+    def get_optimal_agent(self, specialty: AgentType, load_balancer=None) -> Optional[Agent]:
+        """Get the optimal agent for a task type"""
+        available_agents = [
+            agent for agent in self.agents.values()
+            if (agent.specialty == specialty or specialty in agent.specializations) 
+            and agent.current_tasks < agent.max_concurrent
+        ]
+        
+        if not available_agents:
+            # Fallback to general AI agents
+            available_agents = [
+                agent for agent in self.agents.values()
+                if agent.specialty == AgentType.GENERAL_AI 
+                and agent.current_tasks < agent.max_concurrent
+            ]
+        
+        if available_agents:
+            if load_balancer:
+                return min(available_agents, key=lambda a: load_balancer.get_weight(a.id))
+            else:
+                # Simple round-robin based on current tasks
+                return min(available_agents, key=lambda a: a.current_tasks)
+        
+        return None
+    
+    def increment_agent_tasks(self, agent_id: str):
+        """Increment current task count for an agent"""
+        if agent_id in self.agents:
+            self.agents[agent_id].current_tasks += 1
+    
+    def decrement_agent_tasks(self, agent_id: str):
+        """Decrement current task count for an agent"""
+        if agent_id in self.agents:
+            self.agents[agent_id].current_tasks = max(0, self.agents[agent_id].current_tasks - 1)
+    
+    def update_agent_heartbeat(self, agent_id: str):
+        """Update agent heartbeat timestamp"""
+        if agent_id in self.agents:
+            self.agents[agent_id].last_heartbeat = time.time()
+    
+    async def _load_database_agents(self):
+        """Load agents from database"""
+        try:
+            db = SessionLocal()
+            orm_agents = db.query(ORMAgent).all()
+            
+            for orm_agent in orm_agents:
+                specialty = AgentType(orm_agent.specialty) if orm_agent.specialty else AgentType.GENERAL_AI
+                agent = Agent(
+                    id=orm_agent.id,
+                    endpoint=orm_agent.endpoint,
+                    model=orm_agent.model or "unknown",
+                    specialty=specialty,
+                    max_concurrent=orm_agent.max_concurrent,
+                    current_tasks=orm_agent.current_tasks,
+                    agent_type=orm_agent.agent_type,
+                    cli_config=orm_agent.cli_config
+                )
+                self.add_agent(agent)
+            
+            db.close()
+            logger.info(f"📊 Loaded {len(orm_agents)} agents from database")
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to load agents from database: {e}")
+    
+    def _initialize_cluster_agents(self):
+        """Initialize predefined cluster agents"""
+        cluster_agents = [
+            Agent(
+                id="walnut-codellama",
+                endpoint="http://walnut.local:11434",
+                model="codellama:34b",
+                specialty=AgentType.KERNEL_DEV
+            ),
+            Agent(
+                id="oak-gemma",
+                endpoint="http://oak.local:11434", 
+                model="gemma2:27b",
+                specialty=AgentType.PYTORCH_DEV
+            ),
+            Agent(
+                id="ironwood-llama",
+                endpoint="http://ironwood.local:11434",
+                model="llama3.1:70b",
+                specialty=AgentType.GENERAL_AI
+            )
+        ]
+        
+        for agent in cluster_agents:
+            if agent.id not in self.agents:
+                self.add_agent(agent)
+    
+    async def _test_initial_connectivity(self):
+        """Test connectivity to all agents"""
+        logger.info("🔍 Testing agent connectivity...")
+        
+        for agent in self.agents.values():
+            try:
+                if agent.agent_type == "cli":
+                    # Test CLI agent
+                    if self.cli_agent_manager:
+                        await self.cli_agent_manager.test_agent(agent.id)
+                else:
+                    # Test Ollama agent
+                    async with aiohttp.ClientSession() as session:
+                        async with session.get(
+                            f"{agent.endpoint}/api/tags", 
+                            timeout=aiohttp.ClientTimeout(total=5)
+                        ) as response:
+                            if response.status == 200:
+                                logger.info(f"✅ Agent {agent.id} is responsive")
+                            else:
+                                logger.warning(f"⚠️ Agent {agent.id} returned HTTP {response.status}")
+            except Exception as e:
+                logger.warning(f"⚠️ Agent {agent.id} is not responsive: {e}")
+    
+    async def check_agent_health(self, agent: Agent) -> bool:
+        """Check individual agent health"""
+        try:
+            if agent.agent_type == "cli":
+                # CLI agent health check
+                if self.cli_agent_manager:
+                    return await self.cli_agent_manager.test_agent(agent.id)
+                return False
+            else:
+                # Ollama agent health check
+                async with aiohttp.ClientSession() as session:
+                    async with session.get(
+                        f"{agent.endpoint}/api/tags", 
+                        timeout=aiohttp.ClientTimeout(total=10)
+                    ) as response:
+                        return response.status == 200
+                        
+        except Exception as e:
+            logger.warning(f"⚠️ Agent {agent.id} health check error: {e}")
+            return False
+    
+    async def health_monitor_cycle(self):
+        """Single cycle of health monitoring for all agents"""
+        try:
+            for agent in self.agents.values():
+                is_healthy = await self.check_agent_health(agent)
+                if is_healthy:
+                    agent.last_heartbeat = time.time()
+                else:
+                    logger.warning(f"⚠️ Agent {agent.id} health check failed")
+        except Exception as e:
+            logger.error(f"❌ Health monitor cycle error: {e}")
+    
+    def get_agent_status(self) -> Dict[str, Dict]:
+        """Get status of all agents"""
+        agent_status = {}
+        for agent_id, agent in self.agents.items():
+            agent_status[agent_id] = {
+                "type": agent.agent_type,
+                "model": agent.model,
+                "specialty": agent.specialty.value,
+                "current_tasks": agent.current_tasks,
+                "max_concurrent": agent.max_concurrent,
+                "last_heartbeat": agent.last_heartbeat,
+                "utilization": agent.current_tasks / agent.max_concurrent if agent.max_concurrent > 0 else 0
+            }
+        return agent_status
--- a/backend/app/services/background_service.py
+++ b/backend/app/services/background_service.py
@@ -0,0 +1,163 @@
+"""
+Background Processing Service
+
+Handles background tasks, cleanup, monitoring, and maintenance operations.
+"""
+
+import asyncio
+import logging
+from typing import Set, Optional, Callable
+from concurrent.futures import ThreadPoolExecutor
+
+logger = logging.getLogger(__name__)
+
+
+class BackgroundService:
+    """Service for managing background tasks and processes"""
+    
+    def __init__(self):
+        self.running = False
+        self.executor = ThreadPoolExecutor(max_workers=4)
+        self._background_tasks: Set[asyncio.Task] = set()
+        self._initialized = False
+        
+        # Service references (injected)
+        self.agent_service = None
+        self.task_service = None
+        self.workflow_service = None
+        self.performance_service = None
+    
+    def initialize(self, agent_service, task_service, workflow_service, performance_service):
+        """Initialize the background service with dependencies"""
+        if self._initialized:
+            return
+            
+        self.agent_service = agent_service
+        self.task_service = task_service
+        self.workflow_service = workflow_service
+        self.performance_service = performance_service
+        
+        self._initialized = True
+        logger.info("✅ Background Service initialized successfully")
+    
+    async def start(self):
+        """Start background processes"""
+        if not self._initialized:
+            raise Exception("Background service not initialized")
+            
+        self.running = True
+        
+        # Start background tasks
+        self._background_tasks.add(asyncio.create_task(self._health_monitor()))
+        self._background_tasks.add(asyncio.create_task(self._performance_optimizer()))
+        self._background_tasks.add(asyncio.create_task(self._cleanup_manager()))
+        
+        logger.info("🚀 Background Service processes started")
+    
+    async def shutdown(self):
+        """Shutdown background processes"""
+        logger.info("🛑 Shutting down Background Service...")
+        
+        self.running = False
+        
+        # Cancel background tasks
+        for task in self._background_tasks:
+            task.cancel()
+        
+        # Wait for tasks to complete
+        if self._background_tasks:
+            await asyncio.gather(*self._background_tasks, return_exceptions=True)
+        
+        # Shutdown executor
+        self.executor.shutdown(wait=True)
+        
+        logger.info("✅ Background Service shutdown complete")
+    
+    async def _health_monitor(self):
+        """Background health monitoring"""
+        while self.running:
+            try:
+                if self.agent_service:
+                    await self.agent_service.health_monitor_cycle()
+                await asyncio.sleep(30)  # Check every 30 seconds
+            except Exception as e:
+                logger.error(f"❌ Health monitor error: {e}")
+                await asyncio.sleep(60)
+    
+    async def _performance_optimizer(self):
+        """Background performance optimization"""
+        while self.running:
+            try:
+                if self.performance_service and self.agent_service:
+                    await self.performance_service.optimization_cycle(
+                        self.agent_service.get_all_agents()
+                    )
+                await asyncio.sleep(300)  # Optimize every 5 minutes
+            except Exception as e:
+                logger.error(f"❌ Performance optimizer error: {e}")
+                await asyncio.sleep(600)
+    
+    async def _cleanup_manager(self):
+        """Background cleanup management"""
+        while self.running:
+            try:
+                # Cleanup completed tasks
+                if self.task_service:
+                    cleaned_count = await self._cleanup_completed_tasks()
+                    if cleaned_count > 0:
+                        logger.info(f"🧹 Cleaned up {cleaned_count} old tasks")
+                
+                # Cleanup workflows
+                if self.workflow_service:
+                    workflow_cleaned = self.workflow_service.cleanup_completed_workflows(max_age_hours=24)
+                    if workflow_cleaned > 0:
+                        logger.info(f"🧹 Cleaned up {workflow_cleaned} old workflows")
+                
+                await asyncio.sleep(3600)  # Cleanup every hour
+            except Exception as e:
+                logger.error(f"❌ Cleanup manager error: {e}")
+                await asyncio.sleep(1800)  # Retry in 30 minutes
+    
+    async def _cleanup_completed_tasks(self) -> int:
+        """Clean up old completed tasks"""
+        try:
+            # Clean up database tasks (older ones)
+            db_cleaned_count = self.task_service.cleanup_completed_tasks(max_age_hours=24)
+            return db_cleaned_count
+        except Exception as e:
+            logger.error(f"❌ Failed to cleanup completed tasks: {e}")
+            return 0
+    
+    def add_background_task(self, coro):
+        """Add a custom background task"""
+        if self.running:
+            task = asyncio.create_task(coro)
+            self._background_tasks.add(task)
+            
+            # Clean up completed tasks
+            task.add_done_callback(self._background_tasks.discard)
+            
+            return task
+        return None
+    
+    def schedule_periodic_task(self, coro_func: Callable, interval_seconds: int):
+        """Schedule a periodic task"""
+        async def periodic_wrapper():
+            while self.running:
+                try:
+                    await coro_func()
+                    await asyncio.sleep(interval_seconds)
+                except Exception as e:
+                    logger.error(f"❌ Periodic task error: {e}")
+                    await asyncio.sleep(interval_seconds)
+        
+        return self.add_background_task(periodic_wrapper())
+    
+    def get_status(self) -> dict:
+        """Get background service status"""
+        return {
+            "running": self.running,
+            "active_tasks": len([t for t in self._background_tasks if not t.done()]),
+            "total_tasks": len(self._background_tasks),
+            "executor_threads": self.executor._threads if hasattr(self.executor, '_threads') else 0
+        }
--- a/backend/app/services/performance_service.py
+++ b/backend/app/services/performance_service.py
@@ -0,0 +1,173 @@
+"""
+Performance Monitoring and Optimization Service
+
+Handles performance metrics, load balancing, and system optimization.
+"""
+
+import time
+import logging
+from typing import Dict, List, Optional
+from prometheus_client import Counter, Histogram, Gauge
+
+logger = logging.getLogger(__name__)
+
+# Performance Metrics
+TASK_COUNTER = Counter('hive_tasks_total', 'Total tasks processed', ['task_type', 'agent'])
+TASK_DURATION = Histogram('hive_task_duration_seconds', 'Task execution time', ['task_type', 'agent'])
+ACTIVE_TASKS = Gauge('hive_active_tasks', 'Currently active tasks', ['agent'])
+AGENT_UTILIZATION = Gauge('hive_agent_utilization', 'Agent utilization percentage', ['agent'])
+
+
+class AdaptiveLoadBalancer:
+    """Adaptive load balancer for optimal agent selection"""
+    
+    def __init__(self):
+        self.weights: Dict[str, float] = {}
+        self.performance_history: Dict[str, List[float]] = {}
+        self.max_history = 100  # Keep last 100 performance measurements
+    
+    def update_weight(self, agent_id: str, performance_metric: float):
+        """Update agent weight based on performance (lower is better)"""
+        # Inverse relationship: better performance = lower weight
+        self.weights[agent_id] = performance_metric
+        
+        # Update performance history
+        if agent_id not in self.performance_history:
+            self.performance_history[agent_id] = []
+        
+        self.performance_history[agent_id].append(performance_metric)
+        
+        # Keep only recent history
+        if len(self.performance_history[agent_id]) > self.max_history:
+            self.performance_history[agent_id] = self.performance_history[agent_id][-self.max_history:]
+    
+    def get_weight(self, agent_id: str) -> float:
+        """Get agent weight (lower = more preferred)"""
+        return self.weights.get(agent_id, 1.0)
+    
+    def get_average_performance(self, agent_id: str) -> float:
+        """Get average performance for an agent"""
+        history = self.performance_history.get(agent_id, [])
+        if not history:
+            return 1.0
+        return sum(history) / len(history)
+    
+    def get_performance_stats(self) -> Dict[str, Dict[str, float]]:
+        """Get performance statistics for all agents"""
+        stats = {}
+        for agent_id in self.weights:
+            history = self.performance_history.get(agent_id, [])
+            if history:
+                stats[agent_id] = {
+                    "current_weight": self.weights[agent_id],
+                    "average_time": sum(history) / len(history),
+                    "min_time": min(history),
+                    "max_time": max(history),
+                    "sample_count": len(history)
+                }
+        return stats
+
+
+class PerformanceService:
+    """Service for performance monitoring and optimization"""
+    
+    def __init__(self):
+        self.load_balancer = AdaptiveLoadBalancer()
+        self._initialized = False
+    
+    def initialize(self):
+        """Initialize the performance service"""
+        if self._initialized:
+            return
+            
+        self._initialized = True
+        logger.info("✅ Performance Service initialized successfully")
+    
+    def record_task_start(self, agent_id: str):
+        """Record task start for metrics"""
+        ACTIVE_TASKS.labels(agent=agent_id).inc()
+    
+    def record_task_completion(self, agent_id: str, task_type: str, execution_time: float):
+        """Record task completion metrics"""
+        TASK_COUNTER.labels(task_type=task_type, agent=agent_id).inc()
+        TASK_DURATION.labels(task_type=task_type, agent=agent_id).observe(execution_time)
+        ACTIVE_TASKS.labels(agent=agent_id).dec()
+        
+        # Update load balancer
+        self.load_balancer.update_weight(agent_id, execution_time)
+    
+    def record_task_failure(self, agent_id: str):
+        """Record task failure for metrics"""
+        ACTIVE_TASKS.labels(agent=agent_id).dec()
+    
+    def update_agent_utilization(self, agent_id: str, current_tasks: int, max_concurrent: int):
+        """Update agent utilization metrics"""
+        utilization = current_tasks / max_concurrent if max_concurrent > 0 else 0
+        AGENT_UTILIZATION.labels(agent=agent_id).set(utilization)
+    
+    def get_load_balancer(self) -> AdaptiveLoadBalancer:
+        """Get the load balancer instance"""
+        return self.load_balancer
+    
+    async def optimization_cycle(self, agents: Dict):
+        """Single cycle of performance optimization"""
+        try:
+            # Update utilization metrics for all agents
+            for agent in agents.values():
+                utilization = agent.current_tasks / agent.max_concurrent if agent.max_concurrent > 0 else 0
+                AGENT_UTILIZATION.labels(agent=agent.id).set(utilization)
+            
+            # Additional optimization logic could go here
+            # - Dynamic scaling recommendations
+            # - Agent rebalancing suggestions
+            # - Performance alerts
+            
+        except Exception as e:
+            logger.error(f"❌ Performance optimization cycle error: {e}")
+    
+    def get_performance_metrics(self) -> Dict:
+        """Get current performance metrics"""
+        return {
+            "load_balancer_stats": self.load_balancer.get_performance_stats(),
+            "prometheus_available": True
+        }
+    
+    async def get_prometheus_metrics(self):
+        """Get Prometheus metrics"""
+        from prometheus_client import generate_latest, CONTENT_TYPE_LATEST
+        return generate_latest()
+    
+    def generate_performance_report(self, agents: Dict, tasks: Dict) -> Dict:
+        """Generate comprehensive performance report"""
+        from .workflow_service import TaskStatus
+        
+        # Agent performance
+        agent_stats = {}
+        for agent_id, agent in agents.items():
+            agent_stats[agent_id] = {
+                "current_tasks": agent.current_tasks,
+                "max_concurrent": agent.max_concurrent,
+                "utilization": agent.current_tasks / agent.max_concurrent if agent.max_concurrent > 0 else 0,
+                "average_performance": self.load_balancer.get_average_performance(agent_id),
+                "weight": self.load_balancer.get_weight(agent_id)
+            }
+        
+        # Task statistics
+        total_tasks = len(tasks)
+        completed_tasks = len([t for t in tasks.values() if t.status == TaskStatus.COMPLETED])
+        failed_tasks = len([t for t in tasks.values() if t.status == TaskStatus.FAILED])
+        active_tasks = len([t for t in tasks.values() if t.status == TaskStatus.IN_PROGRESS])
+        
+        return {
+            "timestamp": time.time(),
+            "task_statistics": {
+                "total": total_tasks,
+                "completed": completed_tasks,
+                "failed": failed_tasks,
+                "active": active_tasks,
+                "success_rate": completed_tasks / total_tasks if total_tasks > 0 else 0
+            },
+            "agent_performance": agent_stats,
+            "active_agents": len([a for a in agents.values() if a.current_tasks > 0]),
+            "load_balancer": self.load_balancer.get_performance_stats()
+        }
--- a/backend/app/services/workflow_service.py
+++ b/backend/app/services/workflow_service.py
@@ -0,0 +1,263 @@
+"""
+Workflow Management Service
+
+Handles workflow parsing, scheduling, dependency tracking, and execution management.
+"""
+
+import time
+import logging
+from typing import Dict, List, Optional, Any
+from dataclasses import dataclass, field
+from enum import Enum
+
+# Import shared types
+from .agent_service import AgentType
+
+logger = logging.getLogger(__name__)
+
+
+class TaskStatus(Enum):
+    """Task status tracking"""
+    PENDING = "pending"
+    IN_PROGRESS = "in_progress" 
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
+@dataclass
+class Task:
+    """Unified task representation"""
+    id: str
+    type: AgentType
+    priority: int = 3
+    status: TaskStatus = TaskStatus.PENDING
+    context: Dict[str, Any] = field(default_factory=dict)
+    payload: Dict[str, Any] = field(default_factory=dict)
+    assigned_agent: Optional[str] = None
+    result: Optional[Dict] = None
+    created_at: float = field(default_factory=time.time)
+    completed_at: Optional[float] = None
+    
+    # Workflow support
+    workflow_id: Optional[str] = None
+    dependencies: List[str] = field(default_factory=list)
+    
+    def cache_key(self) -> str:
+        """Generate cache key for task result"""
+        import hashlib
+        import json
+        payload_hash = hashlib.md5(json.dumps(self.payload, sort_keys=True).encode()).hexdigest()
+        return f"task_result:{self.type.value}:{payload_hash}"
+
+
+@dataclass
+class WorkflowExecution:
+    """Represents a workflow execution instance"""
+    workflow_id: str
+    execution_id: str
+    tasks: List[Task]
+    created_at: float
+    completed_at: Optional[float] = None
+    status: str = "running"
+    metadata: Dict[str, Any] = None
+    
+    def __post_init__(self):
+        if self.metadata is None:
+            self.metadata = {}
+
+
+class WorkflowService:
+    """Service for managing workflows and their execution"""
+    
+    def __init__(self):
+        self.workflow_tasks: Dict[str, List[Task]] = {}
+        self.workflow_executions: Dict[str, WorkflowExecution] = {}
+        self._initialized = False
+    
+    def initialize(self):
+        """Initialize the workflow service"""
+        if self._initialized:
+            return
+            
+        self._initialized = True
+        logger.info("✅ Workflow Service initialized successfully")
+    
+    async def submit_workflow(self, workflow: Dict[str, Any]) -> str:
+        """Submit a workflow for execution"""
+        workflow_id = f"workflow_{int(time.time())}"
+        execution_id = f"exec_{workflow_id}"
+        
+        tasks = self._parse_workflow_to_tasks(workflow, workflow_id)
+        
+        # Create workflow execution record
+        execution = WorkflowExecution(
+            workflow_id=workflow_id,
+            execution_id=execution_id,
+            tasks=tasks,
+            created_at=time.time(),
+            metadata=workflow.get('metadata', {})
+        )
+        
+        self.workflow_tasks[workflow_id] = tasks
+        self.workflow_executions[execution_id] = execution
+        
+        logger.info(f"🔄 Submitted workflow: {workflow_id} with {len(tasks)} tasks")
+        return workflow_id
+    
+    def _parse_workflow_to_tasks(self, workflow: Dict[str, Any], workflow_id: str) -> List[Task]:
+        """Parse workflow definition into tasks"""
+        tasks = []
+        base_tasks = workflow.get('tasks', [])
+        
+        for i, task_def in enumerate(base_tasks):
+            task_id = f"{workflow_id}_task_{i}"
+            task_type = AgentType(task_def.get('type', 'general_ai'))
+            
+            task = Task(
+                id=task_id,
+                type=task_type,
+                workflow_id=workflow_id,
+                context=task_def.get('context', {}),
+                payload=task_def.get('payload', {}),
+                dependencies=task_def.get('dependencies', []),
+                priority=task_def.get('priority', 3)
+            )
+            tasks.append(task)
+            
+        return tasks
+    
+    def get_ready_workflow_tasks(self, all_tasks: Dict[str, Task]) -> List[Task]:
+        """Get workflow tasks that are ready to execute (dependencies satisfied)"""
+        ready_tasks = []
+        
+        for workflow_id, workflow_tasks in self.workflow_tasks.items():
+            for task in workflow_tasks:
+                if (task.status == TaskStatus.PENDING and 
+                    self._dependencies_satisfied(task, all_tasks)):
+                    ready_tasks.append(task)
+        
+        return ready_tasks
+    
+    def _dependencies_satisfied(self, task: Task, all_tasks: Dict[str, Task]) -> bool:
+        """Check if task dependencies are satisfied"""
+        for dep_id in task.dependencies:
+            dep_task = all_tasks.get(dep_id)
+            if not dep_task or dep_task.status != TaskStatus.COMPLETED:
+                return False
+        return True
+    
+    def handle_task_completion(self, task: Task):
+        """Handle completion of a workflow task"""
+        if not task.workflow_id:
+            return
+            
+        # Check if workflow is complete
+        workflow_tasks = self.workflow_tasks.get(task.workflow_id, [])
+        completed_tasks = [t for t in workflow_tasks if t.status == TaskStatus.COMPLETED]
+        failed_tasks = [t for t in workflow_tasks if t.status == TaskStatus.FAILED]
+        
+        # Update workflow execution status
+        for execution in self.workflow_executions.values():
+            if execution.workflow_id == task.workflow_id:
+                if len(failed_tasks) > 0:
+                    execution.status = "failed"
+                    execution.completed_at = time.time()
+                    logger.info(f"❌ Workflow {task.workflow_id} failed")
+                elif len(completed_tasks) == len(workflow_tasks):
+                    execution.status = "completed"
+                    execution.completed_at = time.time()
+                    logger.info(f"🎉 Workflow {task.workflow_id} completed")
+                break
+    
+    def get_workflow_status(self, workflow_id: str) -> Dict[str, Any]:
+        """Get workflow execution status"""
+        workflow_tasks = self.workflow_tasks.get(workflow_id, [])
+        
+        if not workflow_tasks:
+            return {"error": "Workflow not found"}
+            
+        status_counts = {}
+        for status in TaskStatus:
+            status_counts[status.value] = len([t for t in workflow_tasks if t.status == status])
+            
+        # Find execution record
+        execution = None
+        for exec_record in self.workflow_executions.values():
+            if exec_record.workflow_id == workflow_id:
+                execution = exec_record
+                break
+        
+        return {
+            "workflow_id": workflow_id,
+            "execution_id": execution.execution_id if execution else None,
+            "total_tasks": len(workflow_tasks),
+            "status_breakdown": status_counts,
+            "completed": status_counts.get("completed", 0) == len(workflow_tasks),
+            "status": execution.status if execution else "unknown",
+            "created_at": execution.created_at if execution else None,
+            "completed_at": execution.completed_at if execution else None
+        }
+    
+    def get_workflow_tasks(self, workflow_id: str) -> List[Task]:
+        """Get all tasks for a workflow"""
+        return self.workflow_tasks.get(workflow_id, [])
+    
+    def get_all_workflows(self) -> Dict[str, List[Task]]:
+        """Get all workflows"""
+        return self.workflow_tasks.copy()
+    
+    def get_workflow_executions(self, workflow_id: Optional[str] = None) -> List[Dict[str, Any]]:
+        """Get workflow execution history"""
+        executions = []
+        
+        for execution in self.workflow_executions.values():
+            if workflow_id is None or execution.workflow_id == workflow_id:
+                executions.append({
+                    "workflow_id": execution.workflow_id,
+                    "execution_id": execution.execution_id,
+                    "status": execution.status,
+                    "task_count": len(execution.tasks),
+                    "created_at": execution.created_at,
+                    "completed_at": execution.completed_at,
+                    "metadata": execution.metadata
+                })
+        
+        # Sort by creation time, newest first
+        executions.sort(key=lambda x: x["created_at"], reverse=True)
+        return executions
+    
+    def cleanup_completed_workflows(self, max_age_hours: int = 24):
+        """Clean up old completed workflow executions"""
+        cutoff_time = time.time() - (max_age_hours * 3600)
+        
+        # Find completed executions older than cutoff
+        to_remove = []
+        for execution_id, execution in self.workflow_executions.items():
+            if (execution.status in ["completed", "failed"] and 
+                execution.completed_at and 
+                execution.completed_at < cutoff_time):
+                to_remove.append(execution_id)
+        
+        # Remove old executions and their associated workflow tasks
+        removed_count = 0
+        for execution_id in to_remove:
+            execution = self.workflow_executions[execution_id]
+            workflow_id = execution.workflow_id
+            
+            # Remove workflow tasks if this is the only execution for this workflow
+            other_executions = [
+                e for e in self.workflow_executions.values() 
+                if e.workflow_id == workflow_id and e.execution_id != execution_id
+            ]
+            
+            if not other_executions:
+                self.workflow_tasks.pop(workflow_id, None)
+            
+            # Remove execution record
+            del self.workflow_executions[execution_id]
+            removed_count += 1
+        
+        if removed_count > 0:
+            logger.info(f"🧹 Cleaned up {removed_count} old workflow executions")
+        
+        return removed_count