Fix critical in-memory task storage with database persistence

Major architectural improvement to replace in-memory task storage with database-backed persistence while maintaining backward compatibility. Changes: - Created Task SQLAlchemy model matching database schema - Added Workflow and Execution SQLAlchemy models - Created TaskService for database CRUD operations - Updated UnifiedCoordinator to use database persistence - Modified task APIs to leverage database storage - Added task loading from database on coordinator initialization - Implemented status change persistence during task execution - Enhanced task cleanup with database support - Added comprehensive task statistics from database Benefits: - Tasks persist across application restarts - Better scalability and reliability - Historical task data retention - Comprehensive task filtering and querying - Maintains in-memory cache for performance 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-11 08:52:44 +10:00
parent 4de45bf450
commit 59a59f8869
9 changed files with 594 additions and 59 deletions
--- a/backend/app/core/unified_coordinator.py
+++ b/backend/app/core/unified_coordinator.py
@@ -21,6 +21,7 @@ from prometheus_client import Counter, Histogram, Gauge
 from ..models.agent import Agent as ORMAgent
 from ..core.database import SessionLocal
 from ..cli_agents.cli_agent_manager import get_cli_agent_manager
+from ..services.task_service import TaskService

 logger = logging.getLogger(__name__)

@@ -120,10 +121,13 @@ class UnifiedCoordinator:
    def __init__(self, redis_url: str = "redis://localhost:6379"):
        # Core state
        self.agents: Dict[str, Agent] = {}
-        self.tasks: Dict[str, Task] = {}
+        self.tasks: Dict[str, Task] = {}  # In-memory cache for active tasks
        self.task_queue: List[Task] = []
        self.is_initialized = False
        
+        # Database persistence
+        self.task_service = TaskService()
+        
        # CLI agent support
        self.cli_agent_manager = None
        
@@ -163,6 +167,9 @@ class UnifiedCoordinator:
            # Load agents from database
            await self._load_database_agents()
            
+            # Load existing tasks from database
+            await self._load_database_tasks()
+            
            # Initialize cluster agents
            self._initialize_cluster_agents()
            
@@ -249,6 +256,31 @@ class UnifiedCoordinator:
        except Exception as e:
            logger.error(f"❌ Failed to load agents from database: {e}")

+    async def _load_database_tasks(self):
+        """Load pending and in-progress tasks from database"""
+        try:
+            # Load pending tasks
+            pending_orm_tasks = self.task_service.get_tasks(status='pending', limit=100)
+            for orm_task in pending_orm_tasks:
+                coordinator_task = self.task_service.coordinator_task_from_orm(orm_task)
+                self.tasks[coordinator_task.id] = coordinator_task
+                self.task_queue.append(coordinator_task)
+            
+            # Load in-progress tasks
+            in_progress_orm_tasks = self.task_service.get_tasks(status='in_progress', limit=100)
+            for orm_task in in_progress_orm_tasks:
+                coordinator_task = self.task_service.coordinator_task_from_orm(orm_task)
+                self.tasks[coordinator_task.id] = coordinator_task
+                # In-progress tasks are not added to task_queue as they're already being processed
+            
+            # Sort task queue by priority
+            self.task_queue.sort(key=lambda t: t.priority)
+            
+            logger.info(f"📊 Loaded {len(pending_orm_tasks)} pending and {len(in_progress_orm_tasks)} in-progress tasks from database")
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to load tasks from database: {e}")
+
    def _initialize_cluster_agents(self):
        """Initialize predefined cluster agents"""
        # This maintains compatibility with the original HiveCoordinator
@@ -292,6 +324,14 @@ class UnifiedCoordinator:
            payload=context  # For compatibility
        )
        
+        # Persist to database
+        try:
+            self.task_service.create_task(task)
+            logger.info(f"💾 Task {task_id} persisted to database")
+        except Exception as e:
+            logger.error(f"❌ Failed to persist task {task_id} to database: {e}")
+        
+        # Add to in-memory structures
        self.tasks[task_id] = task
        self.task_queue.append(task)
        
@@ -416,6 +456,13 @@ class UnifiedCoordinator:
            task.assigned_agent = agent.id
            agent.current_tasks += 1
            
+            # Persist status change to database
+            try:
+                self.task_service.update_task(task.id, task)
+                logger.debug(f"💾 Updated task {task.id} status to IN_PROGRESS in database")
+            except Exception as e:
+                logger.error(f"❌ Failed to update task {task.id} status in database: {e}")
+            
            ACTIVE_TASKS.labels(agent=agent.id).inc()
            start_time = time.time()
            
@@ -435,6 +482,13 @@ class UnifiedCoordinator:
            task.status = TaskStatus.COMPLETED
            task.completed_at = time.time()
            
+            # Persist completion to database
+            try:
+                self.task_service.update_task(task.id, task)
+                logger.debug(f"💾 Updated task {task.id} status to COMPLETED in database")
+            except Exception as e:
+                logger.error(f"❌ Failed to update completed task {task.id} in database: {e}")
+            
            # Update agent
            agent.current_tasks -= 1
            self.load_balancer.update_weight(agent.id, execution_time)
@@ -450,6 +504,14 @@ class UnifiedCoordinator:
        except Exception as e:
            task.status = TaskStatus.FAILED
            task.result = {"error": str(e)}
+            
+            # Persist failure to database
+            try:
+                self.task_service.update_task(task.id, task)
+                logger.debug(f"💾 Updated task {task.id} status to FAILED in database")
+            except Exception as db_e:
+                logger.error(f"❌ Failed to update failed task {task.id} in database: {db_e}")
+                
            agent.current_tasks -= 1
            ACTIVE_TASKS.labels(agent=agent.id).dec()
            logger.error(f"❌ Task {task.id} failed: {e}")
@@ -622,18 +684,31 @@ Please complete this task based on the provided context and requirements.

    async def _cleanup_completed_tasks(self):
        """Clean up old completed tasks"""
-        cutoff_time = time.time() - 3600  # 1 hour ago
-        
-        completed_tasks = [
-            task_id for task_id, task in self.tasks.items()
-            if task.status == TaskStatus.COMPLETED and (task.completed_at or 0) < cutoff_time
-        ]
-        
-        for task_id in completed_tasks:
-            del self.tasks[task_id]
+        try:
+            # Clean up in-memory tasks (keep only active ones)
+            cutoff_time = time.time() - 3600  # 1 hour ago
            
-        if completed_tasks:
-            logger.info(f"🧹 Cleaned up {len(completed_tasks)} old completed tasks")
+            completed_tasks = [
+                task_id for task_id, task in self.tasks.items()
+                if task.status == TaskStatus.COMPLETED and (task.completed_at or 0) < cutoff_time
+            ]
+            
+            for task_id in completed_tasks:
+                del self.tasks[task_id]
+                
+            # Clean up database tasks (older ones)
+            try:
+                db_cleaned_count = self.task_service.cleanup_completed_tasks(max_age_hours=24)
+                if db_cleaned_count > 0:
+                    logger.info(f"🧹 Cleaned up {db_cleaned_count} old tasks from database")
+            except Exception as e:
+                logger.error(f"❌ Failed to cleanup database tasks: {e}")
+            
+            if completed_tasks:
+                logger.info(f"🧹 Cleaned up {len(completed_tasks)} old completed tasks from memory")
+                
+        except Exception as e:
+            logger.error(f"❌ Failed to cleanup completed tasks: {e}")

    # =========================================================================
    # STATUS & METRICS
@@ -641,11 +716,39 @@ Please complete this task based on the provided context and requirements.

    def get_task_status(self, task_id: str) -> Optional[Task]:
        """Get status of a specific task"""
-        return self.tasks.get(task_id)
+        # First check in-memory cache
+        task = self.tasks.get(task_id)
+        if task:
+            return task
+        
+        # If not in memory, check database
+        try:
+            orm_task = self.task_service.get_task(task_id)
+            if orm_task:
+                return self.task_service.coordinator_task_from_orm(orm_task)
+        except Exception as e:
+            logger.error(f"❌ Failed to get task {task_id} from database: {e}")
+        
+        return None

-    def get_completed_tasks(self) -> List[Task]:
+    def get_completed_tasks(self, limit: int = 50) -> List[Task]:
        """Get all completed tasks"""
-        return [task for task in self.tasks.values() if task.status == TaskStatus.COMPLETED]
+        # Get from in-memory cache first
+        memory_completed = [task for task in self.tasks.values() if task.status == TaskStatus.COMPLETED]
+        
+        # Get additional from database if needed
+        try:
+            if len(memory_completed) < limit:
+                db_completed = self.task_service.get_tasks(status='completed', limit=limit)
+                db_tasks = [self.task_service.coordinator_task_from_orm(orm_task) for orm_task in db_completed]
+                
+                # Combine and deduplicate
+                all_tasks = {task.id: task for task in memory_completed + db_tasks}
+                return list(all_tasks.values())[:limit]
+        except Exception as e:
+            logger.error(f"❌ Failed to get completed tasks from database: {e}")
+        
+        return memory_completed[:limit]

    async def get_health_status(self):
        """Get coordinator health status"""
@@ -660,13 +763,21 @@ Please complete this task based on the provided context and requirements.
                "last_heartbeat": agent.last_heartbeat
            }
        
+        # Get comprehensive task statistics from database
+        try:
+            db_stats = self.task_service.get_task_statistics()
+        except Exception as e:
+            logger.error(f"❌ Failed to get task statistics from database: {e}")
+            db_stats = {}
+        
        return {
            "status": "operational" if self.is_initialized else "initializing",
            "agents": agent_status,
            "total_agents": len(self.agents),
            "active_tasks": len([t for t in self.tasks.values() if t.status == TaskStatus.IN_PROGRESS]),
            "pending_tasks": len(self.task_queue),
-            "completed_tasks": len([t for t in self.tasks.values() if t.status == TaskStatus.COMPLETED])
+            "completed_tasks": len([t for t in self.tasks.values() if t.status == TaskStatus.COMPLETED]),
+            "database_statistics": db_stats
        }

    async def get_comprehensive_status(self):