Complete Hive platform functionality and expand cluster to 7 agents

Major Features Added: - Fix Socket.IO connectivity by updating Dockerfile to use socket_app - Resolve distributed workflows API to return arrays instead of errors - Expand agent coverage from 3 to 7 agents (added OAK and ROSEWOOD) - Create comprehensive systemd service for MCP server with auto-discovery - Add daemon mode with periodic agent discovery every 5 minutes - Implement comprehensive test suite with 100% pass rate Infrastructure Improvements: - Enhanced database connection handling with retry logic - Improved agent registration with persistent storage - Added proper error handling for distributed workflows endpoint - Created management scripts for service lifecycle operations Agent Cluster Expansion: - ACACIA: deepseek-r1:7b (kernel_dev) - WALNUT: starcoder2:15b (pytorch_dev) - IRONWOOD: deepseek-coder-v2 (profiler) - OAK: codellama:latest (docs_writer) - OAK-TESTER: deepseek-r1:latest (tester) - ROSEWOOD: deepseek-coder-v2:latest (kernel_dev) - ROSEWOOD-VISION: llama3.2-vision:11b (tester) System Status: All 7 agents healthy, Socket.IO operational, MCP server fully functional 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-10 08:41:34 +10:00
parent 8c3adf6d8f
commit fc0eec91ef
16 changed files with 1599 additions and 84 deletions
--- a/backend/app/core/database.py
+++ b/backend/app/core/database.py
@@ -1,19 +1,82 @@
-from sqlalchemy import create_engine
+from sqlalchemy import create_engine, event, text
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
+from sqlalchemy.pool import QueuePool
+from sqlalchemy.exc import DisconnectionError
 import os
+import time
+import logging

-# Use PostgreSQL in production, SQLite for development
-DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///./hive.db")
+# Enhanced database configuration with connection pooling
+DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://postgres:hive123@hive_postgres:5432/hive")
+
+# Create engine with connection pooling and reliability features
+if "sqlite" in DATABASE_URL:
+    engine = create_engine(
+        DATABASE_URL,
+        connect_args={"check_same_thread": False},
+        pool_pre_ping=True
+    )
+else:
+    engine = create_engine(
+        DATABASE_URL,
+        poolclass=QueuePool,
+        pool_size=10,
+        max_overflow=20,
+        pool_pre_ping=True,
+        pool_recycle=3600,
+        echo=False
+    )

-engine = create_engine(DATABASE_URL, connect_args={"check_same_thread": False} if "sqlite" in DATABASE_URL else {})
 SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

 Base = declarative_base()

+@event.listens_for(engine, "connect")
+def set_sqlite_pragma(dbapi_connection, connection_record):
+    """Set SQLite pragma for foreign key support"""
+    if "sqlite" in DATABASE_URL:
+        cursor = dbapi_connection.cursor()
+        cursor.execute("PRAGMA foreign_keys=ON")
+        cursor.close()
+
 def get_db():
+    """Database session dependency with proper error handling"""
    db = SessionLocal()
    try:
        yield db
+    except DisconnectionError as e:
+        logging.error(f"Database disconnection error: {e}")
+        db.rollback()
+        raise
+    except Exception as e:
+        logging.error(f"Database error: {e}")
+        db.rollback()
+        raise
    finally:
-        db.close()
+        db.close()
+
+def test_database_connection():
+    """Test database connectivity"""
+    try:
+        with engine.connect() as conn:
+            result = conn.execute(text("SELECT 1"))
+            return True
+    except Exception as e:
+        logging.error(f"Database connection test failed: {e}")
+        return False
+
+def init_database_with_retry(max_retries=5, retry_delay=2):
+    """Initialize database with retry logic"""
+    for attempt in range(max_retries):
+        try:
+            Base.metadata.create_all(bind=engine)
+            logging.info("Database initialized successfully")
+            return True
+        except Exception as e:
+            if attempt == max_retries - 1:
+                logging.error(f"Database initialization failed after {max_retries} attempts: {e}")
+                raise
+            logging.warning(f"Database initialization attempt {attempt + 1} failed: {e}")
+            time.sleep(retry_delay ** attempt)
+    return False
--- a/backend/app/core/hive_coordinator.py
+++ b/backend/app/core/hive_coordinator.py
@@ -11,6 +11,9 @@ import time
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Any
 from enum import Enum
+from sqlalchemy.orm import Session
+from ..models.agent import Agent as ORMAgent
+from ..core.database import SessionLocal

 class AgentType(Enum):
    KERNEL_DEV = "kernel_dev"
@@ -48,9 +51,8 @@ class Task:
    created_at: float = None
    completed_at: Optional[float] = None

-class AIDevCoordinator:
+class HiveCoordinator:
    def __init__(self):
-        self.agents: Dict[str, Agent] = {}
        self.tasks: Dict[str, Task] = {}
        self.task_queue: List[Task] = []
        self.is_initialized = False
@@ -89,9 +91,20 @@ FOCUS:[full-coverage]→[test+measure+handle+automate]"""
        }
    
    def add_agent(self, agent: Agent):
-        """Register a new agent"""
-        self.agents[agent.id] = agent
-        print(f"Registered agent {agent.id} ({agent.specialty.value}) at {agent.endpoint}")
+        """Register a new agent and persist to database"""
+        with SessionLocal() as db:
+            db_agent = ORMAgent(
+                id=agent.id,
+                endpoint=agent.endpoint,
+                model=agent.model,
+                specialty=agent.specialty.value,
+                max_concurrent=agent.max_concurrent,
+                current_tasks=agent.current_tasks
+            )
+            db.add(db_agent)
+            db.commit()
+            db.refresh(db_agent)
+            print(f"Registered agent {agent.id} ({agent.specialty.value}) at {agent.endpoint} and persisted to DB")
    
    def create_task(self, task_type: AgentType, context: Dict, priority: int = 3) -> Task:
        """Create a new development task"""
@@ -111,16 +124,37 @@ FOCUS:[full-coverage]→[test+measure+handle+automate]"""
        return task
    
    def get_available_agent(self, task_type: AgentType) -> Optional[Agent]:
-        """Find an available agent for the task type"""
-        available_agents = [
-            agent for agent in self.agents.values()
-            if agent.specialty == task_type and agent.current_tasks < agent.max_concurrent
-        ]
-        return available_agents[0] if available_agents else None
+        """Find an available agent for the task type from the database"""
+        with SessionLocal() as db:
+            available_agents_orm = db.query(ORMAgent).filter(
+                ORMAgent.specialty == task_type.value,
+                ORMAgent.current_tasks < ORMAgent.max_concurrent
+            ).all()
+            
+            if available_agents_orm:
+                # Convert ORM agent to dataclass Agent
+                db_agent = available_agents_orm[0]
+                return Agent(
+                    id=db_agent.id,
+                    endpoint=db_agent.endpoint,
+                    model=db_agent.model,
+                    specialty=AgentType(db_agent.specialty),
+                    max_concurrent=db_agent.max_concurrent,
+                    current_tasks=db_agent.current_tasks
+                )
+            return None
    
    async def execute_task(self, task: Task, agent: Agent) -> Dict:
-        """Execute a task on a specific agent"""
-        agent.current_tasks += 1
+        """Execute a task on a specific agent with improved error handling"""
+        with SessionLocal() as db:
+            db_agent = db.query(ORMAgent).filter(ORMAgent.id == agent.id).first()
+            if db_agent:
+                db_agent.current_tasks += 1
+                db.add(db_agent)
+                db.commit()
+                db.refresh(db_agent)
+                agent.current_tasks = db_agent.current_tasks # Update in-memory object
+            
        task.status = TaskStatus.IN_PROGRESS
        task.assigned_agent = agent.id
        
@@ -146,18 +180,32 @@ Complete task → respond JSON format specified above."""
        }
        
        try:
-            async with aiohttp.ClientSession() as session:
-                async with session.post(f"{agent.endpoint}/api/generate", json=payload) as response:
-                    if response.status == 200:
-                        result = await response.json()
-                        task.result = result
-                        task.status = TaskStatus.COMPLETED
-                        task.completed_at = time.time()
-                        print(f"Task {task.id} completed by {agent.id}")
-                        return result
-                    else:
-                        raise Exception(f"HTTP {response.status}: {await response.text()}")
+            # Use the session initialized in the coordinator
+            session = getattr(self, 'session', None)
+            if not session:
+                raise Exception("HTTP session not initialized")
+            
+            async with session.post(
+                f"{agent.endpoint}/api/generate", 
+                json=payload,
+                timeout=aiohttp.ClientTimeout(total=300)  # 5 minute timeout for AI tasks
+            ) as response:
+                if response.status == 200:
+                    result = await response.json()
+                    task.result = result
+                    task.status = TaskStatus.COMPLETED
+                    task.completed_at = time.time()
+                    print(f"Task {task.id} completed by {agent.id}")
+                    return result
+                else:
+                    error_text = await response.text()
+                    raise Exception(f"HTTP {response.status}: {error_text}")
        
+        except asyncio.TimeoutError:
+            task.status = TaskStatus.FAILED
+            task.result = {"error": "Task execution timeout"}
+            print(f"Task {task.id} timed out on {agent.id}")
+            return {"error": "Task execution timeout"}
        except Exception as e:
            task.status = TaskStatus.FAILED
            task.result = {"error": str(e)}
@@ -165,7 +213,14 @@ Complete task → respond JSON format specified above."""
            return {"error": str(e)}
        
        finally:
-            agent.current_tasks -= 1
+            with SessionLocal() as db:
+                db_agent = db.query(ORMAgent).filter(ORMAgent.id == agent.id).first()
+                if db_agent:
+                    db_agent.current_tasks -= 1
+                    db.add(db_agent)
+                    db.commit()
+                    db.refresh(db_agent)
+                    agent.current_tasks = db_agent.current_tasks # Update in-memory object
    
    async def process_queue(self):
        """Process the task queue with available agents"""
@@ -246,8 +301,10 @@ Complete task → respond JSON format specified above."""
        completion_rate = completed / total_tasks if total_tasks > 0 else 0
        
        agent_vectors = {}
-        for agent in self.agents.values():
-            agent_vectors[agent.id] = f"[{agent.specialty.value}@{agent.current_tasks}/{agent.max_concurrent}]"
+        with SessionLocal() as db:
+            db_agents = db.query(ORMAgent).all()
+            for agent in db_agents:
+                agent_vectors[agent.id] = f"[{agent.specialty}@{agent.current_tasks}/{agent.max_concurrent}]"
        
        return {
            "status_vector": status_vector,
@@ -259,26 +316,94 @@ Complete task → respond JSON format specified above."""
            "failed": failed,
            "in_progress": in_progress,
            "pending": total_tasks - completed - failed - in_progress,
-            "agents": {agent.id: agent.current_tasks for agent in self.agents.values()}
+            "agents": {agent.id: agent.current_tasks for agent in db_agents}
        }

    async def initialize(self):
-        """Initialize the coordinator"""
-        print("Initializing Hive Coordinator...")
-        self.is_initialized = True
-        print("✅ Hive Coordinator initialized")
+        """Initialize the coordinator with proper error handling"""
+        try:
+            print("Initializing Hive Coordinator...")
+            
+            # Initialize HTTP client session with timeouts
+            self.session = aiohttp.ClientSession(
+                timeout=aiohttp.ClientTimeout(total=30, connect=10),
+                connector=aiohttp.TCPConnector(
+                    limit=100,
+                    limit_per_host=30,
+                    ttl_dns_cache=300,
+                    use_dns_cache=True
+                )
+            )
+            
+            # Initialize task processing
+            self.task_processor = None
+            
+            # Test connectivity to any configured agents
+            await self._test_initial_connectivity()
+            
+            self.is_initialized = True
+            print("✅ Hive Coordinator initialized")
+            
+        except Exception as e:
+            print(f"❌ Coordinator initialization failed: {e}")
+            self.is_initialized = False
+            # Clean up any partial initialization
+            if hasattr(self, 'session') and self.session:
+                await self.session.close()
+            raise

    async def shutdown(self):
-        """Shutdown the coordinator"""
+        """Enhanced shutdown with proper cleanup"""
        print("Shutting down Hive Coordinator...")
-        self.is_initialized = False
-        print("✅ Hive Coordinator shutdown")
+        
+        try:
+            # Cancel any running tasks
+            running_tasks = [task for task in self.tasks.values() if task.status == TaskStatus.IN_PROGRESS]
+            if running_tasks:
+                print(f"Canceling {len(running_tasks)} running tasks...")
+                for task in running_tasks:
+                    task.status = TaskStatus.FAILED
+                    task.result = {"error": "Coordinator shutdown"}
+            
+            # Close HTTP session
+            if hasattr(self, 'session') and self.session:
+                await self.session.close()
+            
+            # Stop task processor
+            if hasattr(self, 'task_processor') and self.task_processor:
+                self.task_processor.cancel()
+                try:
+                    await self.task_processor
+                except asyncio.CancelledError:
+                    pass
+            
+            self.is_initialized = False
+            print("✅ Hive Coordinator shutdown")
+            
+        except Exception as e:
+            print(f"❌ Shutdown error: {e}")
+            self.is_initialized = False
+
+    async def _test_initial_connectivity(self):
+        """Test initial connectivity to prevent startup issues"""
+        # This would test any pre-configured agents
+        # For now, just ensure we can make HTTP requests
+        try:
+            async with self.session.get('http://httpbin.org/get', timeout=5) as response:
+                if response.status == 200:
+                    print("✅ HTTP connectivity test passed")
+        except Exception as e:
+            print(f"⚠️ HTTP connectivity test failed: {e}")
+            # Don't fail initialization for this, just warn

    async def get_health_status(self):
        """Get health status"""
+        with SessionLocal() as db:
+            db_agents = db.query(ORMAgent).all()
+            agents_status = {agent.id: "available" for agent in db_agents}
        return {
            "status": "healthy" if self.is_initialized else "unhealthy",
-            "agents": {agent.id: "available" for agent in self.agents.values()},
+            "agents": agents_status,
            "tasks": {
                "pending": len([t for t in self.tasks.values() if t.status == TaskStatus.PENDING]),
                "running": len([t for t in self.tasks.values() if t.status == TaskStatus.IN_PROGRESS]),
@@ -289,6 +414,12 @@ Complete task → respond JSON format specified above."""

    async def get_comprehensive_status(self):
        """Get comprehensive system status"""
+        with SessionLocal() as db:
+            db_agents = db.query(ORMAgent).all()
+            total_agents = len(db_agents)
+            available_agents = len([a for a in db_agents if a.current_tasks < a.max_concurrent])
+            busy_agents = len([a for a in db_agents if a.current_tasks >= a.max_concurrent])
+
        return {
            "system": {
                "status": "operational" if self.is_initialized else "initializing",
@@ -296,9 +427,19 @@ Complete task → respond JSON format specified above."""
                "version": "1.0.0"
            },
            "agents": {
-                "total": len(self.agents),
-                "available": len([a for a in self.agents.values() if a.current_tasks < a.max_concurrent]),
-                "busy": len([a for a in self.agents.values() if a.current_tasks >= a.max_concurrent])
+                "total": total_agents,
+                "available": available_agents,
+                "busy": busy_agents,
+                "details": [
+                    {
+                        "id": agent.id,
+                        "endpoint": agent.endpoint,
+                        "model": agent.model,
+                        "specialty": agent.specialty,
+                        "max_concurrent": agent.max_concurrent,
+                        "current_tasks": agent.current_tasks
+                    } for agent in db_agents
+                ]
            },
            "tasks": {
                "total": len(self.tasks),
@@ -313,9 +454,14 @@ Complete task → respond JSON format specified above."""
        """Get Prometheus formatted metrics"""
        metrics = []
        
+        with SessionLocal() as db:
+            db_agents = db.query(ORMAgent).all()
+            total_agents = len(db_agents)
+            available_agents = len([a for a in db_agents if a.current_tasks < a.max_concurrent])
+
        # Agent metrics
-        metrics.append(f"hive_agents_total {len(self.agents)}")
-        metrics.append(f"hive_agents_available {len([a for a in self.agents.values() if a.current_tasks < a.max_concurrent])}")
+        metrics.append(f"hive_agents_total {total_agents}")
+        metrics.append(f"hive_agents_available {available_agents}")
        
        # Task metrics
        metrics.append(f"hive_tasks_total {len(self.tasks)}")
@@ -329,7 +475,7 @@ Complete task → respond JSON format specified above."""
 # Example usage and testing functions
 async def demo_coordination():
    """Demonstrate the coordination system"""
-    coordinator = AIDevCoordinator()
+    coordinator = HiveCoordinator()
    
    # Add example agents (you'll replace with your actual endpoints)
    coordinator.add_agent(Agent(