hive/backend/app/api/agents.py

"""
Hive API - Agent Management Endpoints

This module provides comprehensive API endpoints for managing Ollama-based AI agents
in the Hive distributed orchestration platform. It handles agent registration,
status monitoring, and lifecycle management.

Key Features:
- Agent registration and validation
- Real-time status monitoring
- Comprehensive error handling
- Detailed API documentation
- Authentication and authorization
"""

from fastapi import APIRouter, HTTPException, Request, Depends, status
from typing import List, Dict, Any
import time
import logging
from ..models.agent import Agent
from ..models.responses import (
    AgentListResponse,
    AgentRegistrationResponse,
    AgentRegistrationRequest,
    ErrorResponse,
    AgentModel
)
from ..core.auth_deps import get_current_user_context

router = APIRouter()

from app.core.database import SessionLocal
from app.models.agent import Agent as ORMAgent
from ..services.agent_service import AgentType

logger = logging.getLogger(__name__)


@router.get(
    "/agents",
    response_model=AgentListResponse,
    status_code=status.HTTP_200_OK,
    summary="List all registered agents",
    description="""
    Retrieve a comprehensive list of all registered agents in the Hive cluster.

    This endpoint returns detailed information about each agent including:
    - Agent identification and endpoint information
    - Current status and utilization metrics
    - Specialization and capacity limits
    - Health and heartbeat information

    **Use Cases:**
    - Monitor cluster capacity and agent health
    - Identify available agents for task assignment
    - Track agent utilization and performance
    - Debug agent connectivity issues

    **Response Notes:**
    - Agents are returned in registration order
    - Status reflects real-time agent availability
    - Utilization is calculated as current_tasks / max_concurrent
    """,
    responses={
        200: {"description": "List of agents retrieved successfully"},
        500: {"model": ErrorResponse, "description": "Internal server error"}
    }
)
async def get_agents(
    request: Request,
    current_user: Dict[str, Any] = Depends(get_current_user_context)
) -> AgentListResponse:
    """
    Get all registered agents with detailed status information.

    Returns:
        AgentListResponse: Comprehensive list of all registered agents

    Raises:
        HTTPException: If database query fails
    """
    try:
        with SessionLocal() as db:
            db_agents = db.query(ORMAgent).all()
            agents_list = []
            for db_agent in db_agents:
                agent_model = AgentModel(
                    id=db_agent.id,
                    endpoint=db_agent.endpoint,
                    model=db_agent.model,
                    specialty=db_agent.specialty,
                    max_concurrent=db_agent.max_concurrent,
                    current_tasks=db_agent.current_tasks,
                    status="available" if db_agent.current_tasks < db_agent.max_concurrent else "busy",
                    utilization=db_agent.current_tasks / db_agent.max_concurrent if db_agent.max_concurrent > 0 else 0.0
                )
                agents_list.append(agent_model)

        return AgentListResponse(
            agents=agents_list,
            total=len(agents_list),
            message=f"Retrieved {len(agents_list)} registered agents"
        )
    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to retrieve agents: {str(e)}"
        )


@router.post(
    "/agents",
    response_model=AgentRegistrationResponse,
    status_code=status.HTTP_201_CREATED,
    summary="Register a new Ollama agent",
    description="""
    Register a new Ollama-based AI agent with the Hive cluster.

    This endpoint allows you to add new Ollama agents to the distributed AI network.
    The agent will be validated for connectivity and model availability before registration.

    **Agent Registration Process:**
    1. Validate agent connectivity and model availability
    2. Add agent to the coordinator's active agent pool
    3. Store agent configuration in the database
    4. Perform initial health check
    5. Return registration confirmation with agent details

    **Supported Agent Specializations:**
    - `kernel_dev`: Linux kernel development and debugging
    - `pytorch_dev`: PyTorch model development and optimization
    - `profiler`: Performance profiling and optimization
    - `docs_writer`: Documentation generation and technical writing
    - `tester`: Automated testing and quality assurance
    - `general_ai`: General-purpose AI assistance
    - `reasoning`: Complex reasoning and problem-solving tasks

    **Requirements:**
    - Agent endpoint must be accessible from the Hive cluster
    - Specified model must be available on the target Ollama instance
    - Agent ID must be unique across the cluster
    """,
    responses={
        201: {"description": "Agent registered successfully"},
        400: {"model": ErrorResponse, "description": "Invalid agent configuration"},
        409: {"model": ErrorResponse, "description": "Agent ID already exists"},
        503: {"model": ErrorResponse, "description": "Agent endpoint unreachable"}
    }
)
async def register_agent(
    agent_data: AgentRegistrationRequest,
    request: Request,
    current_user: Dict[str, Any] = Depends(get_current_user_context)
) -> AgentRegistrationResponse:
    """
    Register a new Ollama agent in the Hive cluster.

    Args:
        agent_data: Agent configuration and registration details
        request: FastAPI request object for accessing app state
        current_user: Current authenticated user context

    Returns:
        AgentRegistrationResponse: Registration confirmation with agent details

    Raises:
        HTTPException: If registration fails due to validation or connectivity issues
    """
    # Access coordinator through the dependency injection
    hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
    if not hive_coordinator:
        # Fallback to global coordinator if app state not available
        from ..main import unified_coordinator
        hive_coordinator = unified_coordinator

    if not hive_coordinator:
        raise HTTPException(
            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
            detail="Coordinator service unavailable"
        )

    try:
        # Check if agent ID already exists
        with SessionLocal() as db:
            existing_agent = db.query(ORMAgent).filter(ORMAgent.id == agent_data.id).first()
            if existing_agent:
                raise HTTPException(
                    status_code=status.HTTP_409_CONFLICT,
                    detail=f"Agent with ID '{agent_data.id}' already exists"
                )

        # Create agent instance
        agent = Agent(
            id=agent_data.id,
            endpoint=agent_data.endpoint,
            model=agent_data.model,
            specialty=AgentType(agent_data.specialty.value),
            max_concurrent=agent_data.max_concurrent,
        )

        # Add agent to coordinator
        hive_coordinator.add_agent(agent)

        return AgentRegistrationResponse(
            agent_id=agent.id,
            endpoint=agent.endpoint,
            message=f"Agent '{agent.id}' registered successfully with specialty '{agent_data.specialty}'"
        )

    except ValueError as e:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail=f"Invalid agent configuration: {str(e)}"
        )
    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to register agent: {str(e)}"
        )


@router.get(
    "/agents/{agent_id}",
    response_model=AgentModel,
    status_code=status.HTTP_200_OK,
    summary="Get specific agent details",
    description="""
    Retrieve detailed information about a specific agent by its ID.

    This endpoint provides comprehensive status information for a single agent,
    including real-time metrics, health status, and configuration details.

    **Returned Information:**
    - Agent identification and configuration
    - Current status and utilization
    - Recent activity and performance metrics
    - Health check results and connectivity status

    **Use Cases:**
    - Monitor specific agent performance
    - Debug agent connectivity issues
    - Verify agent configuration
    - Check agent availability for task assignment
    """,
    responses={
        200: {"description": "Agent details retrieved successfully"},
        404: {"model": ErrorResponse, "description": "Agent not found"},
        500: {"model": ErrorResponse, "description": "Internal server error"}
    }
)
async def get_agent(
    agent_id: str,
    request: Request,
    current_user: Dict[str, Any] = Depends(get_current_user_context)
) -> AgentModel:
    """
    Get detailed information about a specific agent.

    Args:
        agent_id: Unique identifier of the agent to retrieve
        request: FastAPI request object
        current_user: Current authenticated user context

    Returns:
        AgentModel: Detailed agent information and status

    Raises:
        HTTPException: If agent not found or query fails
    """
    try:
        with SessionLocal() as db:
            db_agent = db.query(ORMAgent).filter(ORMAgent.id == agent_id).first()
            if not db_agent:
                raise HTTPException(
                    status_code=status.HTTP_404_NOT_FOUND,
                    detail=f"Agent with ID '{agent_id}' not found"
                )

            agent_model = AgentModel(
                id=db_agent.id,
                endpoint=db_agent.endpoint,
                model=db_agent.model,
                specialty=db_agent.specialty,
                max_concurrent=db_agent.max_concurrent,
                current_tasks=db_agent.current_tasks,
                status="available" if db_agent.current_tasks < db_agent.max_concurrent else "busy",
                utilization=db_agent.current_tasks / db_agent.max_concurrent if db_agent.max_concurrent > 0 else 0.0
            )

            return agent_model

    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to retrieve agent: {str(e)}"
        )


@router.delete(
    "/agents/{agent_id}",
    status_code=status.HTTP_204_NO_CONTENT,
    summary="Unregister an agent",
    description="""
    Remove an agent from the Hive cluster.

    This endpoint safely removes an agent from the cluster by:
    1. Checking for active tasks and optionally waiting for completion
    2. Removing the agent from the coordinator's active pool
    3. Cleaning up database records
    4. Confirming successful removal

    **Safety Measures:**
    - Active tasks are checked before removal
    - Graceful shutdown procedures are followed
    - Database consistency is maintained
    - Error handling for cleanup failures

    **Use Cases:**
    - Remove offline or problematic agents
    - Scale down cluster capacity
    - Perform maintenance on agent nodes
    - Clean up test or temporary agents
    """,
    responses={
        204: {"description": "Agent unregistered successfully"},
        404: {"model": ErrorResponse, "description": "Agent not found"},
        409: {"model": ErrorResponse, "description": "Agent has active tasks"},
        500: {"model": ErrorResponse, "description": "Internal server error"}
    }
)
async def unregister_agent(
    agent_id: str,
    request: Request,
    force: bool = False,
    current_user: Dict[str, Any] = Depends(get_current_user_context)
):
    """
    Unregister an agent from the Hive cluster.

    Args:
        agent_id: Unique identifier of the agent to remove
        request: FastAPI request object
        force: Whether to force removal even with active tasks
        current_user: Current authenticated user context

    Raises:
        HTTPException: If agent not found, has active tasks, or removal fails
    """
    # Access coordinator
    hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
    if not hive_coordinator:
        from ..main import unified_coordinator
        hive_coordinator = unified_coordinator

    if not hive_coordinator:
        raise HTTPException(
            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
            detail="Coordinator service unavailable"
        )

    try:
        with SessionLocal() as db:
            db_agent = db.query(ORMAgent).filter(ORMAgent.id == agent_id).first()
            if not db_agent:
                raise HTTPException(
                    status_code=status.HTTP_404_NOT_FOUND,
                    detail=f"Agent with ID '{agent_id}' not found"
                )

            # Check for active tasks unless forced
            if not force and db_agent.current_tasks > 0:
                raise HTTPException(
                    status_code=status.HTTP_409_CONFLICT,
                    detail=f"Agent '{agent_id}' has {db_agent.current_tasks} active tasks. Use force=true to override."
                )

            # Remove from coordinator
            hive_coordinator.remove_agent(agent_id)

            # Remove from database
            db.delete(db_agent)
            db.commit()

    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to unregister agent: {str(e)}"
        )


@router.post(
    "/agents/heartbeat",
    status_code=status.HTTP_200_OK,
    summary="Agent heartbeat update",
    description="""
    Update agent status and maintain registration through periodic heartbeat.

    This endpoint allows agents to:
    - Confirm they are still online and responsive
    - Update their current status and metrics
    - Report any capability or configuration changes
    - Maintain their registration in the cluster

    Agents should call this endpoint every 30-60 seconds to maintain
    their active status in the Hive cluster.
    """,
    responses={
        200: {"description": "Heartbeat received successfully"},
        404: {"model": ErrorResponse, "description": "Agent not registered"},
        400: {"model": ErrorResponse, "description": "Invalid heartbeat data"}
    }
)
async def agent_heartbeat(
    heartbeat_data: Dict[str, Any],
    request: Request
):
    """
    Process agent heartbeat to maintain registration.

    Args:
        heartbeat_data: Agent status and metrics data
        request: FastAPI request object

    Returns:
        Success confirmation and any coordinator updates
    """
    agent_id = heartbeat_data.get("agent_id")
    if not agent_id:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail="Missing agent_id in heartbeat data"
        )

    # Access coordinator
    hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
    if not hive_coordinator:
        from ..main import unified_coordinator
        hive_coordinator = unified_coordinator

    if not hive_coordinator:
        raise HTTPException(
            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
            detail="Coordinator service unavailable"
        )

    try:
        # Update agent heartbeat timestamp
        agent_service = hive_coordinator.agent_service
        if agent_service:
            agent_service.update_agent_heartbeat(agent_id)

        # Update current tasks if provided - use raw SQL to avoid role column
        if "current_tasks" in heartbeat_data:
            current_tasks = heartbeat_data["current_tasks"]
            try:
                with SessionLocal() as db:
                    from sqlalchemy import text
                    db.execute(text(
                        "UPDATE agents SET current_tasks = :current_tasks, last_seen = NOW() WHERE id = :agent_id"
                    ), {
                        "current_tasks": current_tasks,
                        "agent_id": agent_id
                    })
                    db.commit()
            except Exception as e:
                logger.warning(f"Could not update agent tasks: {e}")

        return {
            "status": "success",
            "message": f"Heartbeat received from agent '{agent_id}'",
            "timestamp": time.time()
        }

    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to process heartbeat: {str(e)}"
        )


@router.post(
    "/agents/auto-register",
    response_model=AgentRegistrationResponse,
    status_code=status.HTTP_201_CREATED,
    summary="Automatic agent registration",
    description="""
    Register an agent automatically with capability detection.

    This endpoint is designed for Bzzz agents running as systemd services
    to automatically register themselves with the Hive coordinator.

    Features:
    - Automatic capability detection based on available models
    - Network discovery support
    - Retry-friendly for service startup scenarios
    - Health validation before registration
    """,
    responses={
        201: {"description": "Agent auto-registered successfully"},
        400: {"model": ErrorResponse, "description": "Invalid agent configuration"},
        409: {"model": ErrorResponse, "description": "Agent already registered"},
        503: {"model": ErrorResponse, "description": "Agent endpoint unreachable"}
    }
)
async def auto_register_agent(
    agent_data: Dict[str, Any],
    request: Request
) -> AgentRegistrationResponse:
    """
    Automatically register a Bzzz agent with the Hive coordinator.

    Args:
        agent_data: Agent configuration including endpoint, models, etc.
        request: FastAPI request object

    Returns:
        AgentRegistrationResponse: Registration confirmation
    """
    # Extract required fields
    agent_id = agent_data.get("agent_id")
    endpoint = agent_data.get("endpoint")
    hostname = agent_data.get("hostname")

    if not agent_id or not endpoint:
        raise HTTPException(
            status_code=status.HTTP_400_BAD_REQUEST,
            detail="Missing required fields: agent_id, endpoint"
        )

    # Access coordinator
    hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
    if not hive_coordinator:
        from ..main import unified_coordinator
        hive_coordinator = unified_coordinator

    if not hive_coordinator:
        raise HTTPException(
            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
            detail="Coordinator service unavailable"
        )

    try:
        # Check if agent already exists - use basic query to avoid role column
        try:
            with SessionLocal() as db:
                from sqlalchemy import text
                existing_agent = db.execute(text(
                    "SELECT id, endpoint FROM agents WHERE id = :agent_id LIMIT 1"
                ), {"agent_id": agent_id}).fetchone()
                if existing_agent:
                    # Update existing agent
                    db.execute(text(
                        "UPDATE agents SET endpoint = :endpoint, last_seen = NOW() WHERE id = :agent_id"
                    ), {"endpoint": endpoint, "agent_id": agent_id})
                    db.commit()

                    return AgentRegistrationResponse(
                        agent_id=agent_id,
                        endpoint=endpoint,
                        message=f"Agent '{agent_id}' registration updated successfully"
                    )
        except Exception as e:
            logger.warning(f"Could not check existing agent: {e}")

        # Detect capabilities and models
        models = agent_data.get("models", [])
        if not models:
            # Try to detect models from endpoint
            try:
                import aiohttp
                async with aiohttp.ClientSession() as session:
                    async with session.get(f"{endpoint}/api/tags", timeout=aiohttp.ClientTimeout(total=5)) as response:
                        if response.status == 200:
                            tags_data = await response.json()
                            models = [model["name"] for model in tags_data.get("models", [])]
            except Exception as e:
                logger.warning(f"Could not detect models for {agent_id}: {e}")

        # Determine specialty based on models or hostname
        specialty = AgentType.GENERAL_AI  # Default
        if "codellama" in str(models).lower() or "code" in hostname.lower():
            specialty = AgentType.KERNEL_DEV
        elif "gemma" in str(models).lower():
            specialty = AgentType.PYTORCH_DEV
        elif any(model for model in models if "llama" in model.lower()):
            specialty = AgentType.GENERAL_AI

        # Insert agent directly into database
        try:
            with SessionLocal() as db:
                from sqlalchemy import text
                # Insert new agent using raw SQL to avoid role column issues
                db.execute(text("""
                    INSERT INTO agents (id, name, endpoint, model, specialty, max_concurrent, current_tasks, status, created_at, last_seen)
                    VALUES (:agent_id, :name, :endpoint, :model, :specialty, :max_concurrent, 0, 'active', NOW(), NOW())
                    ON CONFLICT (id) DO UPDATE SET
                        endpoint = EXCLUDED.endpoint,
                        model = EXCLUDED.model,
                        specialty = EXCLUDED.specialty,
                        max_concurrent = EXCLUDED.max_concurrent,
                        last_seen = NOW()
                """), {
                    "agent_id": agent_id,
                    "name": agent_id,  # Use agent_id as name
                    "endpoint": endpoint,
                    "model": models[0] if models else "unknown",
                    "specialty": specialty.value,
                    "max_concurrent": agent_data.get("max_concurrent", 2)
                })
                db.commit()

                return AgentRegistrationResponse(
                    agent_id=agent_id,
                    endpoint=endpoint,
                    message=f"Agent '{agent_id}' auto-registered successfully with specialty '{specialty.value}'"
                )
        except Exception as e:
            logger.error(f"Database insert failed: {e}")
            raise HTTPException(
                status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
                detail=f"Failed to register agent in database: {str(e)}"
            )

    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to auto-register agent: {str(e)}"
        )