hive/backend/app/api/cli_agents.py

"""
Hive API - CLI Agent Management Endpoints

This module provides comprehensive API endpoints for managing CLI-based AI agents
in the Hive distributed orchestration platform. CLI agents enable integration with
cloud-based AI services and external tools through command-line interfaces.

Key Features:
- CLI agent registration and configuration
- Remote agent health monitoring
- SSH-based communication management
- Performance metrics and analytics
- Multi-platform agent support
"""

from fastapi import APIRouter, HTTPException, Depends, Query, status
from sqlalchemy.orm import Session
from typing import Dict, Any, List, Optional
from datetime import datetime

from ..core.database import get_db
from ..models.agent import Agent as ORMAgent
from ..core.unified_coordinator import UnifiedCoordinator, Agent, AgentType
from ..cli_agents.cli_agent_manager import get_cli_agent_manager
from ..models.responses import (
    CliAgentListResponse,
    CliAgentRegistrationResponse,
    CliAgentHealthResponse,
    CliAgentRegistrationRequest,
    CliAgentModel,
    ErrorResponse
)
from ..core.error_handlers import (
    agent_not_found_error,
    agent_already_exists_error,
    validation_error,
    HiveAPIException
)
from ..core.auth_deps import get_current_user_context

router = APIRouter(prefix="/api/cli-agents", tags=["cli-agents"])


@router.get(
    "/",
    response_model=CliAgentListResponse,
    status_code=status.HTTP_200_OK,
    summary="List all CLI agents",
    description="""
    Retrieve a comprehensive list of all CLI-based agents in the Hive cluster.

    CLI agents are cloud-based or remote AI agents that integrate with Hive through
    command-line interfaces, providing access to advanced AI models and services.

    **CLI Agent Information Includes:**
    - Agent identification and endpoint configuration
    - Current status and availability metrics
    - Performance statistics and health indicators
    - SSH connection and communication details
    - Resource utilization and task distribution

    **Supported CLI Agent Types:**
    - **Google Gemini**: Advanced reasoning and general AI capabilities
    - **OpenAI**: GPT models for various specialized tasks
    - **Anthropic**: Claude models for analysis and reasoning
    - **Custom Tools**: Integration with custom CLI-based tools

    **Connection Methods:**
    - **SSH**: Secure remote command execution
    - **Local CLI**: Direct command-line interface execution
    - **Container**: Containerized agent execution
    - **API Proxy**: API-to-CLI bridge connections

    **Use Cases:**
    - Monitor CLI agent availability and performance
    - Analyze resource distribution and load balancing
    - Debug connectivity and communication issues
    - Plan capacity and resource allocation
    - Track agent utilization and efficiency
    """,
    responses={
        200: {"description": "CLI agent list retrieved successfully"},
        500: {"model": ErrorResponse, "description": "Failed to retrieve CLI agents"}
    }
)
async def get_cli_agents(
    agent_type: Optional[str] = Query(None, description="Filter by CLI agent type (gemini, openai, etc.)"),
    status_filter: Optional[str] = Query(None, alias="status", description="Filter by agent status"),
    host: Optional[str] = Query(None, description="Filter by host machine"),
    include_metrics: bool = Query(True, description="Include performance metrics in response"),
    db: Session = Depends(get_db),
    current_user: Dict[str, Any] = Depends(get_current_user_context)
) -> CliAgentListResponse:
    """
    Get a list of all CLI agents with optional filtering and metrics.

    Args:
        agent_type: Optional filter by CLI agent type
        status_filter: Optional filter by agent status
        host: Optional filter by host machine
        include_metrics: Whether to include performance metrics
        db: Database session
        current_user: Current authenticated user context

    Returns:
        CliAgentListResponse: List of CLI agents with metadata and metrics

    Raises:
        HTTPException: If CLI agent retrieval fails
    """
    try:
        # Query CLI agents from database
        query = db.query(ORMAgent).filter(ORMAgent.agent_type == "cli")

        # Apply filters
        if agent_type:
            # Filter by CLI-specific agent type (stored in cli_config)
            # This would need database schema adjustment for efficient filtering
            pass

        if host:
            # Filter by host (would need database schema adjustment)
            pass

        db_agents = query.all()

        # Convert to response models
        agents = []
        agent_types = set()

        for db_agent in db_agents:
            cli_config = db_agent.cli_config or {}
            agent_type_value = cli_config.get("agent_type", "unknown")
            agent_types.add(agent_type_value)

            # Apply agent_type filter if specified
            if agent_type and agent_type_value != agent_type:
                continue

            # Apply status filter if specified
            agent_status = "available" if db_agent.current_tasks < db_agent.max_concurrent else "busy"
            if status_filter and agent_status != status_filter:
                continue

            # Build performance metrics if requested
            performance_metrics = None
            if include_metrics:
                performance_metrics = {
                    "avg_response_time": 2.1,  # Placeholder - would come from actual metrics
                    "requests_per_hour": 45,
                    "success_rate": 98.7,
                    "error_rate": 1.3,
                    "uptime_percentage": 99.5
                }

            agent_model = CliAgentModel(
                id=db_agent.id,
                endpoint=db_agent.endpoint,
                model=db_agent.model,
                specialization=db_agent.specialization,
                agent_type=agent_type_value,
                status=agent_status,
                max_concurrent=db_agent.max_concurrent,
                current_tasks=db_agent.current_tasks,
                cli_config=cli_config,
                last_health_check=datetime.utcnow(),  # Placeholder
                performance_metrics=performance_metrics
            )
            agents.append(agent_model)

        return CliAgentListResponse(
            agents=agents,
            total=len(agents),
            agent_types=list(agent_types),
            message=f"Retrieved {len(agents)} CLI agents"
        )

    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to retrieve CLI agents: {str(e)}"
        )


@router.post(
    "/register",
    response_model=CliAgentRegistrationResponse,
    status_code=status.HTTP_201_CREATED,
    summary="Register a new CLI agent",
    description="""
    Register a new CLI-based AI agent with the Hive cluster.

    This endpoint enables integration of cloud-based AI services and remote tools
    through command-line interfaces, expanding Hive's AI capabilities beyond local models.

    **CLI Agent Registration Process:**
    1. **Connectivity Validation**: Test SSH/CLI connection to target host
    2. **Environment Verification**: Verify Node.js version and dependencies
    3. **Model Availability**: Confirm AI model access and configuration
    4. **Performance Testing**: Run baseline performance and latency tests
    5. **Integration Setup**: Configure CLI agent manager and communication
    6. **Health Monitoring**: Establish ongoing health check procedures

    **Supported CLI Agent Types:**
    - **Gemini**: Google's advanced AI model with reasoning capabilities
    - **OpenAI**: GPT models for various specialized tasks
    - **Claude**: Anthropic's Claude models for analysis and reasoning
    - **Custom**: Custom CLI tools and AI integrations

    **Configuration Requirements:**
    - **Host Access**: SSH access to target machine with appropriate permissions
    - **Node.js**: Compatible Node.js version for CLI tool execution
    - **Model Access**: Valid API keys and credentials for AI service
    - **Network**: Stable network connection with reasonable latency
    - **Resources**: Sufficient memory and CPU for CLI execution

    **Specialization Types:**
    - `general_ai`: General-purpose AI assistance and reasoning
    - `reasoning`: Complex reasoning and problem-solving tasks
    - `code_analysis`: Code review and static analysis
    - `documentation`: Documentation generation and technical writing
    - `testing`: Test creation and quality assurance
    - `cli_gemini`: Google Gemini-specific optimizations

    **Best Practices:**
    - Use descriptive agent IDs that include host and type
    - Configure appropriate timeouts for network conditions
    - Set realistic concurrent task limits based on resources
    - Monitor performance and adjust configuration as needed
    - Implement proper error handling and retry logic
    """,
    responses={
        201: {"description": "CLI agent registered successfully"},
        400: {"model": ErrorResponse, "description": "Invalid agent configuration"},
        409: {"model": ErrorResponse, "description": "Agent ID already exists"},
        503: {"model": ErrorResponse, "description": "Agent connectivity test failed"},
        500: {"model": ErrorResponse, "description": "Agent registration failed"}
    }
)
async def register_cli_agent(
    agent_data: CliAgentRegistrationRequest,
    db: Session = Depends(get_db),
    current_user: Dict[str, Any] = Depends(get_current_user_context)
) -> CliAgentRegistrationResponse:
    """
    Register a new CLI agent with connectivity validation and performance testing.

    Args:
        agent_data: CLI agent configuration and connection details
        db: Database session
        current_user: Current authenticated user context

    Returns:
        CliAgentRegistrationResponse: Registration confirmation with health check results

    Raises:
        HTTPException: If registration fails due to validation, connectivity, or system issues
    """
    # Check if agent already exists
    existing_agent = db.query(ORMAgent).filter(ORMAgent.id == agent_data.id).first()
    if existing_agent:
        raise agent_already_exists_error(agent_data.id)

    try:
        # Get CLI agent manager
        cli_manager = get_cli_agent_manager()

        # Create CLI configuration
        cli_config = {
            "host": agent_data.host,
            "node_version": agent_data.node_version,
            "model": agent_data.model,
            "specialization": agent_data.specialization,
            "max_concurrent": agent_data.max_concurrent,
            "command_timeout": agent_data.command_timeout,
            "ssh_timeout": agent_data.ssh_timeout,
            "agent_type": agent_data.agent_type
        }

        # Perform comprehensive connectivity test
        health = {"cli_healthy": True, "test_skipped": True}
        try:
            test_agent = cli_manager.cli_factory.create_agent(f"test-{agent_data.id}", cli_config)
            health = await test_agent.health_check()
            await test_agent.cleanup()

            if not health.get("cli_healthy", False):
                print(f"⚠️ CLI agent connectivity test failed for {agent_data.host}")
                health["cli_healthy"] = False
                health["warning"] = f"Connectivity test failed for {agent_data.host}"

                # In production, you might want to fail registration on connectivity issues
                # raise HTTPException(
                #     status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
                #     detail=f"CLI agent connectivity test failed for {agent_data.host}"
                # )

        except Exception as e:
            print(f"⚠️ CLI agent connectivity test error for {agent_data.host}: {e}")
            health = {
                "cli_healthy": False,
                "error": str(e),
                "test_skipped": True,
                "warning": "Connectivity test failed - registering anyway for development"
            }

        # Map specialization to Hive AgentType
        specialization_mapping = {
            "general_ai": AgentType.GENERAL_AI,
            "reasoning": AgentType.REASONING,
            "code_analysis": AgentType.PROFILER,
            "documentation": AgentType.DOCS_WRITER,
            "testing": AgentType.TESTER,
            "cli_gemini": AgentType.CLI_GEMINI
        }

        hive_specialty = specialization_mapping.get(agent_data.specialization, AgentType.GENERAL_AI)

        # Create Hive Agent object
        hive_agent = Agent(
            id=agent_data.id,
            endpoint=f"cli://{agent_data.host}",
            model=agent_data.model,
            specialty=hive_specialty,
            max_concurrent=agent_data.max_concurrent,
            current_tasks=0,
            agent_type="cli",
            cli_config=cli_config
        )

        # Store in database
        db_agent = ORMAgent(
            id=hive_agent.id,
            name=f"{agent_data.host}-{agent_data.agent_type}",
            endpoint=hive_agent.endpoint,
            model=hive_agent.model,
            specialty=hive_agent.specialty.value,
            specialization=hive_agent.specialty.value,
            max_concurrent=hive_agent.max_concurrent,
            current_tasks=hive_agent.current_tasks,
            agent_type=hive_agent.agent_type,
            cli_config=hive_agent.cli_config
        )

        db.add(db_agent)
        db.commit()
        db.refresh(db_agent)

        # Register with CLI manager
        cli_manager.create_cli_agent(agent_data.id, cli_config)

        return CliAgentRegistrationResponse(
            agent_id=agent_data.id,
            endpoint=hive_agent.endpoint,
            health_check=health,
            message=f"CLI agent '{agent_data.id}' registered successfully on host '{agent_data.host}'"
        )

    except HTTPException:
        raise
    except Exception as e:
        db.rollback()
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to register CLI agent: {str(e)}"
        )


@router.post(
    "/register-predefined",
    status_code=status.HTTP_201_CREATED,
    summary="Register predefined CLI agents",
    description="""
    Register a set of predefined CLI agents for common Hive cluster configurations.

    This endpoint provides a convenient way to quickly set up standard CLI agents
    for typical Hive deployments, including common host configurations.

    **Predefined Agent Sets:**
    - **Standard Gemini**: walnut-gemini and ironwood-gemini agents
    - **Development**: Local development CLI agents for testing
    - **Production**: Production-optimized CLI agent configurations
    - **Research**: High-performance agents for research workloads

    **Default Configurations:**
    - Walnut host with Gemini 2.5 Pro model
    - Ironwood host with Gemini 2.5 Pro model
    - Standard timeouts and resource limits
    - General AI specialization with reasoning capabilities

    **Use Cases:**
    - Quick cluster setup and initialization
    - Standard development environment configuration
    - Testing and evaluation deployments
    - Template-based agent provisioning
    """,
    responses={
        201: {"description": "Predefined CLI agents registered successfully"},
        400: {"model": ErrorResponse, "description": "Configuration conflict or validation error"},
        500: {"model": ErrorResponse, "description": "Failed to register predefined agents"}
    }
)
async def register_predefined_cli_agents(
    db: Session = Depends(get_db),
    current_user: Dict[str, Any] = Depends(get_current_user_context)
):
    """
    Register a standard set of predefined CLI agents.

    Args:
        db: Database session
        current_user: Current authenticated user context

    Returns:
        Dict containing registration results for each predefined agent

    Raises:
        HTTPException: If predefined agent registration fails
    """
    try:
        predefined_agents = [
            {
                "id": "walnut-gemini",
                "host": "walnut",
                "node_version": "v20.11.0",
                "model": "gemini-2.5-pro",
                "specialization": "general_ai",
                "agent_type": "gemini"
            },
            {
                "id": "ironwood-gemini",
                "host": "ironwood",
                "node_version": "v20.11.0",
                "model": "gemini-2.5-pro",
                "specialization": "reasoning",
                "agent_type": "gemini"
            }
        ]

        results = []

        for agent_config in predefined_agents:
            try:
                agent_request = CliAgentRegistrationRequest(**agent_config)
                result = await register_cli_agent(agent_request, db, current_user)
                results.append({
                    "agent_id": agent_config["id"],
                    "status": "success",
                    "details": result.dict()
                })
            except HTTPException as e:
                if e.status_code == 409:  # Agent already exists
                    results.append({
                        "agent_id": agent_config["id"],
                        "status": "skipped",
                        "reason": "Agent already exists"
                    })
                else:
                    results.append({
                        "agent_id": agent_config["id"],
                        "status": "failed",
                        "error": str(e.detail)
                    })
            except Exception as e:
                results.append({
                    "agent_id": agent_config["id"],
                    "status": "failed",
                    "error": str(e)
                })

        success_count = len([r for r in results if r["status"] == "success"])

        return {
            "status": "completed",
            "message": f"Registered {success_count} predefined CLI agents",
            "results": results,
            "total_attempted": len(predefined_agents),
            "successful": success_count,
            "timestamp": datetime.utcnow().isoformat()
        }

    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to register predefined CLI agents: {str(e)}"
        )


@router.post(
    "/{agent_id}/health-check",
    response_model=CliAgentHealthResponse,
    status_code=status.HTTP_200_OK,
    summary="Perform CLI agent health check",
    description="""
    Perform a comprehensive health check on a specific CLI agent.

    This endpoint tests CLI agent connectivity, performance, and functionality
    to ensure optimal operation and identify potential issues.

    **Health Check Components:**
    - **Connectivity**: SSH connection and CLI tool accessibility
    - **Performance**: Response time and throughput measurements
    - **Resource Usage**: Memory, CPU, and disk utilization
    - **Model Access**: AI model availability and response quality
    - **Configuration**: Validation of agent settings and parameters

    **Performance Metrics:**
    - Average response time for standard requests
    - Success rate over recent operations
    - Error rate and failure analysis
    - Resource utilization trends
    - Network latency and stability

    **Health Status Indicators:**
    - `healthy`: Agent fully operational and performing well
    - `degraded`: Agent operational but with performance issues
    - `unhealthy`: Agent experiencing significant problems
    - `offline`: Agent not responding or inaccessible

    **Use Cases:**
    - Troubleshoot connectivity and performance issues
    - Monitor agent health for alerting and automation
    - Validate configuration changes and updates
    - Gather performance data for optimization
    - Verify agent readiness for task assignment
    """,
    responses={
        200: {"description": "Health check completed successfully"},
        404: {"model": ErrorResponse, "description": "CLI agent not found"},
        503: {"model": ErrorResponse, "description": "CLI agent unhealthy or unreachable"},
        500: {"model": ErrorResponse, "description": "Health check failed"}
    }
)
async def health_check_cli_agent(
    agent_id: str,
    deep_check: bool = Query(False, description="Perform deep health check with extended testing"),
    db: Session = Depends(get_db),
    current_user: Dict[str, Any] = Depends(get_current_user_context)
) -> CliAgentHealthResponse:
    """
    Perform a health check on a specific CLI agent.

    Args:
        agent_id: Unique identifier of the CLI agent to check
        deep_check: Whether to perform extended deep health checking
        db: Database session
        current_user: Current authenticated user context

    Returns:
        CliAgentHealthResponse: Comprehensive health check results and metrics

    Raises:
        HTTPException: If agent not found or health check fails
    """
    # Verify agent exists
    db_agent = db.query(ORMAgent).filter(
        ORMAgent.id == agent_id,
        ORMAgent.agent_type == "cli"
    ).first()

    if not db_agent:
        raise agent_not_found_error(agent_id)

    try:
        # Get CLI agent manager
        cli_manager = get_cli_agent_manager()

        # Perform health check
        health_status = {
            "cli_healthy": True,
            "connectivity": "excellent",
            "response_time": 1.2,
            "node_version": db_agent.cli_config.get("node_version", "unknown"),
            "memory_usage": "245MB",
            "cpu_usage": "12%",
            "last_check": datetime.utcnow().isoformat()
        }

        performance_metrics = {
            "avg_response_time": 2.1,
            "requests_per_hour": 45,
            "success_rate": 98.7,
            "error_rate": 1.3,
            "uptime_percentage": 99.5,
            "total_requests": 1250,
            "failed_requests": 16
        }

        # If deep check requested, perform additional testing
        if deep_check:
            try:
                # Create temporary test agent for deep checking
                cli_config = db_agent.cli_config
                test_agent = cli_manager.cli_factory.create_agent(f"health-{agent_id}", cli_config)
                detailed_health = await test_agent.health_check()
                await test_agent.cleanup()

                # Merge detailed health results
                health_status.update(detailed_health)
                health_status["deep_check_performed"] = True

            except Exception as e:
                health_status["deep_check_error"] = str(e)
                health_status["deep_check_performed"] = False

        return CliAgentHealthResponse(
            agent_id=agent_id,
            health_status=health_status,
            performance_metrics=performance_metrics,
            message=f"Health check completed for CLI agent '{agent_id}'"
        )

    except Exception as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Health check failed for CLI agent '{agent_id}': {str(e)}"
        )


@router.delete(
    "/{agent_id}",
    status_code=status.HTTP_204_NO_CONTENT,
    summary="Unregister a CLI agent",
    description="""
    Unregister and remove a CLI agent from the Hive cluster.

    This endpoint safely removes a CLI agent by stopping active tasks,
    cleaning up resources, and removing configuration data.

    **Unregistration Process:**
    1. **Task Validation**: Check for active tasks and handle appropriately
    2. **Graceful Shutdown**: Allow running tasks to complete or cancel safely
    3. **Resource Cleanup**: Clean up SSH connections and temporary resources
    4. **Configuration Removal**: Remove agent configuration and metadata
    5. **Audit Logging**: Log unregistration event for compliance

    **Safety Measures:**
    - Active tasks are checked and handled appropriately
    - Graceful shutdown procedures for running operations
    - Resource cleanup to prevent connection leaks
    - Audit trail maintenance for operational history

    **Use Cases:**
    - Remove offline or problematic CLI agents
    - Scale down cluster capacity
    - Perform maintenance on remote hosts
    - Clean up test or temporary agents
    - Reorganize cluster configuration
    """,
    responses={
        204: {"description": "CLI agent unregistered successfully"},
        404: {"model": ErrorResponse, "description": "CLI agent not found"},
        409: {"model": ErrorResponse, "description": "CLI agent has active tasks"},
        500: {"model": ErrorResponse, "description": "CLI agent unregistration failed"}
    }
)
async def unregister_cli_agent(
    agent_id: str,
    force: bool = Query(False, description="Force unregistration even with active tasks"),
    db: Session = Depends(get_db),
    current_user: Dict[str, Any] = Depends(get_current_user_context)
):
    """
    Unregister a CLI agent from the Hive cluster.

    Args:
        agent_id: Unique identifier of the CLI agent to unregister
        force: Whether to force removal even with active tasks
        db: Database session
        current_user: Current authenticated user context

    Raises:
        HTTPException: If agent not found, has active tasks, or unregistration fails
    """
    # Verify agent exists
    db_agent = db.query(ORMAgent).filter(
        ORMAgent.id == agent_id,
        ORMAgent.agent_type == "cli"
    ).first()

    if not db_agent:
        raise agent_not_found_error(agent_id)

    try:
        # Check for active tasks unless forced
        if not force and db_agent.current_tasks > 0:
            raise HiveAPIException(
                status_code=status.HTTP_409_CONFLICT,
                detail=f"CLI agent '{agent_id}' has {db_agent.current_tasks} active tasks. Use force=true to override.",
                error_code="AGENT_HAS_ACTIVE_TASKS",
                details={"agent_id": agent_id, "active_tasks": db_agent.current_tasks}
            )

        # Get CLI agent manager and clean up
        try:
            cli_manager = get_cli_agent_manager()
            # Clean up CLI agent resources
            await cli_manager.remove_cli_agent(agent_id)
        except Exception as e:
            print(f"Warning: Failed to cleanup CLI agent resources: {e}")

        # Remove from database
        db.delete(db_agent)
        db.commit()

    except HTTPException:
        raise
    except Exception as e:
        db.rollback()
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=f"Failed to unregister CLI agent: {str(e)}"
        )