- Agent roles integration progress - Various backend and frontend updates - Storybook cache cleanup 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
632 lines
23 KiB
Python
632 lines
23 KiB
Python
"""
|
|
Hive API - Agent Management Endpoints
|
|
|
|
This module provides comprehensive API endpoints for managing Ollama-based AI agents
|
|
in the Hive distributed orchestration platform. It handles agent registration,
|
|
status monitoring, and lifecycle management.
|
|
|
|
Key Features:
|
|
- Agent registration and validation
|
|
- Real-time status monitoring
|
|
- Comprehensive error handling
|
|
- Detailed API documentation
|
|
- Authentication and authorization
|
|
"""
|
|
|
|
from fastapi import APIRouter, HTTPException, Request, Depends, status
|
|
from typing import List, Dict, Any
|
|
import time
|
|
import logging
|
|
from ..models.agent import Agent
|
|
from ..models.responses import (
|
|
AgentListResponse,
|
|
AgentRegistrationResponse,
|
|
AgentRegistrationRequest,
|
|
ErrorResponse,
|
|
AgentModel
|
|
)
|
|
from ..core.auth_deps import get_current_user_context
|
|
|
|
router = APIRouter()
|
|
|
|
from app.core.database import SessionLocal
|
|
from app.models.agent import Agent as ORMAgent
|
|
from ..services.agent_service import AgentType
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@router.get(
|
|
"/agents",
|
|
response_model=AgentListResponse,
|
|
status_code=status.HTTP_200_OK,
|
|
summary="List all registered agents",
|
|
description="""
|
|
Retrieve a comprehensive list of all registered agents in the Hive cluster.
|
|
|
|
This endpoint returns detailed information about each agent including:
|
|
- Agent identification and endpoint information
|
|
- Current status and utilization metrics
|
|
- Specialization and capacity limits
|
|
- Health and heartbeat information
|
|
|
|
**Use Cases:**
|
|
- Monitor cluster capacity and agent health
|
|
- Identify available agents for task assignment
|
|
- Track agent utilization and performance
|
|
- Debug agent connectivity issues
|
|
|
|
**Response Notes:**
|
|
- Agents are returned in registration order
|
|
- Status reflects real-time agent availability
|
|
- Utilization is calculated as current_tasks / max_concurrent
|
|
""",
|
|
responses={
|
|
200: {"description": "List of agents retrieved successfully"},
|
|
500: {"model": ErrorResponse, "description": "Internal server error"}
|
|
}
|
|
)
|
|
async def get_agents(
|
|
request: Request,
|
|
current_user: Dict[str, Any] = Depends(get_current_user_context)
|
|
) -> AgentListResponse:
|
|
"""
|
|
Get all registered agents with detailed status information.
|
|
|
|
Returns:
|
|
AgentListResponse: Comprehensive list of all registered agents
|
|
|
|
Raises:
|
|
HTTPException: If database query fails
|
|
"""
|
|
try:
|
|
with SessionLocal() as db:
|
|
db_agents = db.query(ORMAgent).all()
|
|
agents_list = []
|
|
for db_agent in db_agents:
|
|
agent_model = AgentModel(
|
|
id=db_agent.id,
|
|
endpoint=db_agent.endpoint,
|
|
model=db_agent.model,
|
|
specialty=db_agent.specialty,
|
|
max_concurrent=db_agent.max_concurrent,
|
|
current_tasks=db_agent.current_tasks,
|
|
status="available" if db_agent.current_tasks < db_agent.max_concurrent else "busy",
|
|
utilization=db_agent.current_tasks / db_agent.max_concurrent if db_agent.max_concurrent > 0 else 0.0
|
|
)
|
|
agents_list.append(agent_model)
|
|
|
|
return AgentListResponse(
|
|
agents=agents_list,
|
|
total=len(agents_list),
|
|
message=f"Retrieved {len(agents_list)} registered agents"
|
|
)
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to retrieve agents: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.post(
|
|
"/agents",
|
|
response_model=AgentRegistrationResponse,
|
|
status_code=status.HTTP_201_CREATED,
|
|
summary="Register a new Ollama agent",
|
|
description="""
|
|
Register a new Ollama-based AI agent with the Hive cluster.
|
|
|
|
This endpoint allows you to add new Ollama agents to the distributed AI network.
|
|
The agent will be validated for connectivity and model availability before registration.
|
|
|
|
**Agent Registration Process:**
|
|
1. Validate agent connectivity and model availability
|
|
2. Add agent to the coordinator's active agent pool
|
|
3. Store agent configuration in the database
|
|
4. Perform initial health check
|
|
5. Return registration confirmation with agent details
|
|
|
|
**Supported Agent Specializations:**
|
|
- `kernel_dev`: Linux kernel development and debugging
|
|
- `pytorch_dev`: PyTorch model development and optimization
|
|
- `profiler`: Performance profiling and optimization
|
|
- `docs_writer`: Documentation generation and technical writing
|
|
- `tester`: Automated testing and quality assurance
|
|
- `general_ai`: General-purpose AI assistance
|
|
- `reasoning`: Complex reasoning and problem-solving tasks
|
|
|
|
**Requirements:**
|
|
- Agent endpoint must be accessible from the Hive cluster
|
|
- Specified model must be available on the target Ollama instance
|
|
- Agent ID must be unique across the cluster
|
|
""",
|
|
responses={
|
|
201: {"description": "Agent registered successfully"},
|
|
400: {"model": ErrorResponse, "description": "Invalid agent configuration"},
|
|
409: {"model": ErrorResponse, "description": "Agent ID already exists"},
|
|
503: {"model": ErrorResponse, "description": "Agent endpoint unreachable"}
|
|
}
|
|
)
|
|
async def register_agent(
|
|
agent_data: AgentRegistrationRequest,
|
|
request: Request,
|
|
current_user: Dict[str, Any] = Depends(get_current_user_context)
|
|
) -> AgentRegistrationResponse:
|
|
"""
|
|
Register a new Ollama agent in the Hive cluster.
|
|
|
|
Args:
|
|
agent_data: Agent configuration and registration details
|
|
request: FastAPI request object for accessing app state
|
|
current_user: Current authenticated user context
|
|
|
|
Returns:
|
|
AgentRegistrationResponse: Registration confirmation with agent details
|
|
|
|
Raises:
|
|
HTTPException: If registration fails due to validation or connectivity issues
|
|
"""
|
|
# Access coordinator through the dependency injection
|
|
hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
|
|
if not hive_coordinator:
|
|
# Fallback to global coordinator if app state not available
|
|
from ..main import unified_coordinator
|
|
hive_coordinator = unified_coordinator
|
|
|
|
if not hive_coordinator:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
detail="Coordinator service unavailable"
|
|
)
|
|
|
|
try:
|
|
# Check if agent ID already exists
|
|
with SessionLocal() as db:
|
|
existing_agent = db.query(ORMAgent).filter(ORMAgent.id == agent_data.id).first()
|
|
if existing_agent:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_409_CONFLICT,
|
|
detail=f"Agent with ID '{agent_data.id}' already exists"
|
|
)
|
|
|
|
# Create agent instance
|
|
agent = Agent(
|
|
id=agent_data.id,
|
|
endpoint=agent_data.endpoint,
|
|
model=agent_data.model,
|
|
specialty=AgentType(agent_data.specialty.value),
|
|
max_concurrent=agent_data.max_concurrent,
|
|
)
|
|
|
|
# Add agent to coordinator
|
|
hive_coordinator.add_agent(agent)
|
|
|
|
return AgentRegistrationResponse(
|
|
agent_id=agent.id,
|
|
endpoint=agent.endpoint,
|
|
message=f"Agent '{agent.id}' registered successfully with specialty '{agent_data.specialty}'"
|
|
)
|
|
|
|
except ValueError as e:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
detail=f"Invalid agent configuration: {str(e)}"
|
|
)
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to register agent: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.get(
|
|
"/agents/{agent_id}",
|
|
response_model=AgentModel,
|
|
status_code=status.HTTP_200_OK,
|
|
summary="Get specific agent details",
|
|
description="""
|
|
Retrieve detailed information about a specific agent by its ID.
|
|
|
|
This endpoint provides comprehensive status information for a single agent,
|
|
including real-time metrics, health status, and configuration details.
|
|
|
|
**Returned Information:**
|
|
- Agent identification and configuration
|
|
- Current status and utilization
|
|
- Recent activity and performance metrics
|
|
- Health check results and connectivity status
|
|
|
|
**Use Cases:**
|
|
- Monitor specific agent performance
|
|
- Debug agent connectivity issues
|
|
- Verify agent configuration
|
|
- Check agent availability for task assignment
|
|
""",
|
|
responses={
|
|
200: {"description": "Agent details retrieved successfully"},
|
|
404: {"model": ErrorResponse, "description": "Agent not found"},
|
|
500: {"model": ErrorResponse, "description": "Internal server error"}
|
|
}
|
|
)
|
|
async def get_agent(
|
|
agent_id: str,
|
|
request: Request,
|
|
current_user: Dict[str, Any] = Depends(get_current_user_context)
|
|
) -> AgentModel:
|
|
"""
|
|
Get detailed information about a specific agent.
|
|
|
|
Args:
|
|
agent_id: Unique identifier of the agent to retrieve
|
|
request: FastAPI request object
|
|
current_user: Current authenticated user context
|
|
|
|
Returns:
|
|
AgentModel: Detailed agent information and status
|
|
|
|
Raises:
|
|
HTTPException: If agent not found or query fails
|
|
"""
|
|
try:
|
|
with SessionLocal() as db:
|
|
db_agent = db.query(ORMAgent).filter(ORMAgent.id == agent_id).first()
|
|
if not db_agent:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
detail=f"Agent with ID '{agent_id}' not found"
|
|
)
|
|
|
|
agent_model = AgentModel(
|
|
id=db_agent.id,
|
|
endpoint=db_agent.endpoint,
|
|
model=db_agent.model,
|
|
specialty=db_agent.specialty,
|
|
max_concurrent=db_agent.max_concurrent,
|
|
current_tasks=db_agent.current_tasks,
|
|
status="available" if db_agent.current_tasks < db_agent.max_concurrent else "busy",
|
|
utilization=db_agent.current_tasks / db_agent.max_concurrent if db_agent.max_concurrent > 0 else 0.0
|
|
)
|
|
|
|
return agent_model
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to retrieve agent: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.delete(
|
|
"/agents/{agent_id}",
|
|
status_code=status.HTTP_204_NO_CONTENT,
|
|
summary="Unregister an agent",
|
|
description="""
|
|
Remove an agent from the Hive cluster.
|
|
|
|
This endpoint safely removes an agent from the cluster by:
|
|
1. Checking for active tasks and optionally waiting for completion
|
|
2. Removing the agent from the coordinator's active pool
|
|
3. Cleaning up database records
|
|
4. Confirming successful removal
|
|
|
|
**Safety Measures:**
|
|
- Active tasks are checked before removal
|
|
- Graceful shutdown procedures are followed
|
|
- Database consistency is maintained
|
|
- Error handling for cleanup failures
|
|
|
|
**Use Cases:**
|
|
- Remove offline or problematic agents
|
|
- Scale down cluster capacity
|
|
- Perform maintenance on agent nodes
|
|
- Clean up test or temporary agents
|
|
""",
|
|
responses={
|
|
204: {"description": "Agent unregistered successfully"},
|
|
404: {"model": ErrorResponse, "description": "Agent not found"},
|
|
409: {"model": ErrorResponse, "description": "Agent has active tasks"},
|
|
500: {"model": ErrorResponse, "description": "Internal server error"}
|
|
}
|
|
)
|
|
async def unregister_agent(
|
|
agent_id: str,
|
|
request: Request,
|
|
force: bool = False,
|
|
current_user: Dict[str, Any] = Depends(get_current_user_context)
|
|
):
|
|
"""
|
|
Unregister an agent from the Hive cluster.
|
|
|
|
Args:
|
|
agent_id: Unique identifier of the agent to remove
|
|
request: FastAPI request object
|
|
force: Whether to force removal even with active tasks
|
|
current_user: Current authenticated user context
|
|
|
|
Raises:
|
|
HTTPException: If agent not found, has active tasks, or removal fails
|
|
"""
|
|
# Access coordinator
|
|
hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
|
|
if not hive_coordinator:
|
|
from ..main import unified_coordinator
|
|
hive_coordinator = unified_coordinator
|
|
|
|
if not hive_coordinator:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
detail="Coordinator service unavailable"
|
|
)
|
|
|
|
try:
|
|
with SessionLocal() as db:
|
|
db_agent = db.query(ORMAgent).filter(ORMAgent.id == agent_id).first()
|
|
if not db_agent:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_404_NOT_FOUND,
|
|
detail=f"Agent with ID '{agent_id}' not found"
|
|
)
|
|
|
|
# Check for active tasks unless forced
|
|
if not force and db_agent.current_tasks > 0:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_409_CONFLICT,
|
|
detail=f"Agent '{agent_id}' has {db_agent.current_tasks} active tasks. Use force=true to override."
|
|
)
|
|
|
|
# Remove from coordinator
|
|
hive_coordinator.remove_agent(agent_id)
|
|
|
|
# Remove from database
|
|
db.delete(db_agent)
|
|
db.commit()
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to unregister agent: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.post(
|
|
"/agents/heartbeat",
|
|
status_code=status.HTTP_200_OK,
|
|
summary="Agent heartbeat update",
|
|
description="""
|
|
Update agent status and maintain registration through periodic heartbeat.
|
|
|
|
This endpoint allows agents to:
|
|
- Confirm they are still online and responsive
|
|
- Update their current status and metrics
|
|
- Report any capability or configuration changes
|
|
- Maintain their registration in the cluster
|
|
|
|
Agents should call this endpoint every 30-60 seconds to maintain
|
|
their active status in the Hive cluster.
|
|
""",
|
|
responses={
|
|
200: {"description": "Heartbeat received successfully"},
|
|
404: {"model": ErrorResponse, "description": "Agent not registered"},
|
|
400: {"model": ErrorResponse, "description": "Invalid heartbeat data"}
|
|
}
|
|
)
|
|
async def agent_heartbeat(
|
|
heartbeat_data: Dict[str, Any],
|
|
request: Request
|
|
):
|
|
"""
|
|
Process agent heartbeat to maintain registration.
|
|
|
|
Args:
|
|
heartbeat_data: Agent status and metrics data
|
|
request: FastAPI request object
|
|
|
|
Returns:
|
|
Success confirmation and any coordinator updates
|
|
"""
|
|
agent_id = heartbeat_data.get("agent_id")
|
|
if not agent_id:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
detail="Missing agent_id in heartbeat data"
|
|
)
|
|
|
|
# Access coordinator
|
|
hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
|
|
if not hive_coordinator:
|
|
from ..main import unified_coordinator
|
|
hive_coordinator = unified_coordinator
|
|
|
|
if not hive_coordinator:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
detail="Coordinator service unavailable"
|
|
)
|
|
|
|
try:
|
|
# Update agent heartbeat timestamp
|
|
agent_service = hive_coordinator.agent_service
|
|
if agent_service:
|
|
agent_service.update_agent_heartbeat(agent_id)
|
|
|
|
# Update current tasks if provided - use raw SQL to avoid role column
|
|
if "current_tasks" in heartbeat_data:
|
|
current_tasks = heartbeat_data["current_tasks"]
|
|
try:
|
|
with SessionLocal() as db:
|
|
from sqlalchemy import text
|
|
db.execute(text(
|
|
"UPDATE agents SET current_tasks = :current_tasks, last_seen = NOW() WHERE id = :agent_id"
|
|
), {
|
|
"current_tasks": current_tasks,
|
|
"agent_id": agent_id
|
|
})
|
|
db.commit()
|
|
except Exception as e:
|
|
logger.warning(f"Could not update agent tasks: {e}")
|
|
|
|
return {
|
|
"status": "success",
|
|
"message": f"Heartbeat received from agent '{agent_id}'",
|
|
"timestamp": time.time()
|
|
}
|
|
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to process heartbeat: {str(e)}"
|
|
)
|
|
|
|
|
|
@router.post(
|
|
"/agents/auto-register",
|
|
response_model=AgentRegistrationResponse,
|
|
status_code=status.HTTP_201_CREATED,
|
|
summary="Automatic agent registration",
|
|
description="""
|
|
Register an agent automatically with capability detection.
|
|
|
|
This endpoint is designed for Bzzz agents running as systemd services
|
|
to automatically register themselves with the Hive coordinator.
|
|
|
|
Features:
|
|
- Automatic capability detection based on available models
|
|
- Network discovery support
|
|
- Retry-friendly for service startup scenarios
|
|
- Health validation before registration
|
|
""",
|
|
responses={
|
|
201: {"description": "Agent auto-registered successfully"},
|
|
400: {"model": ErrorResponse, "description": "Invalid agent configuration"},
|
|
409: {"model": ErrorResponse, "description": "Agent already registered"},
|
|
503: {"model": ErrorResponse, "description": "Agent endpoint unreachable"}
|
|
}
|
|
)
|
|
async def auto_register_agent(
|
|
agent_data: Dict[str, Any],
|
|
request: Request
|
|
) -> AgentRegistrationResponse:
|
|
"""
|
|
Automatically register a Bzzz agent with the Hive coordinator.
|
|
|
|
Args:
|
|
agent_data: Agent configuration including endpoint, models, etc.
|
|
request: FastAPI request object
|
|
|
|
Returns:
|
|
AgentRegistrationResponse: Registration confirmation
|
|
"""
|
|
# Extract required fields
|
|
agent_id = agent_data.get("agent_id")
|
|
endpoint = agent_data.get("endpoint")
|
|
hostname = agent_data.get("hostname")
|
|
|
|
if not agent_id or not endpoint:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_400_BAD_REQUEST,
|
|
detail="Missing required fields: agent_id, endpoint"
|
|
)
|
|
|
|
# Access coordinator
|
|
hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
|
|
if not hive_coordinator:
|
|
from ..main import unified_coordinator
|
|
hive_coordinator = unified_coordinator
|
|
|
|
if not hive_coordinator:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
detail="Coordinator service unavailable"
|
|
)
|
|
|
|
try:
|
|
# Check if agent already exists - use basic query to avoid role column
|
|
try:
|
|
with SessionLocal() as db:
|
|
from sqlalchemy import text
|
|
existing_agent = db.execute(text(
|
|
"SELECT id, endpoint FROM agents WHERE id = :agent_id LIMIT 1"
|
|
), {"agent_id": agent_id}).fetchone()
|
|
if existing_agent:
|
|
# Update existing agent
|
|
db.execute(text(
|
|
"UPDATE agents SET endpoint = :endpoint, last_seen = NOW() WHERE id = :agent_id"
|
|
), {"endpoint": endpoint, "agent_id": agent_id})
|
|
db.commit()
|
|
|
|
return AgentRegistrationResponse(
|
|
agent_id=agent_id,
|
|
endpoint=endpoint,
|
|
message=f"Agent '{agent_id}' registration updated successfully"
|
|
)
|
|
except Exception as e:
|
|
logger.warning(f"Could not check existing agent: {e}")
|
|
|
|
# Detect capabilities and models
|
|
models = agent_data.get("models", [])
|
|
if not models:
|
|
# Try to detect models from endpoint
|
|
try:
|
|
import aiohttp
|
|
async with aiohttp.ClientSession() as session:
|
|
async with session.get(f"{endpoint}/api/tags", timeout=aiohttp.ClientTimeout(total=5)) as response:
|
|
if response.status == 200:
|
|
tags_data = await response.json()
|
|
models = [model["name"] for model in tags_data.get("models", [])]
|
|
except Exception as e:
|
|
logger.warning(f"Could not detect models for {agent_id}: {e}")
|
|
|
|
# Determine specialty based on models or hostname
|
|
specialty = AgentType.GENERAL_AI # Default
|
|
if "codellama" in str(models).lower() or "code" in hostname.lower():
|
|
specialty = AgentType.KERNEL_DEV
|
|
elif "gemma" in str(models).lower():
|
|
specialty = AgentType.PYTORCH_DEV
|
|
elif any(model for model in models if "llama" in model.lower()):
|
|
specialty = AgentType.GENERAL_AI
|
|
|
|
# Insert agent directly into database
|
|
try:
|
|
with SessionLocal() as db:
|
|
from sqlalchemy import text
|
|
# Insert new agent using raw SQL to avoid role column issues
|
|
db.execute(text("""
|
|
INSERT INTO agents (id, name, endpoint, model, specialty, max_concurrent, current_tasks, status, created_at, last_seen)
|
|
VALUES (:agent_id, :name, :endpoint, :model, :specialty, :max_concurrent, 0, 'active', NOW(), NOW())
|
|
ON CONFLICT (id) DO UPDATE SET
|
|
endpoint = EXCLUDED.endpoint,
|
|
model = EXCLUDED.model,
|
|
specialty = EXCLUDED.specialty,
|
|
max_concurrent = EXCLUDED.max_concurrent,
|
|
last_seen = NOW()
|
|
"""), {
|
|
"agent_id": agent_id,
|
|
"name": agent_id, # Use agent_id as name
|
|
"endpoint": endpoint,
|
|
"model": models[0] if models else "unknown",
|
|
"specialty": specialty.value,
|
|
"max_concurrent": agent_data.get("max_concurrent", 2)
|
|
})
|
|
db.commit()
|
|
|
|
return AgentRegistrationResponse(
|
|
agent_id=agent_id,
|
|
endpoint=endpoint,
|
|
message=f"Agent '{agent_id}' auto-registered successfully with specialty '{specialty.value}'"
|
|
)
|
|
except Exception as e:
|
|
logger.error(f"Database insert failed: {e}")
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to register agent in database: {str(e)}"
|
|
)
|
|
|
|
except Exception as e:
|
|
raise HTTPException(
|
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
detail=f"Failed to auto-register agent: {str(e)}"
|
|
) |