WIP: Save current work before CHORUS rebrand

- Agent roles integration progress
- Various backend and frontend updates
- Storybook cache cleanup

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
anthonyrawlins
2025-08-01 02:20:56 +10:00
parent 1e81daaf18
commit b6bff318d9
740 changed files with 90022 additions and 279523 deletions

View File

@@ -15,6 +15,8 @@ Key Features:
from fastapi import APIRouter, HTTPException, Request, Depends, status
from typing import List, Dict, Any
import time
import logging
from ..models.agent import Agent
from ..models.responses import (
AgentListResponse,
@@ -29,6 +31,9 @@ router = APIRouter()
from app.core.database import SessionLocal
from app.models.agent import Agent as ORMAgent
from ..services.agent_service import AgentType
logger = logging.getLogger(__name__)
@router.get(
@@ -384,4 +389,244 @@ async def unregister_agent(
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to unregister agent: {str(e)}"
)
@router.post(
"/agents/heartbeat",
status_code=status.HTTP_200_OK,
summary="Agent heartbeat update",
description="""
Update agent status and maintain registration through periodic heartbeat.
This endpoint allows agents to:
- Confirm they are still online and responsive
- Update their current status and metrics
- Report any capability or configuration changes
- Maintain their registration in the cluster
Agents should call this endpoint every 30-60 seconds to maintain
their active status in the Hive cluster.
""",
responses={
200: {"description": "Heartbeat received successfully"},
404: {"model": ErrorResponse, "description": "Agent not registered"},
400: {"model": ErrorResponse, "description": "Invalid heartbeat data"}
}
)
async def agent_heartbeat(
heartbeat_data: Dict[str, Any],
request: Request
):
"""
Process agent heartbeat to maintain registration.
Args:
heartbeat_data: Agent status and metrics data
request: FastAPI request object
Returns:
Success confirmation and any coordinator updates
"""
agent_id = heartbeat_data.get("agent_id")
if not agent_id:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Missing agent_id in heartbeat data"
)
# Access coordinator
hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
if not hive_coordinator:
from ..main import unified_coordinator
hive_coordinator = unified_coordinator
if not hive_coordinator:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Coordinator service unavailable"
)
try:
# Update agent heartbeat timestamp
agent_service = hive_coordinator.agent_service
if agent_service:
agent_service.update_agent_heartbeat(agent_id)
# Update current tasks if provided - use raw SQL to avoid role column
if "current_tasks" in heartbeat_data:
current_tasks = heartbeat_data["current_tasks"]
try:
with SessionLocal() as db:
from sqlalchemy import text
db.execute(text(
"UPDATE agents SET current_tasks = :current_tasks, last_seen = NOW() WHERE id = :agent_id"
), {
"current_tasks": current_tasks,
"agent_id": agent_id
})
db.commit()
except Exception as e:
logger.warning(f"Could not update agent tasks: {e}")
return {
"status": "success",
"message": f"Heartbeat received from agent '{agent_id}'",
"timestamp": time.time()
}
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to process heartbeat: {str(e)}"
)
@router.post(
"/agents/auto-register",
response_model=AgentRegistrationResponse,
status_code=status.HTTP_201_CREATED,
summary="Automatic agent registration",
description="""
Register an agent automatically with capability detection.
This endpoint is designed for Bzzz agents running as systemd services
to automatically register themselves with the Hive coordinator.
Features:
- Automatic capability detection based on available models
- Network discovery support
- Retry-friendly for service startup scenarios
- Health validation before registration
""",
responses={
201: {"description": "Agent auto-registered successfully"},
400: {"model": ErrorResponse, "description": "Invalid agent configuration"},
409: {"model": ErrorResponse, "description": "Agent already registered"},
503: {"model": ErrorResponse, "description": "Agent endpoint unreachable"}
}
)
async def auto_register_agent(
agent_data: Dict[str, Any],
request: Request
) -> AgentRegistrationResponse:
"""
Automatically register a Bzzz agent with the Hive coordinator.
Args:
agent_data: Agent configuration including endpoint, models, etc.
request: FastAPI request object
Returns:
AgentRegistrationResponse: Registration confirmation
"""
# Extract required fields
agent_id = agent_data.get("agent_id")
endpoint = agent_data.get("endpoint")
hostname = agent_data.get("hostname")
if not agent_id or not endpoint:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail="Missing required fields: agent_id, endpoint"
)
# Access coordinator
hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
if not hive_coordinator:
from ..main import unified_coordinator
hive_coordinator = unified_coordinator
if not hive_coordinator:
raise HTTPException(
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
detail="Coordinator service unavailable"
)
try:
# Check if agent already exists - use basic query to avoid role column
try:
with SessionLocal() as db:
from sqlalchemy import text
existing_agent = db.execute(text(
"SELECT id, endpoint FROM agents WHERE id = :agent_id LIMIT 1"
), {"agent_id": agent_id}).fetchone()
if existing_agent:
# Update existing agent
db.execute(text(
"UPDATE agents SET endpoint = :endpoint, last_seen = NOW() WHERE id = :agent_id"
), {"endpoint": endpoint, "agent_id": agent_id})
db.commit()
return AgentRegistrationResponse(
agent_id=agent_id,
endpoint=endpoint,
message=f"Agent '{agent_id}' registration updated successfully"
)
except Exception as e:
logger.warning(f"Could not check existing agent: {e}")
# Detect capabilities and models
models = agent_data.get("models", [])
if not models:
# Try to detect models from endpoint
try:
import aiohttp
async with aiohttp.ClientSession() as session:
async with session.get(f"{endpoint}/api/tags", timeout=aiohttp.ClientTimeout(total=5)) as response:
if response.status == 200:
tags_data = await response.json()
models = [model["name"] for model in tags_data.get("models", [])]
except Exception as e:
logger.warning(f"Could not detect models for {agent_id}: {e}")
# Determine specialty based on models or hostname
specialty = AgentType.GENERAL_AI # Default
if "codellama" in str(models).lower() or "code" in hostname.lower():
specialty = AgentType.KERNEL_DEV
elif "gemma" in str(models).lower():
specialty = AgentType.PYTORCH_DEV
elif any(model for model in models if "llama" in model.lower()):
specialty = AgentType.GENERAL_AI
# Insert agent directly into database
try:
with SessionLocal() as db:
from sqlalchemy import text
# Insert new agent using raw SQL to avoid role column issues
db.execute(text("""
INSERT INTO agents (id, name, endpoint, model, specialty, max_concurrent, current_tasks, status, created_at, last_seen)
VALUES (:agent_id, :name, :endpoint, :model, :specialty, :max_concurrent, 0, 'active', NOW(), NOW())
ON CONFLICT (id) DO UPDATE SET
endpoint = EXCLUDED.endpoint,
model = EXCLUDED.model,
specialty = EXCLUDED.specialty,
max_concurrent = EXCLUDED.max_concurrent,
last_seen = NOW()
"""), {
"agent_id": agent_id,
"name": agent_id, # Use agent_id as name
"endpoint": endpoint,
"model": models[0] if models else "unknown",
"specialty": specialty.value,
"max_concurrent": agent_data.get("max_concurrent", 2)
})
db.commit()
return AgentRegistrationResponse(
agent_id=agent_id,
endpoint=endpoint,
message=f"Agent '{agent_id}' auto-registered successfully with specialty '{specialty.value}'"
)
except Exception as e:
logger.error(f"Database insert failed: {e}")
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to register agent in database: {str(e)}"
)
except Exception as e:
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Failed to auto-register agent: {str(e)}"
)

View File

@@ -0,0 +1,287 @@
"""
Bzzz hypercore/hyperswarm log streaming API endpoints.
Provides real-time access to agent communication logs from the Bzzz network.
"""
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, HTTPException, Query
from fastapi.responses import StreamingResponse
from typing import List, Optional, Dict, Any
import asyncio
import json
import logging
import httpx
import time
from datetime import datetime, timedelta
router = APIRouter()
logger = logging.getLogger(__name__)
# Keep track of active WebSocket connections
active_connections: List[WebSocket] = []
class BzzzLogEntry:
"""Represents a Bzzz hypercore log entry"""
def __init__(self, data: Dict[str, Any]):
self.index = data.get("index", 0)
self.timestamp = data.get("timestamp", "")
self.author = data.get("author", "")
self.log_type = data.get("type", "")
self.message_data = data.get("data", {})
self.hash_value = data.get("hash", "")
self.prev_hash = data.get("prev_hash", "")
def to_chat_message(self) -> Dict[str, Any]:
"""Convert hypercore log entry to chat message format"""
# Extract message details from the log data
msg_data = self.message_data
return {
"id": f"log-{self.index}",
"senderId": msg_data.get("from_short", self.author),
"senderName": msg_data.get("from_short", self.author),
"content": self._format_message_content(),
"timestamp": self.timestamp,
"messageType": self._determine_message_type(),
"channel": msg_data.get("topic", "unknown"),
"swarmId": f"swarm-{msg_data.get('topic', 'unknown')}",
"isDelivered": True,
"isRead": True,
"logType": self.log_type,
"hash": self.hash_value
}
def _format_message_content(self) -> str:
"""Format the log entry into a readable message"""
msg_data = self.message_data
message_type = msg_data.get("message_type", self.log_type)
if message_type == "availability_broadcast":
status = msg_data.get("data", {}).get("status", "unknown")
current_tasks = msg_data.get("data", {}).get("current_tasks", 0)
max_tasks = msg_data.get("data", {}).get("max_tasks", 0)
return f"Status: {status} ({current_tasks}/{max_tasks} tasks)"
elif message_type == "capability_broadcast":
capabilities = msg_data.get("data", {}).get("capabilities", [])
models = msg_data.get("data", {}).get("models", [])
return f"Updated capabilities: {', '.join(capabilities[:3])}{'...' if len(capabilities) > 3 else ''}"
elif message_type == "task_announced":
task_data = msg_data.get("data", {})
return f"Task announced: {task_data.get('title', 'Unknown task')}"
elif message_type == "task_claimed":
task_data = msg_data.get("data", {})
return f"Task claimed: {task_data.get('title', 'Unknown task')}"
elif message_type == "role_announcement":
role = msg_data.get("data", {}).get("role", "unknown")
return f"Role announcement: {role}"
elif message_type == "collaboration":
return f"Collaboration: {msg_data.get('data', {}).get('content', 'Agent discussion')}"
elif self.log_type == "peer_joined":
return "Agent joined the network"
elif self.log_type == "peer_left":
return "Agent left the network"
else:
# Generic fallback
return f"{message_type}: {json.dumps(msg_data.get('data', {}))[:100]}{'...' if len(str(msg_data.get('data', {}))) > 100 else ''}"
def _determine_message_type(self) -> str:
"""Determine if this is a sent, received, or system message"""
msg_data = self.message_data
# System messages
if self.log_type in ["peer_joined", "peer_left", "network_event"]:
return "system"
# For now, treat all as received since we're monitoring
# In a real implementation, you'd check if the author is the current node
return "received"
class BzzzLogStreamer:
"""Manages streaming of Bzzz hypercore logs"""
def __init__(self):
self.agent_endpoints = {}
self.last_indices = {} # Track last seen index per agent
async def discover_bzzz_agents(self) -> List[Dict[str, str]]:
"""Discover active Bzzz agents from the Hive agents API"""
try:
# This would typically query the actual agents database
# For now, return known endpoints based on cluster nodes
return [
{"agent_id": "acacia-bzzz", "endpoint": "http://acacia.local:8080"},
{"agent_id": "walnut-bzzz", "endpoint": "http://walnut.local:8080"},
{"agent_id": "ironwood-bzzz", "endpoint": "http://ironwood.local:8080"},
{"agent_id": "rosewood-bzzz", "endpoint": "http://rosewood.local:8080"},
]
except Exception as e:
logger.error(f"Failed to discover Bzzz agents: {e}")
return []
async def fetch_agent_logs(self, agent_endpoint: str, since_index: int = 0) -> List[BzzzLogEntry]:
"""Fetch hypercore logs from a specific Bzzz agent"""
try:
# This would call the actual Bzzz agent's HTTP API
# For now, return mock data structure that matches hypercore format
async with httpx.AsyncClient() as client:
response = await client.get(
f"{agent_endpoint}/api/hypercore/logs",
params={"since": since_index},
timeout=5.0
)
if response.status_code == 200:
logs_data = response.json()
return [BzzzLogEntry(log) for log in logs_data.get("entries", [])]
else:
logger.warning(f"Failed to fetch logs from {agent_endpoint}: {response.status_code}")
return []
except httpx.ConnectError:
logger.debug(f"Agent at {agent_endpoint} is not reachable")
return []
except Exception as e:
logger.error(f"Error fetching logs from {agent_endpoint}: {e}")
return []
async def get_recent_logs(self, limit: int = 100) -> List[Dict[str, Any]]:
"""Get recent logs from all agents"""
agents = await self.discover_bzzz_agents()
all_messages = []
for agent in agents:
logs = await self.fetch_agent_logs(agent["endpoint"])
for log in logs[-limit:]: # Get recent entries
message = log.to_chat_message()
message["agent_id"] = agent["agent_id"]
all_messages.append(message)
# Sort by timestamp
all_messages.sort(key=lambda x: x["timestamp"])
return all_messages[-limit:]
async def stream_new_logs(self):
"""Continuously stream new logs from all agents"""
while True:
try:
agents = await self.discover_bzzz_agents()
new_messages = []
for agent in agents:
agent_id = agent["agent_id"]
last_index = self.last_indices.get(agent_id, 0)
logs = await self.fetch_agent_logs(agent["endpoint"], last_index)
for log in logs:
if log.index > last_index:
message = log.to_chat_message()
message["agent_id"] = agent_id
new_messages.append(message)
self.last_indices[agent_id] = log.index
# Send new messages to all connected WebSocket clients
if new_messages and active_connections:
message_data = {
"type": "new_messages",
"messages": new_messages
}
# Remove disconnected clients
disconnected = []
for connection in active_connections:
try:
await connection.send_text(json.dumps(message_data))
except:
disconnected.append(connection)
for conn in disconnected:
active_connections.remove(conn)
await asyncio.sleep(2) # Poll every 2 seconds
except Exception as e:
logger.error(f"Error in log streaming: {e}")
await asyncio.sleep(5)
# Global log streamer instance
log_streamer = BzzzLogStreamer()
@router.get("/bzzz/logs")
async def get_bzzz_logs(
limit: int = Query(default=100, le=1000),
agent_id: Optional[str] = None
):
"""Get recent Bzzz hypercore logs"""
try:
logs = await log_streamer.get_recent_logs(limit)
if agent_id:
logs = [log for log in logs if log.get("agent_id") == agent_id]
return {
"logs": logs,
"count": len(logs),
"timestamp": datetime.utcnow().isoformat()
}
except Exception as e:
logger.error(f"Error fetching Bzzz logs: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.get("/bzzz/agents")
async def get_bzzz_agents():
"""Get list of discovered Bzzz agents"""
try:
agents = await log_streamer.discover_bzzz_agents()
return {"agents": agents}
except Exception as e:
logger.error(f"Error discovering Bzzz agents: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.websocket("/bzzz/logs/stream")
async def websocket_bzzz_logs(websocket: WebSocket):
"""WebSocket endpoint for real-time Bzzz log streaming"""
await websocket.accept()
active_connections.append(websocket)
try:
# Send initial recent logs
recent_logs = await log_streamer.get_recent_logs(50)
await websocket.send_text(json.dumps({
"type": "initial_logs",
"messages": recent_logs
}))
# Keep connection alive and handle client messages
while True:
try:
# Wait for client messages (ping, filters, etc.)
message = await asyncio.wait_for(websocket.receive_text(), timeout=30)
client_data = json.loads(message)
if client_data.get("type") == "ping":
await websocket.send_text(json.dumps({"type": "pong"}))
except asyncio.TimeoutError:
# Send periodic heartbeat
await websocket.send_text(json.dumps({"type": "heartbeat"}))
except WebSocketDisconnect:
active_connections.remove(websocket)
except Exception as e:
logger.error(f"WebSocket error: {e}")
if websocket in active_connections:
active_connections.remove(websocket)
# Start the log streaming background task
@router.on_event("startup")
async def start_log_streaming():
"""Start the background log streaming task"""
asyncio.create_task(log_streamer.stream_new_logs())

View File

@@ -0,0 +1,434 @@
"""
Cluster Registration API endpoints
Handles registration-based cluster management for Hive-Bzzz integration.
"""
from fastapi import APIRouter, HTTPException, Request, Depends
from pydantic import BaseModel, Field
from typing import Dict, Any, List, Optional
import logging
import os
from ..services.cluster_registration_service import (
ClusterRegistrationService,
RegistrationRequest,
HeartbeatRequest
)
logger = logging.getLogger(__name__)
router = APIRouter()
# Initialize service
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://hive:hivepass@localhost:5432/hive")
cluster_registration_service = ClusterRegistrationService(DATABASE_URL)
# Pydantic models for API
class NodeRegistrationRequest(BaseModel):
token: str = Field(..., description="Cluster registration token")
node_id: str = Field(..., description="Unique node identifier")
hostname: str = Field(..., description="Node hostname")
system_info: Dict[str, Any] = Field(..., description="System hardware and OS information")
client_version: Optional[str] = Field(None, description="Bzzz client version")
services: Optional[Dict[str, Any]] = Field(None, description="Available services")
capabilities: Optional[Dict[str, Any]] = Field(None, description="Node capabilities")
ports: Optional[Dict[str, Any]] = Field(None, description="Service ports")
metadata: Optional[Dict[str, Any]] = Field(None, description="Additional metadata")
class NodeHeartbeatRequest(BaseModel):
node_id: str = Field(..., description="Node identifier")
status: str = Field("online", description="Node status")
cpu_usage: Optional[float] = Field(None, ge=0, le=100, description="CPU usage percentage")
memory_usage: Optional[float] = Field(None, ge=0, le=100, description="Memory usage percentage")
disk_usage: Optional[float] = Field(None, ge=0, le=100, description="Disk usage percentage")
gpu_usage: Optional[float] = Field(None, ge=0, le=100, description="GPU usage percentage")
services_status: Optional[Dict[str, Any]] = Field(None, description="Service status information")
network_metrics: Optional[Dict[str, Any]] = Field(None, description="Network metrics")
custom_metrics: Optional[Dict[str, Any]] = Field(None, description="Custom node metrics")
class TokenCreateRequest(BaseModel):
description: str = Field(..., description="Token description")
expires_in_days: Optional[int] = Field(None, gt=0, description="Token expiration in days")
max_registrations: Optional[int] = Field(None, gt=0, description="Maximum number of registrations")
allowed_ip_ranges: Optional[List[str]] = Field(None, description="Allowed IP CIDR ranges")
# Helper function to get client IP
def get_client_ip(request: Request) -> str:
"""Extract client IP address from request."""
# Check for X-Forwarded-For header (proxy/load balancer)
forwarded_for = request.headers.get("X-Forwarded-For")
if forwarded_for:
# Take the first IP in the chain (original client)
return forwarded_for.split(",")[0].strip()
# Check for X-Real-IP header (nginx)
real_ip = request.headers.get("X-Real-IP")
if real_ip:
return real_ip.strip()
# Fall back to direct connection IP
return request.client.host if request.client else "unknown"
# Registration endpoints
@router.post("/cluster/register")
async def register_node(
registration: NodeRegistrationRequest,
request: Request
) -> Dict[str, Any]:
"""
Register a new node in the cluster.
This endpoint allows Bzzz clients to register themselves with the Hive coordinator
using a valid cluster token. Similar to `docker swarm join`.
"""
try:
client_ip = get_client_ip(request)
logger.info(f"Node registration attempt: {registration.node_id} from {client_ip}")
# Convert to service request
reg_request = RegistrationRequest(
token=registration.token,
node_id=registration.node_id,
hostname=registration.hostname,
ip_address=client_ip,
system_info=registration.system_info,
client_version=registration.client_version,
services=registration.services,
capabilities=registration.capabilities,
ports=registration.ports,
metadata=registration.metadata
)
result = await cluster_registration_service.register_node(reg_request, client_ip)
logger.info(f"Node {registration.node_id} registered successfully")
return result
except ValueError as e:
logger.warning(f"Registration failed for {registration.node_id}: {e}")
raise HTTPException(status_code=400, detail=str(e))
except Exception as e:
logger.error(f"Registration error for {registration.node_id}: {e}")
raise HTTPException(status_code=500, detail="Registration failed")
@router.post("/cluster/heartbeat")
async def node_heartbeat(heartbeat: NodeHeartbeatRequest) -> Dict[str, Any]:
"""
Update node heartbeat and status.
Registered nodes should call this endpoint periodically (every 30 seconds)
to maintain their registration and report current status/metrics.
"""
try:
heartbeat_request = HeartbeatRequest(
node_id=heartbeat.node_id,
status=heartbeat.status,
cpu_usage=heartbeat.cpu_usage,
memory_usage=heartbeat.memory_usage,
disk_usage=heartbeat.disk_usage,
gpu_usage=heartbeat.gpu_usage,
services_status=heartbeat.services_status,
network_metrics=heartbeat.network_metrics,
custom_metrics=heartbeat.custom_metrics
)
result = await cluster_registration_service.update_heartbeat(heartbeat_request)
return result
except ValueError as e:
logger.warning(f"Heartbeat failed for {heartbeat.node_id}: {e}")
raise HTTPException(status_code=404, detail=str(e))
except Exception as e:
logger.error(f"Heartbeat error for {heartbeat.node_id}: {e}")
raise HTTPException(status_code=500, detail="Heartbeat update failed")
# Node management endpoints
@router.get("/cluster/nodes/registered")
async def get_registered_nodes(include_offline: bool = True) -> Dict[str, Any]:
"""
Get all registered cluster nodes.
Returns detailed information about all nodes that have registered
with the cluster, including their hardware specs and current status.
"""
try:
nodes = await cluster_registration_service.get_registered_nodes(include_offline)
# Convert to API response format
nodes_data = []
for node in nodes:
# Convert dataclass to dict and handle datetime serialization
node_dict = {
"id": node.id,
"node_id": node.node_id,
"hostname": node.hostname,
"ip_address": node.ip_address,
"status": node.status,
"hardware": {
"cpu": node.cpu_info or {},
"memory": node.memory_info or {},
"gpu": node.gpu_info or {},
"disk": node.disk_info or {},
"os": node.os_info or {},
"platform": node.platform_info or {}
},
"services": node.services or {},
"capabilities": node.capabilities or {},
"ports": node.ports or {},
"client_version": node.client_version,
"first_registered": node.first_registered.isoformat(),
"last_heartbeat": node.last_heartbeat.isoformat(),
"registration_metadata": node.registration_metadata or {}
}
nodes_data.append(node_dict)
return {
"nodes": nodes_data,
"total_count": len(nodes_data),
"online_count": len([n for n in nodes if n.status == "online"]),
"offline_count": len([n for n in nodes if n.status == "offline"])
}
except Exception as e:
logger.error(f"Failed to get registered nodes: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve registered nodes")
@router.get("/cluster/nodes/{node_id}")
async def get_node_details(node_id: str) -> Dict[str, Any]:
"""Get detailed information about a specific registered node."""
try:
node = await cluster_registration_service.get_node_details(node_id)
if not node:
raise HTTPException(status_code=404, detail="Node not found")
return {
"id": node.id,
"node_id": node.node_id,
"hostname": node.hostname,
"ip_address": node.ip_address,
"status": node.status,
"hardware": {
"cpu": node.cpu_info or {},
"memory": node.memory_info or {},
"gpu": node.gpu_info or {},
"disk": node.disk_info or {},
"os": node.os_info or {},
"platform": node.platform_info or {}
},
"services": node.services or {},
"capabilities": node.capabilities or {},
"ports": node.ports or {},
"client_version": node.client_version,
"first_registered": node.first_registered.isoformat(),
"last_heartbeat": node.last_heartbeat.isoformat(),
"registration_metadata": node.registration_metadata or {}
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to get node details for {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve node details")
@router.delete("/cluster/nodes/{node_id}")
async def remove_node(node_id: str) -> Dict[str, Any]:
"""
Remove a node from the cluster.
This will unregister the node and stop accepting its heartbeats.
The node will need to re-register to rejoin the cluster.
"""
try:
success = await cluster_registration_service.remove_node(node_id)
if not success:
raise HTTPException(status_code=404, detail="Node not found")
return {
"node_id": node_id,
"status": "removed",
"message": "Node successfully removed from cluster"
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to remove node {node_id}: {e}")
raise HTTPException(status_code=500, detail="Failed to remove node")
# Token management endpoints
@router.post("/cluster/tokens")
async def create_cluster_token(token_request: TokenCreateRequest) -> Dict[str, Any]:
"""
Create a new cluster registration token.
Tokens are used by Bzzz clients to authenticate and register with the cluster.
Only administrators should have access to this endpoint.
"""
try:
# For now, use a default admin user ID
# TODO: Extract from JWT token or session
admin_user_id = "admin" # This should come from authentication
token = await cluster_registration_service.generate_cluster_token(
description=token_request.description,
created_by_user_id=admin_user_id,
expires_in_days=token_request.expires_in_days,
max_registrations=token_request.max_registrations,
allowed_ip_ranges=token_request.allowed_ip_ranges
)
return {
"id": token.id,
"token": token.token,
"description": token.description,
"created_at": token.created_at.isoformat(),
"expires_at": token.expires_at.isoformat() if token.expires_at else None,
"is_active": token.is_active,
"max_registrations": token.max_registrations,
"current_registrations": token.current_registrations,
"allowed_ip_ranges": token.allowed_ip_ranges
}
except Exception as e:
logger.error(f"Failed to create cluster token: {e}")
raise HTTPException(status_code=500, detail="Failed to create token")
@router.get("/cluster/tokens")
async def list_cluster_tokens() -> Dict[str, Any]:
"""
List all cluster registration tokens.
Returns information about all tokens including their usage statistics.
Only administrators should have access to this endpoint.
"""
try:
tokens = await cluster_registration_service.list_tokens()
tokens_data = []
for token in tokens:
tokens_data.append({
"id": token.id,
"token": token.token[:20] + "..." if len(token.token) > 20 else token.token, # Partial token for security
"description": token.description,
"created_at": token.created_at.isoformat(),
"expires_at": token.expires_at.isoformat() if token.expires_at else None,
"is_active": token.is_active,
"max_registrations": token.max_registrations,
"current_registrations": token.current_registrations,
"allowed_ip_ranges": token.allowed_ip_ranges
})
return {
"tokens": tokens_data,
"total_count": len(tokens_data)
}
except Exception as e:
logger.error(f"Failed to list cluster tokens: {e}")
raise HTTPException(status_code=500, detail="Failed to list tokens")
@router.delete("/cluster/tokens/{token}")
async def revoke_cluster_token(token: str) -> Dict[str, Any]:
"""
Revoke a cluster registration token.
This will prevent new registrations using this token, but won't affect
nodes that are already registered.
"""
try:
success = await cluster_registration_service.revoke_token(token)
if not success:
raise HTTPException(status_code=404, detail="Token not found")
return {
"token": token[:20] + "..." if len(token) > 20 else token,
"status": "revoked",
"message": "Token successfully revoked"
}
except HTTPException:
raise
except Exception as e:
logger.error(f"Failed to revoke token {token}: {e}")
raise HTTPException(status_code=500, detail="Failed to revoke token")
# Cluster statistics and monitoring
@router.get("/cluster/statistics")
async def get_cluster_statistics() -> Dict[str, Any]:
"""
Get cluster health and usage statistics.
Returns information about node counts, token usage, and overall cluster health.
"""
try:
stats = await cluster_registration_service.get_cluster_statistics()
return stats
except Exception as e:
logger.error(f"Failed to get cluster statistics: {e}")
raise HTTPException(status_code=500, detail="Failed to retrieve cluster statistics")
# Maintenance endpoints
@router.post("/cluster/maintenance/cleanup-offline")
async def cleanup_offline_nodes(offline_threshold_minutes: int = 10) -> Dict[str, Any]:
"""
Mark nodes as offline if they haven't sent heartbeats recently.
This maintenance endpoint should be called periodically to keep
the cluster status accurate.
"""
try:
count = await cluster_registration_service.cleanup_offline_nodes(offline_threshold_minutes)
return {
"nodes_marked_offline": count,
"threshold_minutes": offline_threshold_minutes,
"message": f"Marked {count} nodes as offline"
}
except Exception as e:
logger.error(f"Failed to cleanup offline nodes: {e}")
raise HTTPException(status_code=500, detail="Failed to cleanup offline nodes")
@router.post("/cluster/maintenance/cleanup-heartbeats")
async def cleanup_old_heartbeats(retention_days: int = 30) -> Dict[str, Any]:
"""
Remove old heartbeat data to manage database size.
This maintenance endpoint should be called periodically to prevent
the heartbeat table from growing too large.
"""
try:
count = await cluster_registration_service.cleanup_old_heartbeats(retention_days)
return {
"heartbeats_deleted": count,
"retention_days": retention_days,
"message": f"Deleted {count} old heartbeat records"
}
except Exception as e:
logger.error(f"Failed to cleanup old heartbeats: {e}")
raise HTTPException(status_code=500, detail="Failed to cleanup old heartbeats")
# Health check endpoint
@router.get("/cluster/health")
async def cluster_registration_health() -> Dict[str, Any]:
"""
Health check for the cluster registration system.
"""
try:
# Test database connection
stats = await cluster_registration_service.get_cluster_statistics()
return {
"status": "healthy",
"database_connected": True,
"cluster_health": stats.get("cluster_health", {}),
"timestamp": stats.get("last_updated")
}
except Exception as e:
logger.error(f"Cluster registration health check failed: {e}")
return {
"status": "unhealthy",
"database_connected": False,
"error": str(e),
"timestamp": None
}

474
backend/app/api/feedback.py Normal file
View File

@@ -0,0 +1,474 @@
"""
Context Feedback API endpoints for RL Context Curator integration
"""
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
from sqlalchemy.orm import Session
from typing import List, Optional, Dict, Any
from datetime import datetime, timedelta
from pydantic import BaseModel, Field
from ..core.database import get_db
from ..models.context_feedback import ContextFeedback, AgentPermissions, PromotionRuleHistory
from ..models.task import Task
from ..models.agent import Agent
from ..services.auth import get_current_user
from ..models.responses import StatusResponse
router = APIRouter(prefix="/api/feedback", tags=["Context Feedback"])
# Pydantic models for API
class ContextFeedbackRequest(BaseModel):
"""Request model for context feedback"""
context_id: str = Field(..., description="HCFS context ID")
feedback_type: str = Field(..., description="Type of feedback: upvote, downvote, forgetfulness, task_success, task_failure")
confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence in feedback")
reason: Optional[str] = Field(None, description="Optional reason for feedback")
usage_context: Optional[str] = Field(None, description="Context of usage")
directory_scope: Optional[str] = Field(None, description="Directory where context was used")
task_type: Optional[str] = Field(None, description="Type of task being performed")
class TaskOutcomeFeedbackRequest(BaseModel):
"""Request model for task outcome feedback"""
task_id: str = Field(..., description="Task ID")
outcome: str = Field(..., description="Task outcome: completed, failed, abandoned")
completion_time: Optional[int] = Field(None, description="Time to complete in seconds")
errors_encountered: int = Field(0, description="Number of errors during execution")
follow_up_questions: int = Field(0, description="Number of follow-up questions")
context_used: Optional[List[str]] = Field(None, description="Context IDs used in task")
context_relevance_score: Optional[float] = Field(None, ge=0.0, le=1.0, description="Average relevance of used context")
outcome_confidence: Optional[float] = Field(None, ge=0.0, le=1.0, description="Confidence in outcome classification")
class AgentPermissionsRequest(BaseModel):
"""Request model for agent permissions"""
agent_id: str = Field(..., description="Agent ID")
role: str = Field(..., description="Agent role")
directory_patterns: List[str] = Field(..., description="Directory patterns for this role")
task_types: List[str] = Field(..., description="Task types this agent can handle")
context_weight: float = Field(1.0, ge=0.1, le=2.0, description="Weight for context relevance")
class ContextFeedbackResponse(BaseModel):
"""Response model for context feedback"""
id: int
context_id: str
agent_id: str
task_id: Optional[str]
feedback_type: str
role: str
confidence: float
reason: Optional[str]
usage_context: Optional[str]
directory_scope: Optional[str]
task_type: Optional[str]
timestamp: datetime
class FeedbackStatsResponse(BaseModel):
"""Response model for feedback statistics"""
total_feedback: int
feedback_by_type: Dict[str, int]
feedback_by_role: Dict[str, int]
average_confidence: float
recent_feedback_count: int
top_contexts: List[Dict[str, Any]]
@router.post("/context/{context_id}", response_model=StatusResponse)
async def submit_context_feedback(
context_id: str,
request: ContextFeedbackRequest,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
current_user: dict = Depends(get_current_user)
):
"""
Submit feedback for a specific context
"""
try:
# Get agent information
agent = db.query(Agent).filter(Agent.id == current_user.get("agent_id", "unknown")).first()
if not agent:
raise HTTPException(status_code=404, detail="Agent not found")
# Validate feedback type
valid_types = ["upvote", "downvote", "forgetfulness", "task_success", "task_failure"]
if request.feedback_type not in valid_types:
raise HTTPException(status_code=400, detail=f"Invalid feedback type. Must be one of: {valid_types}")
# Create feedback record
feedback = ContextFeedback(
context_id=request.context_id,
agent_id=agent.id,
feedback_type=request.feedback_type,
role=agent.role if agent.role else "general",
confidence=request.confidence,
reason=request.reason,
usage_context=request.usage_context,
directory_scope=request.directory_scope,
task_type=request.task_type
)
db.add(feedback)
db.commit()
db.refresh(feedback)
# Send feedback to RL Context Curator in background
background_tasks.add_task(
send_feedback_to_rl_curator,
feedback.id,
request.context_id,
request.feedback_type,
agent.id,
agent.role if agent.role else "general",
request.confidence
)
return StatusResponse(
status="success",
message="Context feedback submitted successfully",
data={"feedback_id": feedback.id, "context_id": request.context_id}
)
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=f"Failed to submit feedback: {str(e)}")
@router.post("/task-outcome/{task_id}", response_model=StatusResponse)
async def submit_task_outcome_feedback(
task_id: str,
request: TaskOutcomeFeedbackRequest,
background_tasks: BackgroundTasks,
db: Session = Depends(get_db),
current_user: dict = Depends(get_current_user)
):
"""
Submit task outcome feedback for RL learning
"""
try:
# Get task
task = db.query(Task).filter(Task.id == task_id).first()
if not task:
raise HTTPException(status_code=404, detail="Task not found")
# Update task with outcome metrics
task.task_outcome = request.outcome
task.completion_time = request.completion_time
task.errors_encountered = request.errors_encountered
task.follow_up_questions = request.follow_up_questions
task.context_relevance_score = request.context_relevance_score
task.outcome_confidence = request.outcome_confidence
task.feedback_collected = True
if request.context_used:
task.context_used = request.context_used
if request.outcome in ["completed", "failed", "abandoned"] and not task.completed_at:
task.completed_at = datetime.utcnow()
# Calculate success rate
if request.outcome == "completed":
task.success_rate = 1.0 - (request.errors_encountered * 0.1) # Simple calculation
task.success_rate = max(0.0, min(1.0, task.success_rate))
else:
task.success_rate = 0.0
db.commit()
# Create feedback events for used contexts
if request.context_used and task.assigned_agent_id:
agent = db.query(Agent).filter(Agent.id == task.assigned_agent_id).first()
if agent:
feedback_type = "task_success" if request.outcome == "completed" else "task_failure"
for context_id in request.context_used:
feedback = ContextFeedback(
context_id=context_id,
agent_id=agent.id,
task_id=task.id,
feedback_type=feedback_type,
role=agent.role if agent.role else "general",
confidence=request.outcome_confidence or 0.8,
reason=f"Task {request.outcome}",
usage_context=f"task_execution_{request.outcome}",
task_type=request.task_type
)
db.add(feedback)
db.commit()
return StatusResponse(
status="success",
message="Task outcome feedback submitted successfully",
data={"task_id": task_id, "outcome": request.outcome}
)
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=f"Failed to submit task outcome: {str(e)}")
@router.get("/stats", response_model=FeedbackStatsResponse)
async def get_feedback_stats(
days: int = 7,
role: Optional[str] = None,
db: Session = Depends(get_db),
current_user: dict = Depends(get_current_user)
):
"""
Get feedback statistics for analysis
"""
try:
# Base query
query = db.query(ContextFeedback)
# Filter by date range
if days > 0:
since_date = datetime.utcnow() - timedelta(days=days)
query = query.filter(ContextFeedback.timestamp >= since_date)
# Filter by role if specified
if role:
query = query.filter(ContextFeedback.role == role)
feedback_records = query.all()
# Calculate statistics
total_feedback = len(feedback_records)
feedback_by_type = {}
feedback_by_role = {}
confidence_values = []
context_usage = {}
for feedback in feedback_records:
# Count by type
feedback_by_type[feedback.feedback_type] = feedback_by_type.get(feedback.feedback_type, 0) + 1
# Count by role
feedback_by_role[feedback.role] = feedback_by_role.get(feedback.role, 0) + 1
# Collect confidence values
confidence_values.append(feedback.confidence)
# Count context usage
context_usage[feedback.context_id] = context_usage.get(feedback.context_id, 0) + 1
# Calculate average confidence
average_confidence = sum(confidence_values) / len(confidence_values) if confidence_values else 0.0
# Get recent feedback count (last 24 hours)
recent_since = datetime.utcnow() - timedelta(days=1)
recent_count = db.query(ContextFeedback).filter(
ContextFeedback.timestamp >= recent_since
).count()
# Get top contexts by usage
top_contexts = [
{"context_id": ctx_id, "usage_count": count}
for ctx_id, count in sorted(context_usage.items(), key=lambda x: x[1], reverse=True)[:10]
]
return FeedbackStatsResponse(
total_feedback=total_feedback,
feedback_by_type=feedback_by_type,
feedback_by_role=feedback_by_role,
average_confidence=average_confidence,
recent_feedback_count=recent_count,
top_contexts=top_contexts
)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to get feedback stats: {str(e)}")
@router.get("/recent", response_model=List[ContextFeedbackResponse])
async def get_recent_feedback(
limit: int = 50,
feedback_type: Optional[str] = None,
role: Optional[str] = None,
db: Session = Depends(get_db),
current_user: dict = Depends(get_current_user)
):
"""
Get recent feedback events
"""
try:
query = db.query(ContextFeedback).order_by(ContextFeedback.timestamp.desc())
if feedback_type:
query = query.filter(ContextFeedback.feedback_type == feedback_type)
if role:
query = query.filter(ContextFeedback.role == role)
feedback_records = query.limit(limit).all()
return [
ContextFeedbackResponse(
id=fb.id,
context_id=fb.context_id,
agent_id=fb.agent_id,
task_id=str(fb.task_id) if fb.task_id else None,
feedback_type=fb.feedback_type,
role=fb.role,
confidence=fb.confidence,
reason=fb.reason,
usage_context=fb.usage_context,
directory_scope=fb.directory_scope,
task_type=fb.task_type,
timestamp=fb.timestamp
)
for fb in feedback_records
]
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to get recent feedback: {str(e)}")
@router.post("/agent-permissions", response_model=StatusResponse)
async def set_agent_permissions(
request: AgentPermissionsRequest,
db: Session = Depends(get_db),
current_user: dict = Depends(get_current_user)
):
"""
Set or update agent permissions for context filtering
"""
try:
# Check if permissions already exist
existing = db.query(AgentPermissions).filter(
AgentPermissions.agent_id == request.agent_id,
AgentPermissions.role == request.role
).first()
if existing:
# Update existing permissions
existing.directory_patterns = ",".join(request.directory_patterns)
existing.task_types = ",".join(request.task_types)
existing.context_weight = request.context_weight
existing.updated_at = datetime.utcnow()
else:
# Create new permissions
permissions = AgentPermissions(
agent_id=request.agent_id,
role=request.role,
directory_patterns=",".join(request.directory_patterns),
task_types=",".join(request.task_types),
context_weight=request.context_weight
)
db.add(permissions)
db.commit()
return StatusResponse(
status="success",
message="Agent permissions updated successfully",
data={"agent_id": request.agent_id, "role": request.role}
)
except Exception as e:
db.rollback()
raise HTTPException(status_code=500, detail=f"Failed to set agent permissions: {str(e)}")
@router.get("/agent-permissions/{agent_id}")
async def get_agent_permissions(
agent_id: str,
db: Session = Depends(get_db),
current_user: dict = Depends(get_current_user)
):
"""
Get agent permissions for context filtering
"""
try:
permissions = db.query(AgentPermissions).filter(
AgentPermissions.agent_id == agent_id,
AgentPermissions.active == "true"
).all()
return [
{
"id": perm.id,
"agent_id": perm.agent_id,
"role": perm.role,
"directory_patterns": perm.directory_patterns.split(",") if perm.directory_patterns else [],
"task_types": perm.task_types.split(",") if perm.task_types else [],
"context_weight": perm.context_weight,
"created_at": perm.created_at,
"updated_at": perm.updated_at
}
for perm in permissions
]
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to get agent permissions: {str(e)}")
async def send_feedback_to_rl_curator(
feedback_id: int,
context_id: str,
feedback_type: str,
agent_id: str,
role: str,
confidence: float
):
"""
Background task to send feedback to RL Context Curator
"""
try:
import httpx
import json
from datetime import datetime
# Prepare feedback event in Bzzz format
feedback_event = {
"bzzz_type": "feedback_event",
"timestamp": datetime.utcnow().isoformat(),
"origin": {
"node_id": "hive",
"agent_id": agent_id,
"task_id": f"hive-feedback-{feedback_id}",
"workspace": "hive://context-feedback",
"directory": "/feedback/"
},
"feedback": {
"type": feedback_type,
"category": "general", # Could be enhanced with category detection
"role": role,
"context_id": context_id,
"reason": f"Feedback from Hive agent {agent_id}",
"confidence": confidence,
"usage_context": "hive_platform"
},
"task_outcome": {
"completed": feedback_type in ["upvote", "task_success"],
"completion_time": 0,
"errors_encountered": 0,
"follow_up_questions": 0
}
}
# Send to HCFS RL Tuner Service
async with httpx.AsyncClient() as client:
try:
response = await client.post(
"http://localhost:8001/api/feedback",
json=feedback_event,
timeout=10.0
)
if response.status_code == 200:
print(f"✅ Feedback sent to RL Curator: {feedback_id}")
else:
print(f"⚠️ RL Curator responded with status {response.status_code}")
except httpx.ConnectError:
print(f"⚠️ Could not connect to RL Curator service (feedback {feedback_id})")
except Exception as e:
print(f"❌ Error sending feedback to RL Curator: {e}")
except Exception as e:
print(f"❌ Background feedback task failed: {e}")

View File

@@ -47,6 +47,37 @@ async def get_project_tasks(project_id: str, current_user: Dict[str, Any] = Depe
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.put("/projects/{project_id}")
async def update_project(project_id: str, project_data: Dict[str, Any], current_user: Dict[str, Any] = Depends(get_current_user_context)) -> Dict[str, Any]:
"""Update a project configuration."""
try:
updated_project = project_service.update_project(project_id, project_data)
if not updated_project:
raise HTTPException(status_code=404, detail="Project not found")
return updated_project
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/projects")
async def create_project(project_data: Dict[str, Any], current_user: Dict[str, Any] = Depends(get_current_user_context)) -> Dict[str, Any]:
"""Create a new project."""
try:
new_project = project_service.create_project(project_data)
return new_project
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.delete("/projects/{project_id}")
async def delete_project(project_id: str, current_user: Dict[str, Any] = Depends(get_current_user_context)) -> Dict[str, Any]:
"""Delete a project."""
try:
result = project_service.delete_project(project_id)
if not result:
raise HTTPException(status_code=404, detail="Project not found")
return {"success": True, "message": "Project deleted successfully"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# === Bzzz Integration Endpoints ===
@bzzz_router.get("/active-repos")

View File

@@ -11,7 +11,7 @@ from typing import Dict, Any, Optional
from dataclasses import asdict
# Add CCLI source to path
ccli_path = os.path.join(os.path.dirname(__file__), '../../../ccli_src')
ccli_path = os.path.join(os.path.dirname(__file__), '../../ccli_src')
sys.path.insert(0, ccli_path)
from agents.gemini_cli_agent import GeminiCliAgent, GeminiCliConfig, TaskRequest as CliTaskRequest, TaskResult as CliTaskResult

View File

@@ -273,7 +273,6 @@ def create_token_response(user_id: int, user_data: Dict[str, Any]) -> Dict[str,
"refresh_token": refresh_token,
"token_type": "bearer",
"expires_in": ACCESS_TOKEN_EXPIRE_MINUTES * 60, # seconds
"user": user_data,
}

View File

@@ -174,6 +174,10 @@ app = FastAPI(
"name": "cluster",
"description": "Cluster-wide operations and coordination"
},
{
"name": "cluster-registration",
"description": "Dynamic cluster node registration and management"
},
{
"name": "distributed-workflows",
"description": "Advanced distributed workflow management"
@@ -206,7 +210,7 @@ def get_coordinator() -> UnifiedCoordinator:
return unified_coordinator
# Import API routers
from .api import agents, workflows, executions, monitoring, projects, tasks, cluster, distributed_workflows, cli_agents, auth
from .api import agents, workflows, executions, monitoring, projects, tasks, cluster, distributed_workflows, cli_agents, auth, bzzz_logs, cluster_registration
# Import error handlers and response models
from .core.error_handlers import (
@@ -239,8 +243,10 @@ app.include_router(projects.router, prefix="/api", tags=["projects"])
app.include_router(projects.bzzz_router, prefix="/api", tags=["bzzz-integration"])
app.include_router(tasks.router, prefix="/api", tags=["tasks"])
app.include_router(cluster.router, prefix="/api", tags=["cluster"])
app.include_router(cluster_registration.router, prefix="/api", tags=["cluster-registration"])
app.include_router(distributed_workflows.router, tags=["distributed-workflows"])
app.include_router(cli_agents.router, tags=["cli-agents"])
app.include_router(bzzz_logs.router, prefix="/api", tags=["bzzz-logs"])
# Override dependency functions in API modules with our coordinator instance
agents.get_coordinator = get_coordinator
@@ -528,16 +534,6 @@ async def root():
# Removed duplicate /health endpoint - using the enhanced one above
@app.get("/api/health", response_model=None)
async def health_check():
"""Simple health check endpoint"""
return {
"status": "healthy",
"timestamp": datetime.now().isoformat(),
"version": "1.0.0",
"message": "Hive API is operational"
}
@app.get("/api/status")
async def get_system_status():
"""Get comprehensive system status"""

View File

@@ -2,4 +2,5 @@ from . import agent
from . import agent_role
from . import project
from . import task
from . import context_feedback
from . import sqlalchemy_models

View File

@@ -34,6 +34,8 @@ class Agent(Base):
# Relationships
tasks = relationship("Task", back_populates="assigned_agent")
context_feedback = relationship("ContextFeedback", back_populates="agent")
permissions = relationship("AgentPermissions", back_populates="agent")
def to_dict(self):
return {

View File

@@ -0,0 +1,85 @@
"""
Context Feedback model for RL Context Curator integration
"""
from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, UUID as SqlUUID, Float
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from ..core.database import Base
import uuid
class ContextFeedback(Base):
__tablename__ = "context_feedback"
# Primary identification
id = Column(Integer, primary_key=True, index=True)
# Context and agent information
context_id = Column(String(255), nullable=False, index=True) # HCFS context ID
agent_id = Column(String(255), ForeignKey("agents.id"), nullable=False)
task_id = Column(SqlUUID(as_uuid=True), ForeignKey("tasks.id"), nullable=True)
# Feedback details
feedback_type = Column(String(50), nullable=False) # upvote, downvote, forgetfulness, task_success, task_failure
role = Column(String(100), nullable=False) # Agent role when feedback was given
confidence = Column(Float, nullable=False) # Confidence in feedback (0.0 to 1.0)
reason = Column(Text, nullable=True) # Optional reason for feedback
usage_context = Column(String(255), nullable=True) # Context of usage (debugging, coding, etc.)
# Additional metadata
directory_scope = Column(String(500), nullable=True) # Directory where context was used
task_type = Column(String(100), nullable=True) # Type of task being performed
# Timestamps
timestamp = Column(DateTime(timezone=True), server_default=func.now())
# Relationships
agent = relationship("Agent", back_populates="context_feedback")
task = relationship("Task", backref="context_feedback")
class AgentPermissions(Base):
__tablename__ = "agent_permissions"
# Primary identification
id = Column(Integer, primary_key=True, index=True)
# Agent and role information
agent_id = Column(String(255), ForeignKey("agents.id"), nullable=False, index=True)
role = Column(String(100), nullable=False)
# Permission details
directory_patterns = Column(Text, nullable=True) # JSON array of path patterns
task_types = Column(Text, nullable=True) # JSON array of allowed task types
context_weight = Column(Float, default=1.0) # Weight for context relevance
# Status
active = Column(String(10), default='true') # String to match existing boolean patterns
# Timestamps
created_at = Column(DateTime(timezone=True), server_default=func.now())
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
# Relationships
agent = relationship("Agent", back_populates="permissions")
class PromotionRuleHistory(Base):
__tablename__ = "promotion_rule_history"
# Primary identification
id = Column(Integer, primary_key=True, index=True)
# Rule information
rule_version = Column(String(50), nullable=False)
category = Column(String(100), nullable=False)
role = Column(String(100), nullable=False)
weight_value = Column(Float, nullable=False)
# Change information
change_reason = Column(Text, nullable=True)
previous_value = Column(Float, nullable=True)
# Timestamps
timestamp = Column(DateTime(timezone=True), server_default=func.now())

View File

@@ -2,7 +2,7 @@
Task model for SQLAlchemy ORM
"""
from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, UUID as SqlUUID
from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, UUID as SqlUUID, Float, Boolean
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
@@ -30,6 +30,17 @@ class Task(Base):
# Task metadata (includes context and payload)
task_metadata = Column("metadata", JSONB, nullable=True)
# RL Context Curator outcome tracking fields
completion_time = Column(Integer, nullable=True) # Time to complete in seconds
errors_encountered = Column(Integer, default=0) # Number of errors during execution
follow_up_questions = Column(Integer, default=0) # Number of follow-up questions
success_rate = Column(Float, nullable=True) # Success rate (0.0 to 1.0)
context_used = Column(JSONB, nullable=True) # Context IDs used in this task
context_relevance_score = Column(Float, nullable=True) # Average relevance of used context
feedback_collected = Column(Boolean, default=False) # Whether feedback was collected
task_outcome = Column(String(50), nullable=True) # completed, failed, abandoned
outcome_confidence = Column(Float, nullable=True) # Confidence in outcome classification
# Timestamps
created_at = Column(DateTime(timezone=True), server_default=func.now())
started_at = Column(DateTime(timezone=True), nullable=True)

View File

@@ -0,0 +1,522 @@
"""
Cluster Registration Service
Handles registration-based cluster management for Hive-Bzzz integration.
"""
import asyncpg
import secrets
import json
import socket
from datetime import datetime, timedelta
from typing import Dict, List, Optional, Any
from dataclasses import dataclass
from ipaddress import IPv4Network, IPv6Network, ip_address
import logging
logger = logging.getLogger(__name__)
@dataclass
class ClusterToken:
id: int
token: str
description: str
created_at: datetime
expires_at: Optional[datetime]
is_active: bool
max_registrations: Optional[int]
current_registrations: int
allowed_ip_ranges: Optional[List[str]]
@dataclass
class ClusterNode:
id: int
node_id: str
hostname: str
ip_address: str
registration_token: str
cpu_info: Optional[Dict[str, Any]]
memory_info: Optional[Dict[str, Any]]
gpu_info: Optional[Dict[str, Any]]
disk_info: Optional[Dict[str, Any]]
os_info: Optional[Dict[str, Any]]
platform_info: Optional[Dict[str, Any]]
status: str
last_heartbeat: datetime
first_registered: datetime
services: Optional[Dict[str, Any]]
capabilities: Optional[Dict[str, Any]]
ports: Optional[Dict[str, Any]]
client_version: Optional[str]
registration_metadata: Optional[Dict[str, Any]]
@dataclass
class RegistrationRequest:
token: str
node_id: str
hostname: str
ip_address: str
system_info: Dict[str, Any]
client_version: Optional[str] = None
services: Optional[Dict[str, Any]] = None
capabilities: Optional[Dict[str, Any]] = None
ports: Optional[Dict[str, Any]] = None
metadata: Optional[Dict[str, Any]] = None
@dataclass
class HeartbeatRequest:
node_id: str
status: str = "online"
cpu_usage: Optional[float] = None
memory_usage: Optional[float] = None
disk_usage: Optional[float] = None
gpu_usage: Optional[float] = None
services_status: Optional[Dict[str, Any]] = None
network_metrics: Optional[Dict[str, Any]] = None
custom_metrics: Optional[Dict[str, Any]] = None
class ClusterRegistrationService:
def __init__(self, database_url: str):
self.database_url = database_url
self._conn_cache = None
async def get_connection(self) -> asyncpg.Connection:
"""Get database connection with caching."""
if not self._conn_cache or self._conn_cache.is_closed():
try:
self._conn_cache = await asyncpg.connect(self.database_url)
except Exception as e:
logger.error(f"Failed to connect to database: {e}")
raise
return self._conn_cache
async def close_connection(self):
"""Close database connection."""
if self._conn_cache and not self._conn_cache.is_closed():
await self._conn_cache.close()
# Token Management
async def generate_cluster_token(
self,
description: str,
created_by_user_id: str,
expires_in_days: Optional[int] = None,
max_registrations: Optional[int] = None,
allowed_ip_ranges: Optional[List[str]] = None
) -> ClusterToken:
"""Generate a new cluster registration token."""
conn = await self.get_connection()
# Generate secure token
token = f"hive_cluster_{secrets.token_urlsafe(32)}"
expires_at = datetime.now() + timedelta(days=expires_in_days) if expires_in_days else None
try:
result = await conn.fetchrow("""
INSERT INTO cluster_tokens (
token, description, created_by, expires_at,
max_registrations, allowed_ip_ranges
) VALUES ($1, $2, $3, $4, $5, $6)
RETURNING id, token, description, created_at, expires_at,
is_active, max_registrations, current_registrations, allowed_ip_ranges
""", token, description, created_by_user_id, expires_at, max_registrations, allowed_ip_ranges)
return ClusterToken(**dict(result))
except Exception as e:
logger.error(f"Failed to generate cluster token: {e}")
raise
async def validate_token(self, token: str, client_ip: str) -> Optional[ClusterToken]:
"""Validate a cluster registration token."""
conn = await self.get_connection()
try:
result = await conn.fetchrow("""
SELECT id, token, description, created_at, expires_at,
is_active, max_registrations, current_registrations, allowed_ip_ranges
FROM cluster_tokens
WHERE token = $1 AND is_active = true
""", token)
if not result:
return None
cluster_token = ClusterToken(**dict(result))
# Check expiration
if cluster_token.expires_at and datetime.now() > cluster_token.expires_at:
logger.warning(f"Token {token[:20]}... has expired")
return None
# Check registration limit
if (cluster_token.max_registrations and
cluster_token.current_registrations >= cluster_token.max_registrations):
logger.warning(f"Token {token[:20]}... has reached registration limit")
return None
# Check IP restrictions
if cluster_token.allowed_ip_ranges:
client_ip_obj = ip_address(client_ip)
allowed = False
for ip_range in cluster_token.allowed_ip_ranges:
try:
network = IPv4Network(ip_range, strict=False) if ':' not in ip_range else IPv6Network(ip_range, strict=False)
if client_ip_obj in network:
allowed = True
break
except Exception as e:
logger.warning(f"Invalid IP range {ip_range}: {e}")
if not allowed:
logger.warning(f"IP {client_ip} not allowed for token {token[:20]}...")
return None
return cluster_token
except Exception as e:
logger.error(f"Failed to validate token: {e}")
return None
async def list_tokens(self) -> List[ClusterToken]:
"""List all cluster tokens."""
conn = await self.get_connection()
try:
results = await conn.fetch("""
SELECT id, token, description, created_at, expires_at,
is_active, max_registrations, current_registrations, allowed_ip_ranges
FROM cluster_tokens
ORDER BY created_at DESC
""")
return [ClusterToken(**dict(result)) for result in results]
except Exception as e:
logger.error(f"Failed to list tokens: {e}")
raise
async def revoke_token(self, token: str) -> bool:
"""Revoke a cluster token."""
conn = await self.get_connection()
try:
result = await conn.execute("""
UPDATE cluster_tokens
SET is_active = false
WHERE token = $1
""", token)
return result != "UPDATE 0"
except Exception as e:
logger.error(f"Failed to revoke token: {e}")
return False
# Node Registration
async def register_node(self, request: RegistrationRequest, client_ip: str) -> Dict[str, Any]:
"""Register a new cluster node."""
conn = await self.get_connection()
# Log registration attempt
await self._log_registration_attempt(
client_ip, request.token, request.node_id,
request.hostname, True, None, request.metadata
)
try:
# Validate token
token_info = await self.validate_token(request.token, client_ip)
if not token_info:
await self._log_registration_attempt(
client_ip, request.token, request.node_id,
request.hostname, False, "Invalid or expired token", request.metadata
)
raise ValueError("Invalid or expired registration token")
# Extract system info components
system_info = request.system_info or {}
cpu_info = system_info.get('cpu', {})
memory_info = system_info.get('memory', {})
gpu_info = system_info.get('gpu', {})
disk_info = system_info.get('disk', {})
os_info = system_info.get('os', {})
platform_info = system_info.get('platform', {})
# Register or update node
result = await conn.fetchrow("""
INSERT INTO cluster_nodes (
node_id, hostname, ip_address, registration_token,
cpu_info, memory_info, gpu_info, disk_info, os_info, platform_info,
services, capabilities, ports, client_version, registration_metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
ON CONFLICT (node_id) DO UPDATE SET
hostname = EXCLUDED.hostname,
ip_address = EXCLUDED.ip_address,
cpu_info = EXCLUDED.cpu_info,
memory_info = EXCLUDED.memory_info,
gpu_info = EXCLUDED.gpu_info,
disk_info = EXCLUDED.disk_info,
os_info = EXCLUDED.os_info,
platform_info = EXCLUDED.platform_info,
services = EXCLUDED.services,
capabilities = EXCLUDED.capabilities,
ports = EXCLUDED.ports,
client_version = EXCLUDED.client_version,
registration_metadata = EXCLUDED.registration_metadata,
status = 'online',
last_heartbeat = NOW()
RETURNING id, node_id, hostname, ip_address, first_registered
""",
request.node_id, request.hostname, request.ip_address, request.token,
json.dumps(cpu_info) if cpu_info else None,
json.dumps(memory_info) if memory_info else None,
json.dumps(gpu_info) if gpu_info else None,
json.dumps(disk_info) if disk_info else None,
json.dumps(os_info) if os_info else None,
json.dumps(platform_info) if platform_info else None,
json.dumps(request.services) if request.services else None,
json.dumps(request.capabilities) if request.capabilities else None,
json.dumps(request.ports) if request.ports else None,
request.client_version,
json.dumps(request.metadata) if request.metadata else None
)
logger.info(f"Node {request.node_id} registered successfully from {client_ip}")
return {
"node_id": result["node_id"],
"registration_status": "success",
"heartbeat_interval": 30, # seconds
"registered_at": result["first_registered"].isoformat(),
"cluster_info": {
"coordinator_version": "1.0.0",
"features": ["heartbeat", "dynamic_scaling", "service_discovery"]
}
}
except Exception as e:
logger.error(f"Failed to register node {request.node_id}: {e}")
await self._log_registration_attempt(
client_ip, request.token, request.node_id,
request.hostname, False, str(e), request.metadata
)
raise
async def update_heartbeat(self, request: HeartbeatRequest) -> Dict[str, Any]:
"""Update node heartbeat and metrics."""
conn = await self.get_connection()
try:
# Update node status and heartbeat
result = await conn.fetchrow("""
UPDATE cluster_nodes
SET status = $2, last_heartbeat = NOW()
WHERE node_id = $1
RETURNING node_id, status, last_heartbeat
""", request.node_id, request.status)
if not result:
raise ValueError(f"Node {request.node_id} not found")
# Record heartbeat metrics
await conn.execute("""
INSERT INTO node_heartbeats (
node_id, cpu_usage, memory_usage, disk_usage, gpu_usage,
services_status, network_metrics, custom_metrics
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
""",
request.node_id, request.cpu_usage, request.memory_usage,
request.disk_usage, request.gpu_usage,
json.dumps(request.services_status) if request.services_status else None,
json.dumps(request.network_metrics) if request.network_metrics else None,
json.dumps(request.custom_metrics) if request.custom_metrics else None
)
return {
"node_id": result["node_id"],
"status": result["status"],
"heartbeat_received": result["last_heartbeat"].isoformat(),
"next_heartbeat_in": 30, # seconds
"commands": [] # Future: cluster management commands
}
except Exception as e:
logger.error(f"Failed to update heartbeat for {request.node_id}: {e}")
raise
async def get_registered_nodes(self, include_offline: bool = True) -> List[ClusterNode]:
"""Get all registered cluster nodes."""
conn = await self.get_connection()
try:
query = """
SELECT id, node_id, hostname, ip_address, registration_token,
cpu_info, memory_info, gpu_info, disk_info, os_info, platform_info,
status, last_heartbeat, first_registered,
services, capabilities, ports, client_version, registration_metadata
FROM cluster_nodes
"""
if not include_offline:
query += " WHERE status != 'offline'"
query += " ORDER BY first_registered DESC"
results = await conn.fetch(query)
nodes = []
for result in results:
node_dict = dict(result)
# Parse JSON fields
for json_field in ['cpu_info', 'memory_info', 'gpu_info', 'disk_info',
'os_info', 'platform_info', 'services', 'capabilities',
'ports', 'registration_metadata']:
if node_dict[json_field]:
try:
node_dict[json_field] = json.loads(node_dict[json_field])
except json.JSONDecodeError:
node_dict[json_field] = None
nodes.append(ClusterNode(**node_dict))
return nodes
except Exception as e:
logger.error(f"Failed to get registered nodes: {e}")
raise
async def get_node_details(self, node_id: str) -> Optional[ClusterNode]:
"""Get detailed information about a specific node."""
nodes = await self.get_registered_nodes()
return next((node for node in nodes if node.node_id == node_id), None)
async def remove_node(self, node_id: str) -> bool:
"""Remove a node from the cluster."""
conn = await self.get_connection()
try:
result = await conn.execute("""
DELETE FROM cluster_nodes WHERE node_id = $1
""", node_id)
if result != "DELETE 0":
logger.info(f"Node {node_id} removed from cluster")
return True
return False
except Exception as e:
logger.error(f"Failed to remove node {node_id}: {e}")
return False
# Maintenance and Monitoring
async def cleanup_offline_nodes(self, offline_threshold_minutes: int = 10) -> int:
"""Mark nodes as offline if they haven't sent heartbeats."""
conn = await self.get_connection()
try:
result = await conn.execute("""
UPDATE cluster_nodes
SET status = 'offline'
WHERE status = 'online'
AND last_heartbeat < NOW() - INTERVAL '%s minutes'
""" % offline_threshold_minutes)
# Extract number from result like "UPDATE 3"
count = int(result.split()[-1]) if result.split()[-1].isdigit() else 0
if count > 0:
logger.info(f"Marked {count} nodes as offline due to missing heartbeats")
return count
except Exception as e:
logger.error(f"Failed to cleanup offline nodes: {e}")
return 0
async def cleanup_old_heartbeats(self, retention_days: int = 30) -> int:
"""Remove old heartbeat data for storage management."""
conn = await self.get_connection()
try:
result = await conn.execute("""
DELETE FROM node_heartbeats
WHERE heartbeat_time < NOW() - INTERVAL '%s days'
""" % retention_days)
count = int(result.split()[-1]) if result.split()[-1].isdigit() else 0
if count > 0:
logger.info(f"Cleaned up {count} old heartbeat records")
return count
except Exception as e:
logger.error(f"Failed to cleanup old heartbeats: {e}")
return 0
async def _log_registration_attempt(
self,
ip_address: str,
token: str,
node_id: str,
hostname: str,
success: bool,
failure_reason: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None
):
"""Log registration attempts for security monitoring."""
conn = await self.get_connection()
try:
await conn.execute("""
INSERT INTO node_registration_attempts (
ip_address, token_used, node_id, hostname,
success, failure_reason, request_metadata
) VALUES ($1, $2, $3, $4, $5, $6, $7)
""", ip_address, token, node_id, hostname, success, failure_reason,
json.dumps(metadata) if metadata else None)
except Exception as e:
logger.error(f"Failed to log registration attempt: {e}")
async def get_cluster_statistics(self) -> Dict[str, Any]:
"""Get cluster statistics and health metrics."""
conn = await self.get_connection()
try:
# Node statistics
node_stats = await conn.fetchrow("""
SELECT
COUNT(*) as total_nodes,
COUNT(*) FILTER (WHERE status = 'online') as online_nodes,
COUNT(*) FILTER (WHERE status = 'offline') as offline_nodes,
COUNT(*) FILTER (WHERE status = 'maintenance') as maintenance_nodes
FROM cluster_nodes
""")
# Token statistics
token_stats = await conn.fetchrow("""
SELECT
COUNT(*) as total_tokens,
COUNT(*) FILTER (WHERE is_active = true) as active_tokens,
COUNT(*) FILTER (WHERE expires_at IS NOT NULL AND expires_at < NOW()) as expired_tokens
FROM cluster_tokens
""")
return {
"cluster_health": {
"total_nodes": node_stats["total_nodes"],
"online_nodes": node_stats["online_nodes"],
"offline_nodes": node_stats["offline_nodes"],
"maintenance_nodes": node_stats["maintenance_nodes"],
"health_percentage": (node_stats["online_nodes"] / max(node_stats["total_nodes"], 1)) * 100
},
"token_management": {
"total_tokens": token_stats["total_tokens"],
"active_tokens": token_stats["active_tokens"],
"expired_tokens": token_stats["expired_tokens"]
},
"last_updated": datetime.now().isoformat()
}
except Exception as e:
logger.error(f"Failed to get cluster statistics: {e}")
return {
"error": str(e),
"last_updated": datetime.now().isoformat()
}

View File

@@ -26,7 +26,7 @@ class ClusterService:
"ip": "192.168.1.113",
"hostname": "ironwood",
"role": "worker",
"gpu": "NVIDIA RTX 3070",
"gpu": "NVIDIA RTX 2080S",
"memory": "128GB",
"cpu": "AMD Threadripper 2920X",
"ollama_port": 11434,
@@ -57,6 +57,66 @@ class ClusterService:
self.n8n_api_base = "https://n8n.home.deepblack.cloud/api/v1"
self.n8n_api_key = self._get_n8n_api_key()
def _get_live_hardware_info(self, hostname: str, ip: str) -> Dict[str, str]:
"""Get live hardware information from a remote node via SSH."""
hardware = {
"cpu": "Unknown",
"memory": "Unknown",
"gpu": "Unknown"
}
try:
# Try to get GPU info via SSH
print(f"🔍 SSH GPU command for {hostname}: ssh tony@{ip} 'nvidia-smi || lspci | grep -i vga'")
gpu_result = subprocess.run([
"ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
f"tony@{ip}", "nvidia-smi --query-gpu=name --format=csv,noheader,nounits || lspci | grep -i 'vga\\|3d\\|display'"
], capture_output=True, text=True, timeout=10)
print(f"📊 GPU command result for {hostname}: returncode={gpu_result.returncode}, stdout='{gpu_result.stdout.strip()}', stderr='{gpu_result.stderr.strip()}'")
if gpu_result.returncode == 0 and gpu_result.stdout.strip():
gpu_info = gpu_result.stdout.strip().split('\n')[0]
if "NVIDIA" in gpu_info or "RTX" in gpu_info or "GTX" in gpu_info:
hardware["gpu"] = gpu_info.strip()
elif "VGA" in gpu_info or "Display" in gpu_info:
# Parse lspci output for GPU info
if "NVIDIA" in gpu_info:
parts = gpu_info.split("NVIDIA")
if len(parts) > 1:
gpu_name = "NVIDIA" + parts[1].split('[')[0].strip()
hardware["gpu"] = gpu_name
elif "AMD" in gpu_info or "Radeon" in gpu_info:
parts = gpu_info.split(":")
if len(parts) > 2:
gpu_name = parts[2].strip()
hardware["gpu"] = gpu_name
# Try to get memory info via SSH
mem_result = subprocess.run([
"ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
f"tony@{ip}", "free -h | grep '^Mem:' | awk '{print $2}'"
], capture_output=True, text=True, timeout=10)
if mem_result.returncode == 0 and mem_result.stdout.strip():
memory_info = mem_result.stdout.strip()
hardware["memory"] = memory_info
# Try to get CPU info via SSH
cpu_result = subprocess.run([
"ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
f"tony@{ip}", "lscpu | grep 'Model name:' | cut -d':' -f2- | xargs"
], capture_output=True, text=True, timeout=10)
if cpu_result.returncode == 0 and cpu_result.stdout.strip():
cpu_info = cpu_result.stdout.strip()
hardware["cpu"] = cpu_info
except Exception as e:
print(f"Error getting live hardware info for {hostname}: {e}")
return hardware
def _get_n8n_api_key(self) -> Optional[str]:
"""Get n8n API key from secrets."""
try:
@@ -136,17 +196,35 @@ class ClusterService:
except Exception:
pass
# Try to get live hardware info if node is online
hardware_info = {
"cpu": node_info["cpu"],
"memory": node_info["memory"],
"gpu": node_info["gpu"]
}
if status == "online":
try:
print(f"🔍 Getting live hardware info for {node_id} ({node_info['ip']})")
live_hardware = self._get_live_hardware_info(node_info["hostname"], node_info["ip"])
print(f"📊 Live hardware detected for {node_id}: {live_hardware}")
# Use live data if available, fallback to hardcoded values
for key in ["cpu", "memory", "gpu"]:
if live_hardware[key] != "Unknown":
print(f"✅ Using live {key} for {node_id}: {live_hardware[key]}")
hardware_info[key] = live_hardware[key]
else:
print(f"⚠️ Using fallback {key} for {node_id}: {hardware_info[key]}")
except Exception as e:
print(f"❌ Failed to get live hardware info for {node_id}: {e}")
return {
"id": node_id,
"hostname": node_info["hostname"],
"ip": node_info["ip"],
"status": status,
"role": node_info["role"],
"hardware": {
"cpu": node_info["cpu"],
"memory": node_info["memory"],
"gpu": node_info["gpu"]
},
"hardware": hardware_info,
"model_count": model_count,
"models": [{"name": m["name"], "size": m.get("size", 0)} for m in models],
"metrics": {

View File

@@ -689,4 +689,58 @@ class ProjectService:
# Handle escalation status
if status == "escalated":
print(f"Task escalated for human review: {metadata}")
# TODO: Trigger N8N webhook for human escalation
# TODO: Trigger N8N webhook for human escalation
def update_project(self, project_id: str, project_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""Update a project configuration."""
try:
# For now, projects are read-only from the filesystem
# This could be extended to update project metadata files
project = self.get_project_by_id(project_id)
if not project:
return None
# Update project metadata in a local JSON file if needed
# For now, just return the existing project as projects are filesystem-based
print(f"Project update request for {project_id}: {project_data}")
return project
except Exception as e:
print(f"Error updating project {project_id}: {e}")
return None
def create_project(self, project_data: Dict[str, Any]) -> Dict[str, Any]:
"""Create a new project."""
try:
# For now, projects are filesystem-based and read-only
# This could be extended to create new project directories
print(f"Project creation request: {project_data}")
# Return a mock project for now
project_id = project_data.get("name", "new-project").lower().replace(" ", "-")
return {
"id": project_id,
"name": project_data.get("name", "New Project"),
"description": project_data.get("description", ""),
"status": "created",
"created_at": datetime.now().isoformat(),
"updated_at": datetime.now().isoformat()
}
except Exception as e:
print(f"Error creating project: {e}")
raise
def delete_project(self, project_id: str) -> bool:
"""Delete a project."""
try:
# For now, projects are filesystem-based and read-only
# This could be extended to archive or remove project directories
project = self.get_project_by_id(project_id)
if not project:
return False
print(f"Project deletion request for {project_id}")
# Return success for now (projects are read-only)
return True
except Exception as e:
print(f"Error deleting project {project_id}: {e}")
return False