WIP: Save current work before CHORUS rebrand
- Agent roles integration progress - Various backend and frontend updates - Storybook cache cleanup 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
34
backend/Dockerfile.dev
Normal file
34
backend/Dockerfile.dev
Normal file
@@ -0,0 +1,34 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
curl \
|
||||
git \
|
||||
build-essential \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
RUN pip install --no-cache-dir watchdog # For hot reload
|
||||
|
||||
# Copy source code
|
||||
COPY . .
|
||||
|
||||
# Create non-root user
|
||||
RUN useradd -m -u 1001 appuser && chown -R appuser:appuser /app
|
||||
USER appuser
|
||||
|
||||
# Expose port
|
||||
EXPOSE 8000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=10s --retries=3 \
|
||||
CMD curl -f http://localhost:8000/api/health || exit 1
|
||||
|
||||
# Start development server with hot reload
|
||||
CMD ["python", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||
@@ -15,6 +15,8 @@ Key Features:
|
||||
|
||||
from fastapi import APIRouter, HTTPException, Request, Depends, status
|
||||
from typing import List, Dict, Any
|
||||
import time
|
||||
import logging
|
||||
from ..models.agent import Agent
|
||||
from ..models.responses import (
|
||||
AgentListResponse,
|
||||
@@ -29,6 +31,9 @@ router = APIRouter()
|
||||
|
||||
from app.core.database import SessionLocal
|
||||
from app.models.agent import Agent as ORMAgent
|
||||
from ..services.agent_service import AgentType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@router.get(
|
||||
@@ -384,4 +389,244 @@ async def unregister_agent(
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to unregister agent: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/agents/heartbeat",
|
||||
status_code=status.HTTP_200_OK,
|
||||
summary="Agent heartbeat update",
|
||||
description="""
|
||||
Update agent status and maintain registration through periodic heartbeat.
|
||||
|
||||
This endpoint allows agents to:
|
||||
- Confirm they are still online and responsive
|
||||
- Update their current status and metrics
|
||||
- Report any capability or configuration changes
|
||||
- Maintain their registration in the cluster
|
||||
|
||||
Agents should call this endpoint every 30-60 seconds to maintain
|
||||
their active status in the Hive cluster.
|
||||
""",
|
||||
responses={
|
||||
200: {"description": "Heartbeat received successfully"},
|
||||
404: {"model": ErrorResponse, "description": "Agent not registered"},
|
||||
400: {"model": ErrorResponse, "description": "Invalid heartbeat data"}
|
||||
}
|
||||
)
|
||||
async def agent_heartbeat(
|
||||
heartbeat_data: Dict[str, Any],
|
||||
request: Request
|
||||
):
|
||||
"""
|
||||
Process agent heartbeat to maintain registration.
|
||||
|
||||
Args:
|
||||
heartbeat_data: Agent status and metrics data
|
||||
request: FastAPI request object
|
||||
|
||||
Returns:
|
||||
Success confirmation and any coordinator updates
|
||||
"""
|
||||
agent_id = heartbeat_data.get("agent_id")
|
||||
if not agent_id:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Missing agent_id in heartbeat data"
|
||||
)
|
||||
|
||||
# Access coordinator
|
||||
hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
|
||||
if not hive_coordinator:
|
||||
from ..main import unified_coordinator
|
||||
hive_coordinator = unified_coordinator
|
||||
|
||||
if not hive_coordinator:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail="Coordinator service unavailable"
|
||||
)
|
||||
|
||||
try:
|
||||
# Update agent heartbeat timestamp
|
||||
agent_service = hive_coordinator.agent_service
|
||||
if agent_service:
|
||||
agent_service.update_agent_heartbeat(agent_id)
|
||||
|
||||
# Update current tasks if provided - use raw SQL to avoid role column
|
||||
if "current_tasks" in heartbeat_data:
|
||||
current_tasks = heartbeat_data["current_tasks"]
|
||||
try:
|
||||
with SessionLocal() as db:
|
||||
from sqlalchemy import text
|
||||
db.execute(text(
|
||||
"UPDATE agents SET current_tasks = :current_tasks, last_seen = NOW() WHERE id = :agent_id"
|
||||
), {
|
||||
"current_tasks": current_tasks,
|
||||
"agent_id": agent_id
|
||||
})
|
||||
db.commit()
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not update agent tasks: {e}")
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"message": f"Heartbeat received from agent '{agent_id}'",
|
||||
"timestamp": time.time()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to process heartbeat: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/agents/auto-register",
|
||||
response_model=AgentRegistrationResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
summary="Automatic agent registration",
|
||||
description="""
|
||||
Register an agent automatically with capability detection.
|
||||
|
||||
This endpoint is designed for Bzzz agents running as systemd services
|
||||
to automatically register themselves with the Hive coordinator.
|
||||
|
||||
Features:
|
||||
- Automatic capability detection based on available models
|
||||
- Network discovery support
|
||||
- Retry-friendly for service startup scenarios
|
||||
- Health validation before registration
|
||||
""",
|
||||
responses={
|
||||
201: {"description": "Agent auto-registered successfully"},
|
||||
400: {"model": ErrorResponse, "description": "Invalid agent configuration"},
|
||||
409: {"model": ErrorResponse, "description": "Agent already registered"},
|
||||
503: {"model": ErrorResponse, "description": "Agent endpoint unreachable"}
|
||||
}
|
||||
)
|
||||
async def auto_register_agent(
|
||||
agent_data: Dict[str, Any],
|
||||
request: Request
|
||||
) -> AgentRegistrationResponse:
|
||||
"""
|
||||
Automatically register a Bzzz agent with the Hive coordinator.
|
||||
|
||||
Args:
|
||||
agent_data: Agent configuration including endpoint, models, etc.
|
||||
request: FastAPI request object
|
||||
|
||||
Returns:
|
||||
AgentRegistrationResponse: Registration confirmation
|
||||
"""
|
||||
# Extract required fields
|
||||
agent_id = agent_data.get("agent_id")
|
||||
endpoint = agent_data.get("endpoint")
|
||||
hostname = agent_data.get("hostname")
|
||||
|
||||
if not agent_id or not endpoint:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail="Missing required fields: agent_id, endpoint"
|
||||
)
|
||||
|
||||
# Access coordinator
|
||||
hive_coordinator = getattr(request.app.state, 'hive_coordinator', None)
|
||||
if not hive_coordinator:
|
||||
from ..main import unified_coordinator
|
||||
hive_coordinator = unified_coordinator
|
||||
|
||||
if not hive_coordinator:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
|
||||
detail="Coordinator service unavailable"
|
||||
)
|
||||
|
||||
try:
|
||||
# Check if agent already exists - use basic query to avoid role column
|
||||
try:
|
||||
with SessionLocal() as db:
|
||||
from sqlalchemy import text
|
||||
existing_agent = db.execute(text(
|
||||
"SELECT id, endpoint FROM agents WHERE id = :agent_id LIMIT 1"
|
||||
), {"agent_id": agent_id}).fetchone()
|
||||
if existing_agent:
|
||||
# Update existing agent
|
||||
db.execute(text(
|
||||
"UPDATE agents SET endpoint = :endpoint, last_seen = NOW() WHERE id = :agent_id"
|
||||
), {"endpoint": endpoint, "agent_id": agent_id})
|
||||
db.commit()
|
||||
|
||||
return AgentRegistrationResponse(
|
||||
agent_id=agent_id,
|
||||
endpoint=endpoint,
|
||||
message=f"Agent '{agent_id}' registration updated successfully"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not check existing agent: {e}")
|
||||
|
||||
# Detect capabilities and models
|
||||
models = agent_data.get("models", [])
|
||||
if not models:
|
||||
# Try to detect models from endpoint
|
||||
try:
|
||||
import aiohttp
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(f"{endpoint}/api/tags", timeout=aiohttp.ClientTimeout(total=5)) as response:
|
||||
if response.status == 200:
|
||||
tags_data = await response.json()
|
||||
models = [model["name"] for model in tags_data.get("models", [])]
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not detect models for {agent_id}: {e}")
|
||||
|
||||
# Determine specialty based on models or hostname
|
||||
specialty = AgentType.GENERAL_AI # Default
|
||||
if "codellama" in str(models).lower() or "code" in hostname.lower():
|
||||
specialty = AgentType.KERNEL_DEV
|
||||
elif "gemma" in str(models).lower():
|
||||
specialty = AgentType.PYTORCH_DEV
|
||||
elif any(model for model in models if "llama" in model.lower()):
|
||||
specialty = AgentType.GENERAL_AI
|
||||
|
||||
# Insert agent directly into database
|
||||
try:
|
||||
with SessionLocal() as db:
|
||||
from sqlalchemy import text
|
||||
# Insert new agent using raw SQL to avoid role column issues
|
||||
db.execute(text("""
|
||||
INSERT INTO agents (id, name, endpoint, model, specialty, max_concurrent, current_tasks, status, created_at, last_seen)
|
||||
VALUES (:agent_id, :name, :endpoint, :model, :specialty, :max_concurrent, 0, 'active', NOW(), NOW())
|
||||
ON CONFLICT (id) DO UPDATE SET
|
||||
endpoint = EXCLUDED.endpoint,
|
||||
model = EXCLUDED.model,
|
||||
specialty = EXCLUDED.specialty,
|
||||
max_concurrent = EXCLUDED.max_concurrent,
|
||||
last_seen = NOW()
|
||||
"""), {
|
||||
"agent_id": agent_id,
|
||||
"name": agent_id, # Use agent_id as name
|
||||
"endpoint": endpoint,
|
||||
"model": models[0] if models else "unknown",
|
||||
"specialty": specialty.value,
|
||||
"max_concurrent": agent_data.get("max_concurrent", 2)
|
||||
})
|
||||
db.commit()
|
||||
|
||||
return AgentRegistrationResponse(
|
||||
agent_id=agent_id,
|
||||
endpoint=endpoint,
|
||||
message=f"Agent '{agent_id}' auto-registered successfully with specialty '{specialty.value}'"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Database insert failed: {e}")
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to register agent in database: {str(e)}"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Failed to auto-register agent: {str(e)}"
|
||||
)
|
||||
287
backend/app/api/bzzz_logs.py
Normal file
287
backend/app/api/bzzz_logs.py
Normal file
@@ -0,0 +1,287 @@
|
||||
"""
|
||||
Bzzz hypercore/hyperswarm log streaming API endpoints.
|
||||
Provides real-time access to agent communication logs from the Bzzz network.
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from typing import List, Optional, Dict, Any
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import httpx
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
router = APIRouter()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Keep track of active WebSocket connections
|
||||
active_connections: List[WebSocket] = []
|
||||
|
||||
class BzzzLogEntry:
|
||||
"""Represents a Bzzz hypercore log entry"""
|
||||
def __init__(self, data: Dict[str, Any]):
|
||||
self.index = data.get("index", 0)
|
||||
self.timestamp = data.get("timestamp", "")
|
||||
self.author = data.get("author", "")
|
||||
self.log_type = data.get("type", "")
|
||||
self.message_data = data.get("data", {})
|
||||
self.hash_value = data.get("hash", "")
|
||||
self.prev_hash = data.get("prev_hash", "")
|
||||
|
||||
def to_chat_message(self) -> Dict[str, Any]:
|
||||
"""Convert hypercore log entry to chat message format"""
|
||||
# Extract message details from the log data
|
||||
msg_data = self.message_data
|
||||
|
||||
return {
|
||||
"id": f"log-{self.index}",
|
||||
"senderId": msg_data.get("from_short", self.author),
|
||||
"senderName": msg_data.get("from_short", self.author),
|
||||
"content": self._format_message_content(),
|
||||
"timestamp": self.timestamp,
|
||||
"messageType": self._determine_message_type(),
|
||||
"channel": msg_data.get("topic", "unknown"),
|
||||
"swarmId": f"swarm-{msg_data.get('topic', 'unknown')}",
|
||||
"isDelivered": True,
|
||||
"isRead": True,
|
||||
"logType": self.log_type,
|
||||
"hash": self.hash_value
|
||||
}
|
||||
|
||||
def _format_message_content(self) -> str:
|
||||
"""Format the log entry into a readable message"""
|
||||
msg_data = self.message_data
|
||||
message_type = msg_data.get("message_type", self.log_type)
|
||||
|
||||
if message_type == "availability_broadcast":
|
||||
status = msg_data.get("data", {}).get("status", "unknown")
|
||||
current_tasks = msg_data.get("data", {}).get("current_tasks", 0)
|
||||
max_tasks = msg_data.get("data", {}).get("max_tasks", 0)
|
||||
return f"Status: {status} ({current_tasks}/{max_tasks} tasks)"
|
||||
|
||||
elif message_type == "capability_broadcast":
|
||||
capabilities = msg_data.get("data", {}).get("capabilities", [])
|
||||
models = msg_data.get("data", {}).get("models", [])
|
||||
return f"Updated capabilities: {', '.join(capabilities[:3])}{'...' if len(capabilities) > 3 else ''}"
|
||||
|
||||
elif message_type == "task_announced":
|
||||
task_data = msg_data.get("data", {})
|
||||
return f"Task announced: {task_data.get('title', 'Unknown task')}"
|
||||
|
||||
elif message_type == "task_claimed":
|
||||
task_data = msg_data.get("data", {})
|
||||
return f"Task claimed: {task_data.get('title', 'Unknown task')}"
|
||||
|
||||
elif message_type == "role_announcement":
|
||||
role = msg_data.get("data", {}).get("role", "unknown")
|
||||
return f"Role announcement: {role}"
|
||||
|
||||
elif message_type == "collaboration":
|
||||
return f"Collaboration: {msg_data.get('data', {}).get('content', 'Agent discussion')}"
|
||||
|
||||
elif self.log_type == "peer_joined":
|
||||
return "Agent joined the network"
|
||||
|
||||
elif self.log_type == "peer_left":
|
||||
return "Agent left the network"
|
||||
|
||||
else:
|
||||
# Generic fallback
|
||||
return f"{message_type}: {json.dumps(msg_data.get('data', {}))[:100]}{'...' if len(str(msg_data.get('data', {}))) > 100 else ''}"
|
||||
|
||||
def _determine_message_type(self) -> str:
|
||||
"""Determine if this is a sent, received, or system message"""
|
||||
msg_data = self.message_data
|
||||
|
||||
# System messages
|
||||
if self.log_type in ["peer_joined", "peer_left", "network_event"]:
|
||||
return "system"
|
||||
|
||||
# For now, treat all as received since we're monitoring
|
||||
# In a real implementation, you'd check if the author is the current node
|
||||
return "received"
|
||||
|
||||
class BzzzLogStreamer:
|
||||
"""Manages streaming of Bzzz hypercore logs"""
|
||||
|
||||
def __init__(self):
|
||||
self.agent_endpoints = {}
|
||||
self.last_indices = {} # Track last seen index per agent
|
||||
|
||||
async def discover_bzzz_agents(self) -> List[Dict[str, str]]:
|
||||
"""Discover active Bzzz agents from the Hive agents API"""
|
||||
try:
|
||||
# This would typically query the actual agents database
|
||||
# For now, return known endpoints based on cluster nodes
|
||||
return [
|
||||
{"agent_id": "acacia-bzzz", "endpoint": "http://acacia.local:8080"},
|
||||
{"agent_id": "walnut-bzzz", "endpoint": "http://walnut.local:8080"},
|
||||
{"agent_id": "ironwood-bzzz", "endpoint": "http://ironwood.local:8080"},
|
||||
{"agent_id": "rosewood-bzzz", "endpoint": "http://rosewood.local:8080"},
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to discover Bzzz agents: {e}")
|
||||
return []
|
||||
|
||||
async def fetch_agent_logs(self, agent_endpoint: str, since_index: int = 0) -> List[BzzzLogEntry]:
|
||||
"""Fetch hypercore logs from a specific Bzzz agent"""
|
||||
try:
|
||||
# This would call the actual Bzzz agent's HTTP API
|
||||
# For now, return mock data structure that matches hypercore format
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.get(
|
||||
f"{agent_endpoint}/api/hypercore/logs",
|
||||
params={"since": since_index},
|
||||
timeout=5.0
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
logs_data = response.json()
|
||||
return [BzzzLogEntry(log) for log in logs_data.get("entries", [])]
|
||||
else:
|
||||
logger.warning(f"Failed to fetch logs from {agent_endpoint}: {response.status_code}")
|
||||
return []
|
||||
|
||||
except httpx.ConnectError:
|
||||
logger.debug(f"Agent at {agent_endpoint} is not reachable")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching logs from {agent_endpoint}: {e}")
|
||||
return []
|
||||
|
||||
async def get_recent_logs(self, limit: int = 100) -> List[Dict[str, Any]]:
|
||||
"""Get recent logs from all agents"""
|
||||
agents = await self.discover_bzzz_agents()
|
||||
all_messages = []
|
||||
|
||||
for agent in agents:
|
||||
logs = await self.fetch_agent_logs(agent["endpoint"])
|
||||
for log in logs[-limit:]: # Get recent entries
|
||||
message = log.to_chat_message()
|
||||
message["agent_id"] = agent["agent_id"]
|
||||
all_messages.append(message)
|
||||
|
||||
# Sort by timestamp
|
||||
all_messages.sort(key=lambda x: x["timestamp"])
|
||||
return all_messages[-limit:]
|
||||
|
||||
async def stream_new_logs(self):
|
||||
"""Continuously stream new logs from all agents"""
|
||||
while True:
|
||||
try:
|
||||
agents = await self.discover_bzzz_agents()
|
||||
new_messages = []
|
||||
|
||||
for agent in agents:
|
||||
agent_id = agent["agent_id"]
|
||||
last_index = self.last_indices.get(agent_id, 0)
|
||||
|
||||
logs = await self.fetch_agent_logs(agent["endpoint"], last_index)
|
||||
|
||||
for log in logs:
|
||||
if log.index > last_index:
|
||||
message = log.to_chat_message()
|
||||
message["agent_id"] = agent_id
|
||||
new_messages.append(message)
|
||||
self.last_indices[agent_id] = log.index
|
||||
|
||||
# Send new messages to all connected WebSocket clients
|
||||
if new_messages and active_connections:
|
||||
message_data = {
|
||||
"type": "new_messages",
|
||||
"messages": new_messages
|
||||
}
|
||||
|
||||
# Remove disconnected clients
|
||||
disconnected = []
|
||||
for connection in active_connections:
|
||||
try:
|
||||
await connection.send_text(json.dumps(message_data))
|
||||
except:
|
||||
disconnected.append(connection)
|
||||
|
||||
for conn in disconnected:
|
||||
active_connections.remove(conn)
|
||||
|
||||
await asyncio.sleep(2) # Poll every 2 seconds
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error in log streaming: {e}")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
# Global log streamer instance
|
||||
log_streamer = BzzzLogStreamer()
|
||||
|
||||
@router.get("/bzzz/logs")
|
||||
async def get_bzzz_logs(
|
||||
limit: int = Query(default=100, le=1000),
|
||||
agent_id: Optional[str] = None
|
||||
):
|
||||
"""Get recent Bzzz hypercore logs"""
|
||||
try:
|
||||
logs = await log_streamer.get_recent_logs(limit)
|
||||
|
||||
if agent_id:
|
||||
logs = [log for log in logs if log.get("agent_id") == agent_id]
|
||||
|
||||
return {
|
||||
"logs": logs,
|
||||
"count": len(logs),
|
||||
"timestamp": datetime.utcnow().isoformat()
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching Bzzz logs: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.get("/bzzz/agents")
|
||||
async def get_bzzz_agents():
|
||||
"""Get list of discovered Bzzz agents"""
|
||||
try:
|
||||
agents = await log_streamer.discover_bzzz_agents()
|
||||
return {"agents": agents}
|
||||
except Exception as e:
|
||||
logger.error(f"Error discovering Bzzz agents: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.websocket("/bzzz/logs/stream")
|
||||
async def websocket_bzzz_logs(websocket: WebSocket):
|
||||
"""WebSocket endpoint for real-time Bzzz log streaming"""
|
||||
await websocket.accept()
|
||||
active_connections.append(websocket)
|
||||
|
||||
try:
|
||||
# Send initial recent logs
|
||||
recent_logs = await log_streamer.get_recent_logs(50)
|
||||
await websocket.send_text(json.dumps({
|
||||
"type": "initial_logs",
|
||||
"messages": recent_logs
|
||||
}))
|
||||
|
||||
# Keep connection alive and handle client messages
|
||||
while True:
|
||||
try:
|
||||
# Wait for client messages (ping, filters, etc.)
|
||||
message = await asyncio.wait_for(websocket.receive_text(), timeout=30)
|
||||
client_data = json.loads(message)
|
||||
|
||||
if client_data.get("type") == "ping":
|
||||
await websocket.send_text(json.dumps({"type": "pong"}))
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
# Send periodic heartbeat
|
||||
await websocket.send_text(json.dumps({"type": "heartbeat"}))
|
||||
|
||||
except WebSocketDisconnect:
|
||||
active_connections.remove(websocket)
|
||||
except Exception as e:
|
||||
logger.error(f"WebSocket error: {e}")
|
||||
if websocket in active_connections:
|
||||
active_connections.remove(websocket)
|
||||
|
||||
# Start the log streaming background task
|
||||
@router.on_event("startup")
|
||||
async def start_log_streaming():
|
||||
"""Start the background log streaming task"""
|
||||
asyncio.create_task(log_streamer.stream_new_logs())
|
||||
434
backend/app/api/cluster_registration.py
Normal file
434
backend/app/api/cluster_registration.py
Normal file
@@ -0,0 +1,434 @@
|
||||
"""
|
||||
Cluster Registration API endpoints
|
||||
Handles registration-based cluster management for Hive-Bzzz integration.
|
||||
"""
|
||||
from fastapi import APIRouter, HTTPException, Request, Depends
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import Dict, Any, List, Optional
|
||||
import logging
|
||||
import os
|
||||
from ..services.cluster_registration_service import (
|
||||
ClusterRegistrationService,
|
||||
RegistrationRequest,
|
||||
HeartbeatRequest
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Initialize service
|
||||
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://hive:hivepass@localhost:5432/hive")
|
||||
cluster_registration_service = ClusterRegistrationService(DATABASE_URL)
|
||||
|
||||
# Pydantic models for API
|
||||
class NodeRegistrationRequest(BaseModel):
|
||||
token: str = Field(..., description="Cluster registration token")
|
||||
node_id: str = Field(..., description="Unique node identifier")
|
||||
hostname: str = Field(..., description="Node hostname")
|
||||
system_info: Dict[str, Any] = Field(..., description="System hardware and OS information")
|
||||
client_version: Optional[str] = Field(None, description="Bzzz client version")
|
||||
services: Optional[Dict[str, Any]] = Field(None, description="Available services")
|
||||
capabilities: Optional[Dict[str, Any]] = Field(None, description="Node capabilities")
|
||||
ports: Optional[Dict[str, Any]] = Field(None, description="Service ports")
|
||||
metadata: Optional[Dict[str, Any]] = Field(None, description="Additional metadata")
|
||||
|
||||
class NodeHeartbeatRequest(BaseModel):
|
||||
node_id: str = Field(..., description="Node identifier")
|
||||
status: str = Field("online", description="Node status")
|
||||
cpu_usage: Optional[float] = Field(None, ge=0, le=100, description="CPU usage percentage")
|
||||
memory_usage: Optional[float] = Field(None, ge=0, le=100, description="Memory usage percentage")
|
||||
disk_usage: Optional[float] = Field(None, ge=0, le=100, description="Disk usage percentage")
|
||||
gpu_usage: Optional[float] = Field(None, ge=0, le=100, description="GPU usage percentage")
|
||||
services_status: Optional[Dict[str, Any]] = Field(None, description="Service status information")
|
||||
network_metrics: Optional[Dict[str, Any]] = Field(None, description="Network metrics")
|
||||
custom_metrics: Optional[Dict[str, Any]] = Field(None, description="Custom node metrics")
|
||||
|
||||
class TokenCreateRequest(BaseModel):
|
||||
description: str = Field(..., description="Token description")
|
||||
expires_in_days: Optional[int] = Field(None, gt=0, description="Token expiration in days")
|
||||
max_registrations: Optional[int] = Field(None, gt=0, description="Maximum number of registrations")
|
||||
allowed_ip_ranges: Optional[List[str]] = Field(None, description="Allowed IP CIDR ranges")
|
||||
|
||||
# Helper function to get client IP
|
||||
def get_client_ip(request: Request) -> str:
|
||||
"""Extract client IP address from request."""
|
||||
# Check for X-Forwarded-For header (proxy/load balancer)
|
||||
forwarded_for = request.headers.get("X-Forwarded-For")
|
||||
if forwarded_for:
|
||||
# Take the first IP in the chain (original client)
|
||||
return forwarded_for.split(",")[0].strip()
|
||||
|
||||
# Check for X-Real-IP header (nginx)
|
||||
real_ip = request.headers.get("X-Real-IP")
|
||||
if real_ip:
|
||||
return real_ip.strip()
|
||||
|
||||
# Fall back to direct connection IP
|
||||
return request.client.host if request.client else "unknown"
|
||||
|
||||
# Registration endpoints
|
||||
@router.post("/cluster/register")
|
||||
async def register_node(
|
||||
registration: NodeRegistrationRequest,
|
||||
request: Request
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Register a new node in the cluster.
|
||||
|
||||
This endpoint allows Bzzz clients to register themselves with the Hive coordinator
|
||||
using a valid cluster token. Similar to `docker swarm join`.
|
||||
"""
|
||||
try:
|
||||
client_ip = get_client_ip(request)
|
||||
logger.info(f"Node registration attempt: {registration.node_id} from {client_ip}")
|
||||
|
||||
# Convert to service request
|
||||
reg_request = RegistrationRequest(
|
||||
token=registration.token,
|
||||
node_id=registration.node_id,
|
||||
hostname=registration.hostname,
|
||||
ip_address=client_ip,
|
||||
system_info=registration.system_info,
|
||||
client_version=registration.client_version,
|
||||
services=registration.services,
|
||||
capabilities=registration.capabilities,
|
||||
ports=registration.ports,
|
||||
metadata=registration.metadata
|
||||
)
|
||||
|
||||
result = await cluster_registration_service.register_node(reg_request, client_ip)
|
||||
logger.info(f"Node {registration.node_id} registered successfully")
|
||||
|
||||
return result
|
||||
|
||||
except ValueError as e:
|
||||
logger.warning(f"Registration failed for {registration.node_id}: {e}")
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"Registration error for {registration.node_id}: {e}")
|
||||
raise HTTPException(status_code=500, detail="Registration failed")
|
||||
|
||||
@router.post("/cluster/heartbeat")
|
||||
async def node_heartbeat(heartbeat: NodeHeartbeatRequest) -> Dict[str, Any]:
|
||||
"""
|
||||
Update node heartbeat and status.
|
||||
|
||||
Registered nodes should call this endpoint periodically (every 30 seconds)
|
||||
to maintain their registration and report current status/metrics.
|
||||
"""
|
||||
try:
|
||||
heartbeat_request = HeartbeatRequest(
|
||||
node_id=heartbeat.node_id,
|
||||
status=heartbeat.status,
|
||||
cpu_usage=heartbeat.cpu_usage,
|
||||
memory_usage=heartbeat.memory_usage,
|
||||
disk_usage=heartbeat.disk_usage,
|
||||
gpu_usage=heartbeat.gpu_usage,
|
||||
services_status=heartbeat.services_status,
|
||||
network_metrics=heartbeat.network_metrics,
|
||||
custom_metrics=heartbeat.custom_metrics
|
||||
)
|
||||
|
||||
result = await cluster_registration_service.update_heartbeat(heartbeat_request)
|
||||
return result
|
||||
|
||||
except ValueError as e:
|
||||
logger.warning(f"Heartbeat failed for {heartbeat.node_id}: {e}")
|
||||
raise HTTPException(status_code=404, detail=str(e))
|
||||
except Exception as e:
|
||||
logger.error(f"Heartbeat error for {heartbeat.node_id}: {e}")
|
||||
raise HTTPException(status_code=500, detail="Heartbeat update failed")
|
||||
|
||||
# Node management endpoints
|
||||
@router.get("/cluster/nodes/registered")
|
||||
async def get_registered_nodes(include_offline: bool = True) -> Dict[str, Any]:
|
||||
"""
|
||||
Get all registered cluster nodes.
|
||||
|
||||
Returns detailed information about all nodes that have registered
|
||||
with the cluster, including their hardware specs and current status.
|
||||
"""
|
||||
try:
|
||||
nodes = await cluster_registration_service.get_registered_nodes(include_offline)
|
||||
|
||||
# Convert to API response format
|
||||
nodes_data = []
|
||||
for node in nodes:
|
||||
# Convert dataclass to dict and handle datetime serialization
|
||||
node_dict = {
|
||||
"id": node.id,
|
||||
"node_id": node.node_id,
|
||||
"hostname": node.hostname,
|
||||
"ip_address": node.ip_address,
|
||||
"status": node.status,
|
||||
"hardware": {
|
||||
"cpu": node.cpu_info or {},
|
||||
"memory": node.memory_info or {},
|
||||
"gpu": node.gpu_info or {},
|
||||
"disk": node.disk_info or {},
|
||||
"os": node.os_info or {},
|
||||
"platform": node.platform_info or {}
|
||||
},
|
||||
"services": node.services or {},
|
||||
"capabilities": node.capabilities or {},
|
||||
"ports": node.ports or {},
|
||||
"client_version": node.client_version,
|
||||
"first_registered": node.first_registered.isoformat(),
|
||||
"last_heartbeat": node.last_heartbeat.isoformat(),
|
||||
"registration_metadata": node.registration_metadata or {}
|
||||
}
|
||||
nodes_data.append(node_dict)
|
||||
|
||||
return {
|
||||
"nodes": nodes_data,
|
||||
"total_count": len(nodes_data),
|
||||
"online_count": len([n for n in nodes if n.status == "online"]),
|
||||
"offline_count": len([n for n in nodes if n.status == "offline"])
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get registered nodes: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve registered nodes")
|
||||
|
||||
@router.get("/cluster/nodes/{node_id}")
|
||||
async def get_node_details(node_id: str) -> Dict[str, Any]:
|
||||
"""Get detailed information about a specific registered node."""
|
||||
try:
|
||||
node = await cluster_registration_service.get_node_details(node_id)
|
||||
if not node:
|
||||
raise HTTPException(status_code=404, detail="Node not found")
|
||||
|
||||
return {
|
||||
"id": node.id,
|
||||
"node_id": node.node_id,
|
||||
"hostname": node.hostname,
|
||||
"ip_address": node.ip_address,
|
||||
"status": node.status,
|
||||
"hardware": {
|
||||
"cpu": node.cpu_info or {},
|
||||
"memory": node.memory_info or {},
|
||||
"gpu": node.gpu_info or {},
|
||||
"disk": node.disk_info or {},
|
||||
"os": node.os_info or {},
|
||||
"platform": node.platform_info or {}
|
||||
},
|
||||
"services": node.services or {},
|
||||
"capabilities": node.capabilities or {},
|
||||
"ports": node.ports or {},
|
||||
"client_version": node.client_version,
|
||||
"first_registered": node.first_registered.isoformat(),
|
||||
"last_heartbeat": node.last_heartbeat.isoformat(),
|
||||
"registration_metadata": node.registration_metadata or {}
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get node details for {node_id}: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve node details")
|
||||
|
||||
@router.delete("/cluster/nodes/{node_id}")
|
||||
async def remove_node(node_id: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Remove a node from the cluster.
|
||||
|
||||
This will unregister the node and stop accepting its heartbeats.
|
||||
The node will need to re-register to rejoin the cluster.
|
||||
"""
|
||||
try:
|
||||
success = await cluster_registration_service.remove_node(node_id)
|
||||
if not success:
|
||||
raise HTTPException(status_code=404, detail="Node not found")
|
||||
|
||||
return {
|
||||
"node_id": node_id,
|
||||
"status": "removed",
|
||||
"message": "Node successfully removed from cluster"
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to remove node {node_id}: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to remove node")
|
||||
|
||||
# Token management endpoints
|
||||
@router.post("/cluster/tokens")
|
||||
async def create_cluster_token(token_request: TokenCreateRequest) -> Dict[str, Any]:
|
||||
"""
|
||||
Create a new cluster registration token.
|
||||
|
||||
Tokens are used by Bzzz clients to authenticate and register with the cluster.
|
||||
Only administrators should have access to this endpoint.
|
||||
"""
|
||||
try:
|
||||
# For now, use a default admin user ID
|
||||
# TODO: Extract from JWT token or session
|
||||
admin_user_id = "admin" # This should come from authentication
|
||||
|
||||
token = await cluster_registration_service.generate_cluster_token(
|
||||
description=token_request.description,
|
||||
created_by_user_id=admin_user_id,
|
||||
expires_in_days=token_request.expires_in_days,
|
||||
max_registrations=token_request.max_registrations,
|
||||
allowed_ip_ranges=token_request.allowed_ip_ranges
|
||||
)
|
||||
|
||||
return {
|
||||
"id": token.id,
|
||||
"token": token.token,
|
||||
"description": token.description,
|
||||
"created_at": token.created_at.isoformat(),
|
||||
"expires_at": token.expires_at.isoformat() if token.expires_at else None,
|
||||
"is_active": token.is_active,
|
||||
"max_registrations": token.max_registrations,
|
||||
"current_registrations": token.current_registrations,
|
||||
"allowed_ip_ranges": token.allowed_ip_ranges
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to create cluster token: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to create token")
|
||||
|
||||
@router.get("/cluster/tokens")
|
||||
async def list_cluster_tokens() -> Dict[str, Any]:
|
||||
"""
|
||||
List all cluster registration tokens.
|
||||
|
||||
Returns information about all tokens including their usage statistics.
|
||||
Only administrators should have access to this endpoint.
|
||||
"""
|
||||
try:
|
||||
tokens = await cluster_registration_service.list_tokens()
|
||||
|
||||
tokens_data = []
|
||||
for token in tokens:
|
||||
tokens_data.append({
|
||||
"id": token.id,
|
||||
"token": token.token[:20] + "..." if len(token.token) > 20 else token.token, # Partial token for security
|
||||
"description": token.description,
|
||||
"created_at": token.created_at.isoformat(),
|
||||
"expires_at": token.expires_at.isoformat() if token.expires_at else None,
|
||||
"is_active": token.is_active,
|
||||
"max_registrations": token.max_registrations,
|
||||
"current_registrations": token.current_registrations,
|
||||
"allowed_ip_ranges": token.allowed_ip_ranges
|
||||
})
|
||||
|
||||
return {
|
||||
"tokens": tokens_data,
|
||||
"total_count": len(tokens_data)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list cluster tokens: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to list tokens")
|
||||
|
||||
@router.delete("/cluster/tokens/{token}")
|
||||
async def revoke_cluster_token(token: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Revoke a cluster registration token.
|
||||
|
||||
This will prevent new registrations using this token, but won't affect
|
||||
nodes that are already registered.
|
||||
"""
|
||||
try:
|
||||
success = await cluster_registration_service.revoke_token(token)
|
||||
if not success:
|
||||
raise HTTPException(status_code=404, detail="Token not found")
|
||||
|
||||
return {
|
||||
"token": token[:20] + "..." if len(token) > 20 else token,
|
||||
"status": "revoked",
|
||||
"message": "Token successfully revoked"
|
||||
}
|
||||
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to revoke token {token}: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to revoke token")
|
||||
|
||||
# Cluster statistics and monitoring
|
||||
@router.get("/cluster/statistics")
|
||||
async def get_cluster_statistics() -> Dict[str, Any]:
|
||||
"""
|
||||
Get cluster health and usage statistics.
|
||||
|
||||
Returns information about node counts, token usage, and overall cluster health.
|
||||
"""
|
||||
try:
|
||||
stats = await cluster_registration_service.get_cluster_statistics()
|
||||
return stats
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get cluster statistics: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to retrieve cluster statistics")
|
||||
|
||||
# Maintenance endpoints
|
||||
@router.post("/cluster/maintenance/cleanup-offline")
|
||||
async def cleanup_offline_nodes(offline_threshold_minutes: int = 10) -> Dict[str, Any]:
|
||||
"""
|
||||
Mark nodes as offline if they haven't sent heartbeats recently.
|
||||
|
||||
This maintenance endpoint should be called periodically to keep
|
||||
the cluster status accurate.
|
||||
"""
|
||||
try:
|
||||
count = await cluster_registration_service.cleanup_offline_nodes(offline_threshold_minutes)
|
||||
return {
|
||||
"nodes_marked_offline": count,
|
||||
"threshold_minutes": offline_threshold_minutes,
|
||||
"message": f"Marked {count} nodes as offline"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to cleanup offline nodes: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to cleanup offline nodes")
|
||||
|
||||
@router.post("/cluster/maintenance/cleanup-heartbeats")
|
||||
async def cleanup_old_heartbeats(retention_days: int = 30) -> Dict[str, Any]:
|
||||
"""
|
||||
Remove old heartbeat data to manage database size.
|
||||
|
||||
This maintenance endpoint should be called periodically to prevent
|
||||
the heartbeat table from growing too large.
|
||||
"""
|
||||
try:
|
||||
count = await cluster_registration_service.cleanup_old_heartbeats(retention_days)
|
||||
return {
|
||||
"heartbeats_deleted": count,
|
||||
"retention_days": retention_days,
|
||||
"message": f"Deleted {count} old heartbeat records"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to cleanup old heartbeats: {e}")
|
||||
raise HTTPException(status_code=500, detail="Failed to cleanup old heartbeats")
|
||||
|
||||
# Health check endpoint
|
||||
@router.get("/cluster/health")
|
||||
async def cluster_registration_health() -> Dict[str, Any]:
|
||||
"""
|
||||
Health check for the cluster registration system.
|
||||
"""
|
||||
try:
|
||||
# Test database connection
|
||||
stats = await cluster_registration_service.get_cluster_statistics()
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"database_connected": True,
|
||||
"cluster_health": stats.get("cluster_health", {}),
|
||||
"timestamp": stats.get("last_updated")
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Cluster registration health check failed: {e}")
|
||||
return {
|
||||
"status": "unhealthy",
|
||||
"database_connected": False,
|
||||
"error": str(e),
|
||||
"timestamp": None
|
||||
}
|
||||
474
backend/app/api/feedback.py
Normal file
474
backend/app/api/feedback.py
Normal file
@@ -0,0 +1,474 @@
|
||||
"""
|
||||
Context Feedback API endpoints for RL Context Curator integration
|
||||
"""
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
|
||||
from sqlalchemy.orm import Session
|
||||
from typing import List, Optional, Dict, Any
|
||||
from datetime import datetime, timedelta
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from ..core.database import get_db
|
||||
from ..models.context_feedback import ContextFeedback, AgentPermissions, PromotionRuleHistory
|
||||
from ..models.task import Task
|
||||
from ..models.agent import Agent
|
||||
from ..services.auth import get_current_user
|
||||
from ..models.responses import StatusResponse
|
||||
|
||||
router = APIRouter(prefix="/api/feedback", tags=["Context Feedback"])
|
||||
|
||||
|
||||
# Pydantic models for API
|
||||
class ContextFeedbackRequest(BaseModel):
|
||||
"""Request model for context feedback"""
|
||||
context_id: str = Field(..., description="HCFS context ID")
|
||||
feedback_type: str = Field(..., description="Type of feedback: upvote, downvote, forgetfulness, task_success, task_failure")
|
||||
confidence: float = Field(..., ge=0.0, le=1.0, description="Confidence in feedback")
|
||||
reason: Optional[str] = Field(None, description="Optional reason for feedback")
|
||||
usage_context: Optional[str] = Field(None, description="Context of usage")
|
||||
directory_scope: Optional[str] = Field(None, description="Directory where context was used")
|
||||
task_type: Optional[str] = Field(None, description="Type of task being performed")
|
||||
|
||||
|
||||
class TaskOutcomeFeedbackRequest(BaseModel):
|
||||
"""Request model for task outcome feedback"""
|
||||
task_id: str = Field(..., description="Task ID")
|
||||
outcome: str = Field(..., description="Task outcome: completed, failed, abandoned")
|
||||
completion_time: Optional[int] = Field(None, description="Time to complete in seconds")
|
||||
errors_encountered: int = Field(0, description="Number of errors during execution")
|
||||
follow_up_questions: int = Field(0, description="Number of follow-up questions")
|
||||
context_used: Optional[List[str]] = Field(None, description="Context IDs used in task")
|
||||
context_relevance_score: Optional[float] = Field(None, ge=0.0, le=1.0, description="Average relevance of used context")
|
||||
outcome_confidence: Optional[float] = Field(None, ge=0.0, le=1.0, description="Confidence in outcome classification")
|
||||
|
||||
|
||||
class AgentPermissionsRequest(BaseModel):
|
||||
"""Request model for agent permissions"""
|
||||
agent_id: str = Field(..., description="Agent ID")
|
||||
role: str = Field(..., description="Agent role")
|
||||
directory_patterns: List[str] = Field(..., description="Directory patterns for this role")
|
||||
task_types: List[str] = Field(..., description="Task types this agent can handle")
|
||||
context_weight: float = Field(1.0, ge=0.1, le=2.0, description="Weight for context relevance")
|
||||
|
||||
|
||||
class ContextFeedbackResponse(BaseModel):
|
||||
"""Response model for context feedback"""
|
||||
id: int
|
||||
context_id: str
|
||||
agent_id: str
|
||||
task_id: Optional[str]
|
||||
feedback_type: str
|
||||
role: str
|
||||
confidence: float
|
||||
reason: Optional[str]
|
||||
usage_context: Optional[str]
|
||||
directory_scope: Optional[str]
|
||||
task_type: Optional[str]
|
||||
timestamp: datetime
|
||||
|
||||
|
||||
class FeedbackStatsResponse(BaseModel):
|
||||
"""Response model for feedback statistics"""
|
||||
total_feedback: int
|
||||
feedback_by_type: Dict[str, int]
|
||||
feedback_by_role: Dict[str, int]
|
||||
average_confidence: float
|
||||
recent_feedback_count: int
|
||||
top_contexts: List[Dict[str, Any]]
|
||||
|
||||
|
||||
@router.post("/context/{context_id}", response_model=StatusResponse)
|
||||
async def submit_context_feedback(
|
||||
context_id: str,
|
||||
request: ContextFeedbackRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: dict = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Submit feedback for a specific context
|
||||
"""
|
||||
try:
|
||||
# Get agent information
|
||||
agent = db.query(Agent).filter(Agent.id == current_user.get("agent_id", "unknown")).first()
|
||||
if not agent:
|
||||
raise HTTPException(status_code=404, detail="Agent not found")
|
||||
|
||||
# Validate feedback type
|
||||
valid_types = ["upvote", "downvote", "forgetfulness", "task_success", "task_failure"]
|
||||
if request.feedback_type not in valid_types:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid feedback type. Must be one of: {valid_types}")
|
||||
|
||||
# Create feedback record
|
||||
feedback = ContextFeedback(
|
||||
context_id=request.context_id,
|
||||
agent_id=agent.id,
|
||||
feedback_type=request.feedback_type,
|
||||
role=agent.role if agent.role else "general",
|
||||
confidence=request.confidence,
|
||||
reason=request.reason,
|
||||
usage_context=request.usage_context,
|
||||
directory_scope=request.directory_scope,
|
||||
task_type=request.task_type
|
||||
)
|
||||
|
||||
db.add(feedback)
|
||||
db.commit()
|
||||
db.refresh(feedback)
|
||||
|
||||
# Send feedback to RL Context Curator in background
|
||||
background_tasks.add_task(
|
||||
send_feedback_to_rl_curator,
|
||||
feedback.id,
|
||||
request.context_id,
|
||||
request.feedback_type,
|
||||
agent.id,
|
||||
agent.role if agent.role else "general",
|
||||
request.confidence
|
||||
)
|
||||
|
||||
return StatusResponse(
|
||||
status="success",
|
||||
message="Context feedback submitted successfully",
|
||||
data={"feedback_id": feedback.id, "context_id": request.context_id}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
raise HTTPException(status_code=500, detail=f"Failed to submit feedback: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/task-outcome/{task_id}", response_model=StatusResponse)
|
||||
async def submit_task_outcome_feedback(
|
||||
task_id: str,
|
||||
request: TaskOutcomeFeedbackRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: dict = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Submit task outcome feedback for RL learning
|
||||
"""
|
||||
try:
|
||||
# Get task
|
||||
task = db.query(Task).filter(Task.id == task_id).first()
|
||||
if not task:
|
||||
raise HTTPException(status_code=404, detail="Task not found")
|
||||
|
||||
# Update task with outcome metrics
|
||||
task.task_outcome = request.outcome
|
||||
task.completion_time = request.completion_time
|
||||
task.errors_encountered = request.errors_encountered
|
||||
task.follow_up_questions = request.follow_up_questions
|
||||
task.context_relevance_score = request.context_relevance_score
|
||||
task.outcome_confidence = request.outcome_confidence
|
||||
task.feedback_collected = True
|
||||
|
||||
if request.context_used:
|
||||
task.context_used = request.context_used
|
||||
|
||||
if request.outcome in ["completed", "failed", "abandoned"] and not task.completed_at:
|
||||
task.completed_at = datetime.utcnow()
|
||||
|
||||
# Calculate success rate
|
||||
if request.outcome == "completed":
|
||||
task.success_rate = 1.0 - (request.errors_encountered * 0.1) # Simple calculation
|
||||
task.success_rate = max(0.0, min(1.0, task.success_rate))
|
||||
else:
|
||||
task.success_rate = 0.0
|
||||
|
||||
db.commit()
|
||||
|
||||
# Create feedback events for used contexts
|
||||
if request.context_used and task.assigned_agent_id:
|
||||
agent = db.query(Agent).filter(Agent.id == task.assigned_agent_id).first()
|
||||
if agent:
|
||||
feedback_type = "task_success" if request.outcome == "completed" else "task_failure"
|
||||
|
||||
for context_id in request.context_used:
|
||||
feedback = ContextFeedback(
|
||||
context_id=context_id,
|
||||
agent_id=agent.id,
|
||||
task_id=task.id,
|
||||
feedback_type=feedback_type,
|
||||
role=agent.role if agent.role else "general",
|
||||
confidence=request.outcome_confidence or 0.8,
|
||||
reason=f"Task {request.outcome}",
|
||||
usage_context=f"task_execution_{request.outcome}",
|
||||
task_type=request.task_type
|
||||
)
|
||||
db.add(feedback)
|
||||
|
||||
db.commit()
|
||||
|
||||
return StatusResponse(
|
||||
status="success",
|
||||
message="Task outcome feedback submitted successfully",
|
||||
data={"task_id": task_id, "outcome": request.outcome}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
raise HTTPException(status_code=500, detail=f"Failed to submit task outcome: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/stats", response_model=FeedbackStatsResponse)
|
||||
async def get_feedback_stats(
|
||||
days: int = 7,
|
||||
role: Optional[str] = None,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: dict = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Get feedback statistics for analysis
|
||||
"""
|
||||
try:
|
||||
# Base query
|
||||
query = db.query(ContextFeedback)
|
||||
|
||||
# Filter by date range
|
||||
if days > 0:
|
||||
since_date = datetime.utcnow() - timedelta(days=days)
|
||||
query = query.filter(ContextFeedback.timestamp >= since_date)
|
||||
|
||||
# Filter by role if specified
|
||||
if role:
|
||||
query = query.filter(ContextFeedback.role == role)
|
||||
|
||||
feedback_records = query.all()
|
||||
|
||||
# Calculate statistics
|
||||
total_feedback = len(feedback_records)
|
||||
|
||||
feedback_by_type = {}
|
||||
feedback_by_role = {}
|
||||
confidence_values = []
|
||||
context_usage = {}
|
||||
|
||||
for feedback in feedback_records:
|
||||
# Count by type
|
||||
feedback_by_type[feedback.feedback_type] = feedback_by_type.get(feedback.feedback_type, 0) + 1
|
||||
|
||||
# Count by role
|
||||
feedback_by_role[feedback.role] = feedback_by_role.get(feedback.role, 0) + 1
|
||||
|
||||
# Collect confidence values
|
||||
confidence_values.append(feedback.confidence)
|
||||
|
||||
# Count context usage
|
||||
context_usage[feedback.context_id] = context_usage.get(feedback.context_id, 0) + 1
|
||||
|
||||
# Calculate average confidence
|
||||
average_confidence = sum(confidence_values) / len(confidence_values) if confidence_values else 0.0
|
||||
|
||||
# Get recent feedback count (last 24 hours)
|
||||
recent_since = datetime.utcnow() - timedelta(days=1)
|
||||
recent_count = db.query(ContextFeedback).filter(
|
||||
ContextFeedback.timestamp >= recent_since
|
||||
).count()
|
||||
|
||||
# Get top contexts by usage
|
||||
top_contexts = [
|
||||
{"context_id": ctx_id, "usage_count": count}
|
||||
for ctx_id, count in sorted(context_usage.items(), key=lambda x: x[1], reverse=True)[:10]
|
||||
]
|
||||
|
||||
return FeedbackStatsResponse(
|
||||
total_feedback=total_feedback,
|
||||
feedback_by_type=feedback_by_type,
|
||||
feedback_by_role=feedback_by_role,
|
||||
average_confidence=average_confidence,
|
||||
recent_feedback_count=recent_count,
|
||||
top_contexts=top_contexts
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to get feedback stats: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/recent", response_model=List[ContextFeedbackResponse])
|
||||
async def get_recent_feedback(
|
||||
limit: int = 50,
|
||||
feedback_type: Optional[str] = None,
|
||||
role: Optional[str] = None,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: dict = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Get recent feedback events
|
||||
"""
|
||||
try:
|
||||
query = db.query(ContextFeedback).order_by(ContextFeedback.timestamp.desc())
|
||||
|
||||
if feedback_type:
|
||||
query = query.filter(ContextFeedback.feedback_type == feedback_type)
|
||||
|
||||
if role:
|
||||
query = query.filter(ContextFeedback.role == role)
|
||||
|
||||
feedback_records = query.limit(limit).all()
|
||||
|
||||
return [
|
||||
ContextFeedbackResponse(
|
||||
id=fb.id,
|
||||
context_id=fb.context_id,
|
||||
agent_id=fb.agent_id,
|
||||
task_id=str(fb.task_id) if fb.task_id else None,
|
||||
feedback_type=fb.feedback_type,
|
||||
role=fb.role,
|
||||
confidence=fb.confidence,
|
||||
reason=fb.reason,
|
||||
usage_context=fb.usage_context,
|
||||
directory_scope=fb.directory_scope,
|
||||
task_type=fb.task_type,
|
||||
timestamp=fb.timestamp
|
||||
)
|
||||
for fb in feedback_records
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to get recent feedback: {str(e)}")
|
||||
|
||||
|
||||
@router.post("/agent-permissions", response_model=StatusResponse)
|
||||
async def set_agent_permissions(
|
||||
request: AgentPermissionsRequest,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: dict = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Set or update agent permissions for context filtering
|
||||
"""
|
||||
try:
|
||||
# Check if permissions already exist
|
||||
existing = db.query(AgentPermissions).filter(
|
||||
AgentPermissions.agent_id == request.agent_id,
|
||||
AgentPermissions.role == request.role
|
||||
).first()
|
||||
|
||||
if existing:
|
||||
# Update existing permissions
|
||||
existing.directory_patterns = ",".join(request.directory_patterns)
|
||||
existing.task_types = ",".join(request.task_types)
|
||||
existing.context_weight = request.context_weight
|
||||
existing.updated_at = datetime.utcnow()
|
||||
else:
|
||||
# Create new permissions
|
||||
permissions = AgentPermissions(
|
||||
agent_id=request.agent_id,
|
||||
role=request.role,
|
||||
directory_patterns=",".join(request.directory_patterns),
|
||||
task_types=",".join(request.task_types),
|
||||
context_weight=request.context_weight
|
||||
)
|
||||
db.add(permissions)
|
||||
|
||||
db.commit()
|
||||
|
||||
return StatusResponse(
|
||||
status="success",
|
||||
message="Agent permissions updated successfully",
|
||||
data={"agent_id": request.agent_id, "role": request.role}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
db.rollback()
|
||||
raise HTTPException(status_code=500, detail=f"Failed to set agent permissions: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/agent-permissions/{agent_id}")
|
||||
async def get_agent_permissions(
|
||||
agent_id: str,
|
||||
db: Session = Depends(get_db),
|
||||
current_user: dict = Depends(get_current_user)
|
||||
):
|
||||
"""
|
||||
Get agent permissions for context filtering
|
||||
"""
|
||||
try:
|
||||
permissions = db.query(AgentPermissions).filter(
|
||||
AgentPermissions.agent_id == agent_id,
|
||||
AgentPermissions.active == "true"
|
||||
).all()
|
||||
|
||||
return [
|
||||
{
|
||||
"id": perm.id,
|
||||
"agent_id": perm.agent_id,
|
||||
"role": perm.role,
|
||||
"directory_patterns": perm.directory_patterns.split(",") if perm.directory_patterns else [],
|
||||
"task_types": perm.task_types.split(",") if perm.task_types else [],
|
||||
"context_weight": perm.context_weight,
|
||||
"created_at": perm.created_at,
|
||||
"updated_at": perm.updated_at
|
||||
}
|
||||
for perm in permissions
|
||||
]
|
||||
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to get agent permissions: {str(e)}")
|
||||
|
||||
|
||||
async def send_feedback_to_rl_curator(
|
||||
feedback_id: int,
|
||||
context_id: str,
|
||||
feedback_type: str,
|
||||
agent_id: str,
|
||||
role: str,
|
||||
confidence: float
|
||||
):
|
||||
"""
|
||||
Background task to send feedback to RL Context Curator
|
||||
"""
|
||||
try:
|
||||
import httpx
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
# Prepare feedback event in Bzzz format
|
||||
feedback_event = {
|
||||
"bzzz_type": "feedback_event",
|
||||
"timestamp": datetime.utcnow().isoformat(),
|
||||
"origin": {
|
||||
"node_id": "hive",
|
||||
"agent_id": agent_id,
|
||||
"task_id": f"hive-feedback-{feedback_id}",
|
||||
"workspace": "hive://context-feedback",
|
||||
"directory": "/feedback/"
|
||||
},
|
||||
"feedback": {
|
||||
"type": feedback_type,
|
||||
"category": "general", # Could be enhanced with category detection
|
||||
"role": role,
|
||||
"context_id": context_id,
|
||||
"reason": f"Feedback from Hive agent {agent_id}",
|
||||
"confidence": confidence,
|
||||
"usage_context": "hive_platform"
|
||||
},
|
||||
"task_outcome": {
|
||||
"completed": feedback_type in ["upvote", "task_success"],
|
||||
"completion_time": 0,
|
||||
"errors_encountered": 0,
|
||||
"follow_up_questions": 0
|
||||
}
|
||||
}
|
||||
|
||||
# Send to HCFS RL Tuner Service
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
"http://localhost:8001/api/feedback",
|
||||
json=feedback_event,
|
||||
timeout=10.0
|
||||
)
|
||||
if response.status_code == 200:
|
||||
print(f"✅ Feedback sent to RL Curator: {feedback_id}")
|
||||
else:
|
||||
print(f"⚠️ RL Curator responded with status {response.status_code}")
|
||||
except httpx.ConnectError:
|
||||
print(f"⚠️ Could not connect to RL Curator service (feedback {feedback_id})")
|
||||
except Exception as e:
|
||||
print(f"❌ Error sending feedback to RL Curator: {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"❌ Background feedback task failed: {e}")
|
||||
@@ -47,6 +47,37 @@ async def get_project_tasks(project_id: str, current_user: Dict[str, Any] = Depe
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.put("/projects/{project_id}")
|
||||
async def update_project(project_id: str, project_data: Dict[str, Any], current_user: Dict[str, Any] = Depends(get_current_user_context)) -> Dict[str, Any]:
|
||||
"""Update a project configuration."""
|
||||
try:
|
||||
updated_project = project_service.update_project(project_id, project_data)
|
||||
if not updated_project:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
return updated_project
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/projects")
|
||||
async def create_project(project_data: Dict[str, Any], current_user: Dict[str, Any] = Depends(get_current_user_context)) -> Dict[str, Any]:
|
||||
"""Create a new project."""
|
||||
try:
|
||||
new_project = project_service.create_project(project_data)
|
||||
return new_project
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.delete("/projects/{project_id}")
|
||||
async def delete_project(project_id: str, current_user: Dict[str, Any] = Depends(get_current_user_context)) -> Dict[str, Any]:
|
||||
"""Delete a project."""
|
||||
try:
|
||||
result = project_service.delete_project(project_id)
|
||||
if not result:
|
||||
raise HTTPException(status_code=404, detail="Project not found")
|
||||
return {"success": True, "message": "Project deleted successfully"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
# === Bzzz Integration Endpoints ===
|
||||
|
||||
@bzzz_router.get("/active-repos")
|
||||
|
||||
@@ -11,7 +11,7 @@ from typing import Dict, Any, Optional
|
||||
from dataclasses import asdict
|
||||
|
||||
# Add CCLI source to path
|
||||
ccli_path = os.path.join(os.path.dirname(__file__), '../../../ccli_src')
|
||||
ccli_path = os.path.join(os.path.dirname(__file__), '../../ccli_src')
|
||||
sys.path.insert(0, ccli_path)
|
||||
|
||||
from agents.gemini_cli_agent import GeminiCliAgent, GeminiCliConfig, TaskRequest as CliTaskRequest, TaskResult as CliTaskResult
|
||||
|
||||
@@ -273,7 +273,6 @@ def create_token_response(user_id: int, user_data: Dict[str, Any]) -> Dict[str,
|
||||
"refresh_token": refresh_token,
|
||||
"token_type": "bearer",
|
||||
"expires_in": ACCESS_TOKEN_EXPIRE_MINUTES * 60, # seconds
|
||||
"user": user_data,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -174,6 +174,10 @@ app = FastAPI(
|
||||
"name": "cluster",
|
||||
"description": "Cluster-wide operations and coordination"
|
||||
},
|
||||
{
|
||||
"name": "cluster-registration",
|
||||
"description": "Dynamic cluster node registration and management"
|
||||
},
|
||||
{
|
||||
"name": "distributed-workflows",
|
||||
"description": "Advanced distributed workflow management"
|
||||
@@ -206,7 +210,7 @@ def get_coordinator() -> UnifiedCoordinator:
|
||||
return unified_coordinator
|
||||
|
||||
# Import API routers
|
||||
from .api import agents, workflows, executions, monitoring, projects, tasks, cluster, distributed_workflows, cli_agents, auth
|
||||
from .api import agents, workflows, executions, monitoring, projects, tasks, cluster, distributed_workflows, cli_agents, auth, bzzz_logs, cluster_registration
|
||||
|
||||
# Import error handlers and response models
|
||||
from .core.error_handlers import (
|
||||
@@ -239,8 +243,10 @@ app.include_router(projects.router, prefix="/api", tags=["projects"])
|
||||
app.include_router(projects.bzzz_router, prefix="/api", tags=["bzzz-integration"])
|
||||
app.include_router(tasks.router, prefix="/api", tags=["tasks"])
|
||||
app.include_router(cluster.router, prefix="/api", tags=["cluster"])
|
||||
app.include_router(cluster_registration.router, prefix="/api", tags=["cluster-registration"])
|
||||
app.include_router(distributed_workflows.router, tags=["distributed-workflows"])
|
||||
app.include_router(cli_agents.router, tags=["cli-agents"])
|
||||
app.include_router(bzzz_logs.router, prefix="/api", tags=["bzzz-logs"])
|
||||
|
||||
# Override dependency functions in API modules with our coordinator instance
|
||||
agents.get_coordinator = get_coordinator
|
||||
@@ -528,16 +534,6 @@ async def root():
|
||||
|
||||
# Removed duplicate /health endpoint - using the enhanced one above
|
||||
|
||||
@app.get("/api/health", response_model=None)
|
||||
async def health_check():
|
||||
"""Simple health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"version": "1.0.0",
|
||||
"message": "Hive API is operational"
|
||||
}
|
||||
|
||||
@app.get("/api/status")
|
||||
async def get_system_status():
|
||||
"""Get comprehensive system status"""
|
||||
|
||||
@@ -2,4 +2,5 @@ from . import agent
|
||||
from . import agent_role
|
||||
from . import project
|
||||
from . import task
|
||||
from . import context_feedback
|
||||
from . import sqlalchemy_models
|
||||
@@ -34,6 +34,8 @@ class Agent(Base):
|
||||
|
||||
# Relationships
|
||||
tasks = relationship("Task", back_populates="assigned_agent")
|
||||
context_feedback = relationship("ContextFeedback", back_populates="agent")
|
||||
permissions = relationship("AgentPermissions", back_populates="agent")
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
|
||||
85
backend/app/models/context_feedback.py
Normal file
85
backend/app/models/context_feedback.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
Context Feedback model for RL Context Curator integration
|
||||
"""
|
||||
|
||||
from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, UUID as SqlUUID, Float
|
||||
from sqlalchemy.sql import func
|
||||
from sqlalchemy.orm import relationship
|
||||
from ..core.database import Base
|
||||
import uuid
|
||||
|
||||
|
||||
class ContextFeedback(Base):
|
||||
__tablename__ = "context_feedback"
|
||||
|
||||
# Primary identification
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
|
||||
# Context and agent information
|
||||
context_id = Column(String(255), nullable=False, index=True) # HCFS context ID
|
||||
agent_id = Column(String(255), ForeignKey("agents.id"), nullable=False)
|
||||
task_id = Column(SqlUUID(as_uuid=True), ForeignKey("tasks.id"), nullable=True)
|
||||
|
||||
# Feedback details
|
||||
feedback_type = Column(String(50), nullable=False) # upvote, downvote, forgetfulness, task_success, task_failure
|
||||
role = Column(String(100), nullable=False) # Agent role when feedback was given
|
||||
confidence = Column(Float, nullable=False) # Confidence in feedback (0.0 to 1.0)
|
||||
reason = Column(Text, nullable=True) # Optional reason for feedback
|
||||
usage_context = Column(String(255), nullable=True) # Context of usage (debugging, coding, etc.)
|
||||
|
||||
# Additional metadata
|
||||
directory_scope = Column(String(500), nullable=True) # Directory where context was used
|
||||
task_type = Column(String(100), nullable=True) # Type of task being performed
|
||||
|
||||
# Timestamps
|
||||
timestamp = Column(DateTime(timezone=True), server_default=func.now())
|
||||
|
||||
# Relationships
|
||||
agent = relationship("Agent", back_populates="context_feedback")
|
||||
task = relationship("Task", backref="context_feedback")
|
||||
|
||||
|
||||
class AgentPermissions(Base):
|
||||
__tablename__ = "agent_permissions"
|
||||
|
||||
# Primary identification
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
|
||||
# Agent and role information
|
||||
agent_id = Column(String(255), ForeignKey("agents.id"), nullable=False, index=True)
|
||||
role = Column(String(100), nullable=False)
|
||||
|
||||
# Permission details
|
||||
directory_patterns = Column(Text, nullable=True) # JSON array of path patterns
|
||||
task_types = Column(Text, nullable=True) # JSON array of allowed task types
|
||||
context_weight = Column(Float, default=1.0) # Weight for context relevance
|
||||
|
||||
# Status
|
||||
active = Column(String(10), default='true') # String to match existing boolean patterns
|
||||
|
||||
# Timestamps
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
updated_at = Column(DateTime(timezone=True), server_default=func.now(), onupdate=func.now())
|
||||
|
||||
# Relationships
|
||||
agent = relationship("Agent", back_populates="permissions")
|
||||
|
||||
|
||||
class PromotionRuleHistory(Base):
|
||||
__tablename__ = "promotion_rule_history"
|
||||
|
||||
# Primary identification
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
|
||||
# Rule information
|
||||
rule_version = Column(String(50), nullable=False)
|
||||
category = Column(String(100), nullable=False)
|
||||
role = Column(String(100), nullable=False)
|
||||
weight_value = Column(Float, nullable=False)
|
||||
|
||||
# Change information
|
||||
change_reason = Column(Text, nullable=True)
|
||||
previous_value = Column(Float, nullable=True)
|
||||
|
||||
# Timestamps
|
||||
timestamp = Column(DateTime(timezone=True), server_default=func.now())
|
||||
@@ -2,7 +2,7 @@
|
||||
Task model for SQLAlchemy ORM
|
||||
"""
|
||||
|
||||
from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, UUID as SqlUUID
|
||||
from sqlalchemy import Column, String, Text, Integer, DateTime, ForeignKey, UUID as SqlUUID, Float, Boolean
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from sqlalchemy.sql import func
|
||||
from sqlalchemy.orm import relationship
|
||||
@@ -30,6 +30,17 @@ class Task(Base):
|
||||
# Task metadata (includes context and payload)
|
||||
task_metadata = Column("metadata", JSONB, nullable=True)
|
||||
|
||||
# RL Context Curator outcome tracking fields
|
||||
completion_time = Column(Integer, nullable=True) # Time to complete in seconds
|
||||
errors_encountered = Column(Integer, default=0) # Number of errors during execution
|
||||
follow_up_questions = Column(Integer, default=0) # Number of follow-up questions
|
||||
success_rate = Column(Float, nullable=True) # Success rate (0.0 to 1.0)
|
||||
context_used = Column(JSONB, nullable=True) # Context IDs used in this task
|
||||
context_relevance_score = Column(Float, nullable=True) # Average relevance of used context
|
||||
feedback_collected = Column(Boolean, default=False) # Whether feedback was collected
|
||||
task_outcome = Column(String(50), nullable=True) # completed, failed, abandoned
|
||||
outcome_confidence = Column(Float, nullable=True) # Confidence in outcome classification
|
||||
|
||||
# Timestamps
|
||||
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
||||
started_at = Column(DateTime(timezone=True), nullable=True)
|
||||
|
||||
522
backend/app/services/cluster_registration_service.py
Normal file
522
backend/app/services/cluster_registration_service.py
Normal file
@@ -0,0 +1,522 @@
|
||||
"""
|
||||
Cluster Registration Service
|
||||
Handles registration-based cluster management for Hive-Bzzz integration.
|
||||
"""
|
||||
import asyncpg
|
||||
import secrets
|
||||
import json
|
||||
import socket
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, List, Optional, Any
|
||||
from dataclasses import dataclass
|
||||
from ipaddress import IPv4Network, IPv6Network, ip_address
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@dataclass
|
||||
class ClusterToken:
|
||||
id: int
|
||||
token: str
|
||||
description: str
|
||||
created_at: datetime
|
||||
expires_at: Optional[datetime]
|
||||
is_active: bool
|
||||
max_registrations: Optional[int]
|
||||
current_registrations: int
|
||||
allowed_ip_ranges: Optional[List[str]]
|
||||
|
||||
@dataclass
|
||||
class ClusterNode:
|
||||
id: int
|
||||
node_id: str
|
||||
hostname: str
|
||||
ip_address: str
|
||||
registration_token: str
|
||||
cpu_info: Optional[Dict[str, Any]]
|
||||
memory_info: Optional[Dict[str, Any]]
|
||||
gpu_info: Optional[Dict[str, Any]]
|
||||
disk_info: Optional[Dict[str, Any]]
|
||||
os_info: Optional[Dict[str, Any]]
|
||||
platform_info: Optional[Dict[str, Any]]
|
||||
status: str
|
||||
last_heartbeat: datetime
|
||||
first_registered: datetime
|
||||
services: Optional[Dict[str, Any]]
|
||||
capabilities: Optional[Dict[str, Any]]
|
||||
ports: Optional[Dict[str, Any]]
|
||||
client_version: Optional[str]
|
||||
registration_metadata: Optional[Dict[str, Any]]
|
||||
|
||||
@dataclass
|
||||
class RegistrationRequest:
|
||||
token: str
|
||||
node_id: str
|
||||
hostname: str
|
||||
ip_address: str
|
||||
system_info: Dict[str, Any]
|
||||
client_version: Optional[str] = None
|
||||
services: Optional[Dict[str, Any]] = None
|
||||
capabilities: Optional[Dict[str, Any]] = None
|
||||
ports: Optional[Dict[str, Any]] = None
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
|
||||
@dataclass
|
||||
class HeartbeatRequest:
|
||||
node_id: str
|
||||
status: str = "online"
|
||||
cpu_usage: Optional[float] = None
|
||||
memory_usage: Optional[float] = None
|
||||
disk_usage: Optional[float] = None
|
||||
gpu_usage: Optional[float] = None
|
||||
services_status: Optional[Dict[str, Any]] = None
|
||||
network_metrics: Optional[Dict[str, Any]] = None
|
||||
custom_metrics: Optional[Dict[str, Any]] = None
|
||||
|
||||
class ClusterRegistrationService:
|
||||
def __init__(self, database_url: str):
|
||||
self.database_url = database_url
|
||||
self._conn_cache = None
|
||||
|
||||
async def get_connection(self) -> asyncpg.Connection:
|
||||
"""Get database connection with caching."""
|
||||
if not self._conn_cache or self._conn_cache.is_closed():
|
||||
try:
|
||||
self._conn_cache = await asyncpg.connect(self.database_url)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to connect to database: {e}")
|
||||
raise
|
||||
return self._conn_cache
|
||||
|
||||
async def close_connection(self):
|
||||
"""Close database connection."""
|
||||
if self._conn_cache and not self._conn_cache.is_closed():
|
||||
await self._conn_cache.close()
|
||||
|
||||
# Token Management
|
||||
async def generate_cluster_token(
|
||||
self,
|
||||
description: str,
|
||||
created_by_user_id: str,
|
||||
expires_in_days: Optional[int] = None,
|
||||
max_registrations: Optional[int] = None,
|
||||
allowed_ip_ranges: Optional[List[str]] = None
|
||||
) -> ClusterToken:
|
||||
"""Generate a new cluster registration token."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
# Generate secure token
|
||||
token = f"hive_cluster_{secrets.token_urlsafe(32)}"
|
||||
expires_at = datetime.now() + timedelta(days=expires_in_days) if expires_in_days else None
|
||||
|
||||
try:
|
||||
result = await conn.fetchrow("""
|
||||
INSERT INTO cluster_tokens (
|
||||
token, description, created_by, expires_at,
|
||||
max_registrations, allowed_ip_ranges
|
||||
) VALUES ($1, $2, $3, $4, $5, $6)
|
||||
RETURNING id, token, description, created_at, expires_at,
|
||||
is_active, max_registrations, current_registrations, allowed_ip_ranges
|
||||
""", token, description, created_by_user_id, expires_at, max_registrations, allowed_ip_ranges)
|
||||
|
||||
return ClusterToken(**dict(result))
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to generate cluster token: {e}")
|
||||
raise
|
||||
|
||||
async def validate_token(self, token: str, client_ip: str) -> Optional[ClusterToken]:
|
||||
"""Validate a cluster registration token."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
result = await conn.fetchrow("""
|
||||
SELECT id, token, description, created_at, expires_at,
|
||||
is_active, max_registrations, current_registrations, allowed_ip_ranges
|
||||
FROM cluster_tokens
|
||||
WHERE token = $1 AND is_active = true
|
||||
""", token)
|
||||
|
||||
if not result:
|
||||
return None
|
||||
|
||||
cluster_token = ClusterToken(**dict(result))
|
||||
|
||||
# Check expiration
|
||||
if cluster_token.expires_at and datetime.now() > cluster_token.expires_at:
|
||||
logger.warning(f"Token {token[:20]}... has expired")
|
||||
return None
|
||||
|
||||
# Check registration limit
|
||||
if (cluster_token.max_registrations and
|
||||
cluster_token.current_registrations >= cluster_token.max_registrations):
|
||||
logger.warning(f"Token {token[:20]}... has reached registration limit")
|
||||
return None
|
||||
|
||||
# Check IP restrictions
|
||||
if cluster_token.allowed_ip_ranges:
|
||||
client_ip_obj = ip_address(client_ip)
|
||||
allowed = False
|
||||
for ip_range in cluster_token.allowed_ip_ranges:
|
||||
try:
|
||||
network = IPv4Network(ip_range, strict=False) if ':' not in ip_range else IPv6Network(ip_range, strict=False)
|
||||
if client_ip_obj in network:
|
||||
allowed = True
|
||||
break
|
||||
except Exception as e:
|
||||
logger.warning(f"Invalid IP range {ip_range}: {e}")
|
||||
|
||||
if not allowed:
|
||||
logger.warning(f"IP {client_ip} not allowed for token {token[:20]}...")
|
||||
return None
|
||||
|
||||
return cluster_token
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to validate token: {e}")
|
||||
return None
|
||||
|
||||
async def list_tokens(self) -> List[ClusterToken]:
|
||||
"""List all cluster tokens."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
results = await conn.fetch("""
|
||||
SELECT id, token, description, created_at, expires_at,
|
||||
is_active, max_registrations, current_registrations, allowed_ip_ranges
|
||||
FROM cluster_tokens
|
||||
ORDER BY created_at DESC
|
||||
""")
|
||||
|
||||
return [ClusterToken(**dict(result)) for result in results]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to list tokens: {e}")
|
||||
raise
|
||||
|
||||
async def revoke_token(self, token: str) -> bool:
|
||||
"""Revoke a cluster token."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
result = await conn.execute("""
|
||||
UPDATE cluster_tokens
|
||||
SET is_active = false
|
||||
WHERE token = $1
|
||||
""", token)
|
||||
|
||||
return result != "UPDATE 0"
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to revoke token: {e}")
|
||||
return False
|
||||
|
||||
# Node Registration
|
||||
async def register_node(self, request: RegistrationRequest, client_ip: str) -> Dict[str, Any]:
|
||||
"""Register a new cluster node."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
# Log registration attempt
|
||||
await self._log_registration_attempt(
|
||||
client_ip, request.token, request.node_id,
|
||||
request.hostname, True, None, request.metadata
|
||||
)
|
||||
|
||||
try:
|
||||
# Validate token
|
||||
token_info = await self.validate_token(request.token, client_ip)
|
||||
if not token_info:
|
||||
await self._log_registration_attempt(
|
||||
client_ip, request.token, request.node_id,
|
||||
request.hostname, False, "Invalid or expired token", request.metadata
|
||||
)
|
||||
raise ValueError("Invalid or expired registration token")
|
||||
|
||||
# Extract system info components
|
||||
system_info = request.system_info or {}
|
||||
cpu_info = system_info.get('cpu', {})
|
||||
memory_info = system_info.get('memory', {})
|
||||
gpu_info = system_info.get('gpu', {})
|
||||
disk_info = system_info.get('disk', {})
|
||||
os_info = system_info.get('os', {})
|
||||
platform_info = system_info.get('platform', {})
|
||||
|
||||
# Register or update node
|
||||
result = await conn.fetchrow("""
|
||||
INSERT INTO cluster_nodes (
|
||||
node_id, hostname, ip_address, registration_token,
|
||||
cpu_info, memory_info, gpu_info, disk_info, os_info, platform_info,
|
||||
services, capabilities, ports, client_version, registration_metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15)
|
||||
ON CONFLICT (node_id) DO UPDATE SET
|
||||
hostname = EXCLUDED.hostname,
|
||||
ip_address = EXCLUDED.ip_address,
|
||||
cpu_info = EXCLUDED.cpu_info,
|
||||
memory_info = EXCLUDED.memory_info,
|
||||
gpu_info = EXCLUDED.gpu_info,
|
||||
disk_info = EXCLUDED.disk_info,
|
||||
os_info = EXCLUDED.os_info,
|
||||
platform_info = EXCLUDED.platform_info,
|
||||
services = EXCLUDED.services,
|
||||
capabilities = EXCLUDED.capabilities,
|
||||
ports = EXCLUDED.ports,
|
||||
client_version = EXCLUDED.client_version,
|
||||
registration_metadata = EXCLUDED.registration_metadata,
|
||||
status = 'online',
|
||||
last_heartbeat = NOW()
|
||||
RETURNING id, node_id, hostname, ip_address, first_registered
|
||||
""",
|
||||
request.node_id, request.hostname, request.ip_address, request.token,
|
||||
json.dumps(cpu_info) if cpu_info else None,
|
||||
json.dumps(memory_info) if memory_info else None,
|
||||
json.dumps(gpu_info) if gpu_info else None,
|
||||
json.dumps(disk_info) if disk_info else None,
|
||||
json.dumps(os_info) if os_info else None,
|
||||
json.dumps(platform_info) if platform_info else None,
|
||||
json.dumps(request.services) if request.services else None,
|
||||
json.dumps(request.capabilities) if request.capabilities else None,
|
||||
json.dumps(request.ports) if request.ports else None,
|
||||
request.client_version,
|
||||
json.dumps(request.metadata) if request.metadata else None
|
||||
)
|
||||
|
||||
logger.info(f"Node {request.node_id} registered successfully from {client_ip}")
|
||||
|
||||
return {
|
||||
"node_id": result["node_id"],
|
||||
"registration_status": "success",
|
||||
"heartbeat_interval": 30, # seconds
|
||||
"registered_at": result["first_registered"].isoformat(),
|
||||
"cluster_info": {
|
||||
"coordinator_version": "1.0.0",
|
||||
"features": ["heartbeat", "dynamic_scaling", "service_discovery"]
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to register node {request.node_id}: {e}")
|
||||
await self._log_registration_attempt(
|
||||
client_ip, request.token, request.node_id,
|
||||
request.hostname, False, str(e), request.metadata
|
||||
)
|
||||
raise
|
||||
|
||||
async def update_heartbeat(self, request: HeartbeatRequest) -> Dict[str, Any]:
|
||||
"""Update node heartbeat and metrics."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
# Update node status and heartbeat
|
||||
result = await conn.fetchrow("""
|
||||
UPDATE cluster_nodes
|
||||
SET status = $2, last_heartbeat = NOW()
|
||||
WHERE node_id = $1
|
||||
RETURNING node_id, status, last_heartbeat
|
||||
""", request.node_id, request.status)
|
||||
|
||||
if not result:
|
||||
raise ValueError(f"Node {request.node_id} not found")
|
||||
|
||||
# Record heartbeat metrics
|
||||
await conn.execute("""
|
||||
INSERT INTO node_heartbeats (
|
||||
node_id, cpu_usage, memory_usage, disk_usage, gpu_usage,
|
||||
services_status, network_metrics, custom_metrics
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
|
||||
""",
|
||||
request.node_id, request.cpu_usage, request.memory_usage,
|
||||
request.disk_usage, request.gpu_usage,
|
||||
json.dumps(request.services_status) if request.services_status else None,
|
||||
json.dumps(request.network_metrics) if request.network_metrics else None,
|
||||
json.dumps(request.custom_metrics) if request.custom_metrics else None
|
||||
)
|
||||
|
||||
return {
|
||||
"node_id": result["node_id"],
|
||||
"status": result["status"],
|
||||
"heartbeat_received": result["last_heartbeat"].isoformat(),
|
||||
"next_heartbeat_in": 30, # seconds
|
||||
"commands": [] # Future: cluster management commands
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to update heartbeat for {request.node_id}: {e}")
|
||||
raise
|
||||
|
||||
async def get_registered_nodes(self, include_offline: bool = True) -> List[ClusterNode]:
|
||||
"""Get all registered cluster nodes."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
query = """
|
||||
SELECT id, node_id, hostname, ip_address, registration_token,
|
||||
cpu_info, memory_info, gpu_info, disk_info, os_info, platform_info,
|
||||
status, last_heartbeat, first_registered,
|
||||
services, capabilities, ports, client_version, registration_metadata
|
||||
FROM cluster_nodes
|
||||
"""
|
||||
|
||||
if not include_offline:
|
||||
query += " WHERE status != 'offline'"
|
||||
|
||||
query += " ORDER BY first_registered DESC"
|
||||
|
||||
results = await conn.fetch(query)
|
||||
|
||||
nodes = []
|
||||
for result in results:
|
||||
node_dict = dict(result)
|
||||
# Parse JSON fields
|
||||
for json_field in ['cpu_info', 'memory_info', 'gpu_info', 'disk_info',
|
||||
'os_info', 'platform_info', 'services', 'capabilities',
|
||||
'ports', 'registration_metadata']:
|
||||
if node_dict[json_field]:
|
||||
try:
|
||||
node_dict[json_field] = json.loads(node_dict[json_field])
|
||||
except json.JSONDecodeError:
|
||||
node_dict[json_field] = None
|
||||
|
||||
nodes.append(ClusterNode(**node_dict))
|
||||
|
||||
return nodes
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get registered nodes: {e}")
|
||||
raise
|
||||
|
||||
async def get_node_details(self, node_id: str) -> Optional[ClusterNode]:
|
||||
"""Get detailed information about a specific node."""
|
||||
nodes = await self.get_registered_nodes()
|
||||
return next((node for node in nodes if node.node_id == node_id), None)
|
||||
|
||||
async def remove_node(self, node_id: str) -> bool:
|
||||
"""Remove a node from the cluster."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM cluster_nodes WHERE node_id = $1
|
||||
""", node_id)
|
||||
|
||||
if result != "DELETE 0":
|
||||
logger.info(f"Node {node_id} removed from cluster")
|
||||
return True
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to remove node {node_id}: {e}")
|
||||
return False
|
||||
|
||||
# Maintenance and Monitoring
|
||||
async def cleanup_offline_nodes(self, offline_threshold_minutes: int = 10) -> int:
|
||||
"""Mark nodes as offline if they haven't sent heartbeats."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
result = await conn.execute("""
|
||||
UPDATE cluster_nodes
|
||||
SET status = 'offline'
|
||||
WHERE status = 'online'
|
||||
AND last_heartbeat < NOW() - INTERVAL '%s minutes'
|
||||
""" % offline_threshold_minutes)
|
||||
|
||||
# Extract number from result like "UPDATE 3"
|
||||
count = int(result.split()[-1]) if result.split()[-1].isdigit() else 0
|
||||
if count > 0:
|
||||
logger.info(f"Marked {count} nodes as offline due to missing heartbeats")
|
||||
|
||||
return count
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to cleanup offline nodes: {e}")
|
||||
return 0
|
||||
|
||||
async def cleanup_old_heartbeats(self, retention_days: int = 30) -> int:
|
||||
"""Remove old heartbeat data for storage management."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
result = await conn.execute("""
|
||||
DELETE FROM node_heartbeats
|
||||
WHERE heartbeat_time < NOW() - INTERVAL '%s days'
|
||||
""" % retention_days)
|
||||
|
||||
count = int(result.split()[-1]) if result.split()[-1].isdigit() else 0
|
||||
if count > 0:
|
||||
logger.info(f"Cleaned up {count} old heartbeat records")
|
||||
|
||||
return count
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to cleanup old heartbeats: {e}")
|
||||
return 0
|
||||
|
||||
async def _log_registration_attempt(
|
||||
self,
|
||||
ip_address: str,
|
||||
token: str,
|
||||
node_id: str,
|
||||
hostname: str,
|
||||
success: bool,
|
||||
failure_reason: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None
|
||||
):
|
||||
"""Log registration attempts for security monitoring."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
await conn.execute("""
|
||||
INSERT INTO node_registration_attempts (
|
||||
ip_address, token_used, node_id, hostname,
|
||||
success, failure_reason, request_metadata
|
||||
) VALUES ($1, $2, $3, $4, $5, $6, $7)
|
||||
""", ip_address, token, node_id, hostname, success, failure_reason,
|
||||
json.dumps(metadata) if metadata else None)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to log registration attempt: {e}")
|
||||
|
||||
async def get_cluster_statistics(self) -> Dict[str, Any]:
|
||||
"""Get cluster statistics and health metrics."""
|
||||
conn = await self.get_connection()
|
||||
|
||||
try:
|
||||
# Node statistics
|
||||
node_stats = await conn.fetchrow("""
|
||||
SELECT
|
||||
COUNT(*) as total_nodes,
|
||||
COUNT(*) FILTER (WHERE status = 'online') as online_nodes,
|
||||
COUNT(*) FILTER (WHERE status = 'offline') as offline_nodes,
|
||||
COUNT(*) FILTER (WHERE status = 'maintenance') as maintenance_nodes
|
||||
FROM cluster_nodes
|
||||
""")
|
||||
|
||||
# Token statistics
|
||||
token_stats = await conn.fetchrow("""
|
||||
SELECT
|
||||
COUNT(*) as total_tokens,
|
||||
COUNT(*) FILTER (WHERE is_active = true) as active_tokens,
|
||||
COUNT(*) FILTER (WHERE expires_at IS NOT NULL AND expires_at < NOW()) as expired_tokens
|
||||
FROM cluster_tokens
|
||||
""")
|
||||
|
||||
return {
|
||||
"cluster_health": {
|
||||
"total_nodes": node_stats["total_nodes"],
|
||||
"online_nodes": node_stats["online_nodes"],
|
||||
"offline_nodes": node_stats["offline_nodes"],
|
||||
"maintenance_nodes": node_stats["maintenance_nodes"],
|
||||
"health_percentage": (node_stats["online_nodes"] / max(node_stats["total_nodes"], 1)) * 100
|
||||
},
|
||||
"token_management": {
|
||||
"total_tokens": token_stats["total_tokens"],
|
||||
"active_tokens": token_stats["active_tokens"],
|
||||
"expired_tokens": token_stats["expired_tokens"]
|
||||
},
|
||||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get cluster statistics: {e}")
|
||||
return {
|
||||
"error": str(e),
|
||||
"last_updated": datetime.now().isoformat()
|
||||
}
|
||||
@@ -26,7 +26,7 @@ class ClusterService:
|
||||
"ip": "192.168.1.113",
|
||||
"hostname": "ironwood",
|
||||
"role": "worker",
|
||||
"gpu": "NVIDIA RTX 3070",
|
||||
"gpu": "NVIDIA RTX 2080S",
|
||||
"memory": "128GB",
|
||||
"cpu": "AMD Threadripper 2920X",
|
||||
"ollama_port": 11434,
|
||||
@@ -57,6 +57,66 @@ class ClusterService:
|
||||
self.n8n_api_base = "https://n8n.home.deepblack.cloud/api/v1"
|
||||
self.n8n_api_key = self._get_n8n_api_key()
|
||||
|
||||
def _get_live_hardware_info(self, hostname: str, ip: str) -> Dict[str, str]:
|
||||
"""Get live hardware information from a remote node via SSH."""
|
||||
hardware = {
|
||||
"cpu": "Unknown",
|
||||
"memory": "Unknown",
|
||||
"gpu": "Unknown"
|
||||
}
|
||||
|
||||
try:
|
||||
# Try to get GPU info via SSH
|
||||
print(f"🔍 SSH GPU command for {hostname}: ssh tony@{ip} 'nvidia-smi || lspci | grep -i vga'")
|
||||
gpu_result = subprocess.run([
|
||||
"ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
|
||||
f"tony@{ip}", "nvidia-smi --query-gpu=name --format=csv,noheader,nounits || lspci | grep -i 'vga\\|3d\\|display'"
|
||||
], capture_output=True, text=True, timeout=10)
|
||||
|
||||
print(f"📊 GPU command result for {hostname}: returncode={gpu_result.returncode}, stdout='{gpu_result.stdout.strip()}', stderr='{gpu_result.stderr.strip()}'")
|
||||
|
||||
if gpu_result.returncode == 0 and gpu_result.stdout.strip():
|
||||
gpu_info = gpu_result.stdout.strip().split('\n')[0]
|
||||
if "NVIDIA" in gpu_info or "RTX" in gpu_info or "GTX" in gpu_info:
|
||||
hardware["gpu"] = gpu_info.strip()
|
||||
elif "VGA" in gpu_info or "Display" in gpu_info:
|
||||
# Parse lspci output for GPU info
|
||||
if "NVIDIA" in gpu_info:
|
||||
parts = gpu_info.split("NVIDIA")
|
||||
if len(parts) > 1:
|
||||
gpu_name = "NVIDIA" + parts[1].split('[')[0].strip()
|
||||
hardware["gpu"] = gpu_name
|
||||
elif "AMD" in gpu_info or "Radeon" in gpu_info:
|
||||
parts = gpu_info.split(":")
|
||||
if len(parts) > 2:
|
||||
gpu_name = parts[2].strip()
|
||||
hardware["gpu"] = gpu_name
|
||||
|
||||
# Try to get memory info via SSH
|
||||
mem_result = subprocess.run([
|
||||
"ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
|
||||
f"tony@{ip}", "free -h | grep '^Mem:' | awk '{print $2}'"
|
||||
], capture_output=True, text=True, timeout=10)
|
||||
|
||||
if mem_result.returncode == 0 and mem_result.stdout.strip():
|
||||
memory_info = mem_result.stdout.strip()
|
||||
hardware["memory"] = memory_info
|
||||
|
||||
# Try to get CPU info via SSH
|
||||
cpu_result = subprocess.run([
|
||||
"ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
|
||||
f"tony@{ip}", "lscpu | grep 'Model name:' | cut -d':' -f2- | xargs"
|
||||
], capture_output=True, text=True, timeout=10)
|
||||
|
||||
if cpu_result.returncode == 0 and cpu_result.stdout.strip():
|
||||
cpu_info = cpu_result.stdout.strip()
|
||||
hardware["cpu"] = cpu_info
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting live hardware info for {hostname}: {e}")
|
||||
|
||||
return hardware
|
||||
|
||||
def _get_n8n_api_key(self) -> Optional[str]:
|
||||
"""Get n8n API key from secrets."""
|
||||
try:
|
||||
@@ -136,17 +196,35 @@ class ClusterService:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Try to get live hardware info if node is online
|
||||
hardware_info = {
|
||||
"cpu": node_info["cpu"],
|
||||
"memory": node_info["memory"],
|
||||
"gpu": node_info["gpu"]
|
||||
}
|
||||
|
||||
if status == "online":
|
||||
try:
|
||||
print(f"🔍 Getting live hardware info for {node_id} ({node_info['ip']})")
|
||||
live_hardware = self._get_live_hardware_info(node_info["hostname"], node_info["ip"])
|
||||
print(f"📊 Live hardware detected for {node_id}: {live_hardware}")
|
||||
# Use live data if available, fallback to hardcoded values
|
||||
for key in ["cpu", "memory", "gpu"]:
|
||||
if live_hardware[key] != "Unknown":
|
||||
print(f"✅ Using live {key} for {node_id}: {live_hardware[key]}")
|
||||
hardware_info[key] = live_hardware[key]
|
||||
else:
|
||||
print(f"⚠️ Using fallback {key} for {node_id}: {hardware_info[key]}")
|
||||
except Exception as e:
|
||||
print(f"❌ Failed to get live hardware info for {node_id}: {e}")
|
||||
|
||||
return {
|
||||
"id": node_id,
|
||||
"hostname": node_info["hostname"],
|
||||
"ip": node_info["ip"],
|
||||
"status": status,
|
||||
"role": node_info["role"],
|
||||
"hardware": {
|
||||
"cpu": node_info["cpu"],
|
||||
"memory": node_info["memory"],
|
||||
"gpu": node_info["gpu"]
|
||||
},
|
||||
"hardware": hardware_info,
|
||||
"model_count": model_count,
|
||||
"models": [{"name": m["name"], "size": m.get("size", 0)} for m in models],
|
||||
"metrics": {
|
||||
|
||||
@@ -689,4 +689,58 @@ class ProjectService:
|
||||
# Handle escalation status
|
||||
if status == "escalated":
|
||||
print(f"Task escalated for human review: {metadata}")
|
||||
# TODO: Trigger N8N webhook for human escalation
|
||||
# TODO: Trigger N8N webhook for human escalation
|
||||
|
||||
def update_project(self, project_id: str, project_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||
"""Update a project configuration."""
|
||||
try:
|
||||
# For now, projects are read-only from the filesystem
|
||||
# This could be extended to update project metadata files
|
||||
project = self.get_project_by_id(project_id)
|
||||
if not project:
|
||||
return None
|
||||
|
||||
# Update project metadata in a local JSON file if needed
|
||||
# For now, just return the existing project as projects are filesystem-based
|
||||
print(f"Project update request for {project_id}: {project_data}")
|
||||
return project
|
||||
except Exception as e:
|
||||
print(f"Error updating project {project_id}: {e}")
|
||||
return None
|
||||
|
||||
def create_project(self, project_data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Create a new project."""
|
||||
try:
|
||||
# For now, projects are filesystem-based and read-only
|
||||
# This could be extended to create new project directories
|
||||
print(f"Project creation request: {project_data}")
|
||||
|
||||
# Return a mock project for now
|
||||
project_id = project_data.get("name", "new-project").lower().replace(" ", "-")
|
||||
return {
|
||||
"id": project_id,
|
||||
"name": project_data.get("name", "New Project"),
|
||||
"description": project_data.get("description", ""),
|
||||
"status": "created",
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"updated_at": datetime.now().isoformat()
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error creating project: {e}")
|
||||
raise
|
||||
|
||||
def delete_project(self, project_id: str) -> bool:
|
||||
"""Delete a project."""
|
||||
try:
|
||||
# For now, projects are filesystem-based and read-only
|
||||
# This could be extended to archive or remove project directories
|
||||
project = self.get_project_by_id(project_id)
|
||||
if not project:
|
||||
return False
|
||||
|
||||
print(f"Project deletion request for {project_id}")
|
||||
# Return success for now (projects are read-only)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Error deleting project {project_id}: {e}")
|
||||
return False
|
||||
117
backend/migrations/004_add_context_feedback_tables.sql
Normal file
117
backend/migrations/004_add_context_feedback_tables.sql
Normal file
@@ -0,0 +1,117 @@
|
||||
-- Migration 004: Add Context Feedback Tables for RL Context Curator Integration
|
||||
-- Created: 2025-01-30
|
||||
-- Description: Adds tables for context feedback, agent permissions, and task outcome tracking
|
||||
|
||||
-- Add RL Context Curator fields to tasks table
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS completion_time INTEGER;
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS errors_encountered INTEGER DEFAULT 0;
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS follow_up_questions INTEGER DEFAULT 0;
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS success_rate REAL;
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS context_used JSONB;
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS context_relevance_score REAL;
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS feedback_collected BOOLEAN DEFAULT false;
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS task_outcome VARCHAR(50);
|
||||
ALTER TABLE tasks ADD COLUMN IF NOT EXISTS outcome_confidence REAL;
|
||||
|
||||
-- Create context_feedback table
|
||||
CREATE TABLE IF NOT EXISTS context_feedback (
|
||||
id SERIAL PRIMARY KEY,
|
||||
context_id VARCHAR(255) NOT NULL,
|
||||
agent_id VARCHAR(255) NOT NULL REFERENCES agents(id),
|
||||
task_id UUID REFERENCES tasks(id),
|
||||
feedback_type VARCHAR(50) NOT NULL CHECK (feedback_type IN ('upvote', 'downvote', 'forgetfulness', 'task_success', 'task_failure')),
|
||||
role VARCHAR(100) NOT NULL,
|
||||
confidence REAL NOT NULL CHECK (confidence >= 0.0 AND confidence <= 1.0),
|
||||
reason TEXT,
|
||||
usage_context VARCHAR(255),
|
||||
directory_scope VARCHAR(500),
|
||||
task_type VARCHAR(100),
|
||||
timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Create indexes for context_feedback table
|
||||
CREATE INDEX IF NOT EXISTS idx_context_feedback_context_id ON context_feedback(context_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_context_feedback_agent_id ON context_feedback(agent_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_context_feedback_timestamp ON context_feedback(timestamp);
|
||||
CREATE INDEX IF NOT EXISTS idx_context_feedback_feedback_type ON context_feedback(feedback_type);
|
||||
CREATE INDEX IF NOT EXISTS idx_context_feedback_role ON context_feedback(role);
|
||||
|
||||
-- Create agent_permissions table
|
||||
CREATE TABLE IF NOT EXISTS agent_permissions (
|
||||
id SERIAL PRIMARY KEY,
|
||||
agent_id VARCHAR(255) NOT NULL REFERENCES agents(id),
|
||||
role VARCHAR(100) NOT NULL,
|
||||
directory_patterns TEXT,
|
||||
task_types TEXT,
|
||||
context_weight REAL DEFAULT 1.0 CHECK (context_weight >= 0.1 AND context_weight <= 2.0),
|
||||
active VARCHAR(10) DEFAULT 'true',
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Create indexes for agent_permissions table
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_permissions_agent_id ON agent_permissions(agent_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_permissions_role ON agent_permissions(role);
|
||||
CREATE INDEX IF NOT EXISTS idx_agent_permissions_active ON agent_permissions(active);
|
||||
|
||||
-- Create promotion_rule_history table
|
||||
CREATE TABLE IF NOT EXISTS promotion_rule_history (
|
||||
id SERIAL PRIMARY KEY,
|
||||
rule_version VARCHAR(50) NOT NULL,
|
||||
category VARCHAR(100) NOT NULL,
|
||||
role VARCHAR(100) NOT NULL,
|
||||
weight_value REAL NOT NULL,
|
||||
change_reason TEXT,
|
||||
previous_value REAL,
|
||||
timestamp TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP
|
||||
);
|
||||
|
||||
-- Create indexes for promotion_rule_history table
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_rule_history_timestamp ON promotion_rule_history(timestamp);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_rule_history_category ON promotion_rule_history(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_promotion_rule_history_role ON promotion_rule_history(role);
|
||||
|
||||
-- Create trigger to update updated_at timestamp for agent_permissions
|
||||
CREATE OR REPLACE FUNCTION update_agent_permissions_updated_at()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
NEW.updated_at = CURRENT_TIMESTAMP;
|
||||
RETURN NEW;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
CREATE TRIGGER IF NOT EXISTS trigger_agent_permissions_updated_at
|
||||
BEFORE UPDATE ON agent_permissions
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_agent_permissions_updated_at();
|
||||
|
||||
-- Insert default agent permissions for existing agents
|
||||
INSERT INTO agent_permissions (agent_id, role, directory_patterns, task_types, context_weight)
|
||||
SELECT
|
||||
id as agent_id,
|
||||
COALESCE(role, 'general') as role,
|
||||
'*' as directory_patterns,
|
||||
'general_development,coding,documentation' as task_types,
|
||||
1.0 as context_weight
|
||||
FROM agents
|
||||
WHERE id NOT IN (SELECT agent_id FROM agent_permissions)
|
||||
ON CONFLICT DO NOTHING;
|
||||
|
||||
-- Add comments to tables
|
||||
COMMENT ON TABLE context_feedback IS 'Stores context feedback from agents for RL Context Curator learning';
|
||||
COMMENT ON TABLE agent_permissions IS 'Stores agent role-based permissions for context filtering';
|
||||
COMMENT ON TABLE promotion_rule_history IS 'Tracks changes to promotion rule weights over time';
|
||||
|
||||
-- Add comments to key columns
|
||||
COMMENT ON COLUMN context_feedback.context_id IS 'Reference to HCFS context ID';
|
||||
COMMENT ON COLUMN context_feedback.feedback_type IS 'Type of feedback: upvote, downvote, forgetfulness, task_success, task_failure';
|
||||
COMMENT ON COLUMN context_feedback.confidence IS 'Confidence level in the feedback (0.0 to 1.0)';
|
||||
COMMENT ON COLUMN agent_permissions.directory_patterns IS 'Comma-separated list of directory patterns this agent can access';
|
||||
COMMENT ON COLUMN agent_permissions.context_weight IS 'Weight for context relevance calculation (0.1 to 2.0)';
|
||||
|
||||
-- Grant permissions (adjust as needed for your setup)
|
||||
-- GRANT SELECT, INSERT, UPDATE ON context_feedback TO hive_user;
|
||||
-- GRANT SELECT, INSERT, UPDATE ON agent_permissions TO hive_user;
|
||||
-- GRANT SELECT, INSERT ON promotion_rule_history TO hive_user;
|
||||
|
||||
COMMIT;
|
||||
241
backend/migrations/007_add_cluster_registration.sql
Normal file
241
backend/migrations/007_add_cluster_registration.sql
Normal file
@@ -0,0 +1,241 @@
|
||||
-- Cluster Registration Migration
|
||||
-- Implements the registration-based cluster architecture for Hive-Bzzz integration
|
||||
-- Version: 1.0
|
||||
-- Date: 2025-07-31
|
||||
|
||||
-- =============================================================================
|
||||
-- CLUSTER REGISTRATION SYSTEM
|
||||
-- =============================================================================
|
||||
|
||||
-- Cluster registration tokens (similar to Docker Swarm tokens)
|
||||
CREATE TABLE cluster_tokens (
|
||||
id SERIAL PRIMARY KEY,
|
||||
token VARCHAR(64) UNIQUE NOT NULL,
|
||||
description TEXT,
|
||||
created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
expires_at TIMESTAMP WITH TIME ZONE,
|
||||
is_active BOOLEAN DEFAULT true,
|
||||
created_by UUID REFERENCES users(id) ON DELETE SET NULL,
|
||||
|
||||
-- Token metadata
|
||||
max_registrations INTEGER DEFAULT NULL, -- NULL = unlimited
|
||||
current_registrations INTEGER DEFAULT 0,
|
||||
|
||||
-- IP restrictions (optional)
|
||||
allowed_ip_ranges TEXT[], -- CIDR ranges like ['192.168.1.0/24']
|
||||
|
||||
CONSTRAINT valid_token_format CHECK (token ~ '^[a-zA-Z0-9_-]{32,64}$')
|
||||
);
|
||||
|
||||
-- Registered cluster nodes (dynamic discovery)
|
||||
CREATE TABLE cluster_nodes (
|
||||
id SERIAL PRIMARY KEY,
|
||||
node_id VARCHAR(64) UNIQUE NOT NULL,
|
||||
hostname VARCHAR(255) NOT NULL,
|
||||
ip_address INET NOT NULL,
|
||||
registration_token VARCHAR(64) REFERENCES cluster_tokens(token) ON DELETE CASCADE,
|
||||
|
||||
-- Hardware information (reported by client)
|
||||
cpu_info JSONB,
|
||||
memory_info JSONB,
|
||||
gpu_info JSONB,
|
||||
disk_info JSONB,
|
||||
|
||||
-- System information
|
||||
os_info JSONB,
|
||||
platform_info JSONB,
|
||||
|
||||
-- Status tracking
|
||||
status VARCHAR(20) DEFAULT 'online' CHECK (status IN ('online', 'offline', 'maintenance', 'error')),
|
||||
last_heartbeat TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
first_registered TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
|
||||
-- Services and capabilities
|
||||
services JSONB, -- Available services like ollama, docker, etc.
|
||||
capabilities JSONB, -- Available models, tools, etc.
|
||||
|
||||
-- Network information
|
||||
ports JSONB, -- Service ports like {"ollama": 11434, "cockpit": 9090}
|
||||
|
||||
-- Registration metadata
|
||||
client_version VARCHAR(50),
|
||||
registration_metadata JSONB,
|
||||
|
||||
CONSTRAINT valid_node_id_format CHECK (node_id ~ '^[a-zA-Z0-9_-]+$'),
|
||||
CONSTRAINT valid_status CHECK (status IN ('online', 'offline', 'maintenance', 'error'))
|
||||
);
|
||||
|
||||
-- Node heartbeat history (for performance tracking)
|
||||
CREATE TABLE node_heartbeats (
|
||||
id SERIAL PRIMARY KEY,
|
||||
node_id VARCHAR(64) REFERENCES cluster_nodes(node_id) ON DELETE CASCADE,
|
||||
heartbeat_time TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
|
||||
-- Runtime metrics
|
||||
cpu_usage FLOAT,
|
||||
memory_usage FLOAT,
|
||||
disk_usage FLOAT,
|
||||
gpu_usage FLOAT,
|
||||
|
||||
-- Service status
|
||||
services_status JSONB,
|
||||
|
||||
-- Network metrics
|
||||
network_metrics JSONB,
|
||||
|
||||
-- Custom metrics from client
|
||||
custom_metrics JSONB
|
||||
);
|
||||
|
||||
-- Node registration attempts (for security monitoring)
|
||||
CREATE TABLE node_registration_attempts (
|
||||
id SERIAL PRIMARY KEY,
|
||||
attempted_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(),
|
||||
|
||||
-- Request information
|
||||
ip_address INET NOT NULL,
|
||||
user_agent TEXT,
|
||||
token_used VARCHAR(64),
|
||||
node_id VARCHAR(64),
|
||||
hostname VARCHAR(255),
|
||||
|
||||
-- Result
|
||||
success BOOLEAN NOT NULL,
|
||||
failure_reason TEXT,
|
||||
|
||||
-- Security metadata
|
||||
request_metadata JSONB
|
||||
);
|
||||
|
||||
-- =============================================================================
|
||||
-- INDEXES FOR PERFORMANCE
|
||||
-- =============================================================================
|
||||
|
||||
-- Token lookup and validation
|
||||
CREATE INDEX idx_cluster_tokens_token ON cluster_tokens(token) WHERE is_active = true;
|
||||
CREATE INDEX idx_cluster_tokens_active ON cluster_tokens(is_active, expires_at);
|
||||
|
||||
-- Node lookups and status queries
|
||||
CREATE INDEX idx_cluster_nodes_node_id ON cluster_nodes(node_id);
|
||||
CREATE INDEX idx_cluster_nodes_status ON cluster_nodes(status);
|
||||
CREATE INDEX idx_cluster_nodes_last_heartbeat ON cluster_nodes(last_heartbeat);
|
||||
CREATE INDEX idx_cluster_nodes_token ON cluster_nodes(registration_token);
|
||||
|
||||
-- Heartbeat queries (time-series data)
|
||||
CREATE INDEX idx_node_heartbeats_node_time ON node_heartbeats(node_id, heartbeat_time DESC);
|
||||
CREATE INDEX idx_node_heartbeats_time ON node_heartbeats(heartbeat_time DESC);
|
||||
|
||||
-- Security monitoring
|
||||
CREATE INDEX idx_registration_attempts_ip_time ON node_registration_attempts(ip_address, attempted_at DESC);
|
||||
CREATE INDEX idx_registration_attempts_success ON node_registration_attempts(success, attempted_at DESC);
|
||||
|
||||
-- =============================================================================
|
||||
-- FUNCTIONS AND TRIGGERS
|
||||
-- =============================================================================
|
||||
|
||||
-- Function to update token registration count
|
||||
CREATE OR REPLACE FUNCTION update_token_registration_count()
|
||||
RETURNS TRIGGER AS $$
|
||||
BEGIN
|
||||
IF TG_OP = 'INSERT' THEN
|
||||
UPDATE cluster_tokens
|
||||
SET current_registrations = current_registrations + 1
|
||||
WHERE token = NEW.registration_token;
|
||||
RETURN NEW;
|
||||
ELSIF TG_OP = 'DELETE' THEN
|
||||
UPDATE cluster_tokens
|
||||
SET current_registrations = current_registrations - 1
|
||||
WHERE token = OLD.registration_token;
|
||||
RETURN OLD;
|
||||
END IF;
|
||||
RETURN NULL;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Trigger to maintain registration counts
|
||||
CREATE TRIGGER trigger_update_token_count
|
||||
AFTER INSERT OR DELETE ON cluster_nodes
|
||||
FOR EACH ROW
|
||||
EXECUTE FUNCTION update_token_registration_count();
|
||||
|
||||
-- Function to clean up old heartbeats (data retention)
|
||||
CREATE OR REPLACE FUNCTION cleanup_old_heartbeats()
|
||||
RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
deleted_count INTEGER;
|
||||
BEGIN
|
||||
DELETE FROM node_heartbeats
|
||||
WHERE heartbeat_time < NOW() - INTERVAL '30 days';
|
||||
|
||||
GET DIAGNOSTICS deleted_count = ROW_COUNT;
|
||||
RETURN deleted_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- Function to update node status based on heartbeat
|
||||
CREATE OR REPLACE FUNCTION update_node_status()
|
||||
RETURNS INTEGER AS $$
|
||||
DECLARE
|
||||
updated_count INTEGER;
|
||||
BEGIN
|
||||
-- Mark nodes as offline if no heartbeat in 5 minutes
|
||||
UPDATE cluster_nodes
|
||||
SET status = 'offline'
|
||||
WHERE status = 'online'
|
||||
AND last_heartbeat < NOW() - INTERVAL '5 minutes';
|
||||
|
||||
GET DIAGNOSTICS updated_count = ROW_COUNT;
|
||||
RETURN updated_count;
|
||||
END;
|
||||
$$ LANGUAGE plpgsql;
|
||||
|
||||
-- =============================================================================
|
||||
-- INITIAL DATA (for development/testing)
|
||||
-- =============================================================================
|
||||
|
||||
-- Insert development cluster token
|
||||
INSERT INTO cluster_tokens (token, description, created_by)
|
||||
VALUES (
|
||||
'hive_dev_cluster_token_12345678901234567890123456789012',
|
||||
'Development cluster token for testing',
|
||||
(SELECT id FROM users WHERE username = 'admin' LIMIT 1)
|
||||
) ON CONFLICT (token) DO NOTHING;
|
||||
|
||||
-- Insert production cluster token (should be changed in production)
|
||||
INSERT INTO cluster_tokens (token, description, created_by, expires_at)
|
||||
VALUES (
|
||||
'hive_prod_cluster_token_98765432109876543210987654321098',
|
||||
'Production cluster token - CHANGE THIS IN PRODUCTION',
|
||||
(SELECT id FROM users WHERE username = 'admin' LIMIT 1),
|
||||
NOW() + INTERVAL '1 year'
|
||||
) ON CONFLICT (token) DO NOTHING;
|
||||
|
||||
-- =============================================================================
|
||||
-- COMMENTS AND DOCUMENTATION
|
||||
-- =============================================================================
|
||||
|
||||
COMMENT ON TABLE cluster_tokens IS 'Registration tokens for cluster nodes to join the Hive cluster';
|
||||
COMMENT ON TABLE cluster_nodes IS 'Dynamically registered cluster nodes with hardware and capability information';
|
||||
COMMENT ON TABLE node_heartbeats IS 'Heartbeat history for performance monitoring and status tracking';
|
||||
COMMENT ON TABLE node_registration_attempts IS 'Security log of all node registration attempts';
|
||||
|
||||
COMMENT ON COLUMN cluster_tokens.token IS 'Unique token for node registration, format: hive_[env]_cluster_token_[random]';
|
||||
COMMENT ON COLUMN cluster_tokens.max_registrations IS 'Maximum number of nodes that can use this token (NULL = unlimited)';
|
||||
COMMENT ON COLUMN cluster_tokens.allowed_ip_ranges IS 'CIDR ranges that can use this token (NULL = any IP)';
|
||||
|
||||
COMMENT ON COLUMN cluster_nodes.node_id IS 'Unique identifier for the node (hostname-uuid format recommended)';
|
||||
COMMENT ON COLUMN cluster_nodes.cpu_info IS 'CPU information: {"cores": 8, "model": "AMD Ryzen 7", "architecture": "x86_64"}';
|
||||
COMMENT ON COLUMN cluster_nodes.memory_info IS 'Memory information: {"total_gb": 64, "available_gb": 32, "type": "DDR4"}';
|
||||
COMMENT ON COLUMN cluster_nodes.gpu_info IS 'GPU information: {"model": "NVIDIA RTX 2080S", "memory_gb": 8, "driver": "535.86.05"}';
|
||||
COMMENT ON COLUMN cluster_nodes.services IS 'Available services: {"ollama": {"version": "0.1.7", "port": 11434}, "docker": {"version": "24.0.6"}}';
|
||||
COMMENT ON COLUMN cluster_nodes.capabilities IS 'Node capabilities: {"models": ["llama2", "codellama"], "max_concurrent": 4}';
|
||||
|
||||
-- Migration completion notice
|
||||
DO $$
|
||||
BEGIN
|
||||
RAISE NOTICE 'Cluster registration migration completed successfully!';
|
||||
RAISE NOTICE 'Development token: hive_dev_cluster_token_12345678901234567890123456789012';
|
||||
RAISE NOTICE 'Production token: hive_prod_cluster_token_98765432109876543210987654321098';
|
||||
RAISE NOTICE 'SECURITY WARNING: Change production tokens before deployment!';
|
||||
END
|
||||
$$;
|
||||
106
backend/scripts/apply_cluster_migration.sh
Executable file
106
backend/scripts/apply_cluster_migration.sh
Executable file
@@ -0,0 +1,106 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Apply cluster registration migration to Hive database
|
||||
# This script applies the 007_add_cluster_registration.sql migration
|
||||
|
||||
set -e
|
||||
|
||||
echo "🚀 Applying Cluster Registration Migration..."
|
||||
|
||||
# Configuration
|
||||
DB_NAME="hive"
|
||||
DB_USER="postgres"
|
||||
DB_PASSWORD="hive123"
|
||||
MIGRATION_FILE="./migrations/007_add_cluster_registration.sql"
|
||||
|
||||
# Check if migration file exists
|
||||
if [[ ! -f "$MIGRATION_FILE" ]]; then
|
||||
echo "❌ Migration file not found: $MIGRATION_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "📁 Migration file: $MIGRATION_FILE"
|
||||
|
||||
# Function to run SQL via Docker
|
||||
run_sql_docker() {
|
||||
local sql_file="$1"
|
||||
echo "🐳 Executing migration via Docker..."
|
||||
|
||||
# Check if PostgreSQL service is running in Docker swarm
|
||||
if docker service ls | grep -q "hive_postgres"; then
|
||||
echo "✅ PostgreSQL service found in Docker swarm"
|
||||
|
||||
# Get a running PostgreSQL container
|
||||
CONTAINER_ID=$(docker ps --filter "label=com.docker.swarm.service.name=hive_postgres" --format "{{.ID}}" | head -n1)
|
||||
|
||||
if [[ -z "$CONTAINER_ID" ]]; then
|
||||
echo "❌ No running PostgreSQL container found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "📦 Using PostgreSQL container: $CONTAINER_ID"
|
||||
|
||||
# Copy migration file to container and execute
|
||||
docker cp "$sql_file" "$CONTAINER_ID:/tmp/migration.sql"
|
||||
docker exec "$CONTAINER_ID" psql -U "$DB_USER" -d "$DB_NAME" -f /tmp/migration.sql
|
||||
docker exec "$CONTAINER_ID" rm -f /tmp/migration.sql
|
||||
|
||||
else
|
||||
echo "❌ PostgreSQL service not found in Docker swarm"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
# Function to run SQL locally
|
||||
run_sql_local() {
|
||||
local sql_file="$1"
|
||||
echo "🏠 Executing migration locally..."
|
||||
|
||||
# Check if psql is available
|
||||
if ! command -v psql &> /dev/null; then
|
||||
echo "❌ psql command not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Try to connect locally
|
||||
PGPASSWORD="$DB_PASSWORD" psql -h localhost -U "$DB_USER" -d "$DB_NAME" -f "$sql_file"
|
||||
}
|
||||
|
||||
# Try Docker first, then local
|
||||
echo "🔍 Attempting migration..."
|
||||
|
||||
if run_sql_docker "$MIGRATION_FILE" 2>/dev/null; then
|
||||
echo "✅ Migration applied successfully via Docker!"
|
||||
elif run_sql_local "$MIGRATION_FILE" 2>/dev/null; then
|
||||
echo "✅ Migration applied successfully locally!"
|
||||
else
|
||||
echo "❌ Migration failed via both Docker and local methods"
|
||||
echo "📝 Manual steps:"
|
||||
echo "1. Ensure PostgreSQL is running"
|
||||
echo "2. Check database credentials"
|
||||
echo "3. Run manually: psql -h localhost -U postgres -d hive -f $MIGRATION_FILE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "🎉 Cluster Registration Migration Complete!"
|
||||
echo ""
|
||||
echo "📋 Summary:"
|
||||
echo " ✅ cluster_tokens table created"
|
||||
echo " ✅ cluster_nodes table created"
|
||||
echo " ✅ node_heartbeats table created"
|
||||
echo " ✅ node_registration_attempts table created"
|
||||
echo " ✅ Indexes and triggers created"
|
||||
echo " ✅ Development tokens inserted"
|
||||
echo ""
|
||||
echo "🔐 Development Tokens:"
|
||||
echo " Dev Token: hive_dev_cluster_token_12345678901234567890123456789012"
|
||||
echo " Prod Token: hive_prod_cluster_token_98765432109876543210987654321098"
|
||||
echo ""
|
||||
echo "⚠️ SECURITY WARNING: Change production tokens before deployment!"
|
||||
echo ""
|
||||
echo "🚀 Next steps:"
|
||||
echo " 1. Implement registration API endpoints (/api/cluster/register)"
|
||||
echo " 2. Add heartbeat API endpoint (/api/cluster/heartbeat)"
|
||||
echo " 3. Update Bzzz clients to use registration system"
|
||||
echo ""
|
||||
Reference in New Issue
Block a user