WIP: Save current work before CHORUS rebrand

- Agent roles integration progress - Various backend and frontend updates - Storybook cache cleanup 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-08-01 02:20:56 +10:00
parent 1e81daaf18
commit b6bff318d9
740 changed files with 90022 additions and 279523 deletions
--- a/backend/app/services/cluster_service.py
+++ b/backend/app/services/cluster_service.py
@@ -26,7 +26,7 @@ class ClusterService:
                "ip": "192.168.1.113", 
                "hostname": "ironwood",
                "role": "worker",
-                "gpu": "NVIDIA RTX 3070",
+                "gpu": "NVIDIA RTX 2080S",
                "memory": "128GB",
                "cpu": "AMD Threadripper 2920X",
                "ollama_port": 11434,
@@ -57,6 +57,66 @@ class ClusterService:
        self.n8n_api_base = "https://n8n.home.deepblack.cloud/api/v1"
        self.n8n_api_key = self._get_n8n_api_key()
    
+    def _get_live_hardware_info(self, hostname: str, ip: str) -> Dict[str, str]:
+        """Get live hardware information from a remote node via SSH."""
+        hardware = {
+            "cpu": "Unknown",
+            "memory": "Unknown", 
+            "gpu": "Unknown"
+        }
+        
+        try:
+            # Try to get GPU info via SSH
+            print(f"🔍 SSH GPU command for {hostname}: ssh tony@{ip} 'nvidia-smi || lspci | grep -i vga'")
+            gpu_result = subprocess.run([
+                "ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
+                f"tony@{ip}", "nvidia-smi --query-gpu=name --format=csv,noheader,nounits || lspci | grep -i 'vga\\|3d\\|display'"
+            ], capture_output=True, text=True, timeout=10)
+            
+            print(f"📊 GPU command result for {hostname}: returncode={gpu_result.returncode}, stdout='{gpu_result.stdout.strip()}', stderr='{gpu_result.stderr.strip()}'")
+            
+            if gpu_result.returncode == 0 and gpu_result.stdout.strip():
+                gpu_info = gpu_result.stdout.strip().split('\n')[0]
+                if "NVIDIA" in gpu_info or "RTX" in gpu_info or "GTX" in gpu_info:
+                    hardware["gpu"] = gpu_info.strip()
+                elif "VGA" in gpu_info or "Display" in gpu_info:
+                    # Parse lspci output for GPU info
+                    if "NVIDIA" in gpu_info:
+                        parts = gpu_info.split("NVIDIA")
+                        if len(parts) > 1:
+                            gpu_name = "NVIDIA" + parts[1].split('[')[0].strip()
+                            hardware["gpu"] = gpu_name
+                    elif "AMD" in gpu_info or "Radeon" in gpu_info:
+                        parts = gpu_info.split(":")
+                        if len(parts) > 2:
+                            gpu_name = parts[2].strip()
+                            hardware["gpu"] = gpu_name
+            
+            # Try to get memory info via SSH
+            mem_result = subprocess.run([
+                "ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5",
+                f"tony@{ip}", "free -h | grep '^Mem:' | awk '{print $2}'"
+            ], capture_output=True, text=True, timeout=10)
+            
+            if mem_result.returncode == 0 and mem_result.stdout.strip():
+                memory_info = mem_result.stdout.strip()
+                hardware["memory"] = memory_info
+            
+            # Try to get CPU info via SSH
+            cpu_result = subprocess.run([
+                "ssh", "-o", "StrictHostKeyChecking=no", "-o", "ConnectTimeout=5", 
+                f"tony@{ip}", "lscpu | grep 'Model name:' | cut -d':' -f2- | xargs"
+            ], capture_output=True, text=True, timeout=10)
+            
+            if cpu_result.returncode == 0 and cpu_result.stdout.strip():
+                cpu_info = cpu_result.stdout.strip()
+                hardware["cpu"] = cpu_info
+                
+        except Exception as e:
+            print(f"Error getting live hardware info for {hostname}: {e}")
+        
+        return hardware
+    
    def _get_n8n_api_key(self) -> Optional[str]:
        """Get n8n API key from secrets."""
        try:
@@ -136,17 +196,35 @@ class ClusterService:
            except Exception:
                pass
        
+        # Try to get live hardware info if node is online
+        hardware_info = {
+            "cpu": node_info["cpu"],
+            "memory": node_info["memory"],
+            "gpu": node_info["gpu"]
+        }
+        
+        if status == "online":
+            try:
+                print(f"🔍 Getting live hardware info for {node_id} ({node_info['ip']})")
+                live_hardware = self._get_live_hardware_info(node_info["hostname"], node_info["ip"])
+                print(f"📊 Live hardware detected for {node_id}: {live_hardware}")
+                # Use live data if available, fallback to hardcoded values
+                for key in ["cpu", "memory", "gpu"]:
+                    if live_hardware[key] != "Unknown":
+                        print(f"✅ Using live {key} for {node_id}: {live_hardware[key]}")
+                        hardware_info[key] = live_hardware[key]
+                    else:
+                        print(f"⚠️ Using fallback {key} for {node_id}: {hardware_info[key]}")
+            except Exception as e:
+                print(f"❌ Failed to get live hardware info for {node_id}: {e}")
+        
        return {
            "id": node_id,
            "hostname": node_info["hostname"],
            "ip": node_info["ip"],
            "status": status,
            "role": node_info["role"],
-            "hardware": {
-                "cpu": node_info["cpu"],
-                "memory": node_info["memory"],
-                "gpu": node_info["gpu"]
-            },
+            "hardware": hardware_info,
            "model_count": model_count,
            "models": [{"name": m["name"], "size": m.get("size", 0)} for m in models],
            "metrics": {