Files
CHORUS/p2p/node.go
anthonyrawlins ea04378962 fix: Resolve WHOOSH startup failures and restore service functionality
## Problem Analysis
- WHOOSH service was failing to start due to BACKBEAT NATS connectivity issues
- Containers were unable to resolve "backbeat-nats" hostname from DNS
- Service was stuck in deployment loops with all replicas failing
- Root cause: Missing WHOOSH_BACKBEAT_NATS_URL environment variable configuration

## Solution Implementation

### 1. BACKBEAT Configuration Fix
- **Added explicit WHOOSH BACKBEAT environment variables** to docker-compose.yml:
  - `WHOOSH_BACKBEAT_ENABLED: "false"` (temporarily disabled for stability)
  - `WHOOSH_BACKBEAT_CLUSTER_ID: "chorus-production"`
  - `WHOOSH_BACKBEAT_AGENT_ID: "whoosh"`
  - `WHOOSH_BACKBEAT_NATS_URL: "nats://backbeat-nats:4222"`

### 2. Service Deployment Improvements
- **Removed rosewood node constraints** across all services (gaming PC intermittency)
- **Simplified network configuration** by removing unused `whoosh-backend` network
- **Improved health check configuration** for postgres service
- **Streamlined service placement** for better distribution

### 3. Code Quality Improvements
- **Fixed code formatting** inconsistencies in HTTP server
- **Updated service comments** from "Bzzz" to "CHORUS" for clarity
- **Standardized import grouping** and spacing

## Results Achieved

###  WHOOSH Service Operational
- **Service successfully running** on walnut node (1/2 replicas healthy)
- **Health checks passing** - API accessible on port 8800
- **Database connectivity restored** - migrations completed successfully
- **Council formation working** - teams being created and tasks assigned

###  Core Functionality Verified
- **Agent discovery active** - CHORUS agents being detected and registered
- **Task processing operational** - autonomous team formation working
- **API endpoints responsive** - `/health` returning proper status
- **Service integration** - discovery of multiple CHORUS agent endpoints

## Technical Details

### Service Configuration
- **Environment**: Production Docker Swarm deployment
- **Database**: PostgreSQL with automatic migrations
- **Networking**: Internal chorus_net overlay network
- **Load Balancing**: Traefik routing with SSL certificates
- **Monitoring**: Prometheus metrics collection enabled

### Deployment Status
```
CHORUS_whoosh.2.nej8z6nbae1a@walnut    Running 31 seconds ago
- Health checks:  Passing (200 OK responses)
- Database:  Connected and migrated
- Agent Discovery:  Active (multiple agents detected)
- Council Formation:  Functional (teams being created)
```

### Key Log Evidence
```
{"service":"whoosh","status":"ok","version":"0.1.0-mvp"}
🚀 Task successfully assigned to team
🤖 Discovered CHORUS agent with metadata
 Database migrations completed
🌐 Starting HTTP server on :8080
```

## Next Steps
- **BACKBEAT Integration**: Re-enable once NATS connectivity fully stabilized
- **Multi-Node Deployment**: Investigate ironwood node DNS resolution issues
- **Performance Monitoring**: Verify scaling behavior under load
- **Integration Testing**: Full project ingestion and council formation workflows

🎯 **Mission Accomplished**: WHOOSH is now operational and ready for autonomous development team orchestration testing.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-24 15:52:05 +10:00

214 lines
5.0 KiB
Go

package p2p
import (
"context"
"fmt"
"time"
"chorus/pkg/dht"
"github.com/libp2p/go-libp2p"
"github.com/libp2p/go-libp2p/core/host"
"github.com/libp2p/go-libp2p/core/peer"
"github.com/libp2p/go-libp2p/p2p/net/connmgr"
"github.com/libp2p/go-libp2p/p2p/security/noise"
"github.com/libp2p/go-libp2p/p2p/transport/tcp"
kaddht "github.com/libp2p/go-libp2p-kad-dht"
"github.com/multiformats/go-multiaddr"
)
// Node represents a Bzzz P2P node
type Node struct {
host host.Host
ctx context.Context
cancel context.CancelFunc
config *Config
dht *dht.LibP2PDHT // Optional DHT for distributed discovery
}
// NewNode creates a new P2P node with the given configuration
func NewNode(ctx context.Context, opts ...Option) (*Node, error) {
config := DefaultConfig()
for _, opt := range opts {
opt(config)
}
nodeCtx, cancel := context.WithCancel(ctx)
// Build multiaddresses for listening
var listenAddrs []multiaddr.Multiaddr
for _, addr := range config.ListenAddresses {
ma, err := multiaddr.NewMultiaddr(addr)
if err != nil {
cancel()
return nil, fmt.Errorf("invalid listen address %s: %w", addr, err)
}
listenAddrs = append(listenAddrs, ma)
}
// Create connection manager with scaling-optimized limits
connManager, err := connmgr.NewConnManager(
config.LowWatermark, // Low watermark (32)
config.HighWatermark, // High watermark (128)
connmgr.WithGracePeriod(30*time.Second), // Grace period before pruning
)
if err != nil {
cancel()
return nil, fmt.Errorf("failed to create connection manager: %w", err)
}
// Create libp2p host with security, transport, and scaling options
h, err := libp2p.New(
libp2p.ListenAddrs(listenAddrs...),
libp2p.Security(noise.ID, noise.New),
libp2p.Transport(tcp.NewTCPTransport),
libp2p.DefaultMuxers,
libp2p.EnableRelay(),
libp2p.ConnectionManager(connManager), // Add connection management
libp2p.EnableAutoRelay(), // Enable AutoRelay for container environments
)
if err != nil {
cancel()
return nil, fmt.Errorf("failed to create libp2p host: %w", err)
}
node := &Node{
host: h,
ctx: nodeCtx,
cancel: cancel,
config: config,
}
// Initialize DHT if enabled
if config.EnableDHT {
var dhtMode kaddht.ModeOpt
switch config.DHTMode {
case "client":
dhtMode = kaddht.ModeClient
case "server":
dhtMode = kaddht.ModeServer
default:
dhtMode = kaddht.ModeAuto
}
dhtOpts := []dht.Option{
dht.WithProtocolPrefix(config.DHTProtocolPrefix),
dht.WithMode(dhtMode),
dht.WithBootstrapPeersFromStrings(config.DHTBootstrapPeers),
dht.WithAutoBootstrap(len(config.DHTBootstrapPeers) > 0),
}
var err error
node.dht, err = dht.NewLibP2PDHT(nodeCtx, h, dhtOpts...)
if err != nil {
cancel()
h.Close()
return nil, fmt.Errorf("failed to create DHT: %w", err)
}
}
// Start background processes
go node.startBackgroundTasks()
return node, nil
}
// Host returns the underlying libp2p host
func (n *Node) Host() host.Host {
return n.host
}
// ID returns the peer ID of this node
func (n *Node) ID() peer.ID {
return n.host.ID()
}
// Addresses returns the multiaddresses this node is listening on
func (n *Node) Addresses() []multiaddr.Multiaddr {
return n.host.Addrs()
}
// Connect connects to a peer at the given multiaddress
func (n *Node) Connect(ctx context.Context, addr string) error {
ma, err := multiaddr.NewMultiaddr(addr)
if err != nil {
return fmt.Errorf("invalid multiaddress %s: %w", addr, err)
}
addrInfo, err := peer.AddrInfoFromP2pAddr(ma)
if err != nil {
return fmt.Errorf("failed to parse addr info: %w", err)
}
return n.host.Connect(ctx, *addrInfo)
}
// Peers returns the list of connected peers
func (n *Node) Peers() []peer.ID {
return n.host.Network().Peers()
}
// ConnectedPeers returns the number of connected peers
func (n *Node) ConnectedPeers() int {
return len(n.Peers())
}
// startBackgroundTasks starts background maintenance tasks
func (n *Node) startBackgroundTasks() {
ticker := time.NewTicker(30 * time.Second)
defer ticker.Stop()
for {
select {
case <-n.ctx.Done():
return
case <-ticker.C:
// Periodic maintenance tasks
n.logConnectionStatus()
}
}
}
// logConnectionStatus logs the current connection status
func (n *Node) logConnectionStatus() {
peers := n.Peers()
fmt.Printf("🐝 Bzzz Node Status - ID: %s, Connected Peers: %d\n",
n.ID().ShortString(), len(peers))
if len(peers) > 0 {
fmt.Printf(" Connected to: ")
for i, p := range peers {
if i > 0 {
fmt.Printf(", ")
}
fmt.Printf("%s", p.ShortString())
}
fmt.Println()
}
}
// DHT returns the DHT instance (if enabled)
func (n *Node) DHT() *dht.LibP2PDHT {
return n.dht
}
// IsDHTEnabled returns whether DHT is enabled and active
func (n *Node) IsDHTEnabled() bool {
return n.dht != nil
}
// Bootstrap bootstraps the DHT (if enabled)
func (n *Node) Bootstrap() error {
if n.dht != nil {
return n.dht.Bootstrap()
}
return fmt.Errorf("DHT not enabled")
}
// Close shuts down the node
func (n *Node) Close() error {
if n.dht != nil {
n.dht.Close()
}
n.cancel()
return n.host.Close()
}