chore: align slurp config and scaffolding

This commit is contained in:
anthonyrawlins
2025-09-27 21:03:12 +10:00
parent acc4361463
commit 4a77862289
47 changed files with 5133 additions and 4274 deletions

View File

@@ -9,74 +9,74 @@ import (
"sync"
"time"
"chorus/pkg/dht"
"chorus/pkg/config"
"chorus/pkg/dht"
"github.com/libp2p/go-libp2p/core/peer"
)
// NetworkManagerImpl implements NetworkManager interface for network topology and partition management
type NetworkManagerImpl struct {
mu sync.RWMutex
dht *dht.DHT
config *config.Config
topology *NetworkTopology
partitionInfo *PartitionInfo
connectivity *ConnectivityMatrix
stats *NetworkStatistics
healthChecker *NetworkHealthChecker
partitionDetector *PartitionDetector
recoveryManager *RecoveryManager
mu sync.RWMutex
dht *dht.DHT
config *config.Config
topology *NetworkTopology
partitionInfo *PartitionInfo
connectivity *ConnectivityMatrix
stats *NetworkStatistics
healthChecker *NetworkHealthChecker
partitionDetector *PartitionDetector
recoveryManager *RecoveryManager
// Configuration
healthCheckInterval time.Duration
healthCheckInterval time.Duration
partitionCheckInterval time.Duration
connectivityTimeout time.Duration
maxPartitionDuration time.Duration
connectivityTimeout time.Duration
maxPartitionDuration time.Duration
// State
lastTopologyUpdate time.Time
lastPartitionCheck time.Time
running bool
recoveryInProgress bool
lastTopologyUpdate time.Time
lastPartitionCheck time.Time
running bool
recoveryInProgress bool
}
// ConnectivityMatrix tracks connectivity between all nodes
type ConnectivityMatrix struct {
Matrix map[string]map[string]*ConnectionInfo `json:"matrix"`
LastUpdated time.Time `json:"last_updated"`
LastUpdated time.Time `json:"last_updated"`
mu sync.RWMutex
}
// ConnectionInfo represents connectivity information between two nodes
type ConnectionInfo struct {
Connected bool `json:"connected"`
Latency time.Duration `json:"latency"`
PacketLoss float64 `json:"packet_loss"`
Bandwidth int64 `json:"bandwidth"`
LastChecked time.Time `json:"last_checked"`
ErrorCount int `json:"error_count"`
LastError string `json:"last_error,omitempty"`
Connected bool `json:"connected"`
Latency time.Duration `json:"latency"`
PacketLoss float64 `json:"packet_loss"`
Bandwidth int64 `json:"bandwidth"`
LastChecked time.Time `json:"last_checked"`
ErrorCount int `json:"error_count"`
LastError string `json:"last_error,omitempty"`
}
// NetworkHealthChecker performs network health checks
type NetworkHealthChecker struct {
mu sync.RWMutex
nodeHealth map[string]*NodeHealth
healthHistory map[string][]*HealthCheckResult
healthHistory map[string][]*NetworkHealthCheckResult
alertThresholds *NetworkAlertThresholds
}
// NodeHealth represents health status of a network node
type NodeHealth struct {
NodeID string `json:"node_id"`
Status NodeStatus `json:"status"`
HealthScore float64 `json:"health_score"`
LastSeen time.Time `json:"last_seen"`
ResponseTime time.Duration `json:"response_time"`
PacketLossRate float64 `json:"packet_loss_rate"`
BandwidthUtil float64 `json:"bandwidth_utilization"`
Uptime time.Duration `json:"uptime"`
ErrorRate float64 `json:"error_rate"`
NodeID string `json:"node_id"`
Status NodeStatus `json:"status"`
HealthScore float64 `json:"health_score"`
LastSeen time.Time `json:"last_seen"`
ResponseTime time.Duration `json:"response_time"`
PacketLossRate float64 `json:"packet_loss_rate"`
BandwidthUtil float64 `json:"bandwidth_utilization"`
Uptime time.Duration `json:"uptime"`
ErrorRate float64 `json:"error_rate"`
}
// NodeStatus represents the status of a network node
@@ -91,23 +91,23 @@ const (
)
// HealthCheckResult represents the result of a health check
type HealthCheckResult struct {
NodeID string `json:"node_id"`
Timestamp time.Time `json:"timestamp"`
Success bool `json:"success"`
ResponseTime time.Duration `json:"response_time"`
ErrorMessage string `json:"error_message,omitempty"`
type NetworkHealthCheckResult struct {
NodeID string `json:"node_id"`
Timestamp time.Time `json:"timestamp"`
Success bool `json:"success"`
ResponseTime time.Duration `json:"response_time"`
ErrorMessage string `json:"error_message,omitempty"`
NetworkMetrics *NetworkMetrics `json:"network_metrics"`
}
// NetworkAlertThresholds defines thresholds for network alerts
type NetworkAlertThresholds struct {
LatencyWarning time.Duration `json:"latency_warning"`
LatencyCritical time.Duration `json:"latency_critical"`
PacketLossWarning float64 `json:"packet_loss_warning"`
PacketLossCritical float64 `json:"packet_loss_critical"`
HealthScoreWarning float64 `json:"health_score_warning"`
HealthScoreCritical float64 `json:"health_score_critical"`
LatencyWarning time.Duration `json:"latency_warning"`
LatencyCritical time.Duration `json:"latency_critical"`
PacketLossWarning float64 `json:"packet_loss_warning"`
PacketLossCritical float64 `json:"packet_loss_critical"`
HealthScoreWarning float64 `json:"health_score_warning"`
HealthScoreCritical float64 `json:"health_score_critical"`
}
// PartitionDetector detects network partitions
@@ -131,14 +131,14 @@ const (
// PartitionEvent represents a partition detection event
type PartitionEvent struct {
EventID string `json:"event_id"`
DetectedAt time.Time `json:"detected_at"`
EventID string `json:"event_id"`
DetectedAt time.Time `json:"detected_at"`
Algorithm PartitionDetectionAlgorithm `json:"algorithm"`
PartitionedNodes []string `json:"partitioned_nodes"`
Confidence float64 `json:"confidence"`
Duration time.Duration `json:"duration"`
Resolved bool `json:"resolved"`
ResolvedAt *time.Time `json:"resolved_at,omitempty"`
PartitionedNodes []string `json:"partitioned_nodes"`
Confidence float64 `json:"confidence"`
Duration time.Duration `json:"duration"`
Resolved bool `json:"resolved"`
ResolvedAt *time.Time `json:"resolved_at,omitempty"`
}
// FalsePositiveFilter helps reduce false partition detections
@@ -159,10 +159,10 @@ type PartitionDetectorConfig struct {
// RecoveryManager manages network partition recovery
type RecoveryManager struct {
mu sync.RWMutex
mu sync.RWMutex
recoveryStrategies map[RecoveryStrategy]*RecoveryStrategyConfig
activeRecoveries map[string]*RecoveryOperation
recoveryHistory []*RecoveryResult
activeRecoveries map[string]*RecoveryOperation
recoveryHistory []*RecoveryResult
}
// RecoveryStrategy represents different recovery strategies
@@ -177,25 +177,25 @@ const (
// RecoveryStrategyConfig configures a recovery strategy
type RecoveryStrategyConfig struct {
Strategy RecoveryStrategy `json:"strategy"`
Timeout time.Duration `json:"timeout"`
RetryAttempts int `json:"retry_attempts"`
RetryInterval time.Duration `json:"retry_interval"`
RequireConsensus bool `json:"require_consensus"`
ForcedThreshold time.Duration `json:"forced_threshold"`
Strategy RecoveryStrategy `json:"strategy"`
Timeout time.Duration `json:"timeout"`
RetryAttempts int `json:"retry_attempts"`
RetryInterval time.Duration `json:"retry_interval"`
RequireConsensus bool `json:"require_consensus"`
ForcedThreshold time.Duration `json:"forced_threshold"`
}
// RecoveryOperation represents an active recovery operation
type RecoveryOperation struct {
OperationID string `json:"operation_id"`
Strategy RecoveryStrategy `json:"strategy"`
StartedAt time.Time `json:"started_at"`
TargetNodes []string `json:"target_nodes"`
Status RecoveryStatus `json:"status"`
Progress float64 `json:"progress"`
CurrentPhase RecoveryPhase `json:"current_phase"`
Errors []string `json:"errors"`
LastUpdate time.Time `json:"last_update"`
OperationID string `json:"operation_id"`
Strategy RecoveryStrategy `json:"strategy"`
StartedAt time.Time `json:"started_at"`
TargetNodes []string `json:"target_nodes"`
Status RecoveryStatus `json:"status"`
Progress float64 `json:"progress"`
CurrentPhase RecoveryPhase `json:"current_phase"`
Errors []string `json:"errors"`
LastUpdate time.Time `json:"last_update"`
}
// RecoveryStatus represents the status of a recovery operation
@@ -213,12 +213,12 @@ const (
type RecoveryPhase string
const (
RecoveryPhaseAssessment RecoveryPhase = "assessment"
RecoveryPhasePreparation RecoveryPhase = "preparation"
RecoveryPhaseReconnection RecoveryPhase = "reconnection"
RecoveryPhaseAssessment RecoveryPhase = "assessment"
RecoveryPhasePreparation RecoveryPhase = "preparation"
RecoveryPhaseReconnection RecoveryPhase = "reconnection"
RecoveryPhaseSynchronization RecoveryPhase = "synchronization"
RecoveryPhaseValidation RecoveryPhase = "validation"
RecoveryPhaseCompletion RecoveryPhase = "completion"
RecoveryPhaseValidation RecoveryPhase = "validation"
RecoveryPhaseCompletion RecoveryPhase = "completion"
)
// NewNetworkManagerImpl creates a new network manager implementation
@@ -231,13 +231,13 @@ func NewNetworkManagerImpl(dht *dht.DHT, config *config.Config) (*NetworkManager
}
nm := &NetworkManagerImpl{
dht: dht,
config: config,
healthCheckInterval: 30 * time.Second,
partitionCheckInterval: 60 * time.Second,
connectivityTimeout: 10 * time.Second,
maxPartitionDuration: 10 * time.Minute,
connectivity: &ConnectivityMatrix{Matrix: make(map[string]map[string]*ConnectionInfo)},
dht: dht,
config: config,
healthCheckInterval: 30 * time.Second,
partitionCheckInterval: 60 * time.Second,
connectivityTimeout: 10 * time.Second,
maxPartitionDuration: 10 * time.Minute,
connectivity: &ConnectivityMatrix{Matrix: make(map[string]map[string]*ConnectionInfo)},
stats: &NetworkStatistics{
LastUpdated: time.Now(),
},
@@ -255,33 +255,33 @@ func NewNetworkManagerImpl(dht *dht.DHT, config *config.Config) (*NetworkManager
func (nm *NetworkManagerImpl) initializeComponents() error {
// Initialize topology
nm.topology = &NetworkTopology{
TotalNodes: 0,
Connections: make(map[string][]string),
Regions: make(map[string][]string),
TotalNodes: 0,
Connections: make(map[string][]string),
Regions: make(map[string][]string),
AvailabilityZones: make(map[string][]string),
UpdatedAt: time.Now(),
UpdatedAt: time.Now(),
}
// Initialize partition info
nm.partitionInfo = &PartitionInfo{
PartitionDetected: false,
PartitionCount: 1,
IsolatedNodes: []string{},
PartitionDetected: false,
PartitionCount: 1,
IsolatedNodes: []string{},
ConnectivityMatrix: make(map[string]map[string]bool),
DetectedAt: time.Now(),
DetectedAt: time.Now(),
}
// Initialize health checker
nm.healthChecker = &NetworkHealthChecker{
nodeHealth: make(map[string]*NodeHealth),
healthHistory: make(map[string][]*HealthCheckResult),
healthHistory: make(map[string][]*NetworkHealthCheckResult),
alertThresholds: &NetworkAlertThresholds{
LatencyWarning: 500 * time.Millisecond,
LatencyCritical: 2 * time.Second,
PacketLossWarning: 0.05, // 5%
PacketLossCritical: 0.15, // 15%
HealthScoreWarning: 0.7,
HealthScoreCritical: 0.4,
LatencyWarning: 500 * time.Millisecond,
LatencyCritical: 2 * time.Second,
PacketLossWarning: 0.05, // 5%
PacketLossCritical: 0.15, // 15%
HealthScoreWarning: 0.7,
HealthScoreCritical: 0.4,
},
}
@@ -307,20 +307,20 @@ func (nm *NetworkManagerImpl) initializeComponents() error {
nm.recoveryManager = &RecoveryManager{
recoveryStrategies: map[RecoveryStrategy]*RecoveryStrategyConfig{
RecoveryStrategyAutomatic: {
Strategy: RecoveryStrategyAutomatic,
Timeout: 5 * time.Minute,
RetryAttempts: 3,
RetryInterval: 30 * time.Second,
Strategy: RecoveryStrategyAutomatic,
Timeout: 5 * time.Minute,
RetryAttempts: 3,
RetryInterval: 30 * time.Second,
RequireConsensus: false,
ForcedThreshold: 10 * time.Minute,
ForcedThreshold: 10 * time.Minute,
},
RecoveryStrategyGraceful: {
Strategy: RecoveryStrategyGraceful,
Timeout: 10 * time.Minute,
RetryAttempts: 5,
RetryInterval: 60 * time.Second,
Strategy: RecoveryStrategyGraceful,
Timeout: 10 * time.Minute,
RetryAttempts: 5,
RetryInterval: 60 * time.Second,
RequireConsensus: true,
ForcedThreshold: 20 * time.Minute,
ForcedThreshold: 20 * time.Minute,
},
},
activeRecoveries: make(map[string]*RecoveryOperation),
@@ -628,10 +628,10 @@ func (nm *NetworkManagerImpl) connectivityChecker(ctx context.Context) {
func (nm *NetworkManagerImpl) updateTopology() {
peers := nm.dht.GetConnectedPeers()
nm.topology.TotalNodes = len(peers) + 1 // +1 for current node
nm.topology.Connections = make(map[string][]string)
// Build connection map
currentNodeID := nm.config.Agent.ID
peerConnections := make([]string, len(peers))
@@ -639,21 +639,21 @@ func (nm *NetworkManagerImpl) updateTopology() {
peerConnections[i] = peer.String()
}
nm.topology.Connections[currentNodeID] = peerConnections
// Calculate network metrics
nm.topology.ClusterDiameter = nm.calculateClusterDiameter()
nm.topology.ClusteringCoefficient = nm.calculateClusteringCoefficient()
nm.topology.UpdatedAt = time.Now()
nm.lastTopologyUpdate = time.Now()
}
func (nm *NetworkManagerImpl) performHealthChecks(ctx context.Context) {
peers := nm.dht.GetConnectedPeers()
for _, peer := range peers {
result := nm.performHealthCheck(ctx, peer.String())
// Update node health
nodeHealth := &NodeHealth{
NodeID: peer.String(),
@@ -664,7 +664,7 @@ func (nm *NetworkManagerImpl) performHealthChecks(ctx context.Context) {
PacketLossRate: 0.0, // Would be measured in real implementation
ErrorRate: 0.0, // Would be calculated from history
}
if result.Success {
nodeHealth.Status = NodeStatusHealthy
nodeHealth.HealthScore = 1.0
@@ -672,21 +672,21 @@ func (nm *NetworkManagerImpl) performHealthChecks(ctx context.Context) {
nodeHealth.Status = NodeStatusUnreachable
nodeHealth.HealthScore = 0.0
}
nm.healthChecker.nodeHealth[peer.String()] = nodeHealth
// Store health check history
if _, exists := nm.healthChecker.healthHistory[peer.String()]; !exists {
nm.healthChecker.healthHistory[peer.String()] = []*HealthCheckResult{}
nm.healthChecker.healthHistory[peer.String()] = []*NetworkHealthCheckResult{}
}
nm.healthChecker.healthHistory[peer.String()] = append(
nm.healthChecker.healthHistory[peer.String()],
nm.healthChecker.healthHistory[peer.String()],
result,
)
// Keep only recent history (last 100 checks)
if len(nm.healthChecker.healthHistory[peer.String()]) > 100 {
nm.healthChecker.healthHistory[peer.String()] =
nm.healthChecker.healthHistory[peer.String()] =
nm.healthChecker.healthHistory[peer.String()][1:]
}
}
@@ -694,31 +694,31 @@ func (nm *NetworkManagerImpl) performHealthChecks(ctx context.Context) {
func (nm *NetworkManagerImpl) updateConnectivityMatrix(ctx context.Context) {
peers := nm.dht.GetConnectedPeers()
nm.connectivity.mu.Lock()
defer nm.connectivity.mu.Unlock()
// Initialize matrix if needed
if nm.connectivity.Matrix == nil {
nm.connectivity.Matrix = make(map[string]map[string]*ConnectionInfo)
}
currentNodeID := nm.config.Agent.ID
// Ensure current node exists in matrix
if nm.connectivity.Matrix[currentNodeID] == nil {
nm.connectivity.Matrix[currentNodeID] = make(map[string]*ConnectionInfo)
}
// Test connectivity to all peers
for _, peer := range peers {
peerID := peer.String()
// Test connection
connInfo := nm.testConnection(ctx, peerID)
nm.connectivity.Matrix[currentNodeID][peerID] = connInfo
}
nm.connectivity.LastUpdated = time.Now()
}
@@ -741,7 +741,7 @@ func (nm *NetworkManagerImpl) detectPartitionByConnectivity() (bool, []string, f
// Simplified connectivity-based detection
peers := nm.dht.GetConnectedPeers()
knownPeers := nm.dht.GetKnownPeers()
// If we know more peers than we're connected to, might be partitioned
if len(knownPeers) > len(peers)+2 { // Allow some tolerance
isolatedNodes := []string{}
@@ -759,7 +759,7 @@ func (nm *NetworkManagerImpl) detectPartitionByConnectivity() (bool, []string, f
}
return true, isolatedNodes, 0.8
}
return false, []string{}, 0.0
}
@@ -767,18 +767,18 @@ func (nm *NetworkManagerImpl) detectPartitionByHeartbeat() (bool, []string, floa
// Simplified heartbeat-based detection
nm.healthChecker.mu.RLock()
defer nm.healthChecker.mu.RUnlock()
isolatedNodes := []string{}
for nodeID, health := range nm.healthChecker.nodeHealth {
if health.Status == NodeStatusUnreachable {
isolatedNodes = append(isolatedNodes, nodeID)
}
}
if len(isolatedNodes) > 0 {
return true, isolatedNodes, 0.7
}
return false, []string{}, 0.0
}
@@ -791,7 +791,7 @@ func (nm *NetworkManagerImpl) detectPartitionHybrid() (bool, []string, float64)
// Combine multiple detection methods
partitioned1, nodes1, conf1 := nm.detectPartitionByConnectivity()
partitioned2, nodes2, conf2 := nm.detectPartitionByHeartbeat()
if partitioned1 && partitioned2 {
// Both methods agree
combinedNodes := nm.combineNodeLists(nodes1, nodes2)
@@ -805,7 +805,7 @@ func (nm *NetworkManagerImpl) detectPartitionHybrid() (bool, []string, float64)
return true, nodes2, conf2 * 0.7
}
}
return false, []string{}, 0.0
}
@@ -878,11 +878,11 @@ func (nm *NetworkManagerImpl) completeRecovery(ctx context.Context, operation *R
func (nm *NetworkManagerImpl) testPeerConnectivity(ctx context.Context, peerID string) *ConnectivityResult {
start := time.Now()
// In a real implementation, this would test actual network connectivity
// For now, we'll simulate based on DHT connectivity
peers := nm.dht.GetConnectedPeers()
for _, peer := range peers {
if peer.String() == peerID {
return &ConnectivityResult{
@@ -895,7 +895,7 @@ func (nm *NetworkManagerImpl) testPeerConnectivity(ctx context.Context, peerID s
}
}
}
return &ConnectivityResult{
PeerID: peerID,
Reachable: false,
@@ -907,13 +907,13 @@ func (nm *NetworkManagerImpl) testPeerConnectivity(ctx context.Context, peerID s
}
}
func (nm *NetworkManagerImpl) performHealthCheck(ctx context.Context, nodeID string) *HealthCheckResult {
func (nm *NetworkManagerImpl) performHealthCheck(ctx context.Context, nodeID string) *NetworkHealthCheckResult {
start := time.Now()
// In a real implementation, this would perform actual health checks
// For now, simulate based on connectivity
peers := nm.dht.GetConnectedPeers()
for _, peer := range peers {
if peer.String() == nodeID {
return &HealthCheckResult{
@@ -924,7 +924,7 @@ func (nm *NetworkManagerImpl) performHealthCheck(ctx context.Context, nodeID str
}
}
}
return &HealthCheckResult{
NodeID: nodeID,
Timestamp: time.Now(),
@@ -938,7 +938,7 @@ func (nm *NetworkManagerImpl) testConnection(ctx context.Context, peerID string)
// Test connection to specific peer
connected := false
latency := time.Duration(0)
// Check if peer is in connected peers list
peers := nm.dht.GetConnectedPeers()
for _, peer := range peers {
@@ -948,28 +948,28 @@ func (nm *NetworkManagerImpl) testConnection(ctx context.Context, peerID string)
break
}
}
return &ConnectionInfo{
Connected: connected,
Latency: latency,
PacketLoss: 0.0,
Bandwidth: 1000000, // 1 Mbps placeholder
LastChecked: time.Now(),
ErrorCount: 0,
Connected: connected,
Latency: latency,
PacketLoss: 0.0,
Bandwidth: 1000000, // 1 Mbps placeholder
LastChecked: time.Now(),
ErrorCount: 0,
}
}
func (nm *NetworkManagerImpl) updateNetworkStatistics() {
peers := nm.dht.GetConnectedPeers()
nm.stats.TotalNodes = len(peers) + 1
nm.stats.ConnectedNodes = len(peers)
nm.stats.DisconnectedNodes = nm.stats.TotalNodes - nm.stats.ConnectedNodes
// Calculate average latency from connectivity matrix
totalLatency := time.Duration(0)
connectionCount := 0
nm.connectivity.mu.RLock()
for _, connections := range nm.connectivity.Matrix {
for _, conn := range connections {
@@ -980,11 +980,11 @@ func (nm *NetworkManagerImpl) updateNetworkStatistics() {
}
}
nm.connectivity.mu.RUnlock()
if connectionCount > 0 {
nm.stats.AverageLatency = totalLatency / time.Duration(connectionCount)
}
nm.stats.OverallHealth = nm.calculateOverallNetworkHealth()
nm.stats.LastUpdated = time.Now()
}
@@ -1024,14 +1024,14 @@ func (nm *NetworkManagerImpl) calculateOverallNetworkHealth() float64 {
return float64(nm.stats.ConnectedNodes) / float64(nm.stats.TotalNodes)
}
func (nm *NetworkManagerImpl) determineNodeStatus(result *HealthCheckResult) NodeStatus {
func (nm *NetworkManagerImpl) determineNodeStatus(result *NetworkHealthCheckResult) NodeStatus {
if result.Success {
return NodeStatusHealthy
}
return NodeStatusUnreachable
}
func (nm *NetworkManagerImpl) calculateHealthScore(result *HealthCheckResult) float64 {
func (nm *NetworkManagerImpl) calculateHealthScore(result *NetworkHealthCheckResult) float64 {
if result.Success {
return 1.0
}
@@ -1040,19 +1040,19 @@ func (nm *NetworkManagerImpl) calculateHealthScore(result *HealthCheckResult) fl
func (nm *NetworkManagerImpl) combineNodeLists(list1, list2 []string) []string {
nodeSet := make(map[string]bool)
for _, node := range list1 {
nodeSet[node] = true
}
for _, node := range list2 {
nodeSet[node] = true
}
result := make([]string, 0, len(nodeSet))
for node := range nodeSet {
result = append(result, node)
}
sort.Strings(result)
return result
}
@@ -1073,4 +1073,4 @@ func (nm *NetworkManagerImpl) generateEventID() string {
func (nm *NetworkManagerImpl) generateOperationID() string {
return fmt.Sprintf("op-%d", time.Now().UnixNano())
}
}