This comprehensive refactoring addresses critical architectural issues: IMPORT CYCLE RESOLUTION: • pkg/crypto ↔ pkg/slurp/roles: Created pkg/security/access_levels.go • pkg/ucxl → pkg/dht: Created pkg/storage/interfaces.go • pkg/slurp/leader → pkg/election → pkg/slurp/storage: Moved types to pkg/election/interfaces.go MODULE PATH MIGRATION: • Changed from github.com/anthonyrawlins/bzzz to chorus.services/bzzz • Updated all import statements across 115+ files • Maintains compatibility while removing personal GitHub account dependency TYPE SYSTEM IMPROVEMENTS: • Resolved duplicate type declarations in crypto package • Added missing type definitions (RoleStatus, TimeRestrictions, KeyStatus, KeyRotationResult) • Proper interface segregation to prevent future cycles ARCHITECTURAL BENEFITS: • Build now progresses past structural issues to normal dependency resolution • Cleaner separation of concerns between packages • Eliminates circular dependencies that prevented compilation • Establishes foundation for scalable codebase growth 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1076 lines
31 KiB
Go
1076 lines
31 KiB
Go
// Package distribution provides network management for distributed context operations
|
|
package distribution
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"sort"
|
|
"sync"
|
|
"time"
|
|
|
|
"chorus.services/bzzz/pkg/dht"
|
|
"chorus.services/bzzz/pkg/config"
|
|
"github.com/libp2p/go-libp2p/core/peer"
|
|
)
|
|
|
|
// NetworkManagerImpl implements NetworkManager interface for network topology and partition management
|
|
type NetworkManagerImpl struct {
|
|
mu sync.RWMutex
|
|
dht *dht.DHT
|
|
config *config.Config
|
|
topology *NetworkTopology
|
|
partitionInfo *PartitionInfo
|
|
connectivity *ConnectivityMatrix
|
|
stats *NetworkStatistics
|
|
healthChecker *NetworkHealthChecker
|
|
partitionDetector *PartitionDetector
|
|
recoveryManager *RecoveryManager
|
|
|
|
// Configuration
|
|
healthCheckInterval time.Duration
|
|
partitionCheckInterval time.Duration
|
|
connectivityTimeout time.Duration
|
|
maxPartitionDuration time.Duration
|
|
|
|
// State
|
|
lastTopologyUpdate time.Time
|
|
lastPartitionCheck time.Time
|
|
running bool
|
|
recoveryInProgress bool
|
|
}
|
|
|
|
// ConnectivityMatrix tracks connectivity between all nodes
|
|
type ConnectivityMatrix struct {
|
|
Matrix map[string]map[string]*ConnectionInfo `json:"matrix"`
|
|
LastUpdated time.Time `json:"last_updated"`
|
|
mu sync.RWMutex
|
|
}
|
|
|
|
// ConnectionInfo represents connectivity information between two nodes
|
|
type ConnectionInfo struct {
|
|
Connected bool `json:"connected"`
|
|
Latency time.Duration `json:"latency"`
|
|
PacketLoss float64 `json:"packet_loss"`
|
|
Bandwidth int64 `json:"bandwidth"`
|
|
LastChecked time.Time `json:"last_checked"`
|
|
ErrorCount int `json:"error_count"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
}
|
|
|
|
// NetworkHealthChecker performs network health checks
|
|
type NetworkHealthChecker struct {
|
|
mu sync.RWMutex
|
|
nodeHealth map[string]*NodeHealth
|
|
healthHistory map[string][]*HealthCheckResult
|
|
alertThresholds *NetworkAlertThresholds
|
|
}
|
|
|
|
// NodeHealth represents health status of a network node
|
|
type NodeHealth struct {
|
|
NodeID string `json:"node_id"`
|
|
Status NodeStatus `json:"status"`
|
|
HealthScore float64 `json:"health_score"`
|
|
LastSeen time.Time `json:"last_seen"`
|
|
ResponseTime time.Duration `json:"response_time"`
|
|
PacketLossRate float64 `json:"packet_loss_rate"`
|
|
BandwidthUtil float64 `json:"bandwidth_utilization"`
|
|
Uptime time.Duration `json:"uptime"`
|
|
ErrorRate float64 `json:"error_rate"`
|
|
}
|
|
|
|
// NodeStatus represents the status of a network node
|
|
type NodeStatus string
|
|
|
|
const (
|
|
NodeStatusHealthy NodeStatus = "healthy"
|
|
NodeStatusDegraded NodeStatus = "degraded"
|
|
NodeStatusUnreachable NodeStatus = "unreachable"
|
|
NodeStatusFailed NodeStatus = "failed"
|
|
NodeStatusRecovering NodeStatus = "recovering"
|
|
)
|
|
|
|
// HealthCheckResult represents the result of a health check
|
|
type HealthCheckResult struct {
|
|
NodeID string `json:"node_id"`
|
|
Timestamp time.Time `json:"timestamp"`
|
|
Success bool `json:"success"`
|
|
ResponseTime time.Duration `json:"response_time"`
|
|
ErrorMessage string `json:"error_message,omitempty"`
|
|
NetworkMetrics *NetworkMetrics `json:"network_metrics"`
|
|
}
|
|
|
|
// NetworkAlertThresholds defines thresholds for network alerts
|
|
type NetworkAlertThresholds struct {
|
|
LatencyWarning time.Duration `json:"latency_warning"`
|
|
LatencyCritical time.Duration `json:"latency_critical"`
|
|
PacketLossWarning float64 `json:"packet_loss_warning"`
|
|
PacketLossCritical float64 `json:"packet_loss_critical"`
|
|
HealthScoreWarning float64 `json:"health_score_warning"`
|
|
HealthScoreCritical float64 `json:"health_score_critical"`
|
|
}
|
|
|
|
// PartitionDetector detects network partitions
|
|
type PartitionDetector struct {
|
|
mu sync.RWMutex
|
|
detectionAlgorithm PartitionDetectionAlgorithm
|
|
partitionHistory []*PartitionEvent
|
|
falsePositiveFilter *FalsePositiveFilter
|
|
config *PartitionDetectorConfig
|
|
}
|
|
|
|
// PartitionDetectionAlgorithm represents different partition detection algorithms
|
|
type PartitionDetectionAlgorithm string
|
|
|
|
const (
|
|
AlgorithmGossipBased PartitionDetectionAlgorithm = "gossip_based"
|
|
AlgorithmConnectivityMap PartitionDetectionAlgorithm = "connectivity_map"
|
|
AlgorithmHeartbeat PartitionDetectionAlgorithm = "heartbeat"
|
|
AlgorithmHybrid PartitionDetectionAlgorithm = "hybrid"
|
|
)
|
|
|
|
// PartitionEvent represents a partition detection event
|
|
type PartitionEvent struct {
|
|
EventID string `json:"event_id"`
|
|
DetectedAt time.Time `json:"detected_at"`
|
|
Algorithm PartitionDetectionAlgorithm `json:"algorithm"`
|
|
PartitionedNodes []string `json:"partitioned_nodes"`
|
|
Confidence float64 `json:"confidence"`
|
|
Duration time.Duration `json:"duration"`
|
|
Resolved bool `json:"resolved"`
|
|
ResolvedAt *time.Time `json:"resolved_at,omitempty"`
|
|
}
|
|
|
|
// FalsePositiveFilter helps reduce false partition detections
|
|
type FalsePositiveFilter struct {
|
|
consecutiveChecks int
|
|
confirmationTime time.Duration
|
|
suspectNodes map[string]time.Time
|
|
}
|
|
|
|
// PartitionDetectorConfig configures partition detection behavior
|
|
type PartitionDetectorConfig struct {
|
|
CheckInterval time.Duration `json:"check_interval"`
|
|
ConfidenceThreshold float64 `json:"confidence_threshold"`
|
|
MinPartitionSize int `json:"min_partition_size"`
|
|
MaxPartitionDuration time.Duration `json:"max_partition_duration"`
|
|
FalsePositiveTimeout time.Duration `json:"false_positive_timeout"`
|
|
}
|
|
|
|
// RecoveryManager manages network partition recovery
|
|
type RecoveryManager struct {
|
|
mu sync.RWMutex
|
|
recoveryStrategies map[RecoveryStrategy]*RecoveryStrategyConfig
|
|
activeRecoveries map[string]*RecoveryOperation
|
|
recoveryHistory []*RecoveryResult
|
|
}
|
|
|
|
// RecoveryStrategy represents different recovery strategies
|
|
type RecoveryStrategy string
|
|
|
|
const (
|
|
RecoveryStrategyAutomatic RecoveryStrategy = "automatic"
|
|
RecoveryStrategyManual RecoveryStrategy = "manual"
|
|
RecoveryStrategyGraceful RecoveryStrategy = "graceful"
|
|
RecoveryStrategyForced RecoveryStrategy = "forced"
|
|
)
|
|
|
|
// RecoveryStrategyConfig configures a recovery strategy
|
|
type RecoveryStrategyConfig struct {
|
|
Strategy RecoveryStrategy `json:"strategy"`
|
|
Timeout time.Duration `json:"timeout"`
|
|
RetryAttempts int `json:"retry_attempts"`
|
|
RetryInterval time.Duration `json:"retry_interval"`
|
|
RequireConsensus bool `json:"require_consensus"`
|
|
ForcedThreshold time.Duration `json:"forced_threshold"`
|
|
}
|
|
|
|
// RecoveryOperation represents an active recovery operation
|
|
type RecoveryOperation struct {
|
|
OperationID string `json:"operation_id"`
|
|
Strategy RecoveryStrategy `json:"strategy"`
|
|
StartedAt time.Time `json:"started_at"`
|
|
TargetNodes []string `json:"target_nodes"`
|
|
Status RecoveryStatus `json:"status"`
|
|
Progress float64 `json:"progress"`
|
|
CurrentPhase RecoveryPhase `json:"current_phase"`
|
|
Errors []string `json:"errors"`
|
|
LastUpdate time.Time `json:"last_update"`
|
|
}
|
|
|
|
// RecoveryStatus represents the status of a recovery operation
|
|
type RecoveryStatus string
|
|
|
|
const (
|
|
RecoveryStatusInitiated RecoveryStatus = "initiated"
|
|
RecoveryStatusInProgress RecoveryStatus = "in_progress"
|
|
RecoveryStatusCompleted RecoveryStatus = "completed"
|
|
RecoveryStatusFailed RecoveryStatus = "failed"
|
|
RecoveryStatusAborted RecoveryStatus = "aborted"
|
|
)
|
|
|
|
// RecoveryPhase represents different phases of recovery
|
|
type RecoveryPhase string
|
|
|
|
const (
|
|
RecoveryPhaseAssessment RecoveryPhase = "assessment"
|
|
RecoveryPhasePreparation RecoveryPhase = "preparation"
|
|
RecoveryPhaseReconnection RecoveryPhase = "reconnection"
|
|
RecoveryPhaseSynchronization RecoveryPhase = "synchronization"
|
|
RecoveryPhaseValidation RecoveryPhase = "validation"
|
|
RecoveryPhaseCompletion RecoveryPhase = "completion"
|
|
)
|
|
|
|
// NewNetworkManagerImpl creates a new network manager implementation
|
|
func NewNetworkManagerImpl(dht *dht.DHT, config *config.Config) (*NetworkManagerImpl, error) {
|
|
if dht == nil {
|
|
return nil, fmt.Errorf("DHT instance is required")
|
|
}
|
|
if config == nil {
|
|
return nil, fmt.Errorf("config is required")
|
|
}
|
|
|
|
nm := &NetworkManagerImpl{
|
|
dht: dht,
|
|
config: config,
|
|
healthCheckInterval: 30 * time.Second,
|
|
partitionCheckInterval: 60 * time.Second,
|
|
connectivityTimeout: 10 * time.Second,
|
|
maxPartitionDuration: 10 * time.Minute,
|
|
connectivity: &ConnectivityMatrix{Matrix: make(map[string]map[string]*ConnectionInfo)},
|
|
stats: &NetworkStatistics{
|
|
LastUpdated: time.Now(),
|
|
},
|
|
}
|
|
|
|
// Initialize components
|
|
if err := nm.initializeComponents(); err != nil {
|
|
return nil, fmt.Errorf("failed to initialize network manager components: %w", err)
|
|
}
|
|
|
|
return nm, nil
|
|
}
|
|
|
|
// initializeComponents initializes all network manager components
|
|
func (nm *NetworkManagerImpl) initializeComponents() error {
|
|
// Initialize topology
|
|
nm.topology = &NetworkTopology{
|
|
TotalNodes: 0,
|
|
Connections: make(map[string][]string),
|
|
Regions: make(map[string][]string),
|
|
AvailabilityZones: make(map[string][]string),
|
|
UpdatedAt: time.Now(),
|
|
}
|
|
|
|
// Initialize partition info
|
|
nm.partitionInfo = &PartitionInfo{
|
|
PartitionDetected: false,
|
|
PartitionCount: 1,
|
|
IsolatedNodes: []string{},
|
|
ConnectivityMatrix: make(map[string]map[string]bool),
|
|
DetectedAt: time.Now(),
|
|
}
|
|
|
|
// Initialize health checker
|
|
nm.healthChecker = &NetworkHealthChecker{
|
|
nodeHealth: make(map[string]*NodeHealth),
|
|
healthHistory: make(map[string][]*HealthCheckResult),
|
|
alertThresholds: &NetworkAlertThresholds{
|
|
LatencyWarning: 500 * time.Millisecond,
|
|
LatencyCritical: 2 * time.Second,
|
|
PacketLossWarning: 0.05, // 5%
|
|
PacketLossCritical: 0.15, // 15%
|
|
HealthScoreWarning: 0.7,
|
|
HealthScoreCritical: 0.4,
|
|
},
|
|
}
|
|
|
|
// Initialize partition detector
|
|
nm.partitionDetector = &PartitionDetector{
|
|
detectionAlgorithm: AlgorithmHybrid,
|
|
partitionHistory: []*PartitionEvent{},
|
|
falsePositiveFilter: &FalsePositiveFilter{
|
|
consecutiveChecks: 3,
|
|
confirmationTime: 60 * time.Second,
|
|
suspectNodes: make(map[string]time.Time),
|
|
},
|
|
config: &PartitionDetectorConfig{
|
|
CheckInterval: 60 * time.Second,
|
|
ConfidenceThreshold: 0.8,
|
|
MinPartitionSize: 1,
|
|
MaxPartitionDuration: 30 * time.Minute,
|
|
FalsePositiveTimeout: 5 * time.Minute,
|
|
},
|
|
}
|
|
|
|
// Initialize recovery manager
|
|
nm.recoveryManager = &RecoveryManager{
|
|
recoveryStrategies: map[RecoveryStrategy]*RecoveryStrategyConfig{
|
|
RecoveryStrategyAutomatic: {
|
|
Strategy: RecoveryStrategyAutomatic,
|
|
Timeout: 5 * time.Minute,
|
|
RetryAttempts: 3,
|
|
RetryInterval: 30 * time.Second,
|
|
RequireConsensus: false,
|
|
ForcedThreshold: 10 * time.Minute,
|
|
},
|
|
RecoveryStrategyGraceful: {
|
|
Strategy: RecoveryStrategyGraceful,
|
|
Timeout: 10 * time.Minute,
|
|
RetryAttempts: 5,
|
|
RetryInterval: 60 * time.Second,
|
|
RequireConsensus: true,
|
|
ForcedThreshold: 20 * time.Minute,
|
|
},
|
|
},
|
|
activeRecoveries: make(map[string]*RecoveryOperation),
|
|
recoveryHistory: []*RecoveryResult{},
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Start starts the network manager
|
|
func (nm *NetworkManagerImpl) Start(ctx context.Context) error {
|
|
nm.mu.Lock()
|
|
if nm.running {
|
|
nm.mu.Unlock()
|
|
return fmt.Errorf("network manager already running")
|
|
}
|
|
nm.running = true
|
|
nm.mu.Unlock()
|
|
|
|
// Start background workers
|
|
go nm.topologyUpdater(ctx)
|
|
go nm.healthMonitor(ctx)
|
|
go nm.partitionMonitor(ctx)
|
|
go nm.connectivityChecker(ctx)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Stop stops the network manager
|
|
func (nm *NetworkManagerImpl) Stop() error {
|
|
nm.mu.Lock()
|
|
defer nm.mu.Unlock()
|
|
|
|
nm.running = false
|
|
return nil
|
|
}
|
|
|
|
// DetectPartition detects network partitions in the cluster
|
|
func (nm *NetworkManagerImpl) DetectPartition(ctx context.Context) (*PartitionInfo, error) {
|
|
nm.mu.RLock()
|
|
defer nm.mu.RUnlock()
|
|
|
|
// Update partition detection
|
|
partitioned, partitionedNodes, confidence := nm.detectPartitionUsing(nm.partitionDetector.detectionAlgorithm)
|
|
|
|
if partitioned && confidence >= nm.partitionDetector.config.ConfidenceThreshold {
|
|
// Record partition event
|
|
event := &PartitionEvent{
|
|
EventID: nm.generateEventID(),
|
|
DetectedAt: time.Now(),
|
|
Algorithm: nm.partitionDetector.detectionAlgorithm,
|
|
PartitionedNodes: partitionedNodes,
|
|
Confidence: confidence,
|
|
Resolved: false,
|
|
}
|
|
|
|
nm.partitionDetector.partitionHistory = append(nm.partitionDetector.partitionHistory, event)
|
|
|
|
// Update partition info
|
|
nm.partitionInfo.PartitionDetected = true
|
|
nm.partitionInfo.PartitionCount = nm.calculatePartitionCount(partitionedNodes)
|
|
nm.partitionInfo.LargestPartitionSize = nm.calculateLargestPartitionSize()
|
|
nm.partitionInfo.CurrentPartitionSize = nm.calculateCurrentPartitionSize()
|
|
nm.partitionInfo.IsolatedNodes = partitionedNodes
|
|
nm.partitionInfo.DetectedAt = time.Now()
|
|
nm.partitionInfo.Duration = time.Since(nm.partitionInfo.DetectedAt)
|
|
}
|
|
|
|
return nm.partitionInfo, nil
|
|
}
|
|
|
|
// GetTopology returns current network topology
|
|
func (nm *NetworkManagerImpl) GetTopology(ctx context.Context) (*NetworkTopology, error) {
|
|
nm.mu.RLock()
|
|
defer nm.mu.RUnlock()
|
|
|
|
// Update topology data
|
|
nm.updateTopology()
|
|
|
|
return nm.topology, nil
|
|
}
|
|
|
|
// GetPeers returns list of available peer nodes
|
|
func (nm *NetworkManagerImpl) GetPeers(ctx context.Context) ([]*PeerInfo, error) {
|
|
peers := nm.dht.GetConnectedPeers()
|
|
peerInfos := make([]*PeerInfo, 0, len(peers))
|
|
|
|
for _, peerID := range peers {
|
|
// Get peer information from DHT
|
|
peerInfo := nm.dht.GetKnownPeers()[peerID]
|
|
if peerInfo != nil {
|
|
peerInfos = append(peerInfos, &PeerInfo{
|
|
NodeID: peerID.String(),
|
|
Address: nm.getPeerAddress(peerID),
|
|
Status: "connected",
|
|
Version: "1.0.0",
|
|
Region: "default",
|
|
AvailabilityZone: "zone-a",
|
|
Latency: nm.getPeerLatency(peerID),
|
|
LastSeen: peerInfo.LastSeen,
|
|
Capabilities: peerInfo.Capabilities,
|
|
})
|
|
}
|
|
}
|
|
|
|
return peerInfos, nil
|
|
}
|
|
|
|
// CheckConnectivity checks connectivity to peer nodes
|
|
func (nm *NetworkManagerImpl) CheckConnectivity(ctx context.Context, peers []string) (*ConnectivityReport, error) {
|
|
start := time.Now()
|
|
|
|
report := &ConnectivityReport{
|
|
TotalPeers: len(peers),
|
|
ReachablePeers: 0,
|
|
UnreachablePeers: 0,
|
|
PeerResults: make(map[string]*ConnectivityResult),
|
|
TestedAt: start,
|
|
}
|
|
|
|
// Test connectivity to each peer
|
|
for _, peerID := range peers {
|
|
result := nm.testPeerConnectivity(ctx, peerID)
|
|
report.PeerResults[peerID] = result
|
|
|
|
if result.Reachable {
|
|
report.ReachablePeers++
|
|
report.AverageLatency = (report.AverageLatency + result.Latency) / time.Duration(report.ReachablePeers)
|
|
} else {
|
|
report.UnreachablePeers++
|
|
}
|
|
}
|
|
|
|
// Calculate overall health
|
|
if report.TotalPeers > 0 {
|
|
report.OverallHealth = float64(report.ReachablePeers) / float64(report.TotalPeers)
|
|
}
|
|
|
|
report.TestDuration = time.Since(start)
|
|
|
|
return report, nil
|
|
}
|
|
|
|
// RecoverFromPartition attempts to recover from network partition
|
|
func (nm *NetworkManagerImpl) RecoverFromPartition(ctx context.Context) (*RecoveryResult, error) {
|
|
nm.mu.Lock()
|
|
if nm.recoveryInProgress {
|
|
nm.mu.Unlock()
|
|
return nil, fmt.Errorf("recovery operation already in progress")
|
|
}
|
|
nm.recoveryInProgress = true
|
|
nm.mu.Unlock()
|
|
|
|
defer func() {
|
|
nm.mu.Lock()
|
|
nm.recoveryInProgress = false
|
|
nm.mu.Unlock()
|
|
}()
|
|
|
|
start := time.Now()
|
|
|
|
result := &RecoveryResult{
|
|
RecoverySuccessful: false,
|
|
RecoveredNodes: []string{},
|
|
StillIsolatedNodes: []string{},
|
|
RecoveryTime: 0,
|
|
RecoveredAt: time.Now(),
|
|
}
|
|
|
|
// Determine recovery strategy
|
|
strategy := nm.selectRecoveryStrategy()
|
|
|
|
// Create recovery operation
|
|
operation := &RecoveryOperation{
|
|
OperationID: nm.generateOperationID(),
|
|
Strategy: strategy,
|
|
StartedAt: start,
|
|
TargetNodes: nm.partitionInfo.IsolatedNodes,
|
|
Status: RecoveryStatusInitiated,
|
|
Progress: 0.0,
|
|
CurrentPhase: RecoveryPhaseAssessment,
|
|
Errors: []string{},
|
|
LastUpdate: time.Now(),
|
|
}
|
|
|
|
// Execute recovery phases
|
|
phases := []RecoveryPhase{
|
|
RecoveryPhaseAssessment,
|
|
RecoveryPhasePreparation,
|
|
RecoveryPhaseReconnection,
|
|
RecoveryPhaseSynchronization,
|
|
RecoveryPhaseValidation,
|
|
RecoveryPhaseCompletion,
|
|
}
|
|
|
|
for i, phase := range phases {
|
|
operation.CurrentPhase = phase
|
|
operation.Progress = float64(i) / float64(len(phases))
|
|
|
|
if err := nm.executeRecoveryPhase(ctx, operation, phase); err != nil {
|
|
operation.Errors = append(operation.Errors, err.Error())
|
|
if len(operation.Errors) > 3 { // Too many errors, abort
|
|
operation.Status = RecoveryStatusFailed
|
|
break
|
|
}
|
|
}
|
|
|
|
operation.LastUpdate = time.Now()
|
|
}
|
|
|
|
// Finalize result
|
|
result.RecoveryTime = time.Since(start)
|
|
result.RecoverySuccessful = operation.Status != RecoveryStatusFailed
|
|
|
|
// Update partition info if recovery was successful
|
|
if result.RecoverySuccessful {
|
|
nm.partitionInfo.PartitionDetected = false
|
|
nm.partitionInfo.IsolatedNodes = []string{}
|
|
}
|
|
|
|
// Store recovery history
|
|
nm.recoveryManager.recoveryHistory = append(nm.recoveryManager.recoveryHistory, result)
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// GetNetworkStats returns network performance statistics
|
|
func (nm *NetworkManagerImpl) GetNetworkStats() (*NetworkStatistics, error) {
|
|
nm.mu.RLock()
|
|
defer nm.mu.RUnlock()
|
|
|
|
// Update real-time statistics
|
|
nm.updateNetworkStatistics()
|
|
|
|
return nm.stats, nil
|
|
}
|
|
|
|
// Background workers
|
|
|
|
func (nm *NetworkManagerImpl) topologyUpdater(ctx context.Context) {
|
|
ticker := time.NewTicker(5 * time.Minute)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
if nm.running {
|
|
nm.updateTopology()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) healthMonitor(ctx context.Context) {
|
|
ticker := time.NewTicker(nm.healthCheckInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
if nm.running {
|
|
nm.performHealthChecks(ctx)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) partitionMonitor(ctx context.Context) {
|
|
ticker := time.NewTicker(nm.partitionCheckInterval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
if nm.running {
|
|
nm.DetectPartition(ctx)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) connectivityChecker(ctx context.Context) {
|
|
ticker := time.NewTicker(2 * time.Minute)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
if nm.running {
|
|
nm.updateConnectivityMatrix(ctx)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Helper methods
|
|
|
|
func (nm *NetworkManagerImpl) updateTopology() {
|
|
peers := nm.dht.GetConnectedPeers()
|
|
|
|
nm.topology.TotalNodes = len(peers) + 1 // +1 for current node
|
|
nm.topology.Connections = make(map[string][]string)
|
|
|
|
// Build connection map
|
|
currentNodeID := nm.config.Agent.ID
|
|
peerConnections := make([]string, len(peers))
|
|
for i, peer := range peers {
|
|
peerConnections[i] = peer.String()
|
|
}
|
|
nm.topology.Connections[currentNodeID] = peerConnections
|
|
|
|
// Calculate network metrics
|
|
nm.topology.ClusterDiameter = nm.calculateClusterDiameter()
|
|
nm.topology.ClusteringCoefficient = nm.calculateClusteringCoefficient()
|
|
|
|
nm.topology.UpdatedAt = time.Now()
|
|
nm.lastTopologyUpdate = time.Now()
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) performHealthChecks(ctx context.Context) {
|
|
peers := nm.dht.GetConnectedPeers()
|
|
|
|
for _, peer := range peers {
|
|
result := nm.performHealthCheck(ctx, peer.String())
|
|
|
|
// Update node health
|
|
nodeHealth := &NodeHealth{
|
|
NodeID: peer.String(),
|
|
Status: nm.determineNodeStatus(result),
|
|
HealthScore: nm.calculateHealthScore(result),
|
|
LastSeen: time.Now(),
|
|
ResponseTime: result.ResponseTime,
|
|
PacketLossRate: 0.0, // Would be measured in real implementation
|
|
ErrorRate: 0.0, // Would be calculated from history
|
|
}
|
|
|
|
if result.Success {
|
|
nodeHealth.Status = NodeStatusHealthy
|
|
nodeHealth.HealthScore = 1.0
|
|
} else {
|
|
nodeHealth.Status = NodeStatusUnreachable
|
|
nodeHealth.HealthScore = 0.0
|
|
}
|
|
|
|
nm.healthChecker.nodeHealth[peer.String()] = nodeHealth
|
|
|
|
// Store health check history
|
|
if _, exists := nm.healthChecker.healthHistory[peer.String()]; !exists {
|
|
nm.healthChecker.healthHistory[peer.String()] = []*HealthCheckResult{}
|
|
}
|
|
nm.healthChecker.healthHistory[peer.String()] = append(
|
|
nm.healthChecker.healthHistory[peer.String()],
|
|
result,
|
|
)
|
|
|
|
// Keep only recent history (last 100 checks)
|
|
if len(nm.healthChecker.healthHistory[peer.String()]) > 100 {
|
|
nm.healthChecker.healthHistory[peer.String()] =
|
|
nm.healthChecker.healthHistory[peer.String()][1:]
|
|
}
|
|
}
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) updateConnectivityMatrix(ctx context.Context) {
|
|
peers := nm.dht.GetConnectedPeers()
|
|
|
|
nm.connectivity.mu.Lock()
|
|
defer nm.connectivity.mu.Unlock()
|
|
|
|
// Initialize matrix if needed
|
|
if nm.connectivity.Matrix == nil {
|
|
nm.connectivity.Matrix = make(map[string]map[string]*ConnectionInfo)
|
|
}
|
|
|
|
currentNodeID := nm.config.Agent.ID
|
|
|
|
// Ensure current node exists in matrix
|
|
if nm.connectivity.Matrix[currentNodeID] == nil {
|
|
nm.connectivity.Matrix[currentNodeID] = make(map[string]*ConnectionInfo)
|
|
}
|
|
|
|
// Test connectivity to all peers
|
|
for _, peer := range peers {
|
|
peerID := peer.String()
|
|
|
|
// Test connection
|
|
connInfo := nm.testConnection(ctx, peerID)
|
|
nm.connectivity.Matrix[currentNodeID][peerID] = connInfo
|
|
}
|
|
|
|
nm.connectivity.LastUpdated = time.Now()
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) detectPartitionUsing(algorithm PartitionDetectionAlgorithm) (bool, []string, float64) {
|
|
switch algorithm {
|
|
case AlgorithmConnectivityMap:
|
|
return nm.detectPartitionByConnectivity()
|
|
case AlgorithmHeartbeat:
|
|
return nm.detectPartitionByHeartbeat()
|
|
case AlgorithmGossipBased:
|
|
return nm.detectPartitionByGossip()
|
|
case AlgorithmHybrid:
|
|
return nm.detectPartitionHybrid()
|
|
default:
|
|
return false, []string{}, 0.0
|
|
}
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) detectPartitionByConnectivity() (bool, []string, float64) {
|
|
// Simplified connectivity-based detection
|
|
peers := nm.dht.GetConnectedPeers()
|
|
knownPeers := nm.dht.GetKnownPeers()
|
|
|
|
// If we know more peers than we're connected to, might be partitioned
|
|
if len(knownPeers) > len(peers)+2 { // Allow some tolerance
|
|
isolatedNodes := []string{}
|
|
for peerID := range knownPeers {
|
|
connected := false
|
|
for _, connectedPeer := range peers {
|
|
if peerID == connectedPeer {
|
|
connected = true
|
|
break
|
|
}
|
|
}
|
|
if !connected {
|
|
isolatedNodes = append(isolatedNodes, peerID.String())
|
|
}
|
|
}
|
|
return true, isolatedNodes, 0.8
|
|
}
|
|
|
|
return false, []string{}, 0.0
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) detectPartitionByHeartbeat() (bool, []string, float64) {
|
|
// Simplified heartbeat-based detection
|
|
nm.healthChecker.mu.RLock()
|
|
defer nm.healthChecker.mu.RUnlock()
|
|
|
|
isolatedNodes := []string{}
|
|
for nodeID, health := range nm.healthChecker.nodeHealth {
|
|
if health.Status == NodeStatusUnreachable {
|
|
isolatedNodes = append(isolatedNodes, nodeID)
|
|
}
|
|
}
|
|
|
|
if len(isolatedNodes) > 0 {
|
|
return true, isolatedNodes, 0.7
|
|
}
|
|
|
|
return false, []string{}, 0.0
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) detectPartitionByGossip() (bool, []string, float64) {
|
|
// Placeholder for gossip-based detection
|
|
return false, []string{}, 0.0
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) detectPartitionHybrid() (bool, []string, float64) {
|
|
// Combine multiple detection methods
|
|
partitioned1, nodes1, conf1 := nm.detectPartitionByConnectivity()
|
|
partitioned2, nodes2, conf2 := nm.detectPartitionByHeartbeat()
|
|
|
|
if partitioned1 && partitioned2 {
|
|
// Both methods agree
|
|
combinedNodes := nm.combineNodeLists(nodes1, nodes2)
|
|
avgConfidence := (conf1 + conf2) / 2.0
|
|
return true, combinedNodes, avgConfidence
|
|
} else if partitioned1 || partitioned2 {
|
|
// One method detects partition
|
|
if conf1 > conf2 {
|
|
return true, nodes1, conf1 * 0.7 // Reduce confidence
|
|
} else {
|
|
return true, nodes2, conf2 * 0.7
|
|
}
|
|
}
|
|
|
|
return false, []string{}, 0.0
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) selectRecoveryStrategy() RecoveryStrategy {
|
|
// Simple strategy selection based on partition duration
|
|
if nm.partitionInfo.Duration > 10*time.Minute {
|
|
return RecoveryStrategyForced
|
|
} else if nm.partitionInfo.Duration > 5*time.Minute {
|
|
return RecoveryStrategyGraceful
|
|
} else {
|
|
return RecoveryStrategyAutomatic
|
|
}
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) executeRecoveryPhase(ctx context.Context, operation *RecoveryOperation, phase RecoveryPhase) error {
|
|
switch phase {
|
|
case RecoveryPhaseAssessment:
|
|
return nm.assessPartitionState(ctx, operation)
|
|
case RecoveryPhasePreparation:
|
|
return nm.prepareRecovery(ctx, operation)
|
|
case RecoveryPhaseReconnection:
|
|
return nm.attemptReconnection(ctx, operation)
|
|
case RecoveryPhaseSynchronization:
|
|
return nm.synchronizeAfterRecovery(ctx, operation)
|
|
case RecoveryPhaseValidation:
|
|
return nm.validateRecovery(ctx, operation)
|
|
case RecoveryPhaseCompletion:
|
|
return nm.completeRecovery(ctx, operation)
|
|
default:
|
|
return fmt.Errorf("unknown recovery phase: %s", phase)
|
|
}
|
|
}
|
|
|
|
// Placeholder implementations for recovery phases
|
|
|
|
func (nm *NetworkManagerImpl) assessPartitionState(ctx context.Context, operation *RecoveryOperation) error {
|
|
// Assess current partition state
|
|
operation.Status = RecoveryStatusInProgress
|
|
return nil
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) prepareRecovery(ctx context.Context, operation *RecoveryOperation) error {
|
|
// Prepare for recovery
|
|
return nil
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) attemptReconnection(ctx context.Context, operation *RecoveryOperation) error {
|
|
// Attempt to reconnect partitioned nodes
|
|
return nil
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) synchronizeAfterRecovery(ctx context.Context, operation *RecoveryOperation) error {
|
|
// Synchronize state after reconnection
|
|
return nil
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) validateRecovery(ctx context.Context, operation *RecoveryOperation) error {
|
|
// Validate that recovery was successful
|
|
return nil
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) completeRecovery(ctx context.Context, operation *RecoveryOperation) error {
|
|
// Complete recovery operation
|
|
operation.Status = RecoveryStatusCompleted
|
|
operation.Progress = 1.0
|
|
return nil
|
|
}
|
|
|
|
// Utility methods
|
|
|
|
func (nm *NetworkManagerImpl) testPeerConnectivity(ctx context.Context, peerID string) *ConnectivityResult {
|
|
start := time.Now()
|
|
|
|
// In a real implementation, this would test actual network connectivity
|
|
// For now, we'll simulate based on DHT connectivity
|
|
peers := nm.dht.GetConnectedPeers()
|
|
|
|
for _, peer := range peers {
|
|
if peer.String() == peerID {
|
|
return &ConnectivityResult{
|
|
PeerID: peerID,
|
|
Reachable: true,
|
|
Latency: time.Since(start),
|
|
PacketLoss: 0.0,
|
|
Bandwidth: 1000000, // 1 Mbps placeholder
|
|
TestedAt: time.Now(),
|
|
}
|
|
}
|
|
}
|
|
|
|
return &ConnectivityResult{
|
|
PeerID: peerID,
|
|
Reachable: false,
|
|
Latency: 0,
|
|
PacketLoss: 1.0,
|
|
Bandwidth: 0,
|
|
Error: "peer not connected",
|
|
TestedAt: time.Now(),
|
|
}
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) performHealthCheck(ctx context.Context, nodeID string) *HealthCheckResult {
|
|
start := time.Now()
|
|
|
|
// In a real implementation, this would perform actual health checks
|
|
// For now, simulate based on connectivity
|
|
peers := nm.dht.GetConnectedPeers()
|
|
|
|
for _, peer := range peers {
|
|
if peer.String() == nodeID {
|
|
return &HealthCheckResult{
|
|
NodeID: nodeID,
|
|
Timestamp: time.Now(),
|
|
Success: true,
|
|
ResponseTime: time.Since(start),
|
|
}
|
|
}
|
|
}
|
|
|
|
return &HealthCheckResult{
|
|
NodeID: nodeID,
|
|
Timestamp: time.Now(),
|
|
Success: false,
|
|
ResponseTime: 0,
|
|
ErrorMessage: "node unreachable",
|
|
}
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) testConnection(ctx context.Context, peerID string) *ConnectionInfo {
|
|
// Test connection to specific peer
|
|
connected := false
|
|
latency := time.Duration(0)
|
|
|
|
// Check if peer is in connected peers list
|
|
peers := nm.dht.GetConnectedPeers()
|
|
for _, peer := range peers {
|
|
if peer.String() == peerID {
|
|
connected = true
|
|
latency = 50 * time.Millisecond // Placeholder
|
|
break
|
|
}
|
|
}
|
|
|
|
return &ConnectionInfo{
|
|
Connected: connected,
|
|
Latency: latency,
|
|
PacketLoss: 0.0,
|
|
Bandwidth: 1000000, // 1 Mbps placeholder
|
|
LastChecked: time.Now(),
|
|
ErrorCount: 0,
|
|
}
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) updateNetworkStatistics() {
|
|
peers := nm.dht.GetConnectedPeers()
|
|
|
|
nm.stats.TotalNodes = len(peers) + 1
|
|
nm.stats.ConnectedNodes = len(peers)
|
|
nm.stats.DisconnectedNodes = nm.stats.TotalNodes - nm.stats.ConnectedNodes
|
|
|
|
// Calculate average latency from connectivity matrix
|
|
totalLatency := time.Duration(0)
|
|
connectionCount := 0
|
|
|
|
nm.connectivity.mu.RLock()
|
|
for _, connections := range nm.connectivity.Matrix {
|
|
for _, conn := range connections {
|
|
if conn.Connected {
|
|
totalLatency += conn.Latency
|
|
connectionCount++
|
|
}
|
|
}
|
|
}
|
|
nm.connectivity.mu.RUnlock()
|
|
|
|
if connectionCount > 0 {
|
|
nm.stats.AverageLatency = totalLatency / time.Duration(connectionCount)
|
|
}
|
|
|
|
nm.stats.OverallHealth = nm.calculateOverallNetworkHealth()
|
|
nm.stats.LastUpdated = time.Now()
|
|
}
|
|
|
|
// Placeholder implementations for calculated fields
|
|
|
|
func (nm *NetworkManagerImpl) calculateClusterDiameter() int {
|
|
// Simplified calculation
|
|
return nm.topology.TotalNodes - 1
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) calculateClusteringCoefficient() float64 {
|
|
// Simplified calculation
|
|
if nm.topology.TotalNodes > 1 {
|
|
return 0.8 // Placeholder
|
|
}
|
|
return 0.0
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) calculatePartitionCount(partitionedNodes []string) int {
|
|
return len(partitionedNodes) + 1 // Current partition + isolated nodes
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) calculateLargestPartitionSize() int {
|
|
peers := nm.dht.GetConnectedPeers()
|
|
return len(peers) + 1 // Current partition size
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) calculateCurrentPartitionSize() int {
|
|
return nm.calculateLargestPartitionSize()
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) calculateOverallNetworkHealth() float64 {
|
|
if nm.stats.TotalNodes == 0 {
|
|
return 1.0
|
|
}
|
|
return float64(nm.stats.ConnectedNodes) / float64(nm.stats.TotalNodes)
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) determineNodeStatus(result *HealthCheckResult) NodeStatus {
|
|
if result.Success {
|
|
return NodeStatusHealthy
|
|
}
|
|
return NodeStatusUnreachable
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) calculateHealthScore(result *HealthCheckResult) float64 {
|
|
if result.Success {
|
|
return 1.0
|
|
}
|
|
return 0.0
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) combineNodeLists(list1, list2 []string) []string {
|
|
nodeSet := make(map[string]bool)
|
|
|
|
for _, node := range list1 {
|
|
nodeSet[node] = true
|
|
}
|
|
for _, node := range list2 {
|
|
nodeSet[node] = true
|
|
}
|
|
|
|
result := make([]string, 0, len(nodeSet))
|
|
for node := range nodeSet {
|
|
result = append(result, node)
|
|
}
|
|
|
|
sort.Strings(result)
|
|
return result
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) getPeerAddress(peerID peer.ID) string {
|
|
// In a real implementation, would get actual peer address
|
|
return "unknown"
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) getPeerLatency(peerID peer.ID) time.Duration {
|
|
// In a real implementation, would measure actual latency
|
|
return 50 * time.Millisecond
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) generateEventID() string {
|
|
return fmt.Sprintf("evt-%d", time.Now().UnixNano())
|
|
}
|
|
|
|
func (nm *NetworkManagerImpl) generateOperationID() string {
|
|
return fmt.Sprintf("op-%d", time.Now().UnixNano())
|
|
} |