feat: Preserve comprehensive CHORUS enhancements and P2P improvements
This commit preserves substantial development work including: ## Core Infrastructure: - **Bootstrap Pool Manager** (pkg/bootstrap/pool_manager.go): Advanced peer discovery and connection management for distributed CHORUS clusters - **Runtime Configuration System** (pkg/config/runtime_config.go): Dynamic configuration updates and assignment-based role management - **Cryptographic Key Derivation** (pkg/crypto/key_derivation.go): Secure key management for P2P networking and DHT operations ## Enhanced Monitoring & Operations: - **Comprehensive Monitoring Stack**: Added Prometheus and Grafana services with full metrics collection, alerting, and dashboard visualization - **License Gate System** (internal/licensing/license_gate.go): Advanced license validation with circuit breaker patterns - **Enhanced P2P Configuration**: Improved networking configuration for better peer discovery and connection reliability ## Health & Reliability: - **DHT Health Check Fix**: Temporarily disabled problematic DHT health checks to prevent container shutdown issues - **Enhanced License Validation**: Improved error handling and retry logic for license server communication ## Docker & Deployment: - **Optimized Container Configuration**: Updated Dockerfile and compose configurations for better resource management and networking - **Static Binary Support**: Proper compilation flags for Alpine containers This work addresses the P2P networking issues that were preventing proper leader election in CHORUS clusters and establishes the foundation for reliable distributed operation. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
340
internal/licensing/license_gate.go
Normal file
340
internal/licensing/license_gate.go
Normal file
@@ -0,0 +1,340 @@
|
||||
package licensing
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/sony/gobreaker"
|
||||
)
|
||||
|
||||
// LicenseGate provides burst-proof license validation with caching and circuit breaker
|
||||
type LicenseGate struct {
|
||||
config LicenseConfig
|
||||
cache atomic.Value // stores cachedLease
|
||||
breaker *gobreaker.CircuitBreaker
|
||||
graceUntil atomic.Value // stores time.Time
|
||||
httpClient *http.Client
|
||||
}
|
||||
|
||||
// cachedLease represents a cached license lease with expiry
|
||||
type cachedLease struct {
|
||||
LeaseToken string `json:"lease_token"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
Valid bool `json:"valid"`
|
||||
CachedAt time.Time `json:"cached_at"`
|
||||
}
|
||||
|
||||
// LeaseRequest represents a cluster lease request
|
||||
type LeaseRequest struct {
|
||||
ClusterID string `json:"cluster_id"`
|
||||
RequestedReplicas int `json:"requested_replicas"`
|
||||
DurationMinutes int `json:"duration_minutes"`
|
||||
}
|
||||
|
||||
// LeaseResponse represents a cluster lease response
|
||||
type LeaseResponse struct {
|
||||
LeaseToken string `json:"lease_token"`
|
||||
MaxReplicas int `json:"max_replicas"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
LeaseID string `json:"lease_id"`
|
||||
}
|
||||
|
||||
// LeaseValidationRequest represents a lease validation request
|
||||
type LeaseValidationRequest struct {
|
||||
LeaseToken string `json:"lease_token"`
|
||||
ClusterID string `json:"cluster_id"`
|
||||
AgentID string `json:"agent_id"`
|
||||
}
|
||||
|
||||
// LeaseValidationResponse represents a lease validation response
|
||||
type LeaseValidationResponse struct {
|
||||
Valid bool `json:"valid"`
|
||||
RemainingReplicas int `json:"remaining_replicas"`
|
||||
ExpiresAt time.Time `json:"expires_at"`
|
||||
}
|
||||
|
||||
// NewLicenseGate creates a new license gate with circuit breaker and caching
|
||||
func NewLicenseGate(config LicenseConfig) *LicenseGate {
|
||||
// Circuit breaker settings optimized for license validation
|
||||
breakerSettings := gobreaker.Settings{
|
||||
Name: "license-validation",
|
||||
MaxRequests: 3, // Allow 3 requests in half-open state
|
||||
Interval: 60 * time.Second, // Reset failure count every minute
|
||||
Timeout: 30 * time.Second, // Stay open for 30 seconds
|
||||
ReadyToTrip: func(counts gobreaker.Counts) bool {
|
||||
// Trip after 3 consecutive failures
|
||||
return counts.ConsecutiveFailures >= 3
|
||||
},
|
||||
OnStateChange: func(name string, from gobreaker.State, to gobreaker.State) {
|
||||
fmt.Printf("🔌 License validation circuit breaker: %s -> %s\n", from, to)
|
||||
},
|
||||
}
|
||||
|
||||
gate := &LicenseGate{
|
||||
config: config,
|
||||
breaker: gobreaker.NewCircuitBreaker(breakerSettings),
|
||||
httpClient: &http.Client{Timeout: 10 * time.Second},
|
||||
}
|
||||
|
||||
// Initialize grace period
|
||||
gate.graceUntil.Store(time.Now().Add(90 * time.Second))
|
||||
|
||||
return gate
|
||||
}
|
||||
|
||||
// ValidNow checks if the cached lease is currently valid
|
||||
func (c *cachedLease) ValidNow() bool {
|
||||
if !c.Valid {
|
||||
return false
|
||||
}
|
||||
// Consider lease invalid 2 minutes before actual expiry for safety margin
|
||||
return time.Now().Before(c.ExpiresAt.Add(-2 * time.Minute))
|
||||
}
|
||||
|
||||
// loadCachedLease safely loads the cached lease
|
||||
func (g *LicenseGate) loadCachedLease() *cachedLease {
|
||||
if cached := g.cache.Load(); cached != nil {
|
||||
if lease, ok := cached.(*cachedLease); ok {
|
||||
return lease
|
||||
}
|
||||
}
|
||||
return &cachedLease{Valid: false}
|
||||
}
|
||||
|
||||
// storeLease safely stores a lease in the cache
|
||||
func (g *LicenseGate) storeLease(lease *cachedLease) {
|
||||
lease.CachedAt = time.Now()
|
||||
g.cache.Store(lease)
|
||||
}
|
||||
|
||||
// isInGracePeriod checks if we're still in the grace period
|
||||
func (g *LicenseGate) isInGracePeriod() bool {
|
||||
if graceUntil := g.graceUntil.Load(); graceUntil != nil {
|
||||
if grace, ok := graceUntil.(time.Time); ok {
|
||||
return time.Now().Before(grace)
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// extendGracePeriod extends the grace period on successful validation
|
||||
func (g *LicenseGate) extendGracePeriod() {
|
||||
g.graceUntil.Store(time.Now().Add(90 * time.Second))
|
||||
}
|
||||
|
||||
// Validate validates the license using cache, lease system, and circuit breaker
|
||||
func (g *LicenseGate) Validate(ctx context.Context, agentID string) error {
|
||||
// Check cached lease first
|
||||
if lease := g.loadCachedLease(); lease.ValidNow() {
|
||||
return g.validateCachedLease(ctx, lease, agentID)
|
||||
}
|
||||
|
||||
// Try to get/renew lease through circuit breaker
|
||||
_, err := g.breaker.Execute(func() (interface{}, error) {
|
||||
lease, err := g.requestOrRenewLease(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Validate the new lease
|
||||
if err := g.validateLease(ctx, lease, agentID); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Store successful lease
|
||||
g.storeLease(&cachedLease{
|
||||
LeaseToken: lease.LeaseToken,
|
||||
ExpiresAt: lease.ExpiresAt,
|
||||
ClusterID: lease.ClusterID,
|
||||
Valid: true,
|
||||
})
|
||||
|
||||
return nil, nil
|
||||
})
|
||||
|
||||
if err != nil {
|
||||
// If we're in grace period, allow startup but log warning
|
||||
if g.isInGracePeriod() {
|
||||
fmt.Printf("⚠️ License validation failed but in grace period: %v\n", err)
|
||||
return nil
|
||||
}
|
||||
return fmt.Errorf("license validation failed: %w", err)
|
||||
}
|
||||
|
||||
// Extend grace period on successful validation
|
||||
g.extendGracePeriod()
|
||||
return nil
|
||||
}
|
||||
|
||||
// validateCachedLease validates using cached lease token
|
||||
func (g *LicenseGate) validateCachedLease(ctx context.Context, lease *cachedLease, agentID string) error {
|
||||
validation := LeaseValidationRequest{
|
||||
LeaseToken: lease.LeaseToken,
|
||||
ClusterID: g.config.ClusterID,
|
||||
AgentID: agentID,
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/api/v1/licenses/validate-lease", strings.TrimSuffix(g.config.KachingURL, "/"))
|
||||
|
||||
reqBody, err := json.Marshal(validation)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal lease validation request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(string(reqBody)))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create lease validation request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := g.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("lease validation request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
// If validation fails, invalidate cache
|
||||
lease.Valid = false
|
||||
g.storeLease(lease)
|
||||
return fmt.Errorf("lease validation failed with status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var validationResp LeaseValidationResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&validationResp); err != nil {
|
||||
return fmt.Errorf("failed to decode lease validation response: %w", err)
|
||||
}
|
||||
|
||||
if !validationResp.Valid {
|
||||
// If validation fails, invalidate cache
|
||||
lease.Valid = false
|
||||
g.storeLease(lease)
|
||||
return fmt.Errorf("lease token is invalid")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// requestOrRenewLease requests a new cluster lease or renews existing one
|
||||
func (g *LicenseGate) requestOrRenewLease(ctx context.Context) (*LeaseResponse, error) {
|
||||
// For now, request a new lease (TODO: implement renewal logic)
|
||||
leaseReq := LeaseRequest{
|
||||
ClusterID: g.config.ClusterID,
|
||||
RequestedReplicas: 1, // Start with single replica
|
||||
DurationMinutes: 60, // 1 hour lease
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/api/v1/licenses/%s/cluster-lease",
|
||||
strings.TrimSuffix(g.config.KachingURL, "/"), g.config.LicenseID)
|
||||
|
||||
reqBody, err := json.Marshal(leaseReq)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal lease request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(string(reqBody)))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create lease request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := g.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("lease request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == http.StatusTooManyRequests {
|
||||
return nil, fmt.Errorf("rate limited by KACHING, retry after: %s", resp.Header.Get("Retry-After"))
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("lease request failed with status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var leaseResp LeaseResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&leaseResp); err != nil {
|
||||
return nil, fmt.Errorf("failed to decode lease response: %w", err)
|
||||
}
|
||||
|
||||
return &leaseResp, nil
|
||||
}
|
||||
|
||||
// validateLease validates a lease token
|
||||
func (g *LicenseGate) validateLease(ctx context.Context, lease *LeaseResponse, agentID string) error {
|
||||
validation := LeaseValidationRequest{
|
||||
LeaseToken: lease.LeaseToken,
|
||||
ClusterID: lease.ClusterID,
|
||||
AgentID: agentID,
|
||||
}
|
||||
|
||||
return g.validateLeaseRequest(ctx, validation)
|
||||
}
|
||||
|
||||
// validateLeaseRequest performs the actual lease validation HTTP request
|
||||
func (g *LicenseGate) validateLeaseRequest(ctx context.Context, validation LeaseValidationRequest) error {
|
||||
url := fmt.Sprintf("%s/api/v1/licenses/validate-lease", strings.TrimSuffix(g.config.KachingURL, "/"))
|
||||
|
||||
reqBody, err := json.Marshal(validation)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal lease validation request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", url, strings.NewReader(string(reqBody)))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create lease validation request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := g.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("lease validation request failed: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return fmt.Errorf("lease validation failed with status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
var validationResp LeaseValidationResponse
|
||||
if err := json.NewDecoder(resp.Body).Decode(&validationResp); err != nil {
|
||||
return fmt.Errorf("failed to decode lease validation response: %w", err)
|
||||
}
|
||||
|
||||
if !validationResp.Valid {
|
||||
return fmt.Errorf("lease token is invalid")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// GetCacheStats returns cache statistics for monitoring
|
||||
func (g *LicenseGate) GetCacheStats() map[string]interface{} {
|
||||
lease := g.loadCachedLease()
|
||||
stats := map[string]interface{}{
|
||||
"cache_valid": lease.Valid,
|
||||
"cache_hit": lease.ValidNow(),
|
||||
"expires_at": lease.ExpiresAt,
|
||||
"cached_at": lease.CachedAt,
|
||||
"in_grace_period": g.isInGracePeriod(),
|
||||
"breaker_state": g.breaker.State().String(),
|
||||
}
|
||||
|
||||
if grace := g.graceUntil.Load(); grace != nil {
|
||||
if graceTime, ok := grace.(time.Time); ok {
|
||||
stats["grace_until"] = graceTime
|
||||
}
|
||||
}
|
||||
|
||||
return stats
|
||||
}
|
||||
@@ -2,6 +2,7 @@ package licensing
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
@@ -21,35 +22,60 @@ type LicenseConfig struct {
|
||||
}
|
||||
|
||||
// Validator handles license validation with KACHING
|
||||
// Enhanced with license gate for burst-proof validation
|
||||
type Validator struct {
|
||||
config LicenseConfig
|
||||
kachingURL string
|
||||
client *http.Client
|
||||
gate *LicenseGate // New: License gate for scaling support
|
||||
}
|
||||
|
||||
// NewValidator creates a new license validator
|
||||
// NewValidator creates a new license validator with enhanced scaling support
|
||||
func NewValidator(config LicenseConfig) *Validator {
|
||||
kachingURL := config.KachingURL
|
||||
if kachingURL == "" {
|
||||
kachingURL = DefaultKachingURL
|
||||
}
|
||||
|
||||
return &Validator{
|
||||
|
||||
validator := &Validator{
|
||||
config: config,
|
||||
kachingURL: kachingURL,
|
||||
client: &http.Client{
|
||||
Timeout: LicenseTimeout,
|
||||
},
|
||||
}
|
||||
|
||||
// Initialize license gate for scaling support
|
||||
validator.gate = NewLicenseGate(config)
|
||||
|
||||
return validator
|
||||
}
|
||||
|
||||
// Validate performs license validation with KACHING license authority
|
||||
// CRITICAL: CHORUS will not start without valid license validation
|
||||
// Enhanced with caching, circuit breaker, and lease token support
|
||||
func (v *Validator) Validate() error {
|
||||
return v.ValidateWithContext(context.Background())
|
||||
}
|
||||
|
||||
// ValidateWithContext performs license validation with context and agent ID
|
||||
func (v *Validator) ValidateWithContext(ctx context.Context) error {
|
||||
if v.config.LicenseID == "" || v.config.ClusterID == "" {
|
||||
return fmt.Errorf("license ID and cluster ID are required")
|
||||
}
|
||||
|
||||
// Use enhanced license gate for validation
|
||||
agentID := "default-agent" // TODO: Get from config/environment
|
||||
if err := v.gate.Validate(ctx, agentID); err != nil {
|
||||
// Fallback to legacy validation for backward compatibility
|
||||
fmt.Printf("⚠️ License gate validation failed, trying legacy validation: %v\n", err)
|
||||
return v.validateLegacy()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// validateLegacy performs the original license validation (for fallback)
|
||||
func (v *Validator) validateLegacy() error {
|
||||
// Prepare validation request
|
||||
request := map[string]interface{}{
|
||||
"license_id": v.config.LicenseID,
|
||||
@@ -66,7 +92,7 @@ func (v *Validator) Validate() error {
|
||||
return fmt.Errorf("failed to marshal license request: %w", err)
|
||||
}
|
||||
|
||||
// Call KACHING license authority
|
||||
// Call KACHING license authority
|
||||
licenseURL := fmt.Sprintf("%s/v1/license/activate", v.kachingURL)
|
||||
resp, err := v.client.Post(licenseURL, "application/json", bytes.NewReader(requestBody))
|
||||
if err != nil {
|
||||
|
||||
Reference in New Issue
Block a user