WHOOSH/internal/orchestrator/health_gates.go

package orchestrator

import (
	"context"
	"encoding/json"
	"fmt"
	"net/http"
	"time"

	"github.com/rs/zerolog/log"
	"go.opentelemetry.io/otel/attribute"
	"go.opentelemetry.io/otel/trace"

	"github.com/chorus-services/whoosh/internal/tracing"
)

// HealthGates manages health checks that gate scaling operations
type HealthGates struct {
	kachingURL    string
	backbeatURL   string
	chorusURL     string
	httpClient    *http.Client
	thresholds    HealthThresholds
}

// HealthThresholds defines the health criteria for allowing scaling
type HealthThresholds struct {
	KachingMaxLatencyMS     int `json:"kaching_max_latency_ms"`     // Maximum acceptable KACHING latency
	KachingMinRateRemaining int `json:"kaching_min_rate_remaining"` // Minimum rate limit remaining
	BackbeatMaxLagSeconds   int `json:"backbeat_max_lag_seconds"`   // Maximum subject lag in seconds
	BootstrapMinHealthyPeers int `json:"bootstrap_min_healthy_peers"` // Minimum healthy bootstrap peers
	JoinSuccessRateThreshold float64 `json:"join_success_rate_threshold"` // Minimum join success rate (0.0-1.0)
}

// HealthStatus represents the current health status across all gates
type HealthStatus struct {
	Healthy         bool                   `json:"healthy"`
	Timestamp       time.Time             `json:"timestamp"`
	Gates           map[string]GateStatus `json:"gates"`
	OverallReason   string                `json:"overall_reason,omitempty"`
}

// GateStatus represents the status of an individual health gate
type GateStatus struct {
	Name        string                 `json:"name"`
	Healthy     bool                   `json:"healthy"`
	Reason      string                 `json:"reason,omitempty"`
	Metrics     map[string]interface{} `json:"metrics,omitempty"`
	LastChecked time.Time             `json:"last_checked"`
}

// KachingHealth represents KACHING health metrics
type KachingHealth struct {
	Healthy              bool    `json:"healthy"`
	LatencyP95MS         float64 `json:"latency_p95_ms"`
	QueueDepth          int     `json:"queue_depth"`
	RateLimitRemaining  int     `json:"rate_limit_remaining"`
	ActiveLeases        int     `json:"active_leases"`
	ClusterCapacity     int     `json:"cluster_capacity"`
}

// BackbeatHealth represents BACKBEAT health metrics
type BackbeatHealth struct {
	Healthy          bool              `json:"healthy"`
	SubjectLags      map[string]int    `json:"subject_lags"`
	MaxLagSeconds    int               `json:"max_lag_seconds"`
	ConsumerHealth   map[string]bool   `json:"consumer_health"`
}

// BootstrapHealth represents bootstrap peer pool health
type BootstrapHealth struct {
	Healthy       bool `json:"healthy"`
	TotalPeers    int  `json:"total_peers"`
	HealthyPeers  int  `json:"healthy_peers"`
	ReachablePeers int `json:"reachable_peers"`
}

// ScalingMetrics represents recent scaling operation metrics
type ScalingMetrics struct {
	LastWaveSize        int     `json:"last_wave_size"`
	LastWaveStarted     time.Time `json:"last_wave_started"`
	LastWaveCompleted   time.Time `json:"last_wave_completed"`
	JoinSuccessRate     float64 `json:"join_success_rate"`
	SuccessfulJoins     int     `json:"successful_joins"`
	FailedJoins         int     `json:"failed_joins"`
}

// NewHealthGates creates a new health gates manager
func NewHealthGates(kachingURL, backbeatURL, chorusURL string) *HealthGates {
	return &HealthGates{
		kachingURL:  kachingURL,
		backbeatURL: backbeatURL,
		chorusURL:   chorusURL,
		httpClient:  &http.Client{Timeout: 10 * time.Second},
		thresholds: HealthThresholds{
			KachingMaxLatencyMS:      500,  // 500ms max latency
			KachingMinRateRemaining:  20,   // At least 20 requests remaining
			BackbeatMaxLagSeconds:    30,   // Max 30 seconds lag
			BootstrapMinHealthyPeers: 3,    // At least 3 healthy bootstrap peers
			JoinSuccessRateThreshold: 0.8,  // 80% join success rate
		},
	}
}

// SetThresholds updates the health thresholds
func (hg *HealthGates) SetThresholds(thresholds HealthThresholds) {
	hg.thresholds = thresholds
}

// CheckHealth checks all health gates and returns overall status
func (hg *HealthGates) CheckHealth(ctx context.Context, recentMetrics *ScalingMetrics) (*HealthStatus, error) {
	ctx, span := tracing.Tracer.Start(ctx, "health_gates.check_health")
	defer span.End()

	status := &HealthStatus{
		Timestamp: time.Now(),
		Gates:     make(map[string]GateStatus),
		Healthy:   true,
	}

	var failReasons []string

	// Check KACHING health
	if kachingStatus, err := hg.checkKachingHealth(ctx); err != nil {
		log.Warn().Err(err).Msg("Failed to check KACHING health")
		status.Gates["kaching"] = GateStatus{
			Name:        "kaching",
			Healthy:     false,
			Reason:      fmt.Sprintf("Health check failed: %v", err),
			LastChecked: time.Now(),
		}
		status.Healthy = false
		failReasons = append(failReasons, "KACHING unreachable")
	} else {
		status.Gates["kaching"] = *kachingStatus
		if !kachingStatus.Healthy {
			status.Healthy = false
			failReasons = append(failReasons, kachingStatus.Reason)
		}
	}

	// Check BACKBEAT health
	if backbeatStatus, err := hg.checkBackbeatHealth(ctx); err != nil {
		log.Warn().Err(err).Msg("Failed to check BACKBEAT health")
		status.Gates["backbeat"] = GateStatus{
			Name:        "backbeat",
			Healthy:     false,
			Reason:      fmt.Sprintf("Health check failed: %v", err),
			LastChecked: time.Now(),
		}
		status.Healthy = false
		failReasons = append(failReasons, "BACKBEAT unreachable")
	} else {
		status.Gates["backbeat"] = *backbeatStatus
		if !backbeatStatus.Healthy {
			status.Healthy = false
			failReasons = append(failReasons, backbeatStatus.Reason)
		}
	}

	// Check bootstrap peer health
	if bootstrapStatus, err := hg.checkBootstrapHealth(ctx); err != nil {
		log.Warn().Err(err).Msg("Failed to check bootstrap health")
		status.Gates["bootstrap"] = GateStatus{
			Name:        "bootstrap",
			Healthy:     false,
			Reason:      fmt.Sprintf("Health check failed: %v", err),
			LastChecked: time.Now(),
		}
		status.Healthy = false
		failReasons = append(failReasons, "Bootstrap peers unreachable")
	} else {
		status.Gates["bootstrap"] = *bootstrapStatus
		if !bootstrapStatus.Healthy {
			status.Healthy = false
			failReasons = append(failReasons, bootstrapStatus.Reason)
		}
	}

	// Check recent scaling metrics if provided
	if recentMetrics != nil {
		if metricsStatus := hg.checkScalingMetrics(recentMetrics); !metricsStatus.Healthy {
			status.Gates["scaling_metrics"] = *metricsStatus
			status.Healthy = false
			failReasons = append(failReasons, metricsStatus.Reason)
		} else {
			status.Gates["scaling_metrics"] = *metricsStatus
		}
	}

	// Set overall reason if unhealthy
	if !status.Healthy && len(failReasons) > 0 {
		status.OverallReason = fmt.Sprintf("Health gates failed: %v", failReasons)
	}

	// Add tracing attributes
	span.SetAttributes(
		attribute.Bool("health.overall_healthy", status.Healthy),
		attribute.Int("health.gate_count", len(status.Gates)),
	)

	if !status.Healthy {
		span.SetAttributes(attribute.String("health.fail_reason", status.OverallReason))
	}

	return status, nil
}

// checkKachingHealth checks KACHING health and rate limits
func (hg *HealthGates) checkKachingHealth(ctx context.Context) (*GateStatus, error) {
	url := fmt.Sprintf("%s/health/burst", hg.kachingURL)

	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to create KACHING health request: %w", err)
	}

	resp, err := hg.httpClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("KACHING health request failed: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("KACHING health check returned status %d", resp.StatusCode)
	}

	var health KachingHealth
	if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
		return nil, fmt.Errorf("failed to decode KACHING health response: %w", err)
	}

	status := &GateStatus{
		Name:        "kaching",
		LastChecked: time.Now(),
		Metrics: map[string]interface{}{
			"latency_p95_ms":        health.LatencyP95MS,
			"queue_depth":           health.QueueDepth,
			"rate_limit_remaining":  health.RateLimitRemaining,
			"active_leases":         health.ActiveLeases,
			"cluster_capacity":      health.ClusterCapacity,
		},
	}

	// Check latency threshold
	if health.LatencyP95MS > float64(hg.thresholds.KachingMaxLatencyMS) {
		status.Healthy = false
		status.Reason = fmt.Sprintf("KACHING latency too high: %.1fms > %dms",
			health.LatencyP95MS, hg.thresholds.KachingMaxLatencyMS)
		return status, nil
	}

	// Check rate limit threshold
	if health.RateLimitRemaining < hg.thresholds.KachingMinRateRemaining {
		status.Healthy = false
		status.Reason = fmt.Sprintf("KACHING rate limit too low: %d < %d remaining",
			health.RateLimitRemaining, hg.thresholds.KachingMinRateRemaining)
		return status, nil
	}

	// Check overall KACHING health
	if !health.Healthy {
		status.Healthy = false
		status.Reason = "KACHING reports unhealthy status"
		return status, nil
	}

	status.Healthy = true
	return status, nil
}

// checkBackbeatHealth checks BACKBEAT subject lag and consumer health
func (hg *HealthGates) checkBackbeatHealth(ctx context.Context) (*GateStatus, error) {
	url := fmt.Sprintf("%s/metrics", hg.backbeatURL)

	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to create BACKBEAT health request: %w", err)
	}

	resp, err := hg.httpClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("BACKBEAT health request failed: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("BACKBEAT health check returned status %d", resp.StatusCode)
	}

	var health BackbeatHealth
	if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
		return nil, fmt.Errorf("failed to decode BACKBEAT health response: %w", err)
	}

	status := &GateStatus{
		Name:        "backbeat",
		LastChecked: time.Now(),
		Metrics: map[string]interface{}{
			"subject_lags":    health.SubjectLags,
			"max_lag_seconds": health.MaxLagSeconds,
			"consumer_health": health.ConsumerHealth,
		},
	}

	// Check subject lag threshold
	if health.MaxLagSeconds > hg.thresholds.BackbeatMaxLagSeconds {
		status.Healthy = false
		status.Reason = fmt.Sprintf("BACKBEAT lag too high: %ds > %ds",
			health.MaxLagSeconds, hg.thresholds.BackbeatMaxLagSeconds)
		return status, nil
	}

	// Check overall BACKBEAT health
	if !health.Healthy {
		status.Healthy = false
		status.Reason = "BACKBEAT reports unhealthy status"
		return status, nil
	}

	status.Healthy = true
	return status, nil
}

// checkBootstrapHealth checks bootstrap peer pool health
func (hg *HealthGates) checkBootstrapHealth(ctx context.Context) (*GateStatus, error) {
	url := fmt.Sprintf("%s/peers", hg.chorusURL)

	req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
	if err != nil {
		return nil, fmt.Errorf("failed to create bootstrap health request: %w", err)
	}

	resp, err := hg.httpClient.Do(req)
	if err != nil {
		return nil, fmt.Errorf("bootstrap health request failed: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
		return nil, fmt.Errorf("bootstrap health check returned status %d", resp.StatusCode)
	}

	var health BootstrapHealth
	if err := json.NewDecoder(resp.Body).Decode(&health); err != nil {
		return nil, fmt.Errorf("failed to decode bootstrap health response: %w", err)
	}

	status := &GateStatus{
		Name:        "bootstrap",
		LastChecked: time.Now(),
		Metrics: map[string]interface{}{
			"total_peers":     health.TotalPeers,
			"healthy_peers":   health.HealthyPeers,
			"reachable_peers": health.ReachablePeers,
		},
	}

	// Check minimum healthy peers threshold
	if health.HealthyPeers < hg.thresholds.BootstrapMinHealthyPeers {
		status.Healthy = false
		status.Reason = fmt.Sprintf("Not enough healthy bootstrap peers: %d < %d",
			health.HealthyPeers, hg.thresholds.BootstrapMinHealthyPeers)
		return status, nil
	}

	status.Healthy = true
	return status, nil
}

// checkScalingMetrics checks recent scaling success rate
func (hg *HealthGates) checkScalingMetrics(metrics *ScalingMetrics) *GateStatus {
	status := &GateStatus{
		Name:        "scaling_metrics",
		LastChecked: time.Now(),
		Metrics: map[string]interface{}{
			"join_success_rate":  metrics.JoinSuccessRate,
			"successful_joins":   metrics.SuccessfulJoins,
			"failed_joins":       metrics.FailedJoins,
			"last_wave_size":     metrics.LastWaveSize,
		},
	}

	// Check join success rate threshold
	if metrics.JoinSuccessRate < hg.thresholds.JoinSuccessRateThreshold {
		status.Healthy = false
		status.Reason = fmt.Sprintf("Join success rate too low: %.1f%% < %.1f%%",
			metrics.JoinSuccessRate*100, hg.thresholds.JoinSuccessRateThreshold*100)
		return status
	}

	status.Healthy = true
	return status
}

// GetThresholds returns the current health thresholds
func (hg *HealthGates) GetThresholds() HealthThresholds {
	return hg.thresholds
}

// IsHealthy performs a quick health check and returns boolean result
func (hg *HealthGates) IsHealthy(ctx context.Context, recentMetrics *ScalingMetrics) bool {
	status, err := hg.CheckHealth(ctx, recentMetrics)
	if err != nil {
		return false
	}
	return status.Healthy
}