feat: Implement complete CHORUS leader election system
Major milestone: CHORUS leader election is now fully functional! ## Key Features Implemented: ### 🗳️ Leader Election Core - Fixed root cause: nodes now trigger elections when no admin exists - Added randomized election delays to prevent simultaneous elections - Implemented concurrent election prevention (only one election at a time) - Added proper election state management and transitions ### 📡 Admin Discovery System - Enhanced discovery requests with "WHOAMI" debug messages - Fixed discovery responses to properly include current leader ID - Added comprehensive discovery request/response logging - Implemented admin confirmation from multiple sources ### 🔧 Configuration Improvements - Increased discovery timeout from 3s to 15s for better reliability - Added proper Docker Hub image deployment workflow - Updated build process to use correct chorus-agent binary (not deprecated chorus) - Added static compilation flags for Alpine Linux compatibility ### 🐛 Critical Fixes - Fixed build process confusion between chorus vs chorus-agent binaries - Added missing admin_election capability to enable leader elections - Corrected discovery logic to handle zero admin responses - Enhanced debugging with detailed state and timing information ## Current Operational Status: ✅ Admin Election: Working with proper consensus ✅ Heartbeat System: 15-second intervals from elected admin ✅ Discovery Protocol: Nodes can find and confirm current admin ✅ P2P Connectivity: 5+ connected peers with libp2p ✅ SLURP Functionality: Enabled on admin nodes ✅ BACKBEAT Integration: Tempo synchronization working ✅ Container Health: All health checks passing ## Technical Details: - Election uses weighted scoring based on uptime, capabilities, and resources - Randomized delays prevent election storms (30-45s wait periods) - Discovery responses include current leader ID for network-wide consensus - State management prevents multiple concurrent elections - Enhanced logging provides full visibility into election process 🎉 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
380
vendor/github.com/sony/gobreaker/gobreaker.go
generated
vendored
Normal file
380
vendor/github.com/sony/gobreaker/gobreaker.go
generated
vendored
Normal file
@@ -0,0 +1,380 @@
|
||||
// Package gobreaker implements the Circuit Breaker pattern.
|
||||
// See https://msdn.microsoft.com/en-us/library/dn589784.aspx.
|
||||
package gobreaker
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
// State is a type that represents a state of CircuitBreaker.
|
||||
type State int
|
||||
|
||||
// These constants are states of CircuitBreaker.
|
||||
const (
|
||||
StateClosed State = iota
|
||||
StateHalfOpen
|
||||
StateOpen
|
||||
)
|
||||
|
||||
var (
|
||||
// ErrTooManyRequests is returned when the CB state is half open and the requests count is over the cb maxRequests
|
||||
ErrTooManyRequests = errors.New("too many requests")
|
||||
// ErrOpenState is returned when the CB state is open
|
||||
ErrOpenState = errors.New("circuit breaker is open")
|
||||
)
|
||||
|
||||
// String implements stringer interface.
|
||||
func (s State) String() string {
|
||||
switch s {
|
||||
case StateClosed:
|
||||
return "closed"
|
||||
case StateHalfOpen:
|
||||
return "half-open"
|
||||
case StateOpen:
|
||||
return "open"
|
||||
default:
|
||||
return fmt.Sprintf("unknown state: %d", s)
|
||||
}
|
||||
}
|
||||
|
||||
// Counts holds the numbers of requests and their successes/failures.
|
||||
// CircuitBreaker clears the internal Counts either
|
||||
// on the change of the state or at the closed-state intervals.
|
||||
// Counts ignores the results of the requests sent before clearing.
|
||||
type Counts struct {
|
||||
Requests uint32
|
||||
TotalSuccesses uint32
|
||||
TotalFailures uint32
|
||||
ConsecutiveSuccesses uint32
|
||||
ConsecutiveFailures uint32
|
||||
}
|
||||
|
||||
func (c *Counts) onRequest() {
|
||||
c.Requests++
|
||||
}
|
||||
|
||||
func (c *Counts) onSuccess() {
|
||||
c.TotalSuccesses++
|
||||
c.ConsecutiveSuccesses++
|
||||
c.ConsecutiveFailures = 0
|
||||
}
|
||||
|
||||
func (c *Counts) onFailure() {
|
||||
c.TotalFailures++
|
||||
c.ConsecutiveFailures++
|
||||
c.ConsecutiveSuccesses = 0
|
||||
}
|
||||
|
||||
func (c *Counts) clear() {
|
||||
c.Requests = 0
|
||||
c.TotalSuccesses = 0
|
||||
c.TotalFailures = 0
|
||||
c.ConsecutiveSuccesses = 0
|
||||
c.ConsecutiveFailures = 0
|
||||
}
|
||||
|
||||
// Settings configures CircuitBreaker:
|
||||
//
|
||||
// Name is the name of the CircuitBreaker.
|
||||
//
|
||||
// MaxRequests is the maximum number of requests allowed to pass through
|
||||
// when the CircuitBreaker is half-open.
|
||||
// If MaxRequests is 0, the CircuitBreaker allows only 1 request.
|
||||
//
|
||||
// Interval is the cyclic period of the closed state
|
||||
// for the CircuitBreaker to clear the internal Counts.
|
||||
// If Interval is less than or equal to 0, the CircuitBreaker doesn't clear internal Counts during the closed state.
|
||||
//
|
||||
// Timeout is the period of the open state,
|
||||
// after which the state of the CircuitBreaker becomes half-open.
|
||||
// If Timeout is less than or equal to 0, the timeout value of the CircuitBreaker is set to 60 seconds.
|
||||
//
|
||||
// ReadyToTrip is called with a copy of Counts whenever a request fails in the closed state.
|
||||
// If ReadyToTrip returns true, the CircuitBreaker will be placed into the open state.
|
||||
// If ReadyToTrip is nil, default ReadyToTrip is used.
|
||||
// Default ReadyToTrip returns true when the number of consecutive failures is more than 5.
|
||||
//
|
||||
// OnStateChange is called whenever the state of the CircuitBreaker changes.
|
||||
//
|
||||
// IsSuccessful is called with the error returned from a request.
|
||||
// If IsSuccessful returns true, the error is counted as a success.
|
||||
// Otherwise the error is counted as a failure.
|
||||
// If IsSuccessful is nil, default IsSuccessful is used, which returns false for all non-nil errors.
|
||||
type Settings struct {
|
||||
Name string
|
||||
MaxRequests uint32
|
||||
Interval time.Duration
|
||||
Timeout time.Duration
|
||||
ReadyToTrip func(counts Counts) bool
|
||||
OnStateChange func(name string, from State, to State)
|
||||
IsSuccessful func(err error) bool
|
||||
}
|
||||
|
||||
// CircuitBreaker is a state machine to prevent sending requests that are likely to fail.
|
||||
type CircuitBreaker struct {
|
||||
name string
|
||||
maxRequests uint32
|
||||
interval time.Duration
|
||||
timeout time.Duration
|
||||
readyToTrip func(counts Counts) bool
|
||||
isSuccessful func(err error) bool
|
||||
onStateChange func(name string, from State, to State)
|
||||
|
||||
mutex sync.Mutex
|
||||
state State
|
||||
generation uint64
|
||||
counts Counts
|
||||
expiry time.Time
|
||||
}
|
||||
|
||||
// TwoStepCircuitBreaker is like CircuitBreaker but instead of surrounding a function
|
||||
// with the breaker functionality, it only checks whether a request can proceed and
|
||||
// expects the caller to report the outcome in a separate step using a callback.
|
||||
type TwoStepCircuitBreaker struct {
|
||||
cb *CircuitBreaker
|
||||
}
|
||||
|
||||
// NewCircuitBreaker returns a new CircuitBreaker configured with the given Settings.
|
||||
func NewCircuitBreaker(st Settings) *CircuitBreaker {
|
||||
cb := new(CircuitBreaker)
|
||||
|
||||
cb.name = st.Name
|
||||
cb.onStateChange = st.OnStateChange
|
||||
|
||||
if st.MaxRequests == 0 {
|
||||
cb.maxRequests = 1
|
||||
} else {
|
||||
cb.maxRequests = st.MaxRequests
|
||||
}
|
||||
|
||||
if st.Interval <= 0 {
|
||||
cb.interval = defaultInterval
|
||||
} else {
|
||||
cb.interval = st.Interval
|
||||
}
|
||||
|
||||
if st.Timeout <= 0 {
|
||||
cb.timeout = defaultTimeout
|
||||
} else {
|
||||
cb.timeout = st.Timeout
|
||||
}
|
||||
|
||||
if st.ReadyToTrip == nil {
|
||||
cb.readyToTrip = defaultReadyToTrip
|
||||
} else {
|
||||
cb.readyToTrip = st.ReadyToTrip
|
||||
}
|
||||
|
||||
if st.IsSuccessful == nil {
|
||||
cb.isSuccessful = defaultIsSuccessful
|
||||
} else {
|
||||
cb.isSuccessful = st.IsSuccessful
|
||||
}
|
||||
|
||||
cb.toNewGeneration(time.Now())
|
||||
|
||||
return cb
|
||||
}
|
||||
|
||||
// NewTwoStepCircuitBreaker returns a new TwoStepCircuitBreaker configured with the given Settings.
|
||||
func NewTwoStepCircuitBreaker(st Settings) *TwoStepCircuitBreaker {
|
||||
return &TwoStepCircuitBreaker{
|
||||
cb: NewCircuitBreaker(st),
|
||||
}
|
||||
}
|
||||
|
||||
const defaultInterval = time.Duration(0) * time.Second
|
||||
const defaultTimeout = time.Duration(60) * time.Second
|
||||
|
||||
func defaultReadyToTrip(counts Counts) bool {
|
||||
return counts.ConsecutiveFailures > 5
|
||||
}
|
||||
|
||||
func defaultIsSuccessful(err error) bool {
|
||||
return err == nil
|
||||
}
|
||||
|
||||
// Name returns the name of the CircuitBreaker.
|
||||
func (cb *CircuitBreaker) Name() string {
|
||||
return cb.name
|
||||
}
|
||||
|
||||
// State returns the current state of the CircuitBreaker.
|
||||
func (cb *CircuitBreaker) State() State {
|
||||
cb.mutex.Lock()
|
||||
defer cb.mutex.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
state, _ := cb.currentState(now)
|
||||
return state
|
||||
}
|
||||
|
||||
// Counts returns internal counters
|
||||
func (cb *CircuitBreaker) Counts() Counts {
|
||||
cb.mutex.Lock()
|
||||
defer cb.mutex.Unlock()
|
||||
|
||||
return cb.counts
|
||||
}
|
||||
|
||||
// Execute runs the given request if the CircuitBreaker accepts it.
|
||||
// Execute returns an error instantly if the CircuitBreaker rejects the request.
|
||||
// Otherwise, Execute returns the result of the request.
|
||||
// If a panic occurs in the request, the CircuitBreaker handles it as an error
|
||||
// and causes the same panic again.
|
||||
func (cb *CircuitBreaker) Execute(req func() (interface{}, error)) (interface{}, error) {
|
||||
generation, err := cb.beforeRequest()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
e := recover()
|
||||
if e != nil {
|
||||
cb.afterRequest(generation, false)
|
||||
panic(e)
|
||||
}
|
||||
}()
|
||||
|
||||
result, err := req()
|
||||
cb.afterRequest(generation, cb.isSuccessful(err))
|
||||
return result, err
|
||||
}
|
||||
|
||||
// Name returns the name of the TwoStepCircuitBreaker.
|
||||
func (tscb *TwoStepCircuitBreaker) Name() string {
|
||||
return tscb.cb.Name()
|
||||
}
|
||||
|
||||
// State returns the current state of the TwoStepCircuitBreaker.
|
||||
func (tscb *TwoStepCircuitBreaker) State() State {
|
||||
return tscb.cb.State()
|
||||
}
|
||||
|
||||
// Counts returns internal counters
|
||||
func (tscb *TwoStepCircuitBreaker) Counts() Counts {
|
||||
return tscb.cb.Counts()
|
||||
}
|
||||
|
||||
// Allow checks if a new request can proceed. It returns a callback that should be used to
|
||||
// register the success or failure in a separate step. If the circuit breaker doesn't allow
|
||||
// requests, it returns an error.
|
||||
func (tscb *TwoStepCircuitBreaker) Allow() (done func(success bool), err error) {
|
||||
generation, err := tscb.cb.beforeRequest()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return func(success bool) {
|
||||
tscb.cb.afterRequest(generation, success)
|
||||
}, nil
|
||||
}
|
||||
|
||||
func (cb *CircuitBreaker) beforeRequest() (uint64, error) {
|
||||
cb.mutex.Lock()
|
||||
defer cb.mutex.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
state, generation := cb.currentState(now)
|
||||
|
||||
if state == StateOpen {
|
||||
return generation, ErrOpenState
|
||||
} else if state == StateHalfOpen && cb.counts.Requests >= cb.maxRequests {
|
||||
return generation, ErrTooManyRequests
|
||||
}
|
||||
|
||||
cb.counts.onRequest()
|
||||
return generation, nil
|
||||
}
|
||||
|
||||
func (cb *CircuitBreaker) afterRequest(before uint64, success bool) {
|
||||
cb.mutex.Lock()
|
||||
defer cb.mutex.Unlock()
|
||||
|
||||
now := time.Now()
|
||||
state, generation := cb.currentState(now)
|
||||
if generation != before {
|
||||
return
|
||||
}
|
||||
|
||||
if success {
|
||||
cb.onSuccess(state, now)
|
||||
} else {
|
||||
cb.onFailure(state, now)
|
||||
}
|
||||
}
|
||||
|
||||
func (cb *CircuitBreaker) onSuccess(state State, now time.Time) {
|
||||
switch state {
|
||||
case StateClosed:
|
||||
cb.counts.onSuccess()
|
||||
case StateHalfOpen:
|
||||
cb.counts.onSuccess()
|
||||
if cb.counts.ConsecutiveSuccesses >= cb.maxRequests {
|
||||
cb.setState(StateClosed, now)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (cb *CircuitBreaker) onFailure(state State, now time.Time) {
|
||||
switch state {
|
||||
case StateClosed:
|
||||
cb.counts.onFailure()
|
||||
if cb.readyToTrip(cb.counts) {
|
||||
cb.setState(StateOpen, now)
|
||||
}
|
||||
case StateHalfOpen:
|
||||
cb.setState(StateOpen, now)
|
||||
}
|
||||
}
|
||||
|
||||
func (cb *CircuitBreaker) currentState(now time.Time) (State, uint64) {
|
||||
switch cb.state {
|
||||
case StateClosed:
|
||||
if !cb.expiry.IsZero() && cb.expiry.Before(now) {
|
||||
cb.toNewGeneration(now)
|
||||
}
|
||||
case StateOpen:
|
||||
if cb.expiry.Before(now) {
|
||||
cb.setState(StateHalfOpen, now)
|
||||
}
|
||||
}
|
||||
return cb.state, cb.generation
|
||||
}
|
||||
|
||||
func (cb *CircuitBreaker) setState(state State, now time.Time) {
|
||||
if cb.state == state {
|
||||
return
|
||||
}
|
||||
|
||||
prev := cb.state
|
||||
cb.state = state
|
||||
|
||||
cb.toNewGeneration(now)
|
||||
|
||||
if cb.onStateChange != nil {
|
||||
cb.onStateChange(cb.name, prev, state)
|
||||
}
|
||||
}
|
||||
|
||||
func (cb *CircuitBreaker) toNewGeneration(now time.Time) {
|
||||
cb.generation++
|
||||
cb.counts.clear()
|
||||
|
||||
var zero time.Time
|
||||
switch cb.state {
|
||||
case StateClosed:
|
||||
if cb.interval == 0 {
|
||||
cb.expiry = zero
|
||||
} else {
|
||||
cb.expiry = now.Add(cb.interval)
|
||||
}
|
||||
case StateOpen:
|
||||
cb.expiry = now.Add(cb.timeout)
|
||||
default: // StateHalfOpen
|
||||
cb.expiry = zero
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user