WIP: raft leadership

2025-10-05 16:07:07 +08:00 · 2023-04-13 21:44:24 +02:00
parent b6a9fa7965
commit 7643959bf8
5 changed files with 442 additions and 76 deletions
--- a/app/api/api.go
+++ b/app/api/api.go
@@ -622,6 +622,7 @@ func (a *api) start() error {

 	a.restream = restream

+	if cfg.Cluster.Enable {
 		if cluster, err := cluster.New(cluster.ClusterConfig{
 			ID:        cfg.ID,
 			Name:      cfg.Name,
@@ -633,6 +634,7 @@ func (a *api) start() error {
 		} else {
 			a.cluster = cluster
 		}
+	}

 	var httpjwt jwt.JWT

@@ -1318,7 +1320,8 @@ func (a *api) stop() {
 	}

 	if a.cluster != nil {
-		a.cluster.Stop()
+		a.cluster.Leave()
+		a.cluster.Shutdown()
 	}

 	// Stop JWT authentication
--- a/cluster/cluster.go
+++ b/cluster/cluster.go
@@ -5,13 +5,13 @@ import (
 	"errors"
 	"fmt"
 	"io"
+	gonet "net"
 	"path/filepath"
 	"sync"
 	"time"

 	"github.com/datarhei/core/v16/log"
 	"github.com/datarhei/core/v16/net"
-
 	hclog "github.com/hashicorp/go-hclog"
 	"github.com/hashicorp/raft"
 	raftboltdb "github.com/hashicorp/raft-boltdb/v2"
@@ -44,7 +44,8 @@ type Cluster interface {
 	RemoveNode(id string) error
 	ListNodes() []NodeReader
 	GetNode(id string) (NodeReader, error)
-	Stop()
+	Leave() error // gracefully leave the cluster
+	Shutdown() error
 	ClusterReader
 }

@@ -75,6 +76,21 @@ type cluster struct {
 	once   sync.Once

 	logger log.Logger
+
+	raft                  *raft.Raft
+	raftTransport         *raft.NetworkTransport
+	raftAddress           string
+	raftNotifyCh          <-chan bool
+	raftStore             *raftboltdb.BoltStore
+	raftRemoveGracePeriod time.Duration
+
+	reassertLeaderCh chan chan error
+
+	leaveCh chan struct{}
+
+	shutdown     bool
+	shutdownCh   chan struct{}
+	shutdownLock sync.Mutex
 }

 func New(config ClusterConfig) (Cluster, error) {
@@ -89,6 +105,10 @@ func New(config ClusterConfig) (Cluster, error) {
 		limiter:  config.IPLimiter,
 		updates:  make(chan NodeState, 64),
 		logger:   config.Logger,
+
+		reassertLeaderCh: make(chan chan error),
+		leaveCh:          make(chan struct{}),
+		shutdownCh:       make(chan struct{}),
 	}

 	if c.limiter == nil {
@@ -104,62 +124,12 @@ func New(config ClusterConfig) (Cluster, error) {
 		return nil, err
 	}

-	snapshotLogger := NewLogger(c.logger.WithComponent("raft"), hclog.Debug).Named("snapshot")
-	snapShotStore, err := raft.NewFileSnapshotStoreWithLogger(filepath.Join(c.path, "snapshots"), 10, snapshotLogger)
-	if err != nil {
-		return nil, err
-	}
+	c.startRaft(fsm, true, false)

-	boltdb, err := raftboltdb.New(raftboltdb.Options{
-		Path: filepath.Join(c.path, "store.db"),
-		BoltOptions: &bbolt.Options{
-			Timeout: 5 * time.Second,
-		},
-	})
-	if err != nil {
-		return nil, err
-	}
-
-	boltdb.Stats()
-
-	raftConfig := raft.DefaultConfig()
-	raftConfig.Logger = NewLogger(c.logger.WithComponent("raft"), hclog.Debug)
-
-	raftTransport, err := raft.NewTCPTransportWithConfig("127.0.0.1:8090", nil, &raft.NetworkTransportConfig{
-		ServerAddressProvider: nil,
-		Logger:                NewLogger(c.logger.WithComponent("raft"), hclog.Debug).Named("transport"),
-		Stream:                &raft.TCPStreamLayer{},
-		MaxPool:               5,
-		Timeout:               5 * time.Second,
-	})
-	if err != nil {
-		boltdb.Close()
-		return nil, err
-	}
-
-	node, err := raft.NewRaft(raftConfig, fsm, boltdb, boltdb, snapShotStore, raftTransport)
-	if err != nil {
-		boltdb.Close()
-		return nil, err
-	}
-
-	node.BootstrapCluster(raft.Configuration{
-		Servers: []raft.Server{
-			{
-				Suffrage: raft.Voter,
-				ID:       raft.ServerID(config.Name),
-				Address:  raftTransport.LocalAddr(),
-			},
-		},
-	})
-
-	ctx, cancel := context.WithCancel(context.Background())
-	c.cancel = cancel
-
-	go func(ctx context.Context) {
+	go func() {
 		for {
 			select {
-			case <-ctx.Done():
+			case <-c.shutdownCh:
 				return
 			case state := <-c.updates:
 				c.logger.Debug().WithFields(log.Fields{
@@ -190,13 +160,23 @@ func New(config ClusterConfig) (Cluster, error) {
 				c.lock.Unlock()
 			}
 		}
-	}(ctx)
+	}()

 	return c, nil
 }

-func (c *cluster) Stop() {
-	c.once.Do(func() {
+func (c *cluster) Shutdown() error {
+	c.logger.Info().Log("shutting down cluster")
+	c.shutdownLock.Lock()
+	defer c.shutdownLock.Unlock()
+
+	if c.shutdown {
+		return nil
+	}
+
+	c.shutdown = true
+	close(c.shutdownCh)
+
 	c.lock.Lock()
 	defer c.lock.Unlock()

@@ -206,8 +186,77 @@ func (c *cluster) Stop() {

 	c.nodes = map[string]*node{}

-		c.cancel()
-	})
+	c.shutdownRaft()
+
+	return nil
+}
+
+// https://github.com/hashicorp/consul/blob/44b39240a86bc94ddc67bc105286ab450bd869a9/agent/consul/server.go#L1369
+func (c *cluster) Leave() error {
+	addr := c.raftTransport.LocalAddr()
+
+	// Get the latest configuration.
+	future := c.raft.GetConfiguration()
+	if err := future.Error(); err != nil {
+		c.logger.Error().WithError(err).Log("failed to get raft configuration")
+		return err
+	}
+
+	numPeers := len(future.Configuration().Servers)
+
+	// If we are the current leader, and we have any other peers (cluster has multiple
+	// servers), we should do a RemoveServer/RemovePeer to safely reduce the quorum size.
+	// If we are not the leader, then we should issue our leave intention and wait to be
+	// removed for some reasonable period of time.
+	isLeader := c.IsLeader()
+	if isLeader && numPeers > 1 {
+		if err := c.leadershipTransfer(); err == nil {
+			isLeader = false
+		} else {
+			future := c.raft.RemoveServer(raft.ServerID(c.id), 0, 0)
+			if err := future.Error(); err != nil {
+				c.logger.Error().WithError(err).Log("failed to remove ourself as raft peer")
+			}
+		}
+	}
+
+	// If we were not leader, wait to be safely removed from the cluster. We
+	// must wait to allow the raft replication to take place, otherwise an
+	// immediate shutdown could cause a loss of quorum.
+	if !isLeader {
+		left := false
+		limit := time.Now().Add(c.raftRemoveGracePeriod)
+		for !left && time.Now().Before(limit) {
+			// Sleep a while before we check.
+			time.Sleep(50 * time.Millisecond)
+
+			// Get the latest configuration.
+			future := c.raft.GetConfiguration()
+			if err := future.Error(); err != nil {
+				c.logger.Error().WithError(err).Log("failed to get raft configuration")
+				break
+			}
+
+			// See if we are no longer included.
+			left = true
+			for _, server := range future.Configuration().Servers {
+				if server.Address == addr {
+					left = false
+					break
+				}
+			}
+		}
+
+		if !left {
+			c.logger.Warn().Log("failed to leave raft configuration gracefully, timeout")
+		}
+	}
+
+	return nil
+}
+
+func (c *cluster) IsLeader() bool {
+	return c.raft.State() == raft.Leader
 }

 func (c *cluster) AddNode(address, username, password string) (string, error) {
@@ -371,3 +420,112 @@ func (c *cluster) GetFile(path string) (io.ReadCloser, error) {

 	return data, nil
 }
+
+func (c *cluster) startRaft(fsm raft.FSM, bootstrap, inmem bool) error {
+	defer func() {
+		if c.raft == nil && c.raftStore != nil {
+			c.raftStore.Close()
+		}
+	}()
+
+	c.raftRemoveGracePeriod = 5 * time.Second
+
+	addr, err := gonet.ResolveTCPAddr("tcp", c.raftAddress)
+	if err != nil {
+		return err
+	}
+
+	transport, err := raft.NewTCPTransportWithLogger(c.raftAddress, addr, 3, 10*time.Second, NewLogger(c.logger.WithComponent("raft"), hclog.Debug).Named("transport"))
+	if err != nil {
+		return err
+	}
+
+	snapshotLogger := NewLogger(c.logger.WithComponent("raft"), hclog.Debug).Named("snapshot")
+	snapshots, err := raft.NewFileSnapshotStoreWithLogger(filepath.Join(c.path, "snapshots"), 10, snapshotLogger)
+	if err != nil {
+		return err
+	}
+
+	var logStore raft.LogStore
+	var stableStore raft.StableStore
+	if inmem {
+		logStore = raft.NewInmemStore()
+		stableStore = raft.NewInmemStore()
+	} else {
+		bolt, err := raftboltdb.New(raftboltdb.Options{
+			Path: filepath.Join(c.path, "raftlog.db"),
+			BoltOptions: &bbolt.Options{
+				Timeout: 5 * time.Second,
+			},
+		})
+		if err != nil {
+			return fmt.Errorf("bolt: %w", err)
+		}
+		logStore = bolt
+		stableStore = bolt
+
+		cacheStore, err := raft.NewLogCache(512, logStore)
+		if err != nil {
+			return err
+		}
+		logStore = cacheStore
+
+		c.raftStore = bolt
+	}
+
+	cfg := raft.DefaultConfig()
+	cfg.LocalID = raft.ServerID(c.id)
+	cfg.Logger = NewLogger(c.logger.WithComponent("raft"), hclog.Debug)
+
+	if bootstrap {
+		hasState, err := raft.HasExistingState(logStore, stableStore, snapshots)
+		if err != nil {
+			return err
+		}
+		if !hasState {
+			configuration := raft.Configuration{
+				Servers: []raft.Server{
+					{
+						Suffrage: raft.Voter,
+						ID:       raft.ServerID(c.id),
+						Address:  transport.LocalAddr(),
+					},
+				},
+			}
+
+			if err := raft.BootstrapCluster(cfg,
+				logStore, stableStore, snapshots, transport, configuration); err != nil {
+				return err
+			}
+		}
+	}
+
+	// Set up a channel for reliable leader notifications.
+	raftNotifyCh := make(chan bool, 10)
+	cfg.NotifyCh = raftNotifyCh
+	c.raftNotifyCh = raftNotifyCh
+
+	node, err := raft.NewRaft(cfg, fsm, logStore, stableStore, snapshots, transport)
+	if err != nil {
+		return err
+	}
+
+	c.raft = node
+
+	go c.monitorLeadership()
+
+	return nil
+}
+
+func (c *cluster) shutdownRaft() {
+	if c.raft != nil {
+		c.raftTransport.Close()
+		future := c.raft.Shutdown()
+		if err := future.Error(); err != nil {
+			c.logger.Warn().WithError(err).Log("error shutting down raft")
+		}
+		if c.raftStore != nil {
+			c.raftStore.Close()
+		}
+	}
+}
--- a/cluster/leader.go
+++ b/cluster/leader.go
@@ -0,0 +1,195 @@
+package cluster
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/datarhei/core/v16/log"
+)
+
+// monitorLeadership listens to the raf notify channel in order to find
+// out if we got the leadership or lost it.
+// https://github.com/hashicorp/consul/blob/44b39240a86bc94ddc67bc105286ab450bd869a9/agent/consul/leader.go#L71
+func (c *cluster) monitorLeadership() {
+	// We use the notify channel we configured Raft with, NOT Raft's
+	// leaderCh, which is only notified best-effort. Doing this ensures
+	// that we get all notifications in order, which is required for
+	// cleanup and to ensure we never run multiple leader loops.
+	raftNotifyCh := c.raftNotifyCh
+
+	var weAreLeaderCh chan struct{}
+	var leaderLoop sync.WaitGroup
+	for {
+		select {
+		case isLeader := <-raftNotifyCh:
+			switch {
+			case isLeader:
+				if weAreLeaderCh != nil {
+					c.logger.Error().Log("attempted to start the leader loop while running")
+					continue
+				}
+
+				weAreLeaderCh = make(chan struct{})
+				leaderLoop.Add(1)
+				go func(ch chan struct{}) {
+					defer leaderLoop.Done()
+					c.leaderLoop(ch)
+				}(weAreLeaderCh)
+				c.logger.Info().Log("cluster leadership acquired")
+
+			default:
+				if weAreLeaderCh == nil {
+					c.logger.Error().Log("attempted to stop the leader loop while not running")
+					continue
+				}
+
+				c.logger.Debug().Log("shutting down leader loop")
+				close(weAreLeaderCh)
+				leaderLoop.Wait()
+				weAreLeaderCh = nil
+				c.logger.Info().Log("cluster leadership lost")
+			}
+		case <-c.shutdownCh:
+			return
+		}
+	}
+}
+
+// leadershipTransfer tries to transfer the leadership to another node e.g. in order
+// to do a graceful shutdown.
+// https://github.com/hashicorp/consul/blob/44b39240a86bc94ddc67bc105286ab450bd869a9/agent/consul/leader.go#L122
+func (c *cluster) leadershipTransfer() error {
+	retryCount := 3
+	for i := 0; i < retryCount; i++ {
+		future := c.raft.LeadershipTransfer()
+		if err := future.Error(); err != nil {
+			c.logger.Error().WithError(err).WithFields(log.Fields{
+				"attempt":     i,
+				"retry_limit": retryCount,
+			}).Log("failed to transfer leadership attempt, will retry")
+		} else {
+			c.logger.Info().WithFields(log.Fields{
+				"attempt":     i,
+				"retry_limit": retryCount,
+			}).Log("successfully transferred leadership")
+			return nil
+		}
+
+	}
+	return fmt.Errorf("failed to transfer leadership in %d attempts", retryCount)
+}
+
+// leaderLoop runs as long as we are the leader to run various maintenance activities
+// https://github.com/hashicorp/consul/blob/44b39240a86bc94ddc67bc105286ab450bd869a9/agent/consul/leader.go#L146
+func (c *cluster) leaderLoop(stopCh chan struct{}) {
+	establishedLeader := false
+RECONCILE:
+	// Setup a reconciliation timer
+	interval := time.After(s.config.ReconcileInterval)
+
+	// Apply a raft barrier to ensure our FSM is caught up
+	barrier := c.raft.Barrier(time.Minute)
+	if err := barrier.Error(); err != nil {
+		c.logger.Error().WithError(err).Log("failed to wait for barrier")
+		goto WAIT
+	}
+
+	// Check if we need to handle initial leadership actions
+	if !establishedLeader {
+		if err := c.establishLeadership(stopCtx); err != nil {
+			c.logger.Error().WithError(err).Log("failed to establish leadership")
+			// Immediately revoke leadership since we didn't successfully
+			// establish leadership.
+			c.revokeLeadership()
+
+			// attempt to transfer leadership. If successful it is
+			// time to leave the leaderLoop since this node is no
+			// longer the leader. If leadershipTransfer() fails, we
+			// will try to acquire it again after
+			// 5 seconds.
+			if err := c.leadershipTransfer(); err != nil {
+				c.logger.Error().WithError(err).Log("failed to transfer leadership")
+				interval = time.After(5 * time.Second)
+				goto WAIT
+			}
+			return
+		}
+		establishedLeader = true
+		defer c.revokeLeadership()
+	}
+
+WAIT:
+	// Poll the stop channel to give it priority so we don't waste time
+	// trying to perform the other operations if we have been asked to shut
+	// down.
+	select {
+	case <-stopCh:
+		return
+	default:
+	}
+
+	// Periodically reconcile as long as we are the leader,
+	// or when Serf events arrive
+	for {
+		select {
+		case <-stopCh:
+			return
+		case <-c.shutdownCh:
+			return
+		case <-interval:
+			goto RECONCILE
+		case errCh := <-c.reassertLeaderCh:
+			// we can get into this state when the initial
+			// establishLeadership has failed as well as the follow
+			// up leadershipTransfer. Afterwards we will be waiting
+			// for the interval to trigger a reconciliation and can
+			// potentially end up here. There is no point to
+			// reassert because this agent was never leader in the
+			// first place.
+			if !establishedLeader {
+				errCh <- fmt.Errorf("leadership has not been established")
+				continue
+			}
+
+			// continue to reassert only if we previously were the
+			// leader, which means revokeLeadership followed by an
+			// establishLeadership().
+			c.revokeLeadership()
+			err := c.establishLeadership(stopCtx)
+			errCh <- err
+
+			// in case establishLeadership failed, we will try to
+			// transfer leadership. At this time raft thinks we are
+			// the leader, but we disagree.
+			if err != nil {
+				if err := c.leadershipTransfer(); err != nil {
+					// establishedLeader was true before,
+					// but it no longer is since it revoked
+					// leadership and Leadership transfer
+					// also failed. Which is why it stays
+					// in the leaderLoop, but now
+					// establishedLeader needs to be set to
+					// false.
+					establishedLeader = false
+					interval = time.After(5 * time.Second)
+					goto WAIT
+				}
+
+				// leadershipTransfer was successful and it is
+				// time to leave the leaderLoop.
+				return
+			}
+
+		}
+	}
+}
+
+func (c *cluster) establishLeadership(ctx context.Context) error {
+	return nil
+}
+
+func (c *cluster) revokeLeadership() {
+
+}
--- a/config/config.go
+++ b/config/config.go
@@ -271,6 +271,11 @@ func (d *Config) init() {
 	d.vars.Register(value.NewStringList(&d.Router.BlockedPrefixes, []string{"/api"}, ","), "router.blocked_prefixes", "CORE_ROUTER_BLOCKED_PREFIXES", nil, "List of path prefixes that can't be routed", false, false)
 	d.vars.Register(value.NewStringMapString(&d.Router.Routes, nil), "router.routes", "CORE_ROUTER_ROUTES", nil, "List of route mappings", false, false)
 	d.vars.Register(value.NewDir(&d.Router.UIPath, "", d.fs), "router.ui_path", "CORE_ROUTER_UI_PATH", nil, "Path to a directory holding UI files mounted as /ui", false, false)
+
+	// Cluster
+	d.vars.Register(value.NewBool(&d.Cluster.Enable, false), "cluster.enable", "CORE_CLUSTER_ENABLE", nil, "Enable cluster mode", false, false)
+	d.vars.Register(value.NewBool(&d.Cluster.Bootstrap, false), "cluster.bootstrap", "CORE_CLUSTER_BOOTSTRAP", nil, "Bootstrap a cluster", false, false)
+	d.vars.Register(value.NewBool(&d.Cluster.Debug, false), "cluster.debug", "CORE_CLUSTER_DEBUG", nil, "Switch to debug mode, not for production", false, false)
 }

 // Validate validates the current state of the Config for completeness and sanity. Errors are
--- a/config/data.go
+++ b/config/data.go
@@ -166,6 +166,11 @@ type Data struct {
 		Routes          map[string]string `json:"routes"`
 		UIPath          string            `json:"ui_path"`
 	} `json:"router"`
+	Cluster struct {
+		Enable    bool `json:"enable"`
+		Bootstrap bool `json:"bootstrap"`
+		Debug     bool `json:"debug"`
+	} `json:"cluster"`
 }

 func UpgradeV2ToV3(d *v2.Data, fs fs.Filesystem) (*Data, error) {