mirror of
https://github.com/datarhei/core.git
synced 2025-10-06 00:17:07 +08:00
Add auto-removal of nodes if unresponsive
This commit is contained in:
@@ -545,6 +545,7 @@ func (a *api) start(ctx context.Context) error {
|
|||||||
SyncInterval: time.Duration(cfg.Cluster.SyncInterval) * time.Second,
|
SyncInterval: time.Duration(cfg.Cluster.SyncInterval) * time.Second,
|
||||||
NodeRecoverTimeout: time.Duration(cfg.Cluster.NodeRecoverTimeout) * time.Second,
|
NodeRecoverTimeout: time.Duration(cfg.Cluster.NodeRecoverTimeout) * time.Second,
|
||||||
EmergencyLeaderTimeout: time.Duration(cfg.Cluster.EmergencyLeaderTimeout) * time.Second,
|
EmergencyLeaderTimeout: time.Duration(cfg.Cluster.EmergencyLeaderTimeout) * time.Second,
|
||||||
|
RecoverTimeout: time.Duration(cfg.Cluster.RecoverTimeout) * time.Second,
|
||||||
CoreConfig: cfg.Clone(),
|
CoreConfig: cfg.Clone(),
|
||||||
CoreSkills: a.ffmpeg.Skills(),
|
CoreSkills: a.ffmpeg.Skills(),
|
||||||
IPLimiter: a.sessionsLimiter,
|
IPLimiter: a.sessionsLimiter,
|
||||||
|
@@ -55,6 +55,7 @@ type Cluster interface {
|
|||||||
Leave(origin, id string) error // gracefully remove a node from the cluster
|
Leave(origin, id string) error // gracefully remove a node from the cluster
|
||||||
TransferLeadership(origin, id string) error // transfer leadership to another node
|
TransferLeadership(origin, id string) error // transfer leadership to another node
|
||||||
Snapshot(origin string) (io.ReadCloser, error)
|
Snapshot(origin string) (io.ReadCloser, error)
|
||||||
|
IsRaftLeader() bool
|
||||||
HasRaftLeader() bool
|
HasRaftLeader() bool
|
||||||
|
|
||||||
ProcessAdd(origin string, config *app.Config) error
|
ProcessAdd(origin string, config *app.Config) error
|
||||||
@@ -108,6 +109,7 @@ type Config struct {
|
|||||||
SyncInterval time.Duration // Interval between aligning the process in the cluster DB with the processes on the nodes
|
SyncInterval time.Duration // Interval between aligning the process in the cluster DB with the processes on the nodes
|
||||||
NodeRecoverTimeout time.Duration // Timeout for a node to recover before rebalancing the processes
|
NodeRecoverTimeout time.Duration // Timeout for a node to recover before rebalancing the processes
|
||||||
EmergencyLeaderTimeout time.Duration // Timeout for establishing the emergency leadership after lost contact to raft leader
|
EmergencyLeaderTimeout time.Duration // Timeout for establishing the emergency leadership after lost contact to raft leader
|
||||||
|
RecoverTimeout time.Duration // Timeout for recovering the cluster if there's no raft leader
|
||||||
|
|
||||||
CoreConfig *config.Config
|
CoreConfig *config.Config
|
||||||
CoreSkills skills.Skills
|
CoreSkills skills.Skills
|
||||||
@@ -127,7 +129,7 @@ type cluster struct {
|
|||||||
|
|
||||||
logger log.Logger
|
logger log.Logger
|
||||||
|
|
||||||
raft raft.Raft
|
raft raft.RaftRecoverer
|
||||||
raftRemoveGracePeriod time.Duration
|
raftRemoveGracePeriod time.Duration
|
||||||
raftAddress string
|
raftAddress string
|
||||||
raftNotifyCh chan bool
|
raftNotifyCh chan bool
|
||||||
@@ -137,6 +139,7 @@ type cluster struct {
|
|||||||
store store.Store
|
store store.Store
|
||||||
|
|
||||||
cancelLeaderShip context.CancelFunc
|
cancelLeaderShip context.CancelFunc
|
||||||
|
cancelFollowerShip context.CancelFunc
|
||||||
|
|
||||||
shutdown bool
|
shutdown bool
|
||||||
shutdownCh chan struct{}
|
shutdownCh chan struct{}
|
||||||
@@ -146,6 +149,7 @@ type cluster struct {
|
|||||||
syncInterval time.Duration
|
syncInterval time.Duration
|
||||||
nodeRecoverTimeout time.Duration
|
nodeRecoverTimeout time.Duration
|
||||||
emergencyLeaderTimeout time.Duration
|
emergencyLeaderTimeout time.Duration
|
||||||
|
recoverTimeout time.Duration
|
||||||
|
|
||||||
forwarder *forwarder.Forwarder
|
forwarder *forwarder.Forwarder
|
||||||
api API
|
api API
|
||||||
@@ -161,6 +165,8 @@ type cluster struct {
|
|||||||
isRaftLeader bool
|
isRaftLeader bool
|
||||||
hasRaftLeader bool
|
hasRaftLeader bool
|
||||||
isLeader bool
|
isLeader bool
|
||||||
|
isEmergencyLeader bool
|
||||||
|
lastLeaderChange time.Time
|
||||||
leaderLock sync.Mutex
|
leaderLock sync.Mutex
|
||||||
|
|
||||||
isTLSRequired bool
|
isTLSRequired bool
|
||||||
@@ -196,6 +202,7 @@ func New(config Config) (Cluster, error) {
|
|||||||
syncInterval: config.SyncInterval,
|
syncInterval: config.SyncInterval,
|
||||||
nodeRecoverTimeout: config.NodeRecoverTimeout,
|
nodeRecoverTimeout: config.NodeRecoverTimeout,
|
||||||
emergencyLeaderTimeout: config.EmergencyLeaderTimeout,
|
emergencyLeaderTimeout: config.EmergencyLeaderTimeout,
|
||||||
|
recoverTimeout: config.RecoverTimeout,
|
||||||
|
|
||||||
config: config.CoreConfig,
|
config: config.CoreConfig,
|
||||||
skills: config.CoreSkills,
|
skills: config.CoreSkills,
|
||||||
@@ -324,7 +331,7 @@ func New(config Config) (Cluster, error) {
|
|||||||
c.raftLeaderObservationCh = make(chan string, 16)
|
c.raftLeaderObservationCh = make(chan string, 16)
|
||||||
c.raftEmergencyNotifyCh = make(chan bool, 16)
|
c.raftEmergencyNotifyCh = make(chan bool, 16)
|
||||||
|
|
||||||
raft, err := raft.New(raft.Config{
|
raft, err := raft.NewRecoverer(raft.Config{
|
||||||
ID: config.NodeID,
|
ID: config.NodeID,
|
||||||
Path: config.Path,
|
Path: config.Path,
|
||||||
Address: config.Address,
|
Address: config.Address,
|
||||||
@@ -357,11 +364,25 @@ func New(config Config) (Cluster, error) {
|
|||||||
ticker := time.NewTicker(time.Second)
|
ticker := time.NewTicker(time.Second)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
timer := time.NewTimer(c.nodeRecoverTimeout)
|
||||||
|
defer timer.Stop()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-c.shutdownCh:
|
case <-c.shutdownCh:
|
||||||
return
|
return
|
||||||
|
case <-timer.C:
|
||||||
|
c.logger.Warn().WithFields(log.Fields{
|
||||||
|
"peer": peerAddress,
|
||||||
|
"timeout": c.nodeRecoverTimeout,
|
||||||
|
}).Log("Giving up joining cluster via peer")
|
||||||
|
return
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
|
if c.HasRaftLeader() {
|
||||||
|
c.logger.Warn().WithField("peer", peerAddress).Log("Stop joining cluster via peer, already joined")
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
err := c.Join("", c.nodeID, c.raftAddress, peerAddress)
|
err := c.Join("", c.nodeID, c.raftAddress, peerAddress)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.logger.Warn().WithError(err).Log("Join cluster")
|
c.logger.Warn().WithError(err).Log("Join cluster")
|
||||||
@@ -750,11 +771,11 @@ func (c *cluster) Leave(origin, id string) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
numPeers := len(servers)
|
numServers := len(servers)
|
||||||
|
|
||||||
if id == c.nodeID {
|
if id == c.nodeID {
|
||||||
// We're going to remove ourselves
|
// We're going to remove ourselves
|
||||||
if numPeers <= 1 {
|
if numServers <= 1 {
|
||||||
// Don't do so if we're the only server in the cluster
|
// Don't do so if we're the only server in the cluster
|
||||||
c.logger.Debug().Log("We're the leader without any peers, not doing anything")
|
c.logger.Debug().Log("We're the leader without any peers, not doing anything")
|
||||||
return nil
|
return nil
|
||||||
@@ -1015,6 +1036,7 @@ func (c *cluster) trackLeaderChanges() {
|
|||||||
}
|
}
|
||||||
c.forwarder.SetLeader(leaderAddress)
|
c.forwarder.SetLeader(leaderAddress)
|
||||||
c.leaderLock.Lock()
|
c.leaderLock.Lock()
|
||||||
|
c.lastLeaderChange = time.Now()
|
||||||
if len(leaderAddress) == 0 {
|
if len(leaderAddress) == 0 {
|
||||||
c.hasRaftLeader = false
|
c.hasRaftLeader = false
|
||||||
} else {
|
} else {
|
||||||
|
@@ -1,7 +1,22 @@
|
|||||||
package cluster
|
package cluster
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/datarhei/core/v16/cluster/raft"
|
||||||
|
)
|
||||||
|
|
||||||
// followerLoop is run by every follower node in the cluster.
|
// followerLoop is run by every follower node in the cluster.
|
||||||
func (c *cluster) followerLoop(stopCh chan struct{}) {
|
func (c *cluster) followerLoop(stopCh chan struct{}) {
|
||||||
|
establishedFollower := false
|
||||||
|
|
||||||
|
if !establishedFollower {
|
||||||
|
c.establishFollowership(context.TODO())
|
||||||
|
establishedFollower = true
|
||||||
|
defer c.revokeFollowership()
|
||||||
|
}
|
||||||
|
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-stopCh:
|
case <-stopCh:
|
||||||
@@ -11,3 +26,83 @@ func (c *cluster) followerLoop(stopCh chan struct{}) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *cluster) establishFollowership(ctx context.Context) {
|
||||||
|
c.logger.Info().Log("Establishing followership")
|
||||||
|
|
||||||
|
ctx, cancel := context.WithCancel(ctx)
|
||||||
|
c.cancelFollowerShip = cancel
|
||||||
|
|
||||||
|
if c.recoverTimeout > 0 {
|
||||||
|
go c.recoverCluster(ctx, c.syncInterval)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *cluster) revokeFollowership() {
|
||||||
|
c.logger.Info().Log("Revoking followership")
|
||||||
|
|
||||||
|
if c.cancelFollowerShip != nil {
|
||||||
|
c.cancelFollowerShip()
|
||||||
|
c.cancelFollowerShip = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *cluster) recoverCluster(ctx context.Context, interval time.Duration) {
|
||||||
|
ticker := time.NewTicker(interval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
c.leaderLock.Lock()
|
||||||
|
hasLeader := c.hasRaftLeader
|
||||||
|
lastLeaderChange := c.lastLeaderChange
|
||||||
|
c.leaderLock.Unlock()
|
||||||
|
|
||||||
|
uptime := c.raft.Stats().Uptime
|
||||||
|
if uptime < 2*c.recoverTimeout {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if !hasLeader && c.recoverTimeout > 0 && time.Since(lastLeaderChange) > c.recoverTimeout {
|
||||||
|
peers := []raft.Peer{}
|
||||||
|
|
||||||
|
// find living peers and recover
|
||||||
|
servers, err := c.raft.Servers()
|
||||||
|
if err != nil {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
nodes := c.manager.NodeList()
|
||||||
|
for _, node := range nodes {
|
||||||
|
if _, err := node.Status(); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
id := node.About().ID
|
||||||
|
|
||||||
|
for _, server := range servers {
|
||||||
|
if server.ID == id && server.ID != c.nodeID {
|
||||||
|
peers = append(peers, raft.Peer{
|
||||||
|
ID: id,
|
||||||
|
Address: server.Address,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
c.logger.Warn().WithField("peers", peers).Log("Recovering raft")
|
||||||
|
|
||||||
|
// recover raft with new set of peers
|
||||||
|
err = c.raft.Recover(peers, 2*interval)
|
||||||
|
if err != nil {
|
||||||
|
c.logger.Error().WithError(err).Log("Recovering raft failed, shutting down")
|
||||||
|
c.Shutdown()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@@ -111,6 +111,7 @@ func (c *cluster) monitorLeadership() {
|
|||||||
c.leaderLock.Lock()
|
c.leaderLock.Lock()
|
||||||
c.isRaftLeader = false
|
c.isRaftLeader = false
|
||||||
c.isLeader = false
|
c.isLeader = false
|
||||||
|
c.isEmergencyLeader = false
|
||||||
c.leaderLock.Unlock()
|
c.leaderLock.Unlock()
|
||||||
} else if notification == NOTIFY_LEADER {
|
} else if notification == NOTIFY_LEADER {
|
||||||
if weAreLeaderCh != nil {
|
if weAreLeaderCh != nil {
|
||||||
@@ -144,6 +145,7 @@ func (c *cluster) monitorLeadership() {
|
|||||||
c.leaderLock.Lock()
|
c.leaderLock.Lock()
|
||||||
c.isRaftLeader = true
|
c.isRaftLeader = true
|
||||||
c.isLeader = true
|
c.isLeader = true
|
||||||
|
c.isEmergencyLeader = false
|
||||||
c.leaderLock.Unlock()
|
c.leaderLock.Unlock()
|
||||||
} else if notification == NOTIFY_EMERGENCY {
|
} else if notification == NOTIFY_EMERGENCY {
|
||||||
if weAreEmergencyLeaderCh != nil {
|
if weAreEmergencyLeaderCh != nil {
|
||||||
@@ -177,6 +179,7 @@ func (c *cluster) monitorLeadership() {
|
|||||||
c.leaderLock.Lock()
|
c.leaderLock.Lock()
|
||||||
c.isRaftLeader = false
|
c.isRaftLeader = false
|
||||||
c.isLeader = true
|
c.isLeader = true
|
||||||
|
c.isEmergencyLeader = true
|
||||||
c.leaderLock.Unlock()
|
c.leaderLock.Unlock()
|
||||||
}
|
}
|
||||||
case <-c.shutdownCh:
|
case <-c.shutdownCh:
|
||||||
@@ -318,11 +321,38 @@ func (c *cluster) establishLeadership(ctx context.Context, emergency bool) error
|
|||||||
|
|
||||||
if !emergency {
|
if !emergency {
|
||||||
go c.clearLocks(ctx, time.Minute)
|
go c.clearLocks(ctx, time.Minute)
|
||||||
|
go c.clearDeadNodes(ctx, c.syncInterval, c.nodeRecoverTimeout)
|
||||||
}
|
}
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (c *cluster) clearDeadNodes(ctx context.Context, interval, nodeRecoverTimeout time.Duration) {
|
||||||
|
ticker := time.NewTicker(interval)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-ticker.C:
|
||||||
|
nodes := c.manager.NodeList()
|
||||||
|
for _, node := range nodes {
|
||||||
|
about := node.About()
|
||||||
|
if time.Since(about.SpawnedAt) > nodeRecoverTimeout && time.Since(about.LastContact) > nodeRecoverTimeout {
|
||||||
|
c.logger.Warn().WithFields(log.Fields{
|
||||||
|
"id": about.ID,
|
||||||
|
"after": nodeRecoverTimeout,
|
||||||
|
"lastContact": about.LastContact,
|
||||||
|
"spawnedAt": about.SpawnedAt,
|
||||||
|
}).Log("Removing peer from cluster")
|
||||||
|
c.raft.RemoveServer(about.ID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (c *cluster) revokeLeadership() {
|
func (c *cluster) revokeLeadership() {
|
||||||
c.logger.Info().Log("Revoking leadership")
|
c.logger.Info().Log("Revoking leadership")
|
||||||
|
|
||||||
|
@@ -20,6 +20,7 @@ type Node struct {
|
|||||||
address string
|
address string
|
||||||
ips []string
|
ips []string
|
||||||
version string
|
version string
|
||||||
|
spawnedAt time.Time
|
||||||
|
|
||||||
node client.APIClient
|
node client.APIClient
|
||||||
nodeAbout About
|
nodeAbout About
|
||||||
@@ -60,6 +61,7 @@ func New(config Config) *Node {
|
|||||||
id: config.ID,
|
id: config.ID,
|
||||||
address: config.Address,
|
address: config.Address,
|
||||||
version: "0.0.0",
|
version: "0.0.0",
|
||||||
|
spawnedAt: time.Now(),
|
||||||
node: client.APIClient{
|
node: client.APIClient{
|
||||||
Address: config.Address,
|
Address: config.Address,
|
||||||
Client: &http.Client{
|
Client: &http.Client{
|
||||||
@@ -136,6 +138,7 @@ type About struct {
|
|||||||
Error error
|
Error error
|
||||||
Core CoreAbout
|
Core CoreAbout
|
||||||
Resources Resources
|
Resources Resources
|
||||||
|
SpawnedAt time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
type ResourcesGPU struct {
|
type ResourcesGPU struct {
|
||||||
@@ -170,6 +173,7 @@ func (n *Node) About() About {
|
|||||||
ID: n.id,
|
ID: n.id,
|
||||||
Version: n.version,
|
Version: n.version,
|
||||||
Address: n.address,
|
Address: n.address,
|
||||||
|
SpawnedAt: n.spawnedAt,
|
||||||
}
|
}
|
||||||
|
|
||||||
a.Name = n.coreAbout.Name
|
a.Name = n.coreAbout.Name
|
||||||
|
@@ -85,6 +85,7 @@ type Server struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type Stats struct {
|
type Stats struct {
|
||||||
|
Uptime time.Duration
|
||||||
Address string
|
Address string
|
||||||
State string
|
State string
|
||||||
LastContact time.Duration
|
LastContact time.Duration
|
||||||
@@ -210,6 +211,7 @@ func (r *raft) Servers() ([]Server, error) {
|
|||||||
|
|
||||||
func (r *raft) Stats() Stats {
|
func (r *raft) Stats() Stats {
|
||||||
stats := Stats{
|
stats := Stats{
|
||||||
|
Uptime: time.Since(r.raftStart),
|
||||||
Address: r.raftAddress,
|
Address: r.raftAddress,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
136
cluster/raft/recovery.go
Normal file
136
cluster/raft/recovery.go
Normal file
@@ -0,0 +1,136 @@
|
|||||||
|
package raft
|
||||||
|
|
||||||
|
import (
|
||||||
|
"io"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
type RaftRecoverer interface {
|
||||||
|
Raft
|
||||||
|
|
||||||
|
Recover([]Peer, time.Duration) error
|
||||||
|
}
|
||||||
|
|
||||||
|
type raftRecoverer struct {
|
||||||
|
raft Raft
|
||||||
|
config Config
|
||||||
|
|
||||||
|
lock sync.RWMutex
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewRecoverer(config Config) (RaftRecoverer, error) {
|
||||||
|
r := &raftRecoverer{
|
||||||
|
config: config,
|
||||||
|
}
|
||||||
|
|
||||||
|
raft, err := New(config)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
r.raft = raft
|
||||||
|
|
||||||
|
return r, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) Recover(peers []Peer, cooldown time.Duration) error {
|
||||||
|
r.lock.Lock()
|
||||||
|
defer r.lock.Unlock()
|
||||||
|
|
||||||
|
r.raft.Shutdown()
|
||||||
|
|
||||||
|
if r.config.Logger != nil {
|
||||||
|
r.config.Logger.Warn().WithField("duration", cooldown).Log("Cooling down")
|
||||||
|
}
|
||||||
|
|
||||||
|
time.Sleep(cooldown)
|
||||||
|
|
||||||
|
r.config.Peers = peers
|
||||||
|
|
||||||
|
raft, err := New(r.config)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
r.raft = raft
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) Shutdown() {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
r.raft.Shutdown()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) IsLeader() bool {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.IsLeader()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) Leader() (string, string) {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.Leader()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) Servers() ([]Server, error) {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.Servers()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) Stats() Stats {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.Stats()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) Apply(data []byte) error {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.Apply(data)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) Barrier(timeout time.Duration) error {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.Barrier(timeout)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) AddServer(id, address string) error {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.AddServer(id, address)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) RemoveServer(id string) error {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.RemoveServer(id)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) LeadershipTransfer(id string) error {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.LeadershipTransfer(id)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *raftRecoverer) Snapshot() (io.ReadCloser, error) {
|
||||||
|
r.lock.RLock()
|
||||||
|
defer r.lock.RUnlock()
|
||||||
|
|
||||||
|
return r.raft.Snapshot()
|
||||||
|
}
|
@@ -342,6 +342,7 @@ func (d *Config) init() {
|
|||||||
d.vars.Register(value.NewInt64(&d.Cluster.SyncInterval, 5), "cluster.sync_interval_sec", "CORE_CLUSTER_SYNC_INTERVAL_SEC", nil, "Interval between aligning the process in the cluster DB with the processes on the nodes", true, false)
|
d.vars.Register(value.NewInt64(&d.Cluster.SyncInterval, 5), "cluster.sync_interval_sec", "CORE_CLUSTER_SYNC_INTERVAL_SEC", nil, "Interval between aligning the process in the cluster DB with the processes on the nodes", true, false)
|
||||||
d.vars.Register(value.NewInt64(&d.Cluster.NodeRecoverTimeout, 120), "cluster.node_recover_timeout_sec", "CORE_CLUSTER_NODE_RECOVER_TIMEOUT_SEC", nil, "Timeout for a node to recover before rebalancing the processes", true, false)
|
d.vars.Register(value.NewInt64(&d.Cluster.NodeRecoverTimeout, 120), "cluster.node_recover_timeout_sec", "CORE_CLUSTER_NODE_RECOVER_TIMEOUT_SEC", nil, "Timeout for a node to recover before rebalancing the processes", true, false)
|
||||||
d.vars.Register(value.NewInt64(&d.Cluster.EmergencyLeaderTimeout, 10), "cluster.emergency_leader_timeout_sec", "CORE_CLUSTER_EMERGENCY_LEADER_TIMEOUT_SEC", nil, "Timeout for establishing the emergency leadership after lost contact to raft leader", true, false)
|
d.vars.Register(value.NewInt64(&d.Cluster.EmergencyLeaderTimeout, 10), "cluster.emergency_leader_timeout_sec", "CORE_CLUSTER_EMERGENCY_LEADER_TIMEOUT_SEC", nil, "Timeout for establishing the emergency leadership after lost contact to raft leader", true, false)
|
||||||
|
d.vars.Register(value.NewInt64(&d.Cluster.RecoverTimeout, 0), "cluster.recover_timeout_sec", "CORE_CLUSTER_RECOVER_TIMEOUT_SEC", nil, "Timeout for recovering the cluster if no raft leader can be elected", false, false)
|
||||||
d.vars.Register(value.NewBool(&d.Cluster.Debug.DisableFFmpegCheck, false), "cluster.debug.disable_ffmpeg_check", "CORE_CLUSTER_DEBUG_DISABLE_FFMPEG_CHECK", nil, "Disable checking for identical FFmpeg versions on all nodes", false, false)
|
d.vars.Register(value.NewBool(&d.Cluster.Debug.DisableFFmpegCheck, false), "cluster.debug.disable_ffmpeg_check", "CORE_CLUSTER_DEBUG_DISABLE_FFMPEG_CHECK", nil, "Disable checking for identical FFmpeg versions on all nodes", false, false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -199,6 +199,7 @@ type Data struct {
|
|||||||
SyncInterval int64 `json:"sync_interval_sec" format:"int64"` // seconds
|
SyncInterval int64 `json:"sync_interval_sec" format:"int64"` // seconds
|
||||||
NodeRecoverTimeout int64 `json:"node_recover_timeout_sec" format:"int64"` // seconds
|
NodeRecoverTimeout int64 `json:"node_recover_timeout_sec" format:"int64"` // seconds
|
||||||
EmergencyLeaderTimeout int64 `json:"emergency_leader_timeout_sec" format:"int64"` // seconds
|
EmergencyLeaderTimeout int64 `json:"emergency_leader_timeout_sec" format:"int64"` // seconds
|
||||||
|
RecoverTimeout int64 `json:"revocer_timeout_sec" format:"int64"` // seconds
|
||||||
Debug struct {
|
Debug struct {
|
||||||
DisableFFmpegCheck bool `json:"disable_ffmpeg_check"`
|
DisableFFmpegCheck bool `json:"disable_ffmpeg_check"`
|
||||||
} `json:"debug"`
|
} `json:"debug"`
|
||||||
|
Reference in New Issue
Block a user