core/cluster/leader.go

package cluster

import (
	"bytes"
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"sort"
	"sync"
	"time"

	"github.com/datarhei/core/v16/cluster/proxy"
	"github.com/datarhei/core/v16/cluster/store"
	"github.com/datarhei/core/v16/log"
	"github.com/datarhei/core/v16/maps"
	"github.com/datarhei/core/v16/restream/app"
)

const NOTIFY_FOLLOWER = 0
const NOTIFY_LEADER = 1
const NOTIFY_EMERGENCY = 2

func (c *cluster) monitorLeadership() {
	// We use the notify channel we configured Raft with, NOT Raft's
	// leaderCh, which is only notified best-effort. Doing this ensures
	// that we get all notifications in order, which is required for
	// cleanup and to ensure we never run multiple leader loops.
	notifyCh := make(chan int, 10)
	var notifyLoop sync.WaitGroup

	notifyLoop.Add(1)

	go func() {
		raftNotifyCh := c.raftNotifyCh
		raftEmergencyNotifyCh := c.raftEmergencyNotifyCh

		isRaftLeader := false

		notifyCh <- NOTIFY_FOLLOWER

		notifyLoop.Done()

		for {
			select {
			case isEmergencyLeader := <-raftEmergencyNotifyCh:
				if isEmergencyLeader {
					isRaftLeader = false
					notifyCh <- NOTIFY_EMERGENCY
				} else {
					if !isRaftLeader {
						notifyCh <- NOTIFY_FOLLOWER
					}
				}
			case isLeader := <-raftNotifyCh:
				if isLeader {
					isRaftLeader = true
					notifyCh <- NOTIFY_LEADER
				} else {
					isRaftLeader = false
					notifyCh <- NOTIFY_FOLLOWER
				}
			case <-c.shutdownCh:
				return
			}
		}
	}()

	notifyLoop.Wait()

	var weAreLeaderCh chan struct{}
	var weAreEmergencyLeaderCh chan struct{}
	var weAreFollowerCh chan struct{}

	var leaderLoop sync.WaitGroup
	var emergencyLeaderLoop sync.WaitGroup
	var followerLoop sync.WaitGroup

	for {
		select {
		case notification := <-notifyCh:
			if notification == NOTIFY_FOLLOWER {
				if weAreFollowerCh != nil {
					// we are already follower, don't do anything
					continue
				}

				// shutdown any leader and emergency loop
				if weAreLeaderCh != nil {
					c.logger.Debug().Log("Shutting down leader loop")
					close(weAreLeaderCh)
					leaderLoop.Wait()
					weAreLeaderCh = nil
				}

				if weAreEmergencyLeaderCh != nil {
					c.logger.Debug().Log("Shutting down emergency leader loop")
					close(weAreEmergencyLeaderCh)
					emergencyLeaderLoop.Wait()
					weAreEmergencyLeaderCh = nil
				}

				weAreFollowerCh = make(chan struct{})
				followerLoop.Add(1)
				go func(ch chan struct{}) {
					defer followerLoop.Done()
					c.followerLoop(ch)
				}(weAreFollowerCh)

				c.logger.Info().Log("Cluster followship acquired")

				c.leaderLock.Lock()
				c.isRaftLeader = false
				c.isLeader = false
				c.leaderLock.Unlock()
			} else if notification == NOTIFY_LEADER {
				if weAreLeaderCh != nil {
					// we are already leader, don't do anything
					continue
				}

				// shutdown any follower and emergency loop
				if weAreFollowerCh != nil {
					c.logger.Debug().Log("Shutting down follower loop")
					close(weAreFollowerCh)
					followerLoop.Wait()
					weAreFollowerCh = nil
				}

				if weAreEmergencyLeaderCh != nil {
					c.logger.Debug().Log("Shutting down emergency leader loop")
					close(weAreEmergencyLeaderCh)
					emergencyLeaderLoop.Wait()
					weAreEmergencyLeaderCh = nil
				}

				weAreLeaderCh = make(chan struct{})
				leaderLoop.Add(1)
				go func(ch chan struct{}) {
					defer leaderLoop.Done()
					c.leaderLoop(ch, false)
				}(weAreLeaderCh)
				c.logger.Info().Log("Cluster leadership acquired")

				c.leaderLock.Lock()
				c.isRaftLeader = true
				c.isLeader = true
				c.leaderLock.Unlock()
			} else if notification == NOTIFY_EMERGENCY {
				if weAreEmergencyLeaderCh != nil {
					// we are already emergency leader, don't do anything
					continue
				}

				// shutdown any follower and leader loop
				if weAreFollowerCh != nil {
					c.logger.Debug().Log("Shutting down follower loop")
					close(weAreFollowerCh)
					followerLoop.Wait()
					weAreFollowerCh = nil
				}

				if weAreLeaderCh != nil {
					c.logger.Debug().Log("Shutting down leader loop")
					close(weAreLeaderCh)
					leaderLoop.Wait()
					weAreLeaderCh = nil
				}

				weAreEmergencyLeaderCh = make(chan struct{})
				emergencyLeaderLoop.Add(1)
				go func(ch chan struct{}) {
					defer emergencyLeaderLoop.Done()
					c.leaderLoop(ch, true)
				}(weAreEmergencyLeaderCh)
				c.logger.Info().Log("Cluster emergency leadership acquired")

				c.leaderLock.Lock()
				c.isRaftLeader = false
				c.isLeader = true
				c.leaderLock.Unlock()
			}
		case <-c.shutdownCh:
			return
		}
	}
}

// leadershipTransfer tries to transfer the leadership to another node e.g. in order
// to do a graceful shutdown.
func (c *cluster) leadershipTransfer(id string) error {
	if id == c.id {
		return nil
	}

	retryCount := 3
	for i := 0; i < retryCount; i++ {
		err := c.raft.LeadershipTransfer(id)
		if err != nil {
			c.logger.Error().WithError(err).WithFields(log.Fields{
				"attempt":     i,
				"retry_limit": retryCount,
			}).Log("Transfer leadership attempt, will retry")
		} else {
			c.logger.Info().WithFields(log.Fields{
				"attempt":     i,
				"retry_limit": retryCount,
			}).Log("Successfully transferred leadership")

			for {
				c.logger.Debug().Log("Waiting for losing leadership")

				time.Sleep(50 * time.Millisecond)

				c.leaderLock.Lock()
				isLeader := c.isRaftLeader
				c.leaderLock.Unlock()

				if !isLeader {
					break
				}
			}

			return nil
		}
	}

	return fmt.Errorf("failed to transfer leadership in %d attempts", retryCount)
}

// leaderLoop runs as long as we are the leader to run various maintenance activities
// https://github.com/hashicorp/consul/blob/44b39240a86bc94ddc67bc105286ab450bd869a9/agent/consul/leader.go#L146
func (c *cluster) leaderLoop(stopCh chan struct{}, emergency bool) {
	establishedLeader := false

RECONCILE:
	// Setup a reconciliation timer
	interval := time.After(10 * time.Second)

	// Apply a raft barrier to ensure our FSM is caught up
	if !emergency {
		err := c.raft.Barrier(time.Minute)
		if err != nil {
			c.logger.Error().WithError(err).Log("Wait for barrier")
			goto WAIT
		}
	}

	// Check if we need to handle initial leadership actions
	if !establishedLeader {
		if err := c.establishLeadership(context.TODO(), emergency); err != nil {
			c.logger.Error().WithError(err).Log("Establish leadership")
			// Immediately revoke leadership since we didn't successfully
			// establish leadership.
			c.revokeLeadership()

			// attempt to transfer leadership. If successful it is
			// time to leave the leaderLoop since this node is no
			// longer the leader. If leadershipTransfer() fails, we
			// will try to acquire it again after
			// 5 seconds.
			if err := c.leadershipTransfer(""); err != nil {
				c.logger.Error().WithError(err).Log("Transfer leadership")
				interval = time.After(5 * time.Second)
				goto WAIT
			}
			return
		}
		establishedLeader = true
		defer c.revokeLeadership()
	}

WAIT:
	// Poll the stop channel to give it priority so we don't waste time
	// trying to perform the other operations if we have been asked to shut
	// down.
	select {
	case <-stopCh:
		return
	default:
	}

	// Periodically reconcile as long as we are the leader
	for {
		select {
		case <-stopCh:
			return
		case <-c.shutdownCh:
			return
		case <-interval:
			goto RECONCILE
		}
	}
}

func (c *cluster) establishLeadership(ctx context.Context, emergency bool) error {
	c.logger.Info().WithField("emergency", emergency).Log("Establishing leadership")

	ctx, cancel := context.WithCancel(ctx)
	c.cancelLeaderShip = cancel

	go c.synchronizeAndRebalance(ctx, c.syncInterval, emergency)

	if !emergency {
		go c.clearLocks(ctx, time.Minute)
	}

	return nil
}

func (c *cluster) revokeLeadership() {
	c.logger.Info().Log("Revoking leadership")

	if c.cancelLeaderShip != nil {
		c.cancelLeaderShip()
		c.cancelLeaderShip = nil
	}
}

// synchronizeAndRebalance synchronizes and rebalances the processes in a given interval. Synchronizing
// takes care that all processes in the cluster DB are running on one node. It writes the process->node mapping
// to the cluster DB such that when a new leader gets elected it knows where which process should be running.
// This is checked against the actual state. If a node is not reachable, the leader still knows which processes
// should be running on that node. For a certain duration (nodeRecoverTimeout) this is tolerated in case the
// node comes back. If not, the processes will be distributed to the remaining nodes. The actual observed state
// is stored back into the cluster DB.
//
// It follows the rebalancing which takes care that processes are taken from overloaded nodes. In each iteration
// only one process is taken away from a node. If a node is not reachable, its processes will be not part of the
// rebalancing and no attempt will be made to move processes from and to that node.
//
// All this happens if there's a leader. If there's no leader election possible, the node goes into the
// emergency leadership mode after a certain duration (emergencyLeaderTimeout). The synchronization phase will
// happen based on the last known list of processes from the cluster DB. Until nodeRecoverTimeout is reached,
// process that would run on unreachable nodes will not be moved to the node. After that, all processes will
// end on the node, but only if there are enough resources. Not to bog down the node. Rebalancing will be
// disabled.
//
// The goal of synchronizing and rebalancing is to make as little moves as possible and to be tolerant for
// a while if a node is not reachable.
func (c *cluster) synchronizeAndRebalance(ctx context.Context, interval time.Duration, emergency bool) {
	ticker := time.NewTicker(interval)
	defer ticker.Stop()

	term := uint64(0)

	for {
		select {
		case <-ctx.Done():
			return
		case <-ticker.C:
			if !emergency {
				if ok, _ := c.IsDegraded(); ok {
					break
				}

				c.doSynchronize(emergency, term)
				c.doRebalance(emergency, term)
			} else {
				c.doSynchronize(emergency, term)
			}
		}

		term++
	}
}

func (c *cluster) clearLocks(ctx context.Context, interval time.Duration) {
	ticker := time.NewTicker(interval)
	defer ticker.Stop()

	for {
		select {
		case <-ctx.Done():
			return
		case <-ticker.C:
			locks := c.ListLocks()
			hasExpiredLocks := false

			for _, validUntil := range locks {
				if time.Now().Before(validUntil) {
					continue
				}

				hasExpiredLocks = true
				break
			}

			if hasExpiredLocks {
				c.logger.Debug().Log("Clearing locks")
				c.applyCommand(&store.Command{
					Operation: store.OpClearLocks,
					Data:      &store.CommandClearLocks{},
				})
			}
		}
	}
}

var errNotEnoughResourcesForDeployment = errors.New("no node with enough resources for deployment is available")
var errNotEnoughResourcesForRebalancing = errors.New("no node with enough resources for rebalancing is available")
var errNoLimitsDefined = errors.New("this process has no limits defined")

type processOpDelete struct {
	nodeid    string
	processid app.ProcessID
}

type processOpMove struct {
	fromNodeid string
	toNodeid   string
	config     *app.Config
	metadata   map[string]interface{}
}

type processOpStart struct {
	nodeid    string
	processid app.ProcessID
}

type processOpStop struct {
	nodeid    string
	processid app.ProcessID
}

type processOpAdd struct {
	nodeid   string
	config   *app.Config
	metadata map[string]interface{}
	order    string
}

type processOpUpdate struct {
	nodeid    string
	processid app.ProcessID
	config    *app.Config
	metadata  map[string]interface{}
}

type processOpReject struct {
	processid app.ProcessID
	err       error
}

type processOpSkip struct {
	nodeid    string
	processid app.ProcessID
	err       error
}

type processOpError struct {
	processid app.ProcessID
	err       error
}

func (c *cluster) applyOpStack(stack []interface{}, term uint64) []processOpError {
	errors := []processOpError{}

	logger := c.logger.WithField("term", term)

	for _, op := range stack {
		switch v := op.(type) {
		case processOpAdd:
			err := c.proxy.AddProcess(v.nodeid, v.config, v.metadata)
			if err != nil {
				errors = append(errors, processOpError{
					processid: v.config.ProcessID(),
					err:       err,
				})
				logger.Info().WithError(err).WithFields(log.Fields{
					"processid": v.config.ProcessID(),
					"nodeid":    v.nodeid,
				}).Log("Adding process")
				break
			}

			if v.order == "start" {
				err = c.proxy.CommandProcess(v.nodeid, v.config.ProcessID(), "start")
				if err != nil {
					errors = append(errors, processOpError{
						processid: v.config.ProcessID(),
						err:       err,
					})
					logger.Info().WithError(err).WithFields(log.Fields{
						"processid": v.config.ProcessID(),
						"nodeid":    v.nodeid,
					}).Log("Starting process")
					break
				}
			}

			errors = append(errors, processOpError{
				processid: v.config.ProcessID(),
				err:       nil,
			})

			logger.Info().WithFields(log.Fields{
				"processid": v.config.ProcessID(),
				"nodeid":    v.nodeid,
			}).Log("Adding process")
		case processOpUpdate:
			err := c.proxy.UpdateProcess(v.nodeid, v.processid, v.config, v.metadata)
			if err != nil {
				errors = append(errors, processOpError{
					processid: v.processid,
					err:       err,
				})
				logger.Info().WithError(err).WithFields(log.Fields{
					"processid": v.processid,
					"nodeid":    v.nodeid,
				}).Log("Updating process")
				break
			}

			errors = append(errors, processOpError{
				processid: v.processid,
				err:       nil,
			})

			logger.Info().WithFields(log.Fields{
				"processid": v.config.ProcessID(),
				"nodeid":    v.nodeid,
			}).Log("Updating process")
		case processOpDelete:
			err := c.proxy.DeleteProcess(v.nodeid, v.processid)
			if err != nil {
				errors = append(errors, processOpError{
					processid: v.processid,
					err:       err,
				})
				logger.Info().WithError(err).WithFields(log.Fields{
					"processid": v.processid,
					"nodeid":    v.nodeid,
				}).Log("Removing process")
				break
			}

			errors = append(errors, processOpError{
				processid: v.processid,
				err:       nil,
			})

			logger.Info().WithFields(log.Fields{
				"processid": v.processid,
				"nodeid":    v.nodeid,
			}).Log("Removing process")
		case processOpMove:
			err := c.proxy.AddProcess(v.toNodeid, v.config, v.metadata)
			if err != nil {
				errors = append(errors, processOpError{
					processid: v.config.ProcessID(),
					err:       err,
				})
				logger.Info().WithError(err).WithFields(log.Fields{
					"processid":  v.config.ProcessID(),
					"fromnodeid": v.fromNodeid,
					"tonodeid":   v.toNodeid,
				}).Log("Moving process, adding process")
				break
			}

			err = c.proxy.DeleteProcess(v.fromNodeid, v.config.ProcessID())
			if err != nil {
				errors = append(errors, processOpError{
					processid: v.config.ProcessID(),
					err:       err,
				})
				logger.Info().WithError(err).WithFields(log.Fields{
					"processid":  v.config.ProcessID(),
					"fromnodeid": v.fromNodeid,
					"tonodeid":   v.toNodeid,
				}).Log("Moving process, removing process")
				break
			}

			err = c.proxy.CommandProcess(v.toNodeid, v.config.ProcessID(), "start")
			if err != nil {
				errors = append(errors, processOpError{
					processid: v.config.ProcessID(),
					err:       err,
				})
				logger.Info().WithError(err).WithFields(log.Fields{
					"processid":  v.config.ProcessID(),
					"fromnodeid": v.fromNodeid,
					"tonodeid":   v.toNodeid,
				}).Log("Moving process, starting process")
				break
			}

			errors = append(errors, processOpError{
				processid: v.config.ProcessID(),
				err:       nil,
			})

			logger.Info().WithFields(log.Fields{
				"processid":  v.config.ProcessID(),
				"fromnodeid": v.fromNodeid,
				"tonodeid":   v.toNodeid,
			}).Log("Moving process")
		case processOpStart:
			err := c.proxy.CommandProcess(v.nodeid, v.processid, "start")
			if err != nil {
				errors = append(errors, processOpError{
					processid: v.processid,
					err:       err,
				})
				logger.Info().WithError(err).WithFields(log.Fields{
					"processid": v.processid,
					"nodeid":    v.nodeid,
				}).Log("Starting process")
				break
			}

			errors = append(errors, processOpError{
				processid: v.processid,
				err:       nil,
			})

			logger.Info().WithFields(log.Fields{
				"processid": v.processid,
				"nodeid":    v.nodeid,
			}).Log("Starting process")
		case processOpStop:
			err := c.proxy.CommandProcess(v.nodeid, v.processid, "stop")
			if err != nil {
				errors = append(errors, processOpError{
					processid: v.processid,
					err:       err,
				})
				logger.Info().WithError(err).WithFields(log.Fields{
					"processid": v.processid,
					"nodeid":    v.nodeid,
				}).Log("Stopping process")
				break
			}

			errors = append(errors, processOpError{
				processid: v.processid,
				err:       nil,
			})

			logger.Info().WithFields(log.Fields{
				"processid": v.processid,
				"nodeid":    v.nodeid,
			}).Log("Stopping process")
		case processOpReject:
			errors = append(errors, processOpError(v))
			logger.Warn().WithError(v.err).WithField("processid", v.processid).Log("Process rejected")
		case processOpSkip:
			errors = append(errors, processOpError{
				processid: v.processid,
				err:       v.err,
			})
			logger.Warn().WithError(v.err).WithFields(log.Fields{
				"nodeid":    v.nodeid,
				"processid": v.processid,
			}).Log("Process skipped")
		case processOpError:
			errors = append(errors, v)
		default:
			logger.Warn().Log("Unknown operation on stack: %+v", v)
		}
	}

	return errors
}

func (c *cluster) doSynchronize(emergency bool, term uint64) {
	wish := c.store.GetProcessNodeMap()
	want := c.store.ListProcesses()
	have := c.proxy.ListProxyProcesses()
	nodes := c.proxy.ListNodes()

	logger := c.logger.WithField("term", term)

	logger.Debug().WithField("emergency", emergency).Log("Synchronizing")

	nodesMap := map[string]proxy.NodeAbout{}

	for _, node := range nodes {
		about := node.About()
		nodesMap[about.ID] = about
	}

	logger.Debug().WithFields(log.Fields{
		"want":  want,
		"have":  have,
		"nodes": nodesMap,
	}).Log("Synchronize")

	opStack, _, reality := synchronize(wish, want, have, nodesMap, c.nodeRecoverTimeout)

	if !emergency && !maps.Equal(wish, reality) {
		cmd := &store.Command{
			Operation: store.OpSetProcessNodeMap,
			Data: store.CommandSetProcessNodeMap{
				Map: reality,
			},
		}

		c.applyCommand(cmd)
	}

	errors := c.applyOpStack(opStack, term)

	if !emergency {
		for _, e := range errors {
			// Only apply the command if the error is different.
			process, err := c.store.GetProcess(e.processid)
			if err != nil {
				continue
			}

			var errmessage string = ""

			if e.err != nil {
				if process.Error == e.err.Error() {
					continue
				}

				errmessage = e.err.Error()
			} else {
				if len(process.Error) == 0 {
					continue
				}
			}

			cmd := &store.Command{
				Operation: store.OpSetProcessError,
				Data: store.CommandSetProcessError{
					ID:    e.processid,
					Error: errmessage,
				},
			}

			c.applyCommand(cmd)
		}
	}
}

func (c *cluster) doRebalance(emergency bool, term uint64) {
	if emergency {
		// Don't rebalance in emergency mode
		return
	}

	logger := c.logger.WithField("term", term)

	logger.Debug().WithField("emergency", emergency).Log("Rebalancing")

	have := c.proxy.ListProxyProcesses()
	nodes := c.proxy.ListNodes()

	nodesMap := map[string]proxy.NodeAbout{}

	for _, node := range nodes {
		about := node.About()
		nodesMap[about.ID] = about
	}

	logger.Debug().WithFields(log.Fields{
		"have":  have,
		"nodes": nodesMap,
	}).Log("Rebalance")

	opStack, _ := rebalance(have, nodesMap)

	errors := c.applyOpStack(opStack, term)

	for _, e := range errors {
		// Only apply the command if the error is different.
		process, err := c.store.GetProcess(e.processid)
		if err != nil {
			continue
		}

		var errmessage string = ""

		if e.err != nil {
			if process.Error == e.err.Error() {
				continue
			}

			errmessage = e.err.Error()
		} else {
			if len(process.Error) == 0 {
				continue
			}
		}

		cmd := &store.Command{
			Operation: store.OpSetProcessError,
			Data: store.CommandSetProcessError{
				ID:    e.processid,
				Error: errmessage,
			},
		}

		c.applyCommand(cmd)
	}
}

// isMetadataUpdateRequired compares two metadata. It relies on the documented property that json.Marshal
// sorts the map keys prior encoding.
func isMetadataUpdateRequired(wantMap map[string]interface{}, haveMap map[string]interface{}) (bool, map[string]interface{}) {
	hasChanges := false
	changeMap := map[string]interface{}{}

	haveMapKeys := map[string]struct{}{}

	for key := range haveMap {
		haveMapKeys[key] = struct{}{}
	}

	for key, wantMapValue := range wantMap {
		haveMapValue, ok := haveMap[key]
		if !ok {
			// A key in map1 exists, that doesn't exist in map2, we need to update
			hasChanges = true
		}

		// Compare the values
		changesData, err := json.Marshal(wantMapValue)
		if err != nil {
			continue
		}

		completeData, err := json.Marshal(haveMapValue)
		if err != nil {
			continue
		}

		if !bytes.Equal(changesData, completeData) {
			// The values are not equal, we need to update
			hasChanges = true
		}

		delete(haveMapKeys, key)

		changeMap[key] = wantMapValue
	}

	for key := range haveMapKeys {
		// If there keys in map2 that are not in map1, we have to update
		hasChanges = true
		changeMap[key] = nil
	}

	return hasChanges, changeMap
}

// synchronize returns a list of operations in order to adjust the "have" list to the "want" list
// with taking the available resources on each node into account.
func synchronize(wish map[string]string, want []store.Process, have []proxy.Process, nodes map[string]proxy.NodeAbout, nodeRecoverTimeout time.Duration) ([]interface{}, map[string]proxy.NodeResources, map[string]string) {
	resources := map[string]proxy.NodeResources{}
	for nodeid, about := range nodes {
		resources[nodeid] = about.Resources
	}

	// A map same as wish, but reflecting the actual situation.
	reality := map[string]string{}

	// A map from the process ID to the process config of the processes
	// we want to be running on the nodes.
	wantMap := map[string]store.Process{}
	for _, wantP := range want {
		pid := wantP.Config.ProcessID().String()
		wantMap[pid] = wantP
	}

	opStack := []interface{}{}

	// Now we iterate through the processes we actually have running on the nodes
	// and remove them from the wantMap. We also make sure that they have the correct order.
	// If a process cannot be found on the wantMap, it will be deleted from the nodes.
	haveAfterRemove := []proxy.Process{}
	wantOrderStart := []proxy.Process{}

	for _, haveP := range have {
		pid := haveP.Config.ProcessID().String()
		wantP, ok := wantMap[pid]
		if !ok {
			// The process is not on the wantMap. Delete it and adjust the resources.
			opStack = append(opStack, processOpDelete{
				nodeid:    haveP.NodeID,
				processid: haveP.Config.ProcessID(),
			})

			r, ok := resources[haveP.NodeID]
			if ok {
				r.CPU -= haveP.CPU
				r.Mem -= haveP.Mem
				resources[haveP.NodeID] = r
			}

			continue
		}

		// The process is on the wantMap. Update the process if the configuration and/or metadata differ.
		hasConfigChanges := !wantP.Config.Equal(haveP.Config)
		hasMetadataChanges, metadata := isMetadataUpdateRequired(wantP.Metadata, haveP.Metadata)
		if hasConfigChanges || hasMetadataChanges {
			// TODO: When the required resources increase, should we move this process to a node
			// that has them available? Otherwise, this node might start throttling. However, this
			// will result in rebalancing.
			opStack = append(opStack, processOpUpdate{
				nodeid:    haveP.NodeID,
				processid: haveP.Config.ProcessID(),
				config:    wantP.Config,
				metadata:  metadata,
			})
		}

		delete(wantMap, pid)
		reality[pid] = haveP.NodeID

		if haveP.Order != wantP.Order {
			if wantP.Order == "start" {
				// Delay pushing them to the stack in order to have
				// all resources released first.
				wantOrderStart = append(wantOrderStart, haveP)
			} else {
				opStack = append(opStack, processOpStop{
					nodeid:    haveP.NodeID,
					processid: haveP.Config.ProcessID(),
				})

				// Release the resources
				r, ok := resources[haveP.NodeID]
				if ok {
					r.CPU -= haveP.CPU
					r.Mem -= haveP.Mem
					resources[haveP.NodeID] = r
				}
			}
		}

		haveAfterRemove = append(haveAfterRemove, haveP)
	}

	for _, haveP := range wantOrderStart {
		nodeid := haveP.NodeID

		r, ok := resources[nodeid]
		if ok {
			// Consume the resources
			r.CPU += haveP.Config.LimitCPU
			r.Mem += haveP.Config.LimitMemory
			resources[nodeid] = r

			// TODO: check if the current node has actually enough resources available,
			// otherwise it needs to be moved somewhere else. If the node doesn't
			// have enough resources available, the process will be prevented
			// from starting.

			/*
				if hasNodeEnoughResources(r, haveP.Config.LimitCPU, haveP.Config.LimitMemory) {
					// Consume the resources
					r.CPU += haveP.Config.LimitCPU
					r.Mem += haveP.Config.LimitMemory
					resources[nodeid] = r
				} else {
					nodeid = findBestNodeForProcess(resources, haveP.Config.LimitCPU, haveP.Config.LimitMemory)
					if len(nodeid) == 0 {
						// Start it anyways and let it run into an error
						opStack = append(opStack, processOpStart{
							nodeid:    nodeid,
							processid: haveP.Config.ProcessID(),
						})

						continue
					}

					if nodeid != haveP.NodeID {
						opStack = append(opStack, processOpMove{
							fromNodeid: haveP.NodeID,
							toNodeid:   nodeid,
							config:     haveP.Config,
							metadata:   haveP.Metadata,
						})
					}

					// Consume the resources
					r, ok := resources[nodeid]
					if ok {
						r.CPU += haveP.Config.LimitCPU
						r.Mem += haveP.Config.LimitMemory
						resources[nodeid] = r
					}
				}
			*/
		}

		opStack = append(opStack, processOpStart{
			nodeid:    nodeid,
			processid: haveP.Config.ProcessID(),
		})
	}

	have = haveAfterRemove

	// In case a node didn't respond, some PID are still on the wantMap, that would run on
	// the currently not responding nodes. We use the wish map to assign them to the node.
	// If the node is unavailable for too long, keep these processes on the wantMap, otherwise
	// remove them and hope that they will reappear during the nodeRecoverTimeout.
	for pid := range wantMap {
		// Check if this PID is be assigned to a node.
		if nodeid, ok := wish[pid]; ok {
			// Check for how long the node hasn't been contacted, or if it still exists.
			if node, ok := nodes[nodeid]; ok {
				if node.State != "disconnected" {
					continue
				}

				if time.Since(node.LastContact) <= nodeRecoverTimeout {
					reality[pid] = nodeid
					delete(wantMap, pid)
				}
			}
		}
	}

	// The wantMap now contains only those processes that need to be installed on a node.
	// We will rebuild the "want" array from the wantMap in the same order as the original
	// "want" array to make the resulting opStack deterministic.
	wantReduced := []store.Process{}
	for _, wantP := range want {
		pid := wantP.Config.ProcessID().String()
		if _, ok := wantMap[pid]; !ok {
			continue
		}

		wantReduced = append(wantReduced, wantP)
	}

	// Create a map from the process reference to the node it is running on.
	haveReferenceAffinityMap := createReferenceAffinityMap(have)

	// Now, all remaining processes in the wantMap must be added to one of the nodes.
	for _, wantP := range wantReduced {
		pid := wantP.Config.ProcessID().String()
		reference := wantP.Config.Reference + "@" + wantP.Config.Domain

		// If a process doesn't have any limits defined, reject that process
		if wantP.Config.LimitCPU <= 0 || wantP.Config.LimitMemory <= 0 {
			opStack = append(opStack, processOpReject{
				processid: wantP.Config.ProcessID(),
				err:       errNoLimitsDefined,
			})

			continue
		}

		// Check if there are already processes with the same reference, and if so
		// choose this node. Then check the node if it has enough resources left. If
		// not, then select a node with the most available resources.
		nodeid := ""

		// Try to add the process to a node where other processes with the same reference currently reside.
		if len(wantP.Config.Reference) != 0 {
			for _, count := range haveReferenceAffinityMap[reference] {
				if hasNodeEnoughResources(resources[count.nodeid], wantP.Config.LimitCPU, wantP.Config.LimitMemory) {
					nodeid = count.nodeid
					break
				}
			}
		}

		// Find the node with the most resources available
		if len(nodeid) == 0 {
			nodeid = findBestNodeForProcess(resources, wantP.Config.LimitCPU, wantP.Config.LimitMemory)
		}

		if len(nodeid) != 0 {
			opStack = append(opStack, processOpAdd{
				nodeid:   nodeid,
				config:   wantP.Config,
				metadata: wantP.Metadata,
				order:    wantP.Order,
			})

			// Consume the resources
			r, ok := resources[nodeid]
			if ok {
				r.CPU += wantP.Config.LimitCPU
				r.Mem += wantP.Config.LimitMemory
				resources[nodeid] = r
			}

			reality[pid] = nodeid

			haveReferenceAffinityMap = updateReferenceAffinityMap(haveReferenceAffinityMap, reference, nodeid)
		} else {
			opStack = append(opStack, processOpReject{
				processid: wantP.Config.ProcessID(),
				err:       errNotEnoughResourcesForDeployment,
			})
		}
	}

	return opStack, resources, reality
}

// hasNodeEnoughResources returns whether a node has enough resources available for the
// requested cpu and memory consumption.
func hasNodeEnoughResources(r proxy.NodeResources, cpu float64, mem uint64) bool {
	if r.CPU+cpu < r.CPULimit && r.Mem+mem < r.MemLimit && !r.IsThrottling {
		return true
	}

	return false
}

// findBestNodeForProcess returns the nodeid that can fit the requested cpu and memory requirements. If no
// such node is available, an empty string is returned.
func findBestNodeForProcess(resources map[string]proxy.NodeResources, cpu float64, mem uint64) string {
	nodeid := ""

	for id, r := range resources {
		if len(nodeid) == 0 {
			if hasNodeEnoughResources(r, cpu, mem) {
				nodeid = id
			}

			continue
		}

		if r.CPU < resources[nodeid].CPU && r.Mem <= resources[nodeid].Mem {
			nodeid = id
		}
	}

	return nodeid
}

type referenceAffinityNodeCount struct {
	nodeid string
	count  uint64
}

// createReferenceAffinityMap returns a map of references (per domain) to an array of nodes this reference
// is found on and their count. The array is sorted by the count, the highest first.
func createReferenceAffinityMap(processes []proxy.Process) map[string][]referenceAffinityNodeCount {
	referenceAffinityMap := map[string][]referenceAffinityNodeCount{}
	for _, p := range processes {
		if len(p.Config.Reference) == 0 {
			continue
		}

		ref := p.Config.Reference + "@" + p.Config.Domain

		// Here we count how often a reference is present on a node. When
		// moving processes to a different node, the node with the highest
		// count of same references will be the first candidate.
		found := false
		arr := referenceAffinityMap[ref]
		for i, count := range arr {
			if count.nodeid == p.NodeID {
				count.count++
				arr[i] = count
				found = true
				break
			}
		}

		if !found {
			arr = append(arr, referenceAffinityNodeCount{
				nodeid: p.NodeID,
				count:  1,
			})
		}

		referenceAffinityMap[ref] = arr
	}

	// Sort every reference count in decreasing order for each reference
	for ref, count := range referenceAffinityMap {
		sort.SliceStable(count, func(a, b int) bool {
			return count[a].count > count[b].count
		})

		referenceAffinityMap[ref] = count
	}

	return referenceAffinityMap
}

func updateReferenceAffinityMap(raMap map[string][]referenceAffinityNodeCount, reference, nodeid string) map[string][]referenceAffinityNodeCount {
	counts, ok := raMap[reference]
	if !ok {
		raMap[reference] = []referenceAffinityNodeCount{
			{
				nodeid: nodeid,
				count:  1,
			},
		}

		return raMap
	}

	found := false
	for i, count := range counts {
		if count.nodeid == nodeid {
			count.count++
			counts[i] = count
			found = true
			break
		}
	}

	if !found {
		counts = append(counts, referenceAffinityNodeCount{
			nodeid: nodeid,
			count:  1,
		})
	}

	raMap[reference] = counts

	return raMap
}

// rebalance returns a list of operations that will move running processes away from nodes that are overloaded.
func rebalance(have []proxy.Process, nodes map[string]proxy.NodeAbout) ([]interface{}, map[string]proxy.NodeResources) {
	resources := map[string]proxy.NodeResources{}
	for nodeid, about := range nodes {
		resources[nodeid] = about.Resources
	}

	// Group all running processes by node and sort them by their runtime in ascending order.
	nodeProcessMap := createNodeProcessMap(have)

	// A map from the process reference to the nodes it is running on
	haveReferenceAffinityMap := createReferenceAffinityMap(have)

	opStack := []interface{}{}

	// Check if any of the nodes is overloaded
	for id, node := range nodes {
		r := node.Resources

		// Ignore this node if the resource values are not reliable
		if r.Error != nil {
			continue
		}

		// Check if node is overloaded
		if r.CPU < r.CPULimit && r.Mem < r.MemLimit && !r.IsThrottling {
			continue
		}

		// Move processes from this noed to another node with enough free resources.
		// The processes are ordered ascending by their runtime.
		processes := nodeProcessMap[id]
		if len(processes) == 0 {
			// If there are no processes on that node, we can't do anything
			continue
		}

		overloadedNodeid := id

		for i, p := range processes {
			availableNodeid := ""

			// Try to move the process to a node where other processes with the same
			// reference currently reside.
			if len(p.Config.Reference) != 0 {
				for _, count := range haveReferenceAffinityMap[p.Config.Reference+"@"+p.Config.Domain] {
					// Do not move the process to the node it is currently on
					if count.nodeid == overloadedNodeid {
						continue
					}

					r := resources[count.nodeid]
					if hasNodeEnoughResources(r, p.CPU, p.Mem) {
						availableNodeid = count.nodeid
						break
					}
				}
			}

			// Find the best node with enough resources available
			if len(availableNodeid) == 0 {
				availableNodeid = findBestNodeForProcess(resources, p.CPU, p.Mem)
			}

			if len(availableNodeid) == 0 {
				// There's no other node with enough resources to take over this process
				opStack = append(opStack, processOpSkip{
					nodeid:    overloadedNodeid,
					processid: p.Config.ProcessID(),
					err:       errNotEnoughResourcesForRebalancing,
				})
				continue
			}

			opStack = append(opStack, processOpMove{
				fromNodeid: overloadedNodeid,
				toNodeid:   availableNodeid,
				config:     p.Config,
				metadata:   p.Metadata,
			})

			// Adjust the process
			p.NodeID = availableNodeid
			processes[i] = p

			// Adjust the resources
			r = resources[availableNodeid]
			r.CPU += p.CPU
			r.Mem += p.Mem
			resources[availableNodeid] = r

			r = resources[overloadedNodeid]
			r.CPU -= p.CPU
			r.Mem -= p.Mem
			resources[overloadedNodeid] = r

			// Move only one process at a time
			break
		}
	}

	return opStack, resources
}

// createNodeProcessMap takes a list of processes and groups them by the nodeid they
// are running on. Each group contains only running processes and gets sorted by their
// preference to be moved somewhere else, increasing. From the running processes, the
// ones with the shortest runtime have the highest preference.
func createNodeProcessMap(processes []proxy.Process) map[string][]proxy.Process {
	nodeProcessMap := map[string][]proxy.Process{}

	for _, p := range processes {
		if p.State != "running" {
			continue
		}

		nodeProcessMap[p.NodeID] = append(nodeProcessMap[p.NodeID], p)
	}

	// Sort the processes by their runtime (if they are running) for each node
	for nodeid, processes := range nodeProcessMap {
		sort.SliceStable(processes, func(a, b int) bool {
			return processes[a].Runtime < processes[b].Runtime
		})

		nodeProcessMap[nodeid] = processes
	}

	return nodeProcessMap
}