Files
core/cluster/leader_synchronize.go
2024-07-22 09:25:23 +02:00

395 lines
11 KiB
Go

package cluster
import (
"bytes"
"maps"
"time"
"github.com/datarhei/core/v16/cluster/node"
"github.com/datarhei/core/v16/cluster/store"
"github.com/datarhei/core/v16/encoding/json"
)
func (c *cluster) doSynchronize(emergency bool, term uint64) {
logger := c.logger.WithField("term", term)
logger.Debug().WithField("emergency", emergency).Log("Synchronizing")
wish := c.store.ProcessGetNodeMap()
want := c.store.ProcessList()
storeNodes := c.store.NodeList()
nodes := c.manager.NodeList()
have, err := c.manager.ClusterProcessList()
if err != nil {
logger.Warn().WithError(err).Log("Failed to retrieve complete process list")
return
}
nodesMap := map[string]node.About{}
for _, node := range nodes {
about := node.About()
if storeNode, hasStoreNode := storeNodes[about.ID]; hasStoreNode {
about.State = storeNode.State
}
nodesMap[about.ID] = about
}
opStack, _, reality := synchronize(wish, want, have, nodesMap, c.nodeRecoverTimeout)
if !emergency && !maps.Equal(wish, reality) {
cmd := &store.Command{
Operation: store.OpSetProcessNodeMap,
Data: store.CommandSetProcessNodeMap{
Map: reality,
},
}
c.applyCommand(cmd)
}
errors := c.applyOpStack(opStack, term, 5)
if !emergency {
for _, e := range errors {
// Only apply the command if the error is different.
process, err := c.store.ProcessGet(e.processid)
if err != nil {
continue
}
var errmessage string = ""
if e.err != nil {
if process.Error == e.err.Error() {
continue
}
errmessage = e.err.Error()
} else {
if len(process.Error) == 0 {
continue
}
}
cmd := &store.Command{
Operation: store.OpSetProcessError,
Data: store.CommandSetProcessError{
ID: e.processid,
Error: errmessage,
},
}
c.applyCommand(cmd)
}
}
}
// isMetadataUpdateRequired compares two metadata. It relies on the documented property that json.Marshal
// sorts the map keys prior encoding.
func isMetadataUpdateRequired(wantMap map[string]interface{}, haveMap map[string]interface{}) (bool, map[string]interface{}) {
hasChanges := false
changeMap := map[string]interface{}{}
haveMapKeys := map[string]struct{}{}
for key := range haveMap {
haveMapKeys[key] = struct{}{}
}
for key, wantMapValue := range wantMap {
haveMapValue, ok := haveMap[key]
if !ok {
// A key in map1 exists, that doesn't exist in map2, we need to update.
hasChanges = true
}
// Compare the values
changesData, err := json.Marshal(wantMapValue)
if err != nil {
continue
}
completeData, err := json.Marshal(haveMapValue)
if err != nil {
continue
}
if !bytes.Equal(changesData, completeData) {
// The values are not equal, we need to update.
hasChanges = true
}
delete(haveMapKeys, key)
changeMap[key] = wantMapValue
}
for key := range haveMapKeys {
// If there keys in map2 that are not in map1, we have to update.
hasChanges = true
changeMap[key] = nil
}
return hasChanges, changeMap
}
// synchronize returns a list of operations in order to adjust the "have" list to the "want" list
// with taking the available resources on each node into account.
func synchronize(wish map[string]string, want []store.Process, have []node.Process, nodes map[string]node.About, nodeRecoverTimeout time.Duration) ([]interface{}, map[string]node.Resources, map[string]string) {
resources := NewResourcePlanner(nodes)
// Mark nodes as throttling where at least one process is still throttling
for _, haveP := range have {
if haveP.Throttling {
resources.Throttling(haveP.NodeID, true)
}
}
// A map same as wish, but reflecting the actual situation.
reality := map[string]string{}
// A map from the process ID to the process config of the processes
// we want to be running on the nodes.
wantMap := map[string]store.Process{}
for _, wantP := range want {
pid := wantP.Config.ProcessID().String()
wantMap[pid] = wantP
}
opStack := []interface{}{}
opStackStart := []interface{}{}
opStackDelete := []interface{}{}
opStackUpdate := []interface{}{}
opStackAdd := []interface{}{}
opBudget := 100
// Now we iterate through the processes we actually have running on the nodes
// and remove them from the wantMap. We also make sure that they have the correct order.
// If a process cannot be found on the wantMap, it will be deleted from the nodes.
haveAfterRemove := []node.Process{}
wantOrderStart := []node.Process{}
for _, haveP := range have {
pid := haveP.Config.ProcessID().String()
wantP, ok := wantMap[pid]
if !ok {
// The process is not on the wantMap. Delete it and adjust the resources.
opStackDelete = append(opStackDelete, processOpDelete{
nodeid: haveP.NodeID,
processid: haveP.Config.ProcessID(),
})
resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
continue
}
// The process is on the wantMap. Update the process if the configuration and/or metadata differ.
hasConfigChanges := !wantP.Config.Equal(haveP.Config)
hasMetadataChanges, metadata := isMetadataUpdateRequired(wantP.Metadata, haveP.Metadata)
if (hasConfigChanges || hasMetadataChanges) && opBudget > 0 {
// TODO: When the required resources increase, should we move this process to a node
// that has them available? Otherwise, this node might start throttling. However, this
// will result in rebalancing.
opStackUpdate = append(opStackUpdate, processOpUpdate{
nodeid: haveP.NodeID,
processid: haveP.Config.ProcessID(),
config: wantP.Config,
metadata: metadata,
})
opBudget -= 3
}
delete(wantMap, pid)
reality[pid] = haveP.NodeID
if haveP.Order != wantP.Order {
if wantP.Order == "start" {
// Delay pushing them to the stack in order to have
// all resources released first.
wantOrderStart = append(wantOrderStart, haveP)
} else {
opStack = append(opStack, processOpStop{
nodeid: haveP.NodeID,
processid: haveP.Config.ProcessID(),
})
// Release the resources.
resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
}
}
haveAfterRemove = append(haveAfterRemove, haveP)
}
for _, haveP := range wantOrderStart {
nodeid := haveP.NodeID
resources.Add(nodeid, haveP.Config.LimitCPU, haveP.Config.LimitMemory)
// TODO: check if the current node has actually enough resources available,
// otherwise it needs to be moved somewhere else. If the node doesn't
// have enough resources available, the process will be prevented
// from starting.
/*
if hasNodeEnoughResources(r, haveP.Config.LimitCPU, haveP.Config.LimitMemory) {
// Consume the resources
r.CPU += haveP.Config.LimitCPU
r.Mem += haveP.Config.LimitMemory
resources[nodeid] = r
} else {
nodeid = findBestNodeForProcess(resources, haveP.Config.LimitCPU, haveP.Config.LimitMemory)
if len(nodeid) == 0 {
// Start it anyways and let it run into an error
opStack = append(opStack, processOpStart{
nodeid: nodeid,
processid: haveP.Config.ProcessID(),
})
continue
}
if nodeid != haveP.NodeID {
opStack = append(opStack, processOpMove{
fromNodeid: haveP.NodeID,
toNodeid: nodeid,
config: haveP.Config,
metadata: haveP.Metadata,
order: haveP.Order,
})
}
// Consume the resources
r, ok := resources[nodeid]
if ok {
r.CPU += haveP.Config.LimitCPU
r.Mem += haveP.Config.LimitMemory
resources[nodeid] = r
}
}
*/
opStackStart = append(opStackStart, processOpStart{
nodeid: nodeid,
processid: haveP.Config.ProcessID(),
})
opBudget -= 3
if opBudget <= 0 {
break
}
}
have = haveAfterRemove
// In case a node didn't respond, some PID are still on the wantMap, that would run on
// the currently not responding nodes. We use the wish map to assign them to the node.
// If the node is unavailable for too long, keep these processes on the wantMap, otherwise
// remove them and hope that they will reappear during the nodeRecoverTimeout.
for pid := range wantMap {
// Check if this PID is be assigned to a node.
if nodeid, ok := wish[pid]; ok {
// Check for how long the node hasn't been contacted, or if it still exists.
if node, ok := nodes[nodeid]; ok {
if node.State == "online" {
continue
}
if time.Since(node.LastContact) <= nodeRecoverTimeout {
reality[pid] = nodeid
delete(wantMap, pid)
}
}
}
}
// The wantMap now contains only those processes that need to be installed on a node.
// We will rebuild the "want" array from the wantMap in the same order as the original
// "want" array to make the resulting opStack deterministic.
wantReduced := []store.Process{}
for _, wantP := range want {
pid := wantP.Config.ProcessID().String()
if _, ok := wantMap[pid]; !ok {
continue
}
wantReduced = append(wantReduced, wantP)
}
// Create a map from the process reference to the node it is running on.
haveReferenceAffinity := NewReferenceAffinity(have)
// Now, all remaining processes in the wantMap must be added to one of the nodes.
for _, wantP := range wantReduced {
pid := wantP.Config.ProcessID().String()
// If a process doesn't have any limits defined, reject that process.
if wantP.Config.LimitCPU <= 0 || wantP.Config.LimitMemory <= 0 {
opStack = append(opStack, processOpReject{
processid: wantP.Config.ProcessID(),
err: errNoLimitsDefined,
})
continue
}
// Check if there are already processes with the same reference, and if so
// choose this node. Then check the node if it has enough resources left. If
// not, then select a node with the most available resources.
nodeid := ""
// Try to add the process to a node where other processes with the same reference currently reside.
raNodes := haveReferenceAffinity.Nodes(wantP.Config.Reference, wantP.Config.Domain)
for _, raNodeid := range raNodes {
if resources.HasNodeEnough(raNodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory) {
nodeid = raNodeid
break
}
}
// Find the node with the most resources available.
if len(nodeid) == 0 {
nodes := resources.FindBestNodes(wantP.Config.LimitCPU, wantP.Config.LimitMemory)
if len(nodes) > 0 {
nodeid = nodes[0]
}
}
if len(nodeid) != 0 && opBudget > 0 {
opStackAdd = append(opStackAdd, processOpAdd{
nodeid: nodeid,
config: wantP.Config,
metadata: wantP.Metadata,
order: wantP.Order,
})
opBudget -= 3
// Consume the resources
resources.Add(nodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory)
reality[pid] = nodeid
haveReferenceAffinity.Add(wantP.Config.Reference, wantP.Config.Domain, nodeid)
} else {
opStack = append(opStack, processOpReject{
processid: wantP.Config.ProcessID(),
err: errNotEnoughResourcesForDeployment,
})
}
}
opStack = append(opStack, opStackDelete...)
opStack = append(opStack, opStackUpdate...)
opStack = append(opStack, opStackStart...)
opStack = append(opStack, opStackAdd...)
return opStack, resources.Map(), reality
}