mirror of
https://github.com/datarhei/core.git
synced 2025-10-04 23:53:12 +08:00
395 lines
11 KiB
Go
395 lines
11 KiB
Go
package cluster
|
|
|
|
import (
|
|
"bytes"
|
|
"maps"
|
|
"time"
|
|
|
|
"github.com/datarhei/core/v16/cluster/node"
|
|
"github.com/datarhei/core/v16/cluster/store"
|
|
"github.com/datarhei/core/v16/encoding/json"
|
|
)
|
|
|
|
func (c *cluster) doSynchronize(emergency bool, term uint64) {
|
|
logger := c.logger.WithField("term", term)
|
|
|
|
logger.Debug().WithField("emergency", emergency).Log("Synchronizing")
|
|
|
|
wish := c.store.ProcessGetNodeMap()
|
|
want := c.store.ProcessList()
|
|
storeNodes := c.store.NodeList()
|
|
nodes := c.manager.NodeList()
|
|
have, err := c.manager.ClusterProcessList()
|
|
if err != nil {
|
|
logger.Warn().WithError(err).Log("Failed to retrieve complete process list")
|
|
return
|
|
}
|
|
|
|
nodesMap := map[string]node.About{}
|
|
|
|
for _, node := range nodes {
|
|
about := node.About()
|
|
|
|
if storeNode, hasStoreNode := storeNodes[about.ID]; hasStoreNode {
|
|
about.State = storeNode.State
|
|
}
|
|
|
|
nodesMap[about.ID] = about
|
|
}
|
|
|
|
opStack, _, reality := synchronize(wish, want, have, nodesMap, c.nodeRecoverTimeout)
|
|
|
|
if !emergency && !maps.Equal(wish, reality) {
|
|
cmd := &store.Command{
|
|
Operation: store.OpSetProcessNodeMap,
|
|
Data: store.CommandSetProcessNodeMap{
|
|
Map: reality,
|
|
},
|
|
}
|
|
|
|
c.applyCommand(cmd)
|
|
}
|
|
|
|
errors := c.applyOpStack(opStack, term, 5)
|
|
|
|
if !emergency {
|
|
for _, e := range errors {
|
|
// Only apply the command if the error is different.
|
|
process, err := c.store.ProcessGet(e.processid)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
var errmessage string = ""
|
|
|
|
if e.err != nil {
|
|
if process.Error == e.err.Error() {
|
|
continue
|
|
}
|
|
|
|
errmessage = e.err.Error()
|
|
} else {
|
|
if len(process.Error) == 0 {
|
|
continue
|
|
}
|
|
}
|
|
|
|
cmd := &store.Command{
|
|
Operation: store.OpSetProcessError,
|
|
Data: store.CommandSetProcessError{
|
|
ID: e.processid,
|
|
Error: errmessage,
|
|
},
|
|
}
|
|
|
|
c.applyCommand(cmd)
|
|
}
|
|
}
|
|
}
|
|
|
|
// isMetadataUpdateRequired compares two metadata. It relies on the documented property that json.Marshal
|
|
// sorts the map keys prior encoding.
|
|
func isMetadataUpdateRequired(wantMap map[string]interface{}, haveMap map[string]interface{}) (bool, map[string]interface{}) {
|
|
hasChanges := false
|
|
changeMap := map[string]interface{}{}
|
|
|
|
haveMapKeys := map[string]struct{}{}
|
|
|
|
for key := range haveMap {
|
|
haveMapKeys[key] = struct{}{}
|
|
}
|
|
|
|
for key, wantMapValue := range wantMap {
|
|
haveMapValue, ok := haveMap[key]
|
|
if !ok {
|
|
// A key in map1 exists, that doesn't exist in map2, we need to update.
|
|
hasChanges = true
|
|
}
|
|
|
|
// Compare the values
|
|
changesData, err := json.Marshal(wantMapValue)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
completeData, err := json.Marshal(haveMapValue)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
if !bytes.Equal(changesData, completeData) {
|
|
// The values are not equal, we need to update.
|
|
hasChanges = true
|
|
}
|
|
|
|
delete(haveMapKeys, key)
|
|
|
|
changeMap[key] = wantMapValue
|
|
}
|
|
|
|
for key := range haveMapKeys {
|
|
// If there keys in map2 that are not in map1, we have to update.
|
|
hasChanges = true
|
|
changeMap[key] = nil
|
|
}
|
|
|
|
return hasChanges, changeMap
|
|
}
|
|
|
|
// synchronize returns a list of operations in order to adjust the "have" list to the "want" list
|
|
// with taking the available resources on each node into account.
|
|
func synchronize(wish map[string]string, want []store.Process, have []node.Process, nodes map[string]node.About, nodeRecoverTimeout time.Duration) ([]interface{}, map[string]node.Resources, map[string]string) {
|
|
resources := NewResourcePlanner(nodes)
|
|
|
|
// Mark nodes as throttling where at least one process is still throttling
|
|
for _, haveP := range have {
|
|
if haveP.Throttling {
|
|
resources.Throttling(haveP.NodeID, true)
|
|
}
|
|
}
|
|
|
|
// A map same as wish, but reflecting the actual situation.
|
|
reality := map[string]string{}
|
|
|
|
// A map from the process ID to the process config of the processes
|
|
// we want to be running on the nodes.
|
|
wantMap := map[string]store.Process{}
|
|
for _, wantP := range want {
|
|
pid := wantP.Config.ProcessID().String()
|
|
wantMap[pid] = wantP
|
|
}
|
|
|
|
opStack := []interface{}{}
|
|
opStackStart := []interface{}{}
|
|
opStackDelete := []interface{}{}
|
|
opStackUpdate := []interface{}{}
|
|
opStackAdd := []interface{}{}
|
|
opBudget := 100
|
|
|
|
// Now we iterate through the processes we actually have running on the nodes
|
|
// and remove them from the wantMap. We also make sure that they have the correct order.
|
|
// If a process cannot be found on the wantMap, it will be deleted from the nodes.
|
|
haveAfterRemove := []node.Process{}
|
|
wantOrderStart := []node.Process{}
|
|
|
|
for _, haveP := range have {
|
|
pid := haveP.Config.ProcessID().String()
|
|
wantP, ok := wantMap[pid]
|
|
if !ok {
|
|
// The process is not on the wantMap. Delete it and adjust the resources.
|
|
opStackDelete = append(opStackDelete, processOpDelete{
|
|
nodeid: haveP.NodeID,
|
|
processid: haveP.Config.ProcessID(),
|
|
})
|
|
|
|
resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
|
|
|
|
continue
|
|
}
|
|
|
|
// The process is on the wantMap. Update the process if the configuration and/or metadata differ.
|
|
hasConfigChanges := !wantP.Config.Equal(haveP.Config)
|
|
hasMetadataChanges, metadata := isMetadataUpdateRequired(wantP.Metadata, haveP.Metadata)
|
|
if (hasConfigChanges || hasMetadataChanges) && opBudget > 0 {
|
|
// TODO: When the required resources increase, should we move this process to a node
|
|
// that has them available? Otherwise, this node might start throttling. However, this
|
|
// will result in rebalancing.
|
|
opStackUpdate = append(opStackUpdate, processOpUpdate{
|
|
nodeid: haveP.NodeID,
|
|
processid: haveP.Config.ProcessID(),
|
|
config: wantP.Config,
|
|
metadata: metadata,
|
|
})
|
|
|
|
opBudget -= 3
|
|
}
|
|
|
|
delete(wantMap, pid)
|
|
reality[pid] = haveP.NodeID
|
|
|
|
if haveP.Order != wantP.Order {
|
|
if wantP.Order == "start" {
|
|
// Delay pushing them to the stack in order to have
|
|
// all resources released first.
|
|
wantOrderStart = append(wantOrderStart, haveP)
|
|
} else {
|
|
opStack = append(opStack, processOpStop{
|
|
nodeid: haveP.NodeID,
|
|
processid: haveP.Config.ProcessID(),
|
|
})
|
|
|
|
// Release the resources.
|
|
resources.Remove(haveP.NodeID, haveP.CPU, haveP.Mem)
|
|
}
|
|
}
|
|
|
|
haveAfterRemove = append(haveAfterRemove, haveP)
|
|
}
|
|
|
|
for _, haveP := range wantOrderStart {
|
|
nodeid := haveP.NodeID
|
|
|
|
resources.Add(nodeid, haveP.Config.LimitCPU, haveP.Config.LimitMemory)
|
|
|
|
// TODO: check if the current node has actually enough resources available,
|
|
// otherwise it needs to be moved somewhere else. If the node doesn't
|
|
// have enough resources available, the process will be prevented
|
|
// from starting.
|
|
|
|
/*
|
|
if hasNodeEnoughResources(r, haveP.Config.LimitCPU, haveP.Config.LimitMemory) {
|
|
// Consume the resources
|
|
r.CPU += haveP.Config.LimitCPU
|
|
r.Mem += haveP.Config.LimitMemory
|
|
resources[nodeid] = r
|
|
} else {
|
|
nodeid = findBestNodeForProcess(resources, haveP.Config.LimitCPU, haveP.Config.LimitMemory)
|
|
if len(nodeid) == 0 {
|
|
// Start it anyways and let it run into an error
|
|
opStack = append(opStack, processOpStart{
|
|
nodeid: nodeid,
|
|
processid: haveP.Config.ProcessID(),
|
|
})
|
|
|
|
continue
|
|
}
|
|
|
|
if nodeid != haveP.NodeID {
|
|
opStack = append(opStack, processOpMove{
|
|
fromNodeid: haveP.NodeID,
|
|
toNodeid: nodeid,
|
|
config: haveP.Config,
|
|
metadata: haveP.Metadata,
|
|
order: haveP.Order,
|
|
})
|
|
}
|
|
|
|
// Consume the resources
|
|
r, ok := resources[nodeid]
|
|
if ok {
|
|
r.CPU += haveP.Config.LimitCPU
|
|
r.Mem += haveP.Config.LimitMemory
|
|
resources[nodeid] = r
|
|
}
|
|
}
|
|
*/
|
|
|
|
opStackStart = append(opStackStart, processOpStart{
|
|
nodeid: nodeid,
|
|
processid: haveP.Config.ProcessID(),
|
|
})
|
|
|
|
opBudget -= 3
|
|
|
|
if opBudget <= 0 {
|
|
break
|
|
}
|
|
}
|
|
|
|
have = haveAfterRemove
|
|
|
|
// In case a node didn't respond, some PID are still on the wantMap, that would run on
|
|
// the currently not responding nodes. We use the wish map to assign them to the node.
|
|
// If the node is unavailable for too long, keep these processes on the wantMap, otherwise
|
|
// remove them and hope that they will reappear during the nodeRecoverTimeout.
|
|
for pid := range wantMap {
|
|
// Check if this PID is be assigned to a node.
|
|
if nodeid, ok := wish[pid]; ok {
|
|
// Check for how long the node hasn't been contacted, or if it still exists.
|
|
if node, ok := nodes[nodeid]; ok {
|
|
if node.State == "online" {
|
|
continue
|
|
}
|
|
|
|
if time.Since(node.LastContact) <= nodeRecoverTimeout {
|
|
reality[pid] = nodeid
|
|
delete(wantMap, pid)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// The wantMap now contains only those processes that need to be installed on a node.
|
|
// We will rebuild the "want" array from the wantMap in the same order as the original
|
|
// "want" array to make the resulting opStack deterministic.
|
|
wantReduced := []store.Process{}
|
|
for _, wantP := range want {
|
|
pid := wantP.Config.ProcessID().String()
|
|
if _, ok := wantMap[pid]; !ok {
|
|
continue
|
|
}
|
|
|
|
wantReduced = append(wantReduced, wantP)
|
|
}
|
|
|
|
// Create a map from the process reference to the node it is running on.
|
|
haveReferenceAffinity := NewReferenceAffinity(have)
|
|
|
|
// Now, all remaining processes in the wantMap must be added to one of the nodes.
|
|
for _, wantP := range wantReduced {
|
|
pid := wantP.Config.ProcessID().String()
|
|
|
|
// If a process doesn't have any limits defined, reject that process.
|
|
if wantP.Config.LimitCPU <= 0 || wantP.Config.LimitMemory <= 0 {
|
|
opStack = append(opStack, processOpReject{
|
|
processid: wantP.Config.ProcessID(),
|
|
err: errNoLimitsDefined,
|
|
})
|
|
|
|
continue
|
|
}
|
|
|
|
// Check if there are already processes with the same reference, and if so
|
|
// choose this node. Then check the node if it has enough resources left. If
|
|
// not, then select a node with the most available resources.
|
|
nodeid := ""
|
|
|
|
// Try to add the process to a node where other processes with the same reference currently reside.
|
|
raNodes := haveReferenceAffinity.Nodes(wantP.Config.Reference, wantP.Config.Domain)
|
|
for _, raNodeid := range raNodes {
|
|
if resources.HasNodeEnough(raNodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory) {
|
|
nodeid = raNodeid
|
|
break
|
|
}
|
|
}
|
|
|
|
// Find the node with the most resources available.
|
|
if len(nodeid) == 0 {
|
|
nodes := resources.FindBestNodes(wantP.Config.LimitCPU, wantP.Config.LimitMemory)
|
|
if len(nodes) > 0 {
|
|
nodeid = nodes[0]
|
|
}
|
|
}
|
|
|
|
if len(nodeid) != 0 && opBudget > 0 {
|
|
opStackAdd = append(opStackAdd, processOpAdd{
|
|
nodeid: nodeid,
|
|
config: wantP.Config,
|
|
metadata: wantP.Metadata,
|
|
order: wantP.Order,
|
|
})
|
|
|
|
opBudget -= 3
|
|
|
|
// Consume the resources
|
|
resources.Add(nodeid, wantP.Config.LimitCPU, wantP.Config.LimitMemory)
|
|
|
|
reality[pid] = nodeid
|
|
|
|
haveReferenceAffinity.Add(wantP.Config.Reference, wantP.Config.Domain, nodeid)
|
|
} else {
|
|
opStack = append(opStack, processOpReject{
|
|
processid: wantP.Config.ProcessID(),
|
|
err: errNotEnoughResourcesForDeployment,
|
|
})
|
|
}
|
|
}
|
|
|
|
opStack = append(opStack, opStackDelete...)
|
|
opStack = append(opStack, opStackUpdate...)
|
|
opStack = append(opStack, opStackStart...)
|
|
opStack = append(opStack, opStackAdd...)
|
|
|
|
return opStack, resources.Map(), reality
|
|
}
|