Add the resource limits to the metrics

This commit is contained in:
Ingo Oppermann
2023-06-06 15:20:59 +02:00
parent 3adf5fd7d4
commit 3ac7ead20d
9 changed files with 85 additions and 32 deletions

View File

@@ -1027,8 +1027,8 @@ func (a *api) start() error {
} }
metrics.Register(monitor.NewUptimeCollector()) metrics.Register(monitor.NewUptimeCollector())
metrics.Register(monitor.NewCPUCollector()) metrics.Register(monitor.NewCPUCollector(cfg.Resources.MaxCPUUsage))
metrics.Register(monitor.NewMemCollector()) metrics.Register(monitor.NewMemCollector(cfg.Resources.MaxMemoryUsage))
metrics.Register(monitor.NewNetCollector()) metrics.Register(monitor.NewNetCollector())
metrics.Register(monitor.NewDiskCollector(a.diskfs.Metadata("base"))) metrics.Register(monitor.NewDiskCollector(a.diskfs.Metadata("base")))
metrics.Register(monitor.NewFilesystemCollector("diskfs", a.diskfs)) metrics.Register(monitor.NewFilesystemCollector("diskfs", a.diskfs))

View File

@@ -91,11 +91,9 @@ func NewAPI(config APIConfig) (API, error) {
})) }))
a.router.Logger.SetOutput(httplog.NewWrapper(a.logger)) a.router.Logger.SetOutput(httplog.NewWrapper(a.logger))
swagHandler := echoSwagger.EchoWrapHandler(echoSwagger.InstanceName("ClusterAPI"))
// Swagger API documentation router group // Swagger API documentation router group
doc := a.router.Group("/v1/swagger/*") doc := a.router.Group("/v1/swagger/*")
doc.GET("", swagHandler) doc.GET("", echoSwagger.EchoWrapHandler(echoSwagger.InstanceName("ClusterAPI")))
a.router.POST("/v1/server", a.AddServer) a.router.POST("/v1/server", a.AddServer)
a.router.DELETE("/v1/server/:id", a.RemoveServer) a.router.DELETE("/v1/server/:id", a.RemoveServer)

View File

@@ -79,6 +79,9 @@ type ClusterConfig struct {
Address string // Listen address for the raft protocol Address string // Listen address for the raft protocol
Peers []Peer // Address of a member of a cluster to join Peers []Peer // Address of a member of a cluster to join
SyncInterval time.Duration // Interval between aligning the process in the cluster DB with the processes on the nodes
NodeRecoverTimeout time.Duration // Timeout for a node to recover before rebalancing the processes
CoreAPIAddress string // Address of the core API CoreAPIAddress string // Address of the core API
CoreAPIUsername string // Username for the core API CoreAPIUsername string // Username for the core API
CoreAPIPassword string // Password for the core API CoreAPIPassword string // Password for the core API

View File

@@ -106,8 +106,9 @@ type node struct {
resources struct { resources struct {
ncpu float64 ncpu float64
cpu float64 cpu float64
cpuLimit float64
mem uint64 mem uint64
memTotal uint64 memLimit uint64
} }
state nodeState state nodeState
@@ -269,8 +270,10 @@ func (n *node) Connect() error {
Metrics: []clientapi.MetricsQueryMetric{ Metrics: []clientapi.MetricsQueryMetric{
{Name: "cpu_ncpu"}, {Name: "cpu_ncpu"},
{Name: "cpu_idle"}, {Name: "cpu_idle"},
{Name: "cpu_limit"},
{Name: "mem_total"}, {Name: "mem_total"},
{Name: "mem_free"}, {Name: "mem_free"},
{Name: "mem_limit"},
}, },
}) })
@@ -278,8 +281,10 @@ func (n *node) Connect() error {
n.stateLock.Lock() n.stateLock.Lock()
n.resources.cpu = 100 n.resources.cpu = 100
n.resources.ncpu = 1 n.resources.ncpu = 1
n.resources.cpuLimit = 0
n.resources.mem = 0 n.resources.mem = 0
n.resources.memTotal = 0 n.resources.memLimit = 0
n.state = stateDisconnected
n.stateLock.Unlock() n.stateLock.Unlock()
continue continue
@@ -287,30 +292,37 @@ func (n *node) Connect() error {
cpu_ncpu := .0 cpu_ncpu := .0
cpu_idle := .0 cpu_idle := .0
cpu_limit := .0
mem_total := uint64(0) mem_total := uint64(0)
mem_free := uint64(0) mem_free := uint64(0)
mem_limit := uint64(0)
for _, x := range metrics.Metrics { for _, x := range metrics.Metrics {
if x.Name == "cpu_idle" { if x.Name == "cpu_idle" {
cpu_idle = x.Values[0].Value cpu_idle = x.Values[0].Value
} else if x.Name == "cpu_ncpu" { } else if x.Name == "cpu_ncpu" {
cpu_ncpu = x.Values[0].Value cpu_ncpu = x.Values[0].Value
} else if x.Name == "cpu_limit" {
cpu_limit = x.Values[0].Value
} else if x.Name == "mem_total" { } else if x.Name == "mem_total" {
mem_total = uint64(x.Values[0].Value) mem_total = uint64(x.Values[0].Value)
} else if x.Name == "mem_free" { } else if x.Name == "mem_free" {
mem_free = uint64(x.Values[0].Value) mem_free = uint64(x.Values[0].Value)
} else if x.Name == "mem_limit" {
mem_limit = uint64(x.Values[0].Value)
} }
} }
n.stateLock.Lock() n.stateLock.Lock()
n.resources.ncpu = cpu_ncpu n.resources.ncpu = cpu_ncpu
n.resources.cpu = (100 - cpu_idle) * cpu_ncpu n.resources.cpu = (100 - cpu_idle) * cpu_ncpu
n.resources.cpuLimit = cpu_limit * cpu_ncpu
if mem_total != 0 { if mem_total != 0 {
n.resources.mem = mem_total - mem_free n.resources.mem = mem_total - mem_free
n.resources.memTotal = mem_total n.resources.memLimit = mem_limit
} else { } else {
n.resources.mem = 0 n.resources.mem = 0
n.resources.memTotal = 0 n.resources.memLimit = 0
} }
n.lastContact = time.Now() n.lastContact = time.Now()
n.stateLock.Unlock() n.stateLock.Unlock()
@@ -423,7 +435,9 @@ func (n *node) StopFiles() {
func (n *node) About() NodeAbout { func (n *node) About() NodeAbout {
about, err := n.AboutPeer() about, err := n.AboutPeer()
if err != nil { if err != nil {
return NodeAbout{} return NodeAbout{
State: stateDisconnected.String(),
}
} }
createdAt, err := time.Parse(time.RFC3339, about.CreatedAt) createdAt, err := time.Parse(time.RFC3339, about.CreatedAt)
@@ -434,11 +448,16 @@ func (n *node) About() NodeAbout {
n.stateLock.RLock() n.stateLock.RLock()
defer n.stateLock.RUnlock() defer n.stateLock.RUnlock()
state := NodeAbout{ state := n.state
if time.Since(n.lastContact) > 3*time.Second {
state = stateDisconnected
}
nodeAbout := NodeAbout{
ID: about.ID, ID: about.ID,
Name: about.Name, Name: about.Name,
Address: n.address, Address: n.address,
State: n.state.String(), State: state.String(),
CreatedAt: createdAt, CreatedAt: createdAt,
Uptime: time.Since(createdAt), Uptime: time.Since(createdAt),
LastContact: n.lastContact, LastContact: n.lastContact,
@@ -446,13 +465,13 @@ func (n *node) About() NodeAbout {
Resources: NodeResources{ Resources: NodeResources{
NCPU: n.resources.ncpu, NCPU: n.resources.ncpu,
CPU: n.resources.cpu, CPU: n.resources.cpu,
CPULimit: 90 * n.resources.ncpu, CPULimit: n.resources.cpuLimit,
Mem: n.resources.mem, Mem: n.resources.mem,
MemLimit: uint64(float64(n.resources.memTotal) * 0.9), MemLimit: n.resources.memLimit,
}, },
} }
return state return nodeAbout
} }
func (n *node) Version() NodeVersion { func (n *node) Version() NodeVersion {

View File

@@ -169,15 +169,15 @@ type Data struct {
UIPath string `json:"ui_path"` UIPath string `json:"ui_path"`
} `json:"router"` } `json:"router"`
Resources struct { Resources struct {
MaxCPUUsage float64 `json:"max_cpu_usage"` MaxCPUUsage float64 `json:"max_cpu_usage"` // percent 0-100
MaxMemoryUsage float64 `json:"max_memory_usage"` MaxMemoryUsage float64 `json:"max_memory_usage"` // percent 0-100
} `json:"resources"` } `json:"resources"`
Cluster struct { Cluster struct {
Enable bool `json:"enable"` Enable bool `json:"enable"`
Bootstrap bool `json:"bootstrap"` Bootstrap bool `json:"bootstrap"`
Recover bool `json:"recover"` Recover bool `json:"recover"`
Debug bool `json:"debug"` Debug bool `json:"debug"`
Address string `json:"address"` Address string `json:"address"` // ip:port
Peers []string `json:"peers"` Peers []string `json:"peers"`
} `json:"cluster"` } `json:"cluster"`
} }

View File

@@ -11,13 +11,20 @@ type cpuCollector struct {
userDescr *metric.Description userDescr *metric.Description
idleDescr *metric.Description idleDescr *metric.Description
otherDescr *metric.Description otherDescr *metric.Description
limitDescr *metric.Description
ncpu float64 ncpu float64
limit float64
} }
func NewCPUCollector() metric.Collector { func NewCPUCollector(limit float64) metric.Collector {
c := &cpuCollector{ c := &cpuCollector{
ncpu: 1, ncpu: 1,
limit: limit,
}
if limit <= 0 || limit > 100 {
c.limit = 100
} }
c.ncpuDescr = metric.NewDesc("cpu_ncpu", "Number of logical CPUs in the system", nil) c.ncpuDescr = metric.NewDesc("cpu_ncpu", "Number of logical CPUs in the system", nil)
@@ -25,6 +32,7 @@ func NewCPUCollector() metric.Collector {
c.userDescr = metric.NewDesc("cpu_user", "Percentage of CPU used for the user", nil) c.userDescr = metric.NewDesc("cpu_user", "Percentage of CPU used for the user", nil)
c.idleDescr = metric.NewDesc("cpu_idle", "Percentage of idle CPU", nil) c.idleDescr = metric.NewDesc("cpu_idle", "Percentage of idle CPU", nil)
c.otherDescr = metric.NewDesc("cpu_other", "Percentage of CPU used for other subsystems", nil) c.otherDescr = metric.NewDesc("cpu_other", "Percentage of CPU used for other subsystems", nil)
c.limitDescr = metric.NewDesc("cpu_limit", "Percentage of CPU to be consumed", nil)
if ncpu, err := psutil.CPUCounts(true); err == nil { if ncpu, err := psutil.CPUCounts(true); err == nil {
c.ncpu = ncpu c.ncpu = ncpu
@@ -46,6 +54,7 @@ func (c *cpuCollector) Describe() []*metric.Description {
c.userDescr, c.userDescr,
c.idleDescr, c.idleDescr,
c.otherDescr, c.otherDescr,
c.limitDescr,
} }
} }
@@ -53,6 +62,7 @@ func (c *cpuCollector) Collect() metric.Metrics {
metrics := metric.NewMetrics() metrics := metric.NewMetrics()
metrics.Add(metric.NewValue(c.ncpuDescr, c.ncpu)) metrics.Add(metric.NewValue(c.ncpuDescr, c.ncpu))
metrics.Add(metric.NewValue(c.limitDescr, c.limit))
stat, err := psutil.CPUPercent() stat, err := psutil.CPUPercent()
if err != nil { if err != nil {

View File

@@ -8,13 +8,23 @@ import (
type memCollector struct { type memCollector struct {
totalDescr *metric.Description totalDescr *metric.Description
freeDescr *metric.Description freeDescr *metric.Description
limitDescr *metric.Description
limit float64
} }
func NewMemCollector() metric.Collector { func NewMemCollector(limit float64) metric.Collector {
c := &memCollector{} c := &memCollector{
limit: limit / 100,
}
if limit <= 0 || limit > 1 {
c.limit = 1
}
c.totalDescr = metric.NewDesc("mem_total", "Total available memory in bytes", nil) c.totalDescr = metric.NewDesc("mem_total", "Total available memory in bytes", nil)
c.freeDescr = metric.NewDesc("mem_free", "Free memory in bytes", nil) c.freeDescr = metric.NewDesc("mem_free", "Free memory in bytes", nil)
c.limitDescr = metric.NewDesc("mem_limit", "Memory limit in bytes", nil)
return c return c
} }
@@ -27,6 +37,7 @@ func (c *memCollector) Describe() []*metric.Description {
return []*metric.Description{ return []*metric.Description{
c.totalDescr, c.totalDescr,
c.freeDescr, c.freeDescr,
c.limitDescr,
} }
} }
@@ -40,6 +51,7 @@ func (c *memCollector) Collect() metric.Metrics {
metrics.Add(metric.NewValue(c.totalDescr, float64(stat.Total))) metrics.Add(metric.NewValue(c.totalDescr, float64(stat.Total)))
metrics.Add(metric.NewValue(c.freeDescr, float64(stat.Available))) metrics.Add(metric.NewValue(c.freeDescr, float64(stat.Available)))
metrics.Add(metric.NewValue(c.limitDescr, float64(stat.Total)*c.limit))
return metrics return metrics
} }

View File

@@ -14,6 +14,7 @@ type cpuCollector struct {
cpuUserTimeDesc *prometheus.Desc cpuUserTimeDesc *prometheus.Desc
cpuIdleTimeDesc *prometheus.Desc cpuIdleTimeDesc *prometheus.Desc
cpuOtherTimeDesc *prometheus.Desc cpuOtherTimeDesc *prometheus.Desc
cpuLimitDesc *prometheus.Desc
} }
func NewCPUCollector(core string, c metric.Reader) prometheus.Collector { func NewCPUCollector(core string, c metric.Reader) prometheus.Collector {
@@ -36,6 +37,10 @@ func NewCPUCollector(core string, c metric.Reader) prometheus.Collector {
"cpu_other_time_percent", "cpu_other_time_percent",
"CPU other time in percent", "CPU other time in percent",
[]string{"core"}, nil), []string{"core"}, nil),
cpuLimitDesc: prometheus.NewDesc(
"cpu_limit_percent",
"Configured CPU limit in percent",
[]string{"core"}, nil),
} }
} }
@@ -44,6 +49,7 @@ func (c *cpuCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.cpuUserTimeDesc ch <- c.cpuUserTimeDesc
ch <- c.cpuIdleTimeDesc ch <- c.cpuIdleTimeDesc
ch <- c.cpuOtherTimeDesc ch <- c.cpuOtherTimeDesc
ch <- c.cpuLimitDesc
} }
func (c *cpuCollector) Collect(ch chan<- prometheus.Metric) { func (c *cpuCollector) Collect(ch chan<- prometheus.Metric) {
@@ -52,10 +58,12 @@ func (c *cpuCollector) Collect(ch chan<- prometheus.Metric) {
metric.NewPattern("cpu_user"), metric.NewPattern("cpu_user"),
metric.NewPattern("cpu_idle"), metric.NewPattern("cpu_idle"),
metric.NewPattern("cpu_other"), metric.NewPattern("cpu_other"),
metric.NewPattern("cpu_limit"),
}) })
ch <- prometheus.MustNewConstMetric(c.cpuSystemTimeDesc, prometheus.GaugeValue, metrics.Value("cpu_system").Val(), c.core) ch <- prometheus.MustNewConstMetric(c.cpuSystemTimeDesc, prometheus.GaugeValue, metrics.Value("cpu_system").Val(), c.core)
ch <- prometheus.MustNewConstMetric(c.cpuUserTimeDesc, prometheus.GaugeValue, metrics.Value("cpu_user").Val(), c.core) ch <- prometheus.MustNewConstMetric(c.cpuUserTimeDesc, prometheus.GaugeValue, metrics.Value("cpu_user").Val(), c.core)
ch <- prometheus.MustNewConstMetric(c.cpuIdleTimeDesc, prometheus.GaugeValue, metrics.Value("cpu_idle").Val(), c.core) ch <- prometheus.MustNewConstMetric(c.cpuIdleTimeDesc, prometheus.GaugeValue, metrics.Value("cpu_idle").Val(), c.core)
ch <- prometheus.MustNewConstMetric(c.cpuOtherTimeDesc, prometheus.GaugeValue, metrics.Value("cpu_other").Val(), c.core) ch <- prometheus.MustNewConstMetric(c.cpuOtherTimeDesc, prometheus.GaugeValue, metrics.Value("cpu_other").Val(), c.core)
ch <- prometheus.MustNewConstMetric(c.cpuLimitDesc, prometheus.GaugeValue, metrics.Value("cpu_limit").Val(), c.core)
} }

View File

@@ -10,15 +10,16 @@ type memCollector struct {
core string core string
collector metric.Reader collector metric.Reader
memLimitDesc *prometheus.Desc memTotalDesc *prometheus.Desc
memFreeDesc *prometheus.Desc memFreeDesc *prometheus.Desc
memLimitDesc *prometheus.Desc
} }
func NewMemCollector(core string, c metric.Reader) prometheus.Collector { func NewMemCollector(core string, c metric.Reader) prometheus.Collector {
return &memCollector{ return &memCollector{
core: core, core: core,
collector: c, collector: c,
memLimitDesc: prometheus.NewDesc( memTotalDesc: prometheus.NewDesc(
"mem_total_bytes", "mem_total_bytes",
"Total available memory in bytes", "Total available memory in bytes",
[]string{"core"}, nil), []string{"core"}, nil),
@@ -26,25 +27,27 @@ func NewMemCollector(core string, c metric.Reader) prometheus.Collector {
"mem_free_bytes", "mem_free_bytes",
"Free memory in bytes", "Free memory in bytes",
[]string{"core"}, nil), []string{"core"}, nil),
memLimitDesc: prometheus.NewDesc(
"mem_limit_bytes",
"Configured memory limit in bytes",
[]string{"core"}, nil),
} }
} }
func (c *memCollector) Describe(ch chan<- *prometheus.Desc) { func (c *memCollector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.memLimitDesc ch <- c.memLimitDesc
ch <- c.memFreeDesc ch <- c.memFreeDesc
ch <- c.memLimitDesc
} }
func (c *memCollector) Collect(ch chan<- prometheus.Metric) { func (c *memCollector) Collect(ch chan<- prometheus.Metric) {
metrics := c.collector.Collect([]metric.Pattern{ metrics := c.collector.Collect([]metric.Pattern{
metric.NewPattern("mem_total"), metric.NewPattern("mem_total"),
metric.NewPattern("mem_free"), metric.NewPattern("mem_free"),
metric.NewPattern("mem_limit"),
}) })
for _, m := range metrics.Values("mem_total") { ch <- prometheus.MustNewConstMetric(c.memTotalDesc, prometheus.GaugeValue, metrics.Value("mem_total").Val(), c.core)
ch <- prometheus.MustNewConstMetric(c.memLimitDesc, prometheus.GaugeValue, m.Val(), c.core) ch <- prometheus.MustNewConstMetric(c.memFreeDesc, prometheus.GaugeValue, metrics.Value("mem_free").Val(), c.core)
} ch <- prometheus.MustNewConstMetric(c.memLimitDesc, prometheus.GaugeValue, metrics.Value("mem_limit").Val(), c.core)
for _, m := range metrics.Values("mem_free") {
ch <- prometheus.MustNewConstMetric(c.memFreeDesc, prometheus.GaugeValue, m.Val(), c.core)
}
} }