mirror of
https://github.com/datarhei/core.git
synced 2025-10-18 14:00:41 +08:00
650 lines
15 KiB
Go
650 lines
15 KiB
Go
package process
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/datarhei/core/v16/log"
|
|
"github.com/datarhei/core/v16/resources/psutil"
|
|
)
|
|
|
|
type Usage struct {
|
|
CPU struct {
|
|
NCPU float64 // number of logical processors
|
|
Current float64 // percent 0-100*ncpu
|
|
Average float64 // percent 0-100*ncpu
|
|
Max float64 // percent 0-100*ncpu
|
|
Limit float64 // percent 0-100*ncpu
|
|
IsThrottling bool
|
|
}
|
|
Memory struct {
|
|
Current uint64 // bytes
|
|
Average float64 // bytes
|
|
Max uint64 // bytes
|
|
Limit uint64 // bytes
|
|
}
|
|
GPU struct {
|
|
Index int // number of the GPU
|
|
Memory struct {
|
|
Current uint64 // bytes
|
|
Average float64 // bytes
|
|
Max uint64 // bytes
|
|
Limit uint64 // bytes
|
|
}
|
|
Usage struct {
|
|
Current float64 // percent 0-100
|
|
Average float64 // percent 0-100
|
|
Max float64 // percent 0-100
|
|
Limit float64 // percent 0-100
|
|
}
|
|
Encoder struct {
|
|
Current float64 // percent 0-100
|
|
Average float64 // percent 0-100
|
|
Max float64 // percent 0-100
|
|
Limit float64 // percent 0-100
|
|
}
|
|
Decoder struct {
|
|
Current float64 // percent 0-100
|
|
Average float64 // percent 0-100
|
|
Max float64 // percent 0-100
|
|
Limit float64 // percent 0-100
|
|
}
|
|
}
|
|
}
|
|
|
|
type LimitFunc func(cpu float64, memory uint64, gpuusage, gpuencoder, gpudecoder float64, gpumemory uint64)
|
|
|
|
type LimitMode int
|
|
|
|
func (m LimitMode) String() string {
|
|
if m == LimitModeHard {
|
|
return "hard"
|
|
}
|
|
|
|
if m == LimitModeSoft {
|
|
return "soft"
|
|
}
|
|
|
|
return "undefined"
|
|
}
|
|
|
|
const (
|
|
LimitModeHard LimitMode = 0 // Killing the process if either resource is above the limit for a certain time.
|
|
LimitModeSoft LimitMode = 1 // If activated, will throttle the CPU, otherwise killing the process if resources are above the limit.
|
|
)
|
|
|
|
type LimiterConfig struct {
|
|
CPU float64 // Max. CPU usage in percent 0-100 in hard mode, 0-100*ncpu in soft mode.
|
|
Memory uint64 // Max. memory usage in bytes.
|
|
GPUUsage float64 // Max. GPU general usage in percent 0-100.
|
|
GPUEncoder float64 // Max. GPU encoder usage in percent 0-100.
|
|
GPUDecoder float64 // Max. GPU decoder usage in percent 0-100.
|
|
GPUMemory uint64 // Max. GPU memory usage in bytes.
|
|
WaitFor time.Duration // Duration for one of the limits has to be above the limit until OnLimit gets triggered.
|
|
OnLimit LimitFunc // Function to be triggered if limits are exceeded.
|
|
Mode LimitMode // How to limit CPU usage.
|
|
PSUtil psutil.Util
|
|
Logger log.Logger
|
|
}
|
|
|
|
type Limiter interface {
|
|
// Start starts the limiter with a psutil.Process.
|
|
Start(process psutil.Process) error
|
|
|
|
// Stop stops the limiter. The limiter can be reused by calling Start() again
|
|
Stop()
|
|
|
|
// Usage returns the current state of the limiter, such as current, average, max, and
|
|
// limit values for CPU and memory.
|
|
Usage() Usage
|
|
|
|
// Limit enables or disables the throttling of the CPU or killing because of to much
|
|
// memory or GPU consumption.
|
|
Limit(cpu, memory, gpu bool) error
|
|
|
|
// Mode returns in which mode the limiter is running in.
|
|
Mode() LimitMode
|
|
}
|
|
|
|
type numbers interface {
|
|
~uint64 | ~float64
|
|
}
|
|
|
|
type metric[T numbers] struct {
|
|
limit T // Limit
|
|
current T // Current load value
|
|
last T // Last load value
|
|
max T // Max. load value
|
|
top T // Decaying max. load value
|
|
avg float64 // Average load value
|
|
avgCounter uint64 // Counter for average calculation
|
|
limitSince time.Time // Time when the limit has been reached (hard limiter mode)
|
|
limitEnable bool
|
|
}
|
|
|
|
func (x *metric[T]) Reset() {
|
|
var zero T
|
|
|
|
x.current = zero
|
|
x.last = zero
|
|
x.max = zero
|
|
x.top = zero
|
|
x.avg = 0
|
|
x.avgCounter = 0
|
|
x.limitEnable = false
|
|
}
|
|
|
|
func (x *metric[T]) Current() T {
|
|
return x.current
|
|
}
|
|
|
|
func (x *metric[T]) Top() T {
|
|
return x.top
|
|
}
|
|
|
|
func (x *metric[T]) Max() T {
|
|
return x.max
|
|
}
|
|
|
|
func (x *metric[T]) Avg() float64 {
|
|
return x.avg
|
|
}
|
|
|
|
func (x *metric[T]) SetLimit(limit T) {
|
|
x.limit = limit
|
|
}
|
|
|
|
func (x *metric[T]) Limit() T {
|
|
return x.limit
|
|
}
|
|
|
|
func (x *metric[T]) DoLimit(limit bool) (enabled, changed bool) {
|
|
if x.limitEnable != limit {
|
|
x.limitEnable = limit
|
|
changed = true
|
|
}
|
|
|
|
enabled = x.limitEnable
|
|
|
|
return
|
|
}
|
|
|
|
func (x *metric[T]) IsLimitEnabled() bool {
|
|
return x.limitEnable
|
|
}
|
|
|
|
func (x *metric[T]) Update(value T) {
|
|
x.last, x.current = x.current, value
|
|
|
|
if x.current > x.max {
|
|
x.max = x.current
|
|
}
|
|
|
|
if x.current > x.top {
|
|
x.top = x.current
|
|
} else {
|
|
x.top = T(float64(x.top) * 0.95)
|
|
}
|
|
|
|
x.avgCounter++
|
|
|
|
x.avg = ((x.avg * float64(x.avgCounter-1)) + float64(x.current)) / float64(x.avgCounter)
|
|
}
|
|
|
|
func (x *metric[T]) IsExceeded(waitFor time.Duration, mode LimitMode) bool {
|
|
if x.limit <= 0 {
|
|
return false
|
|
}
|
|
|
|
if mode == LimitModeSoft {
|
|
// Check if we actually should limit.
|
|
if !x.limitEnable {
|
|
return false
|
|
}
|
|
|
|
// If we are currently above the limit, the limit is exceeded.
|
|
if x.current > x.limit {
|
|
return true
|
|
}
|
|
} else {
|
|
if x.current > x.limit {
|
|
// Current value is higher than the limit.
|
|
if x.last <= x.limit {
|
|
// If the previous value is below the limit, then we reached the limit as of now.
|
|
x.limitSince = time.Now()
|
|
}
|
|
|
|
if time.Since(x.limitSince) >= waitFor {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
type limiter struct {
|
|
psutil psutil.Util
|
|
|
|
ncpu float64
|
|
ncpuFactor float64
|
|
proc psutil.Process
|
|
lock sync.RWMutex
|
|
cancel context.CancelFunc
|
|
onLimit LimitFunc
|
|
|
|
lastUsage Usage
|
|
lastUsageLock sync.RWMutex
|
|
|
|
cpu metric[float64] // CPU limit
|
|
cpuThrottling bool // Whether CPU throttling is currently active (soft limiter mode)
|
|
|
|
memory metric[uint64] // Memory limit (bytes)
|
|
|
|
gpu struct {
|
|
memory metric[uint64] // GPU memory limit (0-100 percent)
|
|
usage metric[float64] // GPU load limit (0-100 percent)
|
|
encoder metric[float64] // GPU encoder limit (0-100 percent)
|
|
decoder metric[float64] // GPU decoder limit (0-100 percent)
|
|
}
|
|
|
|
waitFor time.Duration
|
|
mode LimitMode
|
|
|
|
logger log.Logger
|
|
}
|
|
|
|
// NewLimiter returns a new Limiter
|
|
func NewLimiter(config LimiterConfig) (Limiter, error) {
|
|
l := &limiter{
|
|
waitFor: config.WaitFor,
|
|
onLimit: config.OnLimit,
|
|
mode: config.Mode,
|
|
psutil: config.PSUtil,
|
|
logger: config.Logger,
|
|
}
|
|
|
|
l.cpu.SetLimit(config.CPU / 100)
|
|
l.memory.SetLimit(config.Memory)
|
|
l.gpu.memory.SetLimit(config.GPUMemory)
|
|
l.gpu.usage.SetLimit(config.GPUUsage / 100)
|
|
l.gpu.encoder.SetLimit(config.GPUEncoder / 100)
|
|
l.gpu.decoder.SetLimit(config.GPUDecoder / 100)
|
|
|
|
if l.logger == nil {
|
|
l.logger = log.New("")
|
|
}
|
|
|
|
if l.psutil == nil {
|
|
return nil, fmt.Errorf("no psutil provided")
|
|
}
|
|
|
|
if ncpu, err := l.psutil.CPUCounts(); err != nil {
|
|
l.ncpu = 1
|
|
} else {
|
|
l.ncpu = ncpu
|
|
}
|
|
|
|
l.lastUsage.CPU.NCPU = l.ncpu
|
|
l.lastUsage.CPU.Limit = l.cpu.Limit() * 100 * l.ncpu
|
|
l.lastUsage.Memory.Limit = l.memory.Limit()
|
|
l.lastUsage.GPU.Memory.Limit = l.gpu.memory.Limit()
|
|
l.lastUsage.GPU.Usage.Limit = l.gpu.usage.Limit() * 100
|
|
l.lastUsage.GPU.Encoder.Limit = l.gpu.encoder.Limit() * 100
|
|
l.lastUsage.GPU.Decoder.Limit = l.gpu.decoder.Limit() * 100
|
|
|
|
l.ncpuFactor = 1
|
|
|
|
mode := "hard"
|
|
if l.mode == LimitModeSoft {
|
|
mode = "soft"
|
|
l.cpu.SetLimit(l.cpu.Limit() / l.ncpu)
|
|
l.ncpuFactor = l.ncpu
|
|
}
|
|
|
|
if l.onLimit == nil {
|
|
l.onLimit = func(float64, uint64, float64, float64, float64, uint64) {}
|
|
}
|
|
|
|
l.logger = l.logger.WithFields(log.Fields{
|
|
"cpu": l.cpu.Limit() * l.ncpuFactor,
|
|
"memory": l.memory.Limit(),
|
|
"gpumemory": l.gpu.memory.Limit(),
|
|
"gpuusage": l.gpu.usage.Limit(),
|
|
"gpuencoder": l.gpu.encoder.Limit(),
|
|
"gpudecoder": l.gpu.decoder.Limit(),
|
|
"mode": mode,
|
|
})
|
|
|
|
return l, nil
|
|
}
|
|
|
|
func (l *limiter) reset() {
|
|
l.cpu.Reset()
|
|
l.cpuThrottling = false
|
|
|
|
l.memory.Reset()
|
|
|
|
l.gpu.memory.Reset()
|
|
l.gpu.usage.Reset()
|
|
l.gpu.encoder.Reset()
|
|
l.gpu.decoder.Reset()
|
|
}
|
|
|
|
func (l *limiter) Start(process psutil.Process) error {
|
|
l.lock.Lock()
|
|
defer l.lock.Unlock()
|
|
|
|
if l.proc != nil {
|
|
return fmt.Errorf("limiter is already running")
|
|
}
|
|
|
|
l.reset()
|
|
|
|
l.proc = process
|
|
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
l.cancel = cancel
|
|
|
|
go l.ticker(ctx, time.Second)
|
|
|
|
if l.mode == LimitModeSoft {
|
|
go l.limitCPU(ctx, l.cpu.Limit(), time.Second)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (l *limiter) Stop() {
|
|
l.lock.Lock()
|
|
defer l.lock.Unlock()
|
|
|
|
if l.proc == nil {
|
|
return
|
|
}
|
|
|
|
l.cancel()
|
|
|
|
l.proc.Stop()
|
|
l.proc = nil
|
|
|
|
l.reset()
|
|
}
|
|
|
|
func (l *limiter) ticker(ctx context.Context, interval time.Duration) {
|
|
ticker := time.NewTicker(interval)
|
|
defer ticker.Stop()
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-ticker.C:
|
|
l.collect()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *limiter) collect() {
|
|
l.lock.Lock()
|
|
proc := l.proc
|
|
l.lock.Unlock()
|
|
|
|
if proc == nil {
|
|
return
|
|
}
|
|
|
|
mstat, merr := proc.Memory()
|
|
cpustat, cerr := proc.CPU()
|
|
gstat, gerr := proc.GPU()
|
|
gindex := -1
|
|
|
|
l.lock.Lock()
|
|
defer l.lock.Unlock()
|
|
|
|
if merr == nil {
|
|
l.memory.Update(mstat)
|
|
}
|
|
|
|
if cerr == nil {
|
|
l.cpu.Update((cpustat.System + cpustat.User + cpustat.Other) / 100)
|
|
}
|
|
|
|
if gerr == nil {
|
|
l.gpu.memory.Update(gstat.MemoryUsed)
|
|
l.gpu.usage.Update(gstat.Usage / 100)
|
|
l.gpu.encoder.Update(gstat.Encoder / 100)
|
|
l.gpu.decoder.Update(gstat.Decoder / 100)
|
|
gindex = gstat.Index
|
|
}
|
|
|
|
isLimitExceeded := false
|
|
|
|
if l.mode == LimitModeHard {
|
|
if l.cpu.IsExceeded(l.waitFor, l.mode) {
|
|
l.logger.Warn().Log("CPU limit exceeded")
|
|
isLimitExceeded = true
|
|
}
|
|
}
|
|
|
|
if l.memory.IsExceeded(l.waitFor, l.mode) {
|
|
l.logger.Warn().Log("Memory limit exceeded")
|
|
isLimitExceeded = true
|
|
}
|
|
|
|
if l.gpu.memory.IsExceeded(l.waitFor, l.mode) {
|
|
l.logger.Warn().Log("GPU memory limit exceeded")
|
|
isLimitExceeded = true
|
|
}
|
|
|
|
if l.gpu.usage.IsExceeded(l.waitFor, l.mode) {
|
|
l.logger.Warn().Log("GPU usage limit exceeded")
|
|
isLimitExceeded = true
|
|
}
|
|
|
|
if l.gpu.encoder.IsExceeded(l.waitFor, l.mode) {
|
|
l.logger.Warn().Log("GPU encoder limit exceeded")
|
|
isLimitExceeded = true
|
|
}
|
|
|
|
if l.gpu.decoder.IsExceeded(l.waitFor, l.mode) {
|
|
l.logger.Warn().Log("GPU decoder limit exceeded")
|
|
isLimitExceeded = true
|
|
}
|
|
|
|
l.logger.Debug().WithFields(log.Fields{
|
|
"cur_cpu": l.cpu.Current() * l.ncpuFactor,
|
|
"top_cpu": l.cpu.Top() * l.ncpuFactor,
|
|
"cur_mem": l.memory.Current(),
|
|
"top_mem": l.memory.Top(),
|
|
"cur_gpu_mem": l.gpu.memory.Current(),
|
|
"top_gpu_mem": l.gpu.memory.Top(),
|
|
"exceeded": isLimitExceeded,
|
|
}).Log("Observation")
|
|
|
|
if isLimitExceeded {
|
|
go l.onLimit(l.cpu.Current()*l.ncpuFactor*100, l.memory.Current(), l.gpu.usage.Current(), l.gpu.encoder.Current(), l.gpu.decoder.Current(), l.gpu.memory.Current())
|
|
}
|
|
|
|
l.lastUsageLock.Lock()
|
|
l.lastUsage.CPU.Current = l.cpu.Current() * l.ncpu * 100
|
|
l.lastUsage.CPU.Average = l.cpu.Avg() * l.ncpu * 100
|
|
l.lastUsage.CPU.Max = l.cpu.Max() * l.ncpu * 100
|
|
l.lastUsage.CPU.IsThrottling = l.cpuThrottling
|
|
|
|
l.lastUsage.Memory.Current = l.memory.Current()
|
|
l.lastUsage.Memory.Average = l.memory.Avg()
|
|
l.lastUsage.Memory.Max = l.memory.Max()
|
|
|
|
l.lastUsage.GPU.Index = gindex
|
|
l.lastUsage.GPU.Memory.Current = l.gpu.memory.Current() * 100
|
|
l.lastUsage.GPU.Memory.Average = l.gpu.memory.Avg() * 100
|
|
l.lastUsage.GPU.Memory.Max = l.gpu.memory.Max() * 100
|
|
|
|
l.lastUsage.GPU.Usage.Current = l.gpu.usage.Current() * 100
|
|
l.lastUsage.GPU.Usage.Average = l.gpu.usage.Avg() * 100
|
|
l.lastUsage.GPU.Usage.Max = l.gpu.usage.Max() * 100
|
|
|
|
l.lastUsage.GPU.Encoder.Current = l.gpu.encoder.Current() * 100
|
|
l.lastUsage.GPU.Encoder.Average = l.gpu.encoder.Avg() * 100
|
|
l.lastUsage.GPU.Encoder.Max = l.gpu.encoder.Max() * 100
|
|
|
|
l.lastUsage.GPU.Decoder.Current = l.gpu.decoder.Current() * 100
|
|
l.lastUsage.GPU.Decoder.Average = l.gpu.decoder.Avg() * 100
|
|
l.lastUsage.GPU.Decoder.Max = l.gpu.decoder.Max() * 100
|
|
l.lastUsageLock.Unlock()
|
|
}
|
|
|
|
func (l *limiter) Limit(cpu, memory, gpu bool) error {
|
|
l.lock.Lock()
|
|
defer l.lock.Unlock()
|
|
|
|
if l.mode == LimitModeHard {
|
|
return nil
|
|
}
|
|
|
|
enabled, changed := l.cpu.DoLimit(cpu)
|
|
if enabled && changed {
|
|
l.logger.Debug().Log("CPU limiter enabled")
|
|
} else if !enabled && changed {
|
|
l.logger.Debug().Log("CPU limiter disabled")
|
|
}
|
|
|
|
enabled, changed = l.memory.DoLimit(memory)
|
|
if enabled && changed {
|
|
l.logger.Debug().Log("Memory limiter enabled")
|
|
} else if !enabled && changed {
|
|
l.logger.Debug().Log("Memory limiter disabled")
|
|
}
|
|
|
|
enabled, changed = l.gpu.memory.DoLimit(gpu)
|
|
if enabled && changed {
|
|
l.logger.Debug().Log("GPU limiter enabled")
|
|
} else if !enabled && changed {
|
|
l.logger.Debug().Log("GPU limiter disabled")
|
|
}
|
|
|
|
l.gpu.usage.DoLimit(gpu)
|
|
l.gpu.encoder.DoLimit(gpu)
|
|
l.gpu.decoder.DoLimit(gpu)
|
|
|
|
return nil
|
|
}
|
|
|
|
// limitCPU will limit the CPU usage of this process. The limit is the max. CPU usage
|
|
// normed to 0-1. The interval defines how long a time slot is that will be splitted
|
|
// into sleeping and working.
|
|
// Inspired by https://github.com/opsengine/cpulimit
|
|
func (l *limiter) limitCPU(ctx context.Context, limit float64, interval time.Duration) {
|
|
defer func() {
|
|
l.lock.Lock()
|
|
if l.proc != nil {
|
|
l.proc.Resume()
|
|
}
|
|
l.cpuThrottling = false
|
|
l.lock.Unlock()
|
|
|
|
l.logger.Debug().Log("CPU throttler disabled")
|
|
}()
|
|
|
|
var workingrate float64 = -1
|
|
var factorTopLimit float64 = 0
|
|
var topLimit float64 = 0
|
|
|
|
l.logger.Debug().WithField("limit", limit*l.ncpu).Log("CPU throttler enabled")
|
|
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
default:
|
|
}
|
|
|
|
l.lock.Lock()
|
|
|
|
if !l.cpu.IsLimitEnabled() {
|
|
if factorTopLimit > 0 {
|
|
factorTopLimit -= 10
|
|
} else {
|
|
if l.cpuThrottling {
|
|
if l.proc != nil {
|
|
l.proc.Resume()
|
|
}
|
|
l.cpuThrottling = false
|
|
}
|
|
l.lock.Unlock()
|
|
time.Sleep(100 * time.Millisecond)
|
|
continue
|
|
}
|
|
} else {
|
|
factorTopLimit = 100
|
|
topLimit = l.cpu.Top() - limit
|
|
l.cpuThrottling = true
|
|
}
|
|
|
|
lim := limit
|
|
|
|
if topLimit > 0 {
|
|
// After releasing the limiter, the process will not get the full CPU capacity back.
|
|
// Instead the limit will be gradually lifted by increments until it reaches the
|
|
// CPU top value. The CPU top value has to be larger than the actual limit.
|
|
lim += (100 - factorTopLimit) / 100 * topLimit
|
|
}
|
|
|
|
pcpu := l.cpu.Current()
|
|
|
|
l.lock.Unlock()
|
|
|
|
if workingrate < 0 {
|
|
workingrate = limit
|
|
}
|
|
// else {
|
|
// workingrate = math.Min(workingrate/pcpu*limit, 1)
|
|
//}
|
|
|
|
workingrate = lim
|
|
|
|
worktime := float64(interval.Nanoseconds()) * workingrate
|
|
sleeptime := float64(interval.Nanoseconds()) - worktime
|
|
|
|
l.logger.Debug().WithFields(log.Fields{
|
|
"limit": lim * l.ncpu,
|
|
"pcpu": pcpu,
|
|
"factor": factorTopLimit,
|
|
"worktime": (time.Duration(worktime) * time.Nanosecond).String(),
|
|
"sleeptime": (time.Duration(sleeptime) * time.Nanosecond).String(),
|
|
}).Log("Throttler")
|
|
|
|
l.lock.Lock()
|
|
if l.proc != nil {
|
|
l.proc.Resume()
|
|
}
|
|
l.lock.Unlock()
|
|
|
|
time.Sleep(time.Duration(worktime) * time.Nanosecond)
|
|
|
|
if sleeptime > 0 {
|
|
l.lock.Lock()
|
|
if l.proc != nil {
|
|
l.proc.Suspend()
|
|
}
|
|
l.lock.Unlock()
|
|
|
|
time.Sleep(time.Duration(sleeptime) * time.Nanosecond)
|
|
}
|
|
}
|
|
}
|
|
|
|
func (l *limiter) Usage() Usage {
|
|
l.lastUsageLock.RLock()
|
|
defer l.lastUsageLock.RUnlock()
|
|
|
|
return l.lastUsage
|
|
}
|
|
|
|
func (l *limiter) Mode() LimitMode {
|
|
return l.mode
|
|
}
|