core/resources/resources.go

package resources

import (
	"context"
	"fmt"
	"os"
	"sort"
	"sync"
	"time"

	"github.com/datarhei/core/v16/log"
	"github.com/datarhei/core/v16/resources/psutil"
	"github.com/datarhei/core/v16/slices"
)

type Info struct {
	Mem MemoryInfo
	CPU CPUInfo
	GPU GPUInfo
}

type DiskInfo struct {
	Path        string
	Fstype      string
	Total       uint64
	Used        uint64
	InodesTotal uint64
	InodesUsed  uint64
}

type NetworkInfo struct {
	Name      string // interface name
	BytesSent uint64 // number of bytes sent
	BytesRecv uint64 // number of bytes received
}

type MemoryInfo struct {
	Total      uint64 // bytes
	Available  uint64 // bytes
	Used       uint64 // bytes
	Limit      uint64 // bytes
	Core       uint64 // bytes
	Throttling bool
	Error      error
}

type CPUInfo struct {
	NCPU       float64 // number of cpus
	System     float64 // percent 0-100
	User       float64 // percent 0-100
	Idle       float64 // percent 0-100
	Other      float64 // percent 0-100
	Limit      float64 // percent 0-100
	Core       float64 // percent 0-100
	Throttling bool
	Error      error
}

type GPUInfo struct {
	NGPU  float64 // number of gpus
	GPU   []GPUInfoStat
	Error error
}

type GPUInfoStat struct {
	Index int
	ID    string
	Name  string

	// Memory
	MemoryTotal     uint64 // bytes
	MemoryUsed      uint64 // bytes
	MemoryAvailable uint64 // bytes
	MemoryLimit     uint64 // bytes

	// GPU
	Usage      float64 // percent 0-100
	Encoder    float64 // percent 0-100
	Decoder    float64 // percent 0-100
	UsageLimit float64 // percent 0-100

	Throttling bool
}

type Request struct {
	CPU        float64 // percent 0-100*ncpu
	Memory     uint64  // bytes
	GPUUsage   float64 // percent 0-100
	GPUEncoder float64 // percent 0-100
	GPUDecoder float64 // percent 0-100
	GPUMemory  uint64  // bytes
}

type Response struct {
	GPU int // GPU number, hwdevice
}

type resources struct {
	psutil psutil.Util

	ncpu      float64
	maxCPU    float64 // percent 0-100*ncpu
	maxMemory uint64  // bytes

	ngpu         int
	maxGPU       float64 // general usage, percent 0-100
	maxGPUMemory float64 // memory usage, percent 0-100

	isUnlimited      bool
	isCPULimiting    bool
	isMemoryLimiting bool
	isGPULimiting    []bool

	self psutil.Process

	cancelObserver context.CancelFunc

	lock     sync.RWMutex
	stopOnce sync.Once

	logger log.Logger
}

type Resources interface {
	Cancel()

	// HasLimits returns whether any limits have been set.
	HasLimits() bool

	// Limits returns the CPU (percent 0-100), memory (bytes) limits, and GPU limits (usage and memory each in percent 0-100).
	Limits() (float64, uint64, float64, float64)

	// ShouldLimit returns whether cpu, memory, and/or GPU is currently limited.
	ShouldLimit() (bool, bool, []bool)

	// Request checks whether the requested resources are available.
	Request(req Request) (Response, error)

	// Info returns the current resource usage.
	Info() Info

	Disk(path string) (*DiskInfo, error)
	Network() ([]NetworkInfo, error)

	Process(pid int32) (Process, error)
}

type Config struct {
	MaxCPU       float64 // percent 0-100
	MaxMemory    float64 // percent 0-100
	MaxGPU       float64 // general,encoder,decoder usage, percent 0-100
	MaxGPUMemory float64 // memory usage, percent 0-100
	PSUtil       psutil.Util
	Logger       log.Logger
}

func New(config Config) (Resources, error) {
	if config.PSUtil == nil {
		psutil, err := psutil.New("", nil)
		if err != nil {
			return nil, fmt.Errorf("unable to initialize psutils: %w", err)
		}
		config.PSUtil = psutil
	}

	gpu, err := config.PSUtil.GPU()
	if err != nil {
		return nil, fmt.Errorf("unable to determine number of GPUs: %w", err)
	}

	if len(gpu) == 0 {
		config.MaxGPU = 0
		config.MaxGPUMemory = 0
	}

	isUnlimited := false

	if config.MaxCPU <= 0 && config.MaxMemory <= 0 && config.MaxGPU <= 0 && config.MaxGPUMemory <= 0 {
		isUnlimited = true
	}

	if config.MaxCPU <= 0 {
		config.MaxCPU = 100
	}

	if config.MaxMemory <= 0 {
		config.MaxMemory = 100
	}

	if config.MaxGPU <= 0 {
		config.MaxGPU = 100
	}

	if config.MaxGPUMemory <= 0 {
		config.MaxGPUMemory = 100
	}

	if config.MaxCPU > 100 || config.MaxMemory > 100 || config.MaxGPU > 100 || config.MaxGPUMemory > 100 {
		return nil, fmt.Errorf("all Max... values must have a range of 0-100")
	}

	r := &resources{
		maxCPU:        config.MaxCPU,
		maxGPU:        config.MaxGPU,
		maxGPUMemory:  config.MaxGPUMemory,
		psutil:        config.PSUtil,
		isUnlimited:   isUnlimited,
		ngpu:          len(gpu),
		isGPULimiting: make([]bool, len(gpu)),
		logger:        config.Logger,
	}

	if r.logger == nil {
		r.logger = log.New("")
	}

	vmstat, err := r.psutil.Memory()
	if err != nil {
		return nil, fmt.Errorf("unable to determine available memory: %w", err)
	}

	ncpu, err := r.psutil.CPUCounts()
	if err != nil {
		return nil, fmt.Errorf("unable to determine number of logical CPUs: %w", err)
	}

	r.ncpu = ncpu

	r.maxCPU *= r.ncpu
	r.maxMemory = uint64(float64(vmstat.Total) * config.MaxMemory / 100)

	r.logger = r.logger.WithFields(log.Fields{
		"ncpu":           r.ncpu,
		"max_cpu":        r.maxCPU,
		"max_memory":     r.maxMemory,
		"ngpu":           len(gpu),
		"max_gpu":        r.maxGPU,
		"max_gpu_memory": r.maxGPUMemory,
	})

	r.self, err = r.psutil.Process(int32(os.Getpid()), false)
	if err != nil {
		return nil, fmt.Errorf("unable to create process observer for self: %w", err)
	}

	r.logger.Debug().Log("Created")

	ctx, cancel := context.WithCancel(context.Background())
	r.cancelObserver = cancel

	go r.observe(ctx, time.Second)

	r.stopOnce = sync.Once{}

	r.logger.Info().Log("Started")

	return r, nil
}

func (r *resources) Cancel() {
	r.stopOnce.Do(func() {
		r.cancelObserver()
		r.psutil.Cancel()
		r.self.Cancel()

		r.logger.Info().Log("Stopped")
	})
}

func (r *resources) observe(ctx context.Context, interval time.Duration) {
	ticker := time.NewTicker(interval)
	defer ticker.Stop()

	r.logger.Debug().Log("Observer started")

	for {
		select {
		case <-ctx.Done():
			return
		case <-ticker.C:
			if r.isUnlimited {
				// If there aren't any limits imposed, don't do anything
				continue
			}

			cpustat, err := r.psutil.CPU()
			if err != nil {
				r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage")
				continue
			}

			cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu

			vmstat, err := r.psutil.Memory()
			if err != nil {
				r.logger.Warn().WithError(err).Log("Failed to determine system memory usage")
				continue
			}

			gpustat, err := r.psutil.GPU()
			if err != nil {
				r.logger.Warn().WithError(err).Log("Failed to determine GPU usage")
				continue
			}

			r.logger.Debug().WithFields(log.Fields{
				"cur_cpu":    cpuload,
				"cur_memory": vmstat.Used,
			}).Log("Observation")

			doCPULimit := false

			if !r.isCPULimiting {
				if cpuload >= r.maxCPU {
					r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit reached")
					doCPULimit = true
				}
			} else {
				doCPULimit = true
				if cpuload < r.maxCPU {
					r.logger.Debug().WithField("cpu", cpuload).Log("CPU limit released")
					doCPULimit = false
				}
			}

			doMemoryLimit := false

			if !r.isMemoryLimiting {
				if vmstat.Used >= r.maxMemory {
					r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit reached")
					doMemoryLimit = true
				}
			} else {
				doMemoryLimit = true
				if vmstat.Used < r.maxMemory {
					r.logger.Debug().WithField("memory", vmstat.Used).Log("Memory limit released")
					doMemoryLimit = false
				}
			}

			doGPULimit := make([]bool, r.ngpu)

			for i, limiting := range r.isGPULimiting {
				maxMemory := uint64(r.maxGPUMemory * float64(gpustat[i].MemoryTotal) / 100)
				if !limiting {
					if gpustat[i].MemoryUsed >= maxMemory || (gpustat[i].Usage >= r.maxGPU && gpustat[i].Encoder >= r.maxGPU && gpustat[i].Decoder >= r.maxGPU) {
						doGPULimit[i] = true
					}
				} else {
					doGPULimit[i] = true
					if gpustat[i].MemoryUsed < maxMemory && (gpustat[i].Usage < r.maxGPU || gpustat[i].Encoder < r.maxGPU || gpustat[i].Decoder < r.maxGPU) {
						doGPULimit[i] = false
					}
				}
			}

			r.lock.Lock()
			if r.isCPULimiting != doCPULimit {
				r.logger.Warn().WithFields(log.Fields{
					"enabled": doCPULimit,
				}).Log("Limiting CPU")
			}
			r.isCPULimiting = doCPULimit

			if r.isMemoryLimiting != doMemoryLimit {
				r.logger.Warn().WithFields(log.Fields{
					"enabled": doMemoryLimit,
				}).Log("Limiting memory")
			}
			r.isMemoryLimiting = doMemoryLimit

			for i, limiting := range r.isGPULimiting {
				if limiting != doGPULimit[i] {
					r.logger.Warn().WithFields(log.Fields{
						"enabled": doGPULimit,
						"index":   i,
					}).Log("Limiting GPU")
				}
			}
			r.isGPULimiting = doGPULimit

			r.lock.Unlock()
		}
	}
}

func (r *resources) HasLimits() bool {
	return !r.isUnlimited
}

func (r *resources) Limits() (float64, uint64, float64, float64) {
	return r.maxCPU / r.ncpu, r.maxMemory, r.maxGPU, r.maxGPUMemory
}

func (r *resources) ShouldLimit() (bool, bool, []bool) {
	r.lock.RLock()
	defer r.lock.RUnlock()

	return r.isCPULimiting, r.isMemoryLimiting, slices.Copy(r.isGPULimiting)
}

func (r *resources) Request(req Request) (Response, error) {
	res := Response{
		GPU: -1,
	}

	r.lock.RLock()
	defer r.lock.RUnlock()

	logger := r.logger.WithFields(log.Fields{
		"req_cpu":         req.CPU,
		"req_memory":      req.Memory,
		"req_gpu":         req.GPUUsage,
		"req_gpu_encoder": req.GPUEncoder,
		"req_gpu_decoder": req.GPUDecoder,
		"req_gpu_memory":  req.GPUMemory,
	})

	logger.Debug().Log("Request for acquiring resources")

	// Check if anything is currently limiting.
	if r.isCPULimiting || r.isMemoryLimiting {
		logger.Debug().Log("Rejected, currently limiting")
		return res, fmt.Errorf("resources are currenlty actively limited")
	}

	// Check if the requested resources are valid.
	if req.CPU <= 0 || req.Memory == 0 {
		logger.Debug().Log("Rejected, invalid values")
		return res, fmt.Errorf("the cpu and/or memory values are invalid. values > 0 are required: cpu=%f, memory=%d", req.CPU, req.Memory)
	}

	// Get current CPU and memory values.
	cpustat, err := r.psutil.CPU()
	if err != nil {
		r.logger.Warn().WithError(err).Log("Failed to determine system CPU usage")
		return res, fmt.Errorf("the system CPU usage couldn't be determined")
	}

	cpuload := (cpustat.User + cpustat.System + cpustat.Other) * r.ncpu

	vmstat, err := r.psutil.Memory()
	if err != nil {
		r.logger.Warn().WithError(err).Log("Failed to determine system memory usage")
		return res, fmt.Errorf("the system memory usage couldn't be determined")
	}

	// Check if enough resources are available
	if cpuload+req.CPU > r.maxCPU {
		logger.Debug().WithField("cur_cpu", cpuload).Log("Rejected, CPU limit exceeded")
		return res, fmt.Errorf("the CPU limit would be exceeded: %f + %f > %f", cpuload, req.CPU, r.maxCPU)
	}

	if vmstat.Used+req.Memory > r.maxMemory {
		logger.Debug().WithField("cur_memory", vmstat.Used).Log("Rejected, memory limit exceeded")
		return res, fmt.Errorf("the memory limit would be exceeded: %d + %d > %d", vmstat.Used, req.Memory, r.maxMemory)
	}

	// Check if any GPU resources are requested
	if req.GPUUsage > 0 || req.GPUEncoder > 0 || req.GPUDecoder > 0 || req.GPUMemory > 0 {
		if req.GPUUsage < 0 || req.GPUEncoder < 0 || req.GPUDecoder < 0 || req.GPUMemory == 0 {
			logger.Debug().Log("Rejected, invalid values")
			return res, fmt.Errorf("the gpu usage and memory values are invalid: usage=%f, encoder=%f, decoder=%f, memory=%d", req.GPUUsage, req.GPUEncoder, req.GPUDecoder, req.GPUMemory)
		}

		// Get current GPU values
		gpustat, err := r.psutil.GPU()
		if err != nil {
			r.logger.Warn().WithError(err).Log("Failed to determine GPU usage")
			return res, fmt.Errorf("the GPU usage couldn't be determined")
		}

		if len(gpustat) == 0 {
			r.logger.Debug().WithError(err).Log("GPU resources requested but no GPU available")
			return res, fmt.Errorf("some GPU resources requested but no GPU available")
		}

		fittingGPU := []psutil.GPUInfo{}
		for _, g := range gpustat {
			if req.GPUUsage > 0 && g.Usage+req.GPUUsage > r.maxGPU {
				logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu": g.Usage}).Log("Rejected, GPU usage limit exceeded")
				continue
			}

			if req.GPUEncoder > 0 && g.Encoder+req.GPUEncoder > r.maxGPU {
				logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_encoder": g.Usage}).Log("Rejected, GPU encoder usage limit exceeded")
				continue
			}

			if req.GPUDecoder > 0 && g.Decoder+req.GPUDecoder > r.maxGPU {
				logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_decoder": g.Usage}).Log("Rejected, GPU decoder usage limit exceeded")
				continue
			}

			gpuMemoryUsage := float64(g.MemoryUsed) / float64(g.MemoryTotal) * 100
			requestedGPUMemoryUsage := float64(req.GPUMemory) / float64(g.MemoryTotal) * 100

			if gpuMemoryUsage+requestedGPUMemoryUsage > r.maxGPUMemory {
				logger.Debug().WithFields(log.Fields{"id": g.Index, "cur_gpu_memory": gpuMemoryUsage}).Log("Rejected, GPU memory usage limit exceeded")
				continue
			}

			fittingGPU = append(fittingGPU, g)
		}

		if len(fittingGPU) == 0 {
			return res, fmt.Errorf("all GPU usage limits are exceeded")
		}

		sort.SliceStable(fittingGPU, func(a, b int) bool {
			loadA := fittingGPU[a].Usage + fittingGPU[a].Encoder + fittingGPU[a].Decoder
			loadB := fittingGPU[b].Usage + fittingGPU[b].Encoder + fittingGPU[b].Decoder

			return loadA < loadB
		})

		foundGPU := fittingGPU[0]

		logger = logger.Debug().WithFields(log.Fields{
			"cur_gpu":         foundGPU.Index,
			"cur_gpu_general": foundGPU.Usage,
			"cur_gpu_encoder": foundGPU.Encoder,
			"cur_gpu_decoder": foundGPU.Decoder,
			"cur_gpu_memory":  float64(foundGPU.MemoryUsed) / float64(foundGPU.MemoryTotal) * 100,
		})

		res.GPU = foundGPU.Index
	}

	logger.Debug().WithFields(log.Fields{
		"cur_cpu":    cpuload,
		"cur_memory": vmstat.Used,
	}).Log("Acquiring approved")

	return res, nil
}

func (r *resources) Info() Info {
	cpulimit, memlimit, gpulimit, gpumemlimit := r.Limits()
	cputhrottling, memthrottling, gputhrottling := r.ShouldLimit()

	cpustat, cpuerr := r.psutil.CPU()
	memstat, memerr := r.psutil.Memory()
	gpustat, gpuerr := r.psutil.GPU()
	selfcpu, _ := r.self.CPU()
	selfmem, _ := r.self.Memory()

	cpuinfo := CPUInfo{
		NCPU:       r.ncpu,
		System:     cpustat.System,
		User:       cpustat.User,
		Idle:       cpustat.Idle,
		Other:      cpustat.Other,
		Limit:      cpulimit,
		Core:       selfcpu.System + selfcpu.User + selfcpu.Other,
		Throttling: cputhrottling,
		Error:      cpuerr,
	}

	meminfo := MemoryInfo{
		Total:      memstat.Total,
		Available:  memstat.Available,
		Used:       memstat.Used,
		Limit:      memlimit,
		Core:       selfmem,
		Throttling: memthrottling,
		Error:      memerr,
	}

	gpuinfo := GPUInfo{
		NGPU:  float64(len(gpustat)),
		Error: gpuerr,
	}

	for i, g := range gpustat {
		gpuinfo.GPU = append(gpuinfo.GPU, GPUInfoStat{
			Index:           g.Index,
			ID:              g.ID,
			Name:            g.Name,
			MemoryTotal:     g.MemoryTotal,
			MemoryUsed:      g.MemoryUsed,
			MemoryAvailable: g.MemoryTotal - g.MemoryUsed,
			MemoryLimit:     uint64(float64(g.MemoryTotal) * gpumemlimit / 100),
			Usage:           g.Usage,
			Encoder:         g.Encoder,
			Decoder:         g.Decoder,
			UsageLimit:      gpulimit,
		})

		if i < len(gputhrottling) {
			gpuinfo.GPU[i].Throttling = gputhrottling[i]
		}
	}

	i := Info{
		CPU: cpuinfo,
		Mem: meminfo,
		GPU: gpuinfo,
	}

	return i
}

func (r *resources) Disk(path string) (*DiskInfo, error) {
	info, err := r.psutil.Disk(path)
	if err != nil {
		return nil, err
	}

	diskinfo := &DiskInfo{
		Path:        info.Path,
		Fstype:      info.Fstype,
		Total:       info.Total,
		Used:        info.Used,
		InodesTotal: info.InodesTotal,
		InodesUsed:  info.InodesUsed,
	}

	return diskinfo, nil
}

func (r *resources) Network() ([]NetworkInfo, error) {
	netio, err := r.psutil.Network()
	if err != nil {
		return nil, err
	}

	info := []NetworkInfo{}

	for _, io := range netio {
		info = append(info, NetworkInfo{
			Name:      io.Name,
			BytesSent: io.BytesSent,
			BytesRecv: io.BytesRecv,
		})
	}

	return info, nil
}

func (r *resources) Process(pid int32) (Process, error) {
	proc, err := r.psutil.Process(pid, true)
	if err != nil {
		return nil, err
	}

	p := &process{
		proc: proc,
	}

	return p, nil
}

type Process interface {
	Info() (ProcessInfo, error)

	// Cancel will stop collecting CPU and memory data for this process.
	Cancel()

	// Suspend will send SIGSTOP to the process.
	Suspend() error

	// Resume will send SIGCONT to the process.
	Resume() error
}

type process struct {
	proc psutil.Process
}

type ProcessInfoCPU struct {
	System float64 // percent 0-100
	User   float64 // percent 0-100
	Idle   float64 // percent 0-100
	Other  float64 // percent 0-100
}

type ProcessInfoGPU struct {
	Index int // Index of the GPU

	MemoryUsed uint64 // bytes

	Usage   float64 // percent 0-100
	Encoder float64 // percent 0-100
	Decoder float64 // percent 0-100
}

type ProcessInfo struct {
	CPU    ProcessInfoCPU
	Memory uint64
	GPU    ProcessInfoGPU
}

func (p *process) Info() (ProcessInfo, error) {
	cpu, err := p.proc.CPU()
	if err != nil {
		return ProcessInfo{}, err
	}

	mem, err := p.proc.Memory()
	if err != nil {
		return ProcessInfo{}, err
	}

	gpu, err := p.proc.GPU()
	if err != nil {
		return ProcessInfo{}, err
	}

	pi := ProcessInfo{
		CPU: ProcessInfoCPU{
			System: cpu.System,
			User:   cpu.User,
			Idle:   cpu.Idle,
			Other:  cpu.Other,
		},
		Memory: mem,
		GPU: ProcessInfoGPU{
			Index:      gpu.Index,
			MemoryUsed: gpu.MemoryUsed,
			Usage:      gpu.Usage,
			Encoder:    gpu.Encoder,
			Decoder:    gpu.Decoder,
		},
	}

	return pi, nil
}

func (p *process) Cancel() {
	p.proc.Cancel()
}

func (p *process) Suspend() error {
	return p.proc.Suspend()
}

func (p *process) Resume() error {
	return p.proc.Resume()
}