Files
core/resources/psutil/gpu/nvidia/nvidia.go

441 lines
8.2 KiB
Go

package nvidia
import (
"bytes"
"context"
"encoding/xml"
"fmt"
"os/exec"
"regexp"
"slices"
"strconv"
"sync"
"time"
"github.com/datarhei/core/v16/resources/psutil/gpu"
)
type Megabytes uint64
func (m *Megabytes) UnmarshalText(text []byte) error {
value := uint64(0)
_, err := fmt.Sscanf(string(text), "%d MiB", &value)
if err != nil {
return err
}
*m = Megabytes(value * 1024 * 1024)
return nil
}
type Utilization float64
func (u *Utilization) UnmarshalText(text []byte) error {
value := float64(0)
_, err := fmt.Sscanf(string(text), "%f %%", &value)
if err != nil {
return err
}
*u = Utilization(value)
return nil
}
type Process struct {
Index int
PID int32
Memory uint64 // bytes
Usage float64 // percent 0-100
Encoder float64 // percent 0-100
Decoder float64 // percent 0-100
lastSeen time.Time
}
type GPUStats struct {
ID string `xml:"id,attr"`
Name string `xml:"product_name"`
Architecture string `xml:"product_architecture"`
MemoryTotal Megabytes `xml:"fb_memory_usage>total"`
MemoryUsed Megabytes `xml:"fb_memory_usage>used"`
Usage Utilization `xml:"utilization>gpu_util"`
UsageEncoder Utilization `xml:"utilization>encoder_util"`
UsageDecoder Utilization `xml:"utilization>decoder_util"`
}
type Stats struct {
GPU []GPUStats `xml:"gpu"`
}
type nvidia struct {
wrQuery *writerQuery
wrProcess *writerProcess
lock sync.RWMutex
cancel context.CancelFunc
stats Stats
process map[int32]Process
err error
}
type dummy struct{}
func (d *dummy) Count() (int, error) { return 0, nil }
func (d *dummy) Stats() ([]gpu.Stats, error) { return nil, nil }
func (d *dummy) Process(pid int32) (gpu.Process, error) { return gpu.Process{}, gpu.ErrProcessNotFound }
func (d *dummy) Close() {}
type writerQuery struct {
buf bytes.Buffer
ch chan Stats
terminator []byte
}
func (w *writerQuery) Write(data []byte) (int, error) {
n, err := w.buf.Write(data)
if err != nil {
return n, err
}
for {
idx := bytes.Index(w.buf.Bytes(), w.terminator)
if idx == -1 {
break
}
content := make([]byte, idx+len(w.terminator))
n, err := w.buf.Read(content)
if err != nil || n != len(content) {
break
}
s, err := w.parse(content)
if err != nil {
continue
}
w.ch <- s
}
return n, nil
}
func (w *writerQuery) parse(data []byte) (Stats, error) {
nv := Stats{}
err := xml.Unmarshal(data, &nv)
if err != nil {
return nv, fmt.Errorf("parsing report: %w", err)
}
return nv, nil
}
type writerProcess struct {
buf bytes.Buffer
ch chan Process
re *regexp.Regexp
terminator []byte
}
func (w *writerProcess) Write(data []byte) (int, error) {
n, err := w.buf.Write(data)
if err != nil {
return n, err
}
for {
idx := bytes.Index(w.buf.Bytes(), w.terminator)
if idx == -1 {
break
}
content := make([]byte, idx+len(w.terminator))
n, err := w.buf.Read(content)
if err != nil || n != len(content) {
break
}
s, err := w.parse(content)
if err != nil {
continue
}
w.ch <- s
}
return n, nil
}
func (w *writerProcess) parse(data []byte) (Process, error) {
p := Process{}
if len(data) == 0 {
return p, fmt.Errorf("empty line")
}
if data[0] == '#' {
return p, fmt.Errorf("comment")
}
matches := w.re.FindStringSubmatch(string(data))
if matches == nil {
return p, fmt.Errorf("no matches found")
}
if len(matches) != 7 {
return p, fmt.Errorf("not the expected number of matches found")
}
if d, err := strconv.ParseInt(matches[1], 10, 0); err == nil {
p.Index = int(d)
}
if d, err := strconv.ParseInt(matches[2], 10, 32); err == nil {
p.PID = int32(d)
}
if matches[3][0] != '-' {
if d, err := strconv.ParseFloat(matches[3], 64); err == nil {
p.Usage = d
}
}
if matches[4][0] != '-' {
if d, err := strconv.ParseFloat(matches[4], 64); err == nil {
p.Encoder = d
}
}
if matches[5][0] != '-' {
if d, err := strconv.ParseFloat(matches[5], 64); err == nil {
p.Decoder = d
}
}
if d, err := strconv.ParseUint(matches[6], 10, 64); err == nil {
p.Memory = d * 1024 * 1024
}
return p, nil
}
func New(path string) gpu.GPU {
if len(path) == 0 {
path = "nvidia-smi"
}
path, err := exec.LookPath(path)
if err != nil {
return &dummy{}
}
n := &nvidia{
wrQuery: &writerQuery{
ch: make(chan Stats, 1),
terminator: []byte("</nvidia_smi_log>\n"),
},
wrProcess: &writerProcess{
ch: make(chan Process, 32),
// # gpu pid type sm mem enc dec fb command
// # Idx # C/G % % % % MB name
// 0 7372 C 2 0 2 - 136 ffmpeg
// 0 12176 C 5 2 3 7 782 ffmpeg
// 0 20035 C 8 2 4 1 1145 ffmpeg
// 0 20141 C 2 1 1 3 429 ffmpeg
// 0 29591 C 2 1 - 2 435 ffmpeg
re: regexp.MustCompile(`^\s*([0-9]+)\s+([0-9]+)\s+[A-Z]\s+([0-9-]+)\s+[0-9-]+\s+([0-9-]+)\s+([0-9-]+)\s+([0-9]+).*`),
terminator: []byte("\n"),
},
process: map[int32]Process{},
}
ctx, cancel := context.WithCancel(context.Background())
n.cancel = cancel
go n.runnerQuery(ctx, path)
go n.runnerProcess(ctx, path)
go n.reader(ctx)
return n
}
func (n *nvidia) reader(ctx context.Context) {
for {
select {
case <-ctx.Done():
return
case stats := <-n.wrQuery.ch:
n.lock.Lock()
n.stats = stats
n.lock.Unlock()
case process := <-n.wrProcess.ch:
process.lastSeen = time.Now()
n.lock.Lock()
n.process[process.PID] = process
for pid, p := range n.process {
if time.Since(p.lastSeen) > 11*time.Second {
delete(n.process, pid)
}
}
n.lock.Unlock()
}
}
}
func (n *nvidia) runnerQuery(ctx context.Context, path string) {
for {
cmd := exec.CommandContext(ctx, path, "-q", "-x", "-l", "1")
cmd.Stdout = n.wrQuery
err := cmd.Start()
if err != nil {
n.lock.Lock()
n.err = err
n.lock.Unlock()
time.Sleep(3 * time.Second)
continue
}
err = cmd.Wait()
n.lock.Lock()
n.err = err
n.lock.Unlock()
select {
case <-ctx.Done():
return
default:
}
}
}
func (n *nvidia) runnerProcess(ctx context.Context, path string) {
for {
cmd := exec.CommandContext(ctx, path, "pmon", "-s", "um", "-d", "5")
cmd.Stdout = n.wrProcess
err := cmd.Start()
if err != nil {
n.lock.Lock()
n.err = err
n.lock.Unlock()
time.Sleep(3 * time.Second)
continue
}
err = cmd.Wait()
n.lock.Lock()
n.err = err
n.lock.Unlock()
select {
case <-ctx.Done():
return
default:
}
}
}
func (n *nvidia) Count() (int, error) {
n.lock.RLock()
defer n.lock.RUnlock()
if n.err != nil {
return 0, n.err
}
return len(n.stats.GPU), nil
}
func (n *nvidia) Stats() ([]gpu.Stats, error) {
stats := []gpu.Stats{}
n.lock.RLock()
defer n.lock.RUnlock()
if n.err != nil {
return stats, n.err
}
for _, nv := range n.stats.GPU {
s := gpu.Stats{
ID: nv.ID,
Name: nv.Name,
Architecture: nv.Architecture,
MemoryTotal: uint64(nv.MemoryTotal),
MemoryUsed: uint64(nv.MemoryUsed),
Usage: float64(nv.Usage),
Encoder: float64(nv.UsageEncoder),
Decoder: float64(nv.UsageDecoder),
Process: []gpu.Process{},
}
stats = append(stats, s)
}
for _, p := range n.process {
if p.Index >= len(stats) {
continue
}
stats[p.Index].Process = append(stats[p.Index].Process, gpu.Process{
PID: p.PID,
Index: p.Index,
Memory: p.Memory,
Usage: p.Usage,
Encoder: p.Encoder,
Decoder: p.Decoder,
})
}
for i := range stats {
p := stats[i].Process
slices.SortFunc(p, func(a, b gpu.Process) int {
return int(a.PID - b.PID)
})
stats[i].Process = p
}
return stats, nil
}
func (n *nvidia) Process(pid int32) (gpu.Process, error) {
n.lock.RLock()
defer n.lock.RUnlock()
p, hasProcess := n.process[pid]
if hasProcess {
return gpu.Process{
PID: p.PID,
Index: p.Index,
Memory: p.Memory,
Usage: p.Usage,
Encoder: p.Encoder,
Decoder: p.Decoder,
}, nil
}
return gpu.Process{Index: -1}, gpu.ErrProcessNotFound
}
func (n *nvidia) Close() {
n.lock.Lock()
defer n.lock.Unlock()
if n.cancel == nil {
return
}
n.cancel()
n.cancel = nil
}