Files
core/psutil/gpu/nvidia/nvidia.go
2024-07-16 08:14:19 +02:00

285 lines
4.8 KiB
Go

package nvidia
import (
"bytes"
"context"
"encoding/xml"
"fmt"
"os/exec"
"sync"
"time"
"github.com/datarhei/core/v16/psutil/gpu"
)
var Default gpu.GPU
func init() {
Default = New("")
}
type Megabytes uint64
func (m *Megabytes) UnmarshalText(text []byte) error {
value := uint64(0)
_, err := fmt.Sscanf(string(text), "%d MiB", &value)
if err != nil {
return err
}
*m = Megabytes(value * 1024 * 1024)
return nil
}
type Utilization float64
func (u *Utilization) UnmarshalText(text []byte) error {
value := float64(0)
_, err := fmt.Sscanf(string(text), "%f %%", &value)
if err != nil {
return err
}
*u = Utilization(value)
return nil
}
type Process struct {
PID int32 `xml:"pid"`
Memory Megabytes `xml:"used_memory"`
}
type GPUStats struct {
Name string `xml:"product_name"`
Architecture string `xml:"product_architecture"`
MemoryTotal Megabytes `xml:"fb_memory_usage>total"`
MemoryUsed Megabytes `xml:"fb_memory_usage>used"`
Usage Utilization `xml:"utilization>gpu_util"`
MemoryUsage Utilization `xml:"utilization>memory_util"`
EncoderUsage Utilization `xml:"utilization>encoder_util"`
DecoderUsage Utilization `xml:"utilization>decoder_util"`
Process []Process `xml:"processes>process_info"`
}
type Stats struct {
GPU []GPUStats `xml:"gpu"`
}
func parse(data []byte) (Stats, error) {
nv := Stats{}
err := xml.Unmarshal(data, &nv)
if err != nil {
return nv, fmt.Errorf("parsing report: %w", err)
}
return nv, nil
}
type nvidia struct {
cmd *exec.Cmd
wr *writer
lock sync.RWMutex
cancel context.CancelFunc
stats Stats
process map[int32]Process
err error
}
type dummy struct{}
func (d *dummy) Count() (int, error) { return 0, nil }
func (d *dummy) Stats() ([]gpu.Stats, error) { return nil, nil }
func (d *dummy) Process(pid int32) (gpu.Process, error) { return gpu.Process{}, gpu.ErrProcessNotFound }
type writer struct {
buf bytes.Buffer
ch chan Stats
}
var terminator = []byte("</nvidia_smi_log>\n")
func (w *writer) Write(data []byte) (int, error) {
n, err := w.buf.Write(data)
if err != nil {
return n, err
}
for {
idx := bytes.Index(w.buf.Bytes(), terminator)
if idx == -1 {
break
}
content := make([]byte, idx+len(terminator))
n, err := w.buf.Read(content)
if err != nil || n != len(content) {
break
}
s, err := parse(content)
if err != nil {
continue
}
w.ch <- s
}
return n, nil
}
func New(path string) gpu.GPU {
if len(path) == 0 {
path = "nvidia-smi"
}
_, err := exec.LookPath(path)
if err != nil {
return &dummy{}
}
n := &nvidia{
wr: &writer{
ch: make(chan Stats, 1),
},
process: map[int32]Process{},
}
ctx, cancel := context.WithCancel(context.Background())
n.cancel = cancel
go n.runner(ctx, path)
go n.reader(ctx)
return n
}
func (n *nvidia) reader(ctx context.Context) {
for {
select {
case <-ctx.Done():
return
case stats := <-n.wr.ch:
n.lock.Lock()
n.stats = stats
n.process = map[int32]Process{}
for _, g := range n.stats.GPU {
for _, p := range g.Process {
n.process[p.PID] = p
}
}
n.lock.Unlock()
}
}
}
func (n *nvidia) runner(ctx context.Context, path string) {
for {
n.cmd = exec.Command(path, "-q", "-x", "-l", "1")
n.cmd.Stdout = n.wr
err := n.cmd.Start()
if err != nil {
n.lock.Lock()
n.err = err
n.lock.Unlock()
time.Sleep(3 * time.Second)
continue
}
err = n.cmd.Wait()
n.lock.Lock()
n.err = err
n.lock.Unlock()
select {
case <-ctx.Done():
return
default:
}
}
}
func (n *nvidia) Count() (int, error) {
n.lock.RLock()
defer n.lock.RUnlock()
if n.err != nil {
return 0, n.err
}
return len(n.stats.GPU), nil
}
func (n *nvidia) Stats() ([]gpu.Stats, error) {
s := []gpu.Stats{}
n.lock.RLock()
defer n.lock.RUnlock()
if n.err != nil {
return s, n.err
}
for _, nv := range n.stats.GPU {
stats := gpu.Stats{
Name: nv.Name,
Architecture: nv.Architecture,
MemoryTotal: uint64(nv.MemoryTotal),
MemoryUsed: uint64(nv.MemoryUsed),
Usage: float64(nv.Usage),
MemoryUsage: float64(nv.MemoryUsage),
EncoderUsage: float64(nv.EncoderUsage),
DecoderUsage: float64(nv.DecoderUsage),
Process: []gpu.Process{},
}
for _, p := range nv.Process {
stats.Process = append(stats.Process, gpu.Process{
PID: p.PID,
Memory: uint64(p.Memory),
})
}
s = append(s, stats)
}
return s, nil
}
func (n *nvidia) Process(pid int32) (gpu.Process, error) {
n.lock.RLock()
defer n.lock.RUnlock()
p, hasProcess := n.process[pid]
if !hasProcess {
return gpu.Process{}, gpu.ErrProcessNotFound
}
return gpu.Process{
PID: p.PID,
Memory: uint64(p.Memory),
}, nil
}
func (n *nvidia) Close() {
n.lock.Lock()
defer n.lock.Unlock()
if n.cancel == nil {
return
}
n.cancel()
n.cancel = nil
n.cmd.Process.Kill()
}