【修改说明 Modification】修改算力切分日志过长问题

This commit is contained in:
yuncliu
2023-08-30 08:36:30 +00:00
committed by song-xiangbing
3 changed files with 10 additions and 130 deletions

View File

@@ -1 +1 @@
locate the output files
locate the output files

View File

@@ -61,7 +61,8 @@ func CreateVDevice(w WorkerInterface, spec *specs.Spec) (VDeviceInfo, error) {
vdeviceID, err := w.CreateVDevice(targetCardID, targetDeviceID, splitDevice)
if err != nil || vdeviceID < 0 {
return invalidVDevice, fmt.Errorf("cannot create vd or vdevice is wrong: %v %v", vdeviceID, err)
hwlog.RunLog.Errorf("cannot create vd or vdevice is wrong: %v %v", vdeviceID, err)
return invalidVDevice, err
}
return VDeviceInfo{CardID: targetCardID, DeviceID: targetDeviceID, VdeviceID: int32(vdeviceID)}, nil
}
@@ -70,11 +71,11 @@ func extractVpuParam(spec *specs.Spec) (int32, string, error) {
splitDevice, needSplit, visibleDeviceLine := "", false, ""
allowSplit := map[string]string{
"vir01": "vir01", "vir02": "vir02", "vir04": "vir04", "vir08": "vir08", "vir16": "vir16",
"vir02_1c": "vir02_1c", "vir03_1c_8g": "vir03_1c_8g", "vir04_3c": "vir04_3c",
"vir02_1c": "vir02_1c", "vir03_1c_8g": "vir03_1c_8g", "vir04_3c": "vir04_3c",
"vir04_4c_dvpp": "vir04_4c_dvpp", "vir04_3c_ndvpp": "vir04_3c_ndvpp",
"vir05_1c_8g": "vir05_1c_8g", "vir05_1c_16g": "vir05_1c_16g",
"vir06_1c_16g": "vir06_1c_16g", "vir10_3c_16g": "vir10_3c_16g",
"vir10_3c_16g_nm": "vir10_3c_16g_nm", "vir10_3c_32g": "vir10_3c_32g",
"vir06_1c_16g": "vir06_1c_16g", "vir10_3c_16g": "vir10_3c_16g",
"vir10_3c_16g_nm": "vir10_3c_16g_nm", "vir10_3c_32g": "vir10_3c_32g",
"vir10_4c_16g_m": "vir10_4c_16g_m", "vir12_3c_32g": "vir12_3c_32g",
}
@@ -101,8 +102,8 @@ func extractVpuParam(spec *specs.Spec) (int32, string, error) {
}
visibleDevice, err := strconv.Atoi(visibleDeviceLine)
if err != nil || visibleDevice < 0 || visibleDevice >= hiAIMaxCardNum*hiAIMaxDeviceNum {
return -1, "", fmt.Errorf("cannot parse param : %v %s", err, visibleDeviceLine)
hwlog.RunLog.Errorf("cannot parse param : %v %s", err, visibleDeviceLine)
return -1, "", err
}
return int32(visibleDevice), splitDevice, nil

View File

@@ -17,13 +17,10 @@ package main
import (
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"io/ioutil"
"log"
"net/http"
"os"
"os/exec"
"path"
@@ -37,7 +34,6 @@ import (
"github.com/containerd/containerd/oci"
"github.com/opencontainers/runtime-spec/specs-go"
"huawei.com/npu-exporter/v5/common-utils/hwlog"
"k8s.io/api/core/v1"
"main/dcmi"
"mindxcheckutils"
@@ -69,8 +65,6 @@ var (
hookDefaultFile = hookDefaultFilePath
dockerRuncName = dockerRuncFile
runcName = runcFile
notMatchError = errors.New("container not match pod or pod not has huawei.com/Ascend910 annotation")
)
const (
@@ -298,26 +292,6 @@ func removeDuplication(devices []int) []int {
return list
}
func parseAnnotationDevices(annotationDevices string) ([]int, error) {
devices := make([]int, 0)
for _, d := range strings.Split(annotationDevices, ",") {
borders := strings.Split(d, Ascend910+"-")
if len(borders) != borderNum || borders[0] != "" {
return nil, fmt.Errorf("invalid device range: %s", d)
}
deviceID, err := strconv.Atoi(borders[1])
if err != nil {
return nil, fmt.Errorf("invalid device ID: %s", d)
}
devices = append(devices, deviceID)
}
sort.Slice(devices, func(i, j int) bool { return i < j })
return removeDuplication(devices), nil
}
func parseDevices(visibleDevices string) ([]int, error) {
devices := make([]int, 0)
const maxDevice = 128
@@ -498,44 +472,6 @@ func addManagerDevice(spec *specs.Spec) error {
}
func addDevice(spec *specs.Spec) error {
// 获取对应pod annotation中的设备信息
annotationDevices, err := getDeviceFromPod(spec)
if err != nil && err != notMatchError {
// 报错不可直接返回,记录日志即可
hwlog.RunLog.Errorf("getDeviceFromPod failed: %#v", err)
}
// 如果没有匹配到pod或annotation则通过环境变量挂载设备
if annotationDevices == "" {
hwlog.RunLog.Info("add devices from env variable")
if err = addDeviceFromEnv(spec); err != nil {
return fmt.Errorf("failed to add device to env: %#v", err)
}
return nil
}
// 如果对应pod annotation中的设备信息存在则用这个信息挂载设备
devices, err := parseAnnotationDevices(annotationDevices)
if err != nil {
return fmt.Errorf("failed to parse device: %#v", err)
}
hwlog.RunLog.Infof("annotation devices is: %#v", devices)
deviceName := davinciName
for _, deviceId := range devices {
dPath := devicePath + deviceName + strconv.Itoa(deviceId)
if err = addDeviceToSpec(spec, dPath, deviceName); err != nil {
return fmt.Errorf("failed to add davinci device to spec: %#v", err)
}
}
if err = addManagerDevice(spec); err != nil {
return fmt.Errorf("failed to add Manager device to spec: %#v", err)
}
return nil
}
func addDeviceFromEnv(spec *specs.Spec) error {
visibleDevices := getValueByKey(spec.Process.Env, ascendVisibleDevices)
if visibleDevices == "" {
return nil
@@ -634,7 +570,8 @@ func modifySpecFile(path string) error {
}
if err = addHook(&spec); err != nil {
return fmt.Errorf("failed to inject hook: %v", err)
hwlog.RunLog.Errorf("failed to inject hook, err: %v", err)
return fmt.Errorf("failed to inject hook")
}
if err = addDevice(&spec); err != nil {
@@ -714,61 +651,3 @@ func main() {
log.Fatal(err)
}
}
func getDeviceFromPod(spec *specs.Spec) (string, error) {
// 只有apiserver会访问kubelet的https api接口所以使用apiserver的客户端证书证书需要从master节点拷贝到worker节点
certFile := "/etc/kubernetes/pki/apiserver-kubelet-client.crt"
keyFile := "/etc/kubernetes/pki/apiserver-kubelet-client.key"
kubeletUrl := "https://127.0.0.1:10250/"
podsUrlPath := "pods"
npu910CardName := "huawei.com/Ascend910"
cert, err := tls.LoadX509KeyPair(certFile, keyFile)
if err != nil {
hwlog.RunLog.Errorf("LoadX509KeyPair failed: %#v", err)
return "", err
}
// 构造带客户端证书的http客户端
client := &http.Client{
Transport: &http.Transport{
Proxy: nil, // 禁用代理
TLSClientConfig: &tls.Config{
Certificates: []tls.Certificate{cert},
InsecureSkipVerify: true, // kubelet 是自签名ca证书apiserver也未校验kubelet服务端证书所以这里不校验
},
},
}
// 向kubelet服务端请求获取pod list
resp, err := client.Get(kubeletUrl + podsUrlPath)
if err != nil {
hwlog.RunLog.Errorf("http get failed: %#v", err)
return "", err
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
hwlog.RunLog.Errorf("ReadAll resp.Body failed: %#v", err)
return "", err
}
hwlog.RunLog.Infof("get pod list success, resp.Status: %#v", resp.Status)
// 遍历pod list找到此容器
var podList v1.PodList
if err := json.Unmarshal(body, &podList); err != nil {
hwlog.RunLog.Errorf("unmarshal body failed: %#v", err)
return "", err
}
for _, pod := range podList.Items {
if pod.ObjectMeta.Name == spec.Hostname {
if value, ok := pod.ObjectMeta.Annotations[npu910CardName]; ok {
return value, nil
}
break
}
}
return "", notMatchError
}7