Match-id-0715a80f21cb04d4d0f05436333a490c2ddc2164

This commit is contained in:
BianTanggui
2023-03-29 16:43:29 +08:00
parent a79a99f98f
commit d541d430e2
15 changed files with 182 additions and 102 deletions

View File

@@ -4,9 +4,7 @@
/etc/ld.so.conf.d/mind_so.conf
/etc/slog.conf
/etc/hdcBasic.cfg
/etc/ascend_install.info
/var/dmp_daemon
/var/davinci/driver/version.info
/var/slogd
/usr/lib64/libsemanage.so.2
/usr/lib64/libmmpa.so
@@ -14,19 +12,6 @@
/usr/lib64/libdrvdsmi.so
/usr/lib64/libdcmi.so
/usr/lib64/libstackcore.so
/usr/lib64/libascend_drvdmp.so
/usr/lib64/libdsmiproduct.so
/usr/lib64/libiam.so.1
/usr/lib64/libascend_drv_baselib.so
/usr/lib64/libascend_hal.so
/usr/lib64/libslog.so
/usr/lib64/libc_sec.so
/usr/lib64/libascend_drvupgrade.so
/usr/lib64/libeasy_comm.so.1
/usr/lib64/libfault_event.so.1
/usr/lib64/libheartbeat.so
/usr/lib64/libxshmem.so.1
/usr/lib64/libdevmmap.so
/usr/lib64/libmpi_dvpp_adapter.so
/usr/lib64/libaicpu_scheduler.so
/usr/lib64/libaicpu_processer.so
@@ -34,5 +19,5 @@
/usr/lib64/libaicpu_sharder.so
/usr/lib64/libadump.so
/usr/lib64/libtsd_eventclient.so
/usr/lib64/libmsprof.so
/root/hdc_ppc
/usr/lib64/aicpu_kernels
/usr/lib64/libyaml-0.so.2

View File

@@ -188,18 +188,13 @@ static bool CheckWhiteList(const char* fileName)
{"/usr/local/Ascend/driver/include"}, {"/usr/local/dcmi"}, {"/usr/local/bin/npu-smi"},
{"/home/data/miniD/driver/lib64"}, {"/usr/local/sbin/npu-smi"},
{"/usr/local/Ascend/driver/tools"}, {"/etc/hdcBasic.cfg"}, {"/etc/sys_version.conf"},
{"/etc/ld.so.conf.d/mind_so.conf"}, {"/etc/slog.conf"}, {"/etc/ascend_install.info"},
{"/var/dmp_daemon"}, {"/var/davinci/driver/version.info"}, {"/var/slogd"},
{"/etc/ld.so.conf.d/mind_so.conf"}, {"/etc/slog.conf"}, {"/var/dmp_daemon"}, {"/var/slogd"},
{"/usr/lib64/libsemanage.so.2"}, {"/usr/lib64/libmmpa.so"}, {"/usr/lib64/libcrypto.so.1.1"},
{"/usr/lib64/libdrvdsmi.so"}, {"/usr/lib64/libdcmi.so"}, {"/usr/lib64/libstackcore.so"},
{"/usr/lib64/libascend_drvdmp.so"}, {"/usr/lib64/libdsmiproduct.so"}, {"/usr/lib64/libiam.so.1"},
{"/usr/lib64/libascend_drv_baselib.so"}, {"/usr/lib64/libascend_hal.so"}, {"/usr/lib64/libslog.so"},
{"/usr/lib64/libc_sec.so"}, {"/usr/lib64/libascend_drvupgrade.so"}, {"/usr/lib64/libeasy_comm.so.1"},
{"/usr/lib64/libfault_event.so.1"}, {"/usr/lib64/libheartbeat.so"}, {"/usr/lib64/libxshmem.so.1"},
{"/usr/lib64/libdevmmap.so"}, {"/usr/lib64/libmpi_dvpp_adapter.so"}, {"/usr/lib64/libaicpu_scheduler.so"},
{"/usr/lib64/libmpi_dvpp_adapter.so"}, {"/usr/lib64/libaicpu_scheduler.so"},
{"/usr/lib64/libaicpu_processer.so"}, {"/usr/lib64/libaicpu_prof.so"}, {"/usr/lib64/libaicpu_sharder.so"},
{"/usr/lib64/libadump.so"}, {"/usr/lib64/libtsd_eventclient.so"}, {"/usr/lib64/libmsprof.so"},
{"/usr/lib64/aicpu_kernels"}, {"/root/hdc_ppc"}
{"/usr/lib64/libadump.so"}, {"/usr/lib64/libtsd_eventclient.so"},
{"/usr/lib64/aicpu_kernels"}, {"/usr/lib64/libyaml-0.so.2"}
};
for (size_t iLoop = 0; iLoop < WHITE_LIST_NUM; iLoop++) {

View File

@@ -357,7 +357,7 @@ bool GetFileSubsetAndCheck(const char *basePath, const size_t basePathLen)
return ShowExceptionInfo("Strcat failed!");
}
if (ptr->d_type == DT_REG) { // 文件
const size_t maxFileSzieMb = 10; // max 10 MB
const size_t maxFileSzieMb = 150; // max 150 MB
if (!CheckFileSubset(base, strlen(base), maxFileSzieMb)) {
return false;
}

View File

@@ -5,7 +5,7 @@ go 1.17
require (
github.com/opencontainers/runtime-spec v1.0.3-0.20220718201635-a8106e99982b
github.com/prashantv/gostub v1.1.0
huawei.com/npu-exporter/v3 v3.0.0
huawei.com/npu-exporter/v5 v5.0.0-rc1.1
mindxcheckutils v1.0.0
)
@@ -18,6 +18,6 @@ require (
)
replace (
huawei.com/npu-exporter/v3 => gitee.com/ascend/ascend-npu-exporter/v3 v3.0.0
huawei.com/npu-exporter/v5 => gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-rc1.1
mindxcheckutils => ../mindxcheckutils
)

View File

@@ -1,5 +1,5 @@
gitee.com/ascend/ascend-npu-exporter/v3 v3.0.0 h1:JfB5Kmce3mWEzbtAhybJozGp6+yJH+jU7D6WytEcOzs=
gitee.com/ascend/ascend-npu-exporter/v3 v3.0.0/go.mod h1:78lAYBVM18u8mobeoKqhJP7POvbayTYBk32hnm9IkfQ=
gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-rc1.1 h1:XXNpZemFi7/kmW7v+eXLwYumeDCDgLZtIT6AJHY8w9Q=
gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-rc1.1/go.mod h1:fMiyWfHf3p+Wo4hu8Uy+cnD8ulQV3nzwwVzupRM8XEk=
github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000=
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/fsnotify/fsnotify v1.6.0 h1:n+5WquG0fcWoWp6xPWfHdbskMCQaFnG6PfBrh1Ky4HY=

View File

@@ -28,10 +28,9 @@ import (
"syscall"
"github.com/opencontainers/runtime-spec/specs-go"
"huawei.com/npu-exporter/v5/common-utils/hwlog"
"mindxcheckutils"
"huawei.com/npu-exporter/v3/common-utils/hwlog"
)
const (

View File

@@ -3,7 +3,7 @@ module main
go 1.17
require (
huawei.com/npu-exporter/v3 v3.0.0
huawei.com/npu-exporter/v5 v5.0.0-rc1.1
mindxcheckutils v1.0.0
)
@@ -16,6 +16,6 @@ require (
)
replace (
huawei.com/npu-exporter/v3 => gitee.com/ascend/ascend-npu-exporter/v3 v3.0.0
huawei.com/npu-exporter/v5 => gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-rc1.1
mindxcheckutils => ../../../mindxcheckutils
)

View File

@@ -37,8 +37,8 @@ cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohl
cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs=
cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
gitee.com/ascend/ascend-npu-exporter/v3 v3.0.0 h1:JfB5Kmce3mWEzbtAhybJozGp6+yJH+jU7D6WytEcOzs=
gitee.com/ascend/ascend-npu-exporter/v3 v3.0.0/go.mod h1:78lAYBVM18u8mobeoKqhJP7POvbayTYBk32hnm9IkfQ=
gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-rc1.1 h1:XXNpZemFi7/kmW7v+eXLwYumeDCDgLZtIT6AJHY8w9Q=
gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-rc1.1/go.mod h1:fMiyWfHf3p+Wo4hu8Uy+cnD8ulQV3nzwwVzupRM8XEk=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo=
github.com/agiledragon/gomonkey/v2 v2.8.0 h1:u2K2nNGyk0ippzklz1CWalllEB9ptD+DtSXeCX5O000=

View File

@@ -26,7 +26,7 @@ import (
"path/filepath"
"strings"
"huawei.com/npu-exporter/v3/common-utils/hwlog"
"huawei.com/npu-exporter/v5/common-utils/hwlog"
"mindxcheckutils"
)

View File

@@ -33,6 +33,8 @@ const (
hiAIMaxCardNum = 64
// hiAIMaxDeviceNum is the max number of devices in a card
hiAIMaxDeviceNum = 4
productTypeLen = 64
notSupportCode = -8255
coreNumLen = 32
vfgID = 4294967295 // vfg_id表示指定虚拟设备所属的虚拟分组ID默认自动分配默认值为0xFFFFFFFF转换成10进制为4294967295。
@@ -195,3 +197,18 @@ func (w *NpuWorker) FindDevice(visibleDevice int32) (int32, int32, error) {
}
return targetDeviceID, targetCardID, nil
}
// GetProductType get type of product by dcmi interface
func (w *NpuWorker) GetProductType(cardID, deviceID int32) (string, error) {
cProductType := C.CString(string(make([]byte, productTypeLen)))
defer C.free(unsafe.Pointer(cProductType))
if err := C.dcmi_get_product_type(C.int(cardID), C.int(deviceID),
(*C.char)(cProductType), productTypeLen); err != 0 {
if err == notSupportCode {
// device which does not support querying product, such as Ascend 910A/B
return "not support", nil
}
return "", fmt.Errorf("get product type failed, errCode: %d", err)
}
return C.GoString(cProductType), nil
}

View File

@@ -21,6 +21,8 @@ import (
"strings"
"github.com/opencontainers/runtime-spec/specs-go"
"huawei.com/npu-exporter/v5/common-utils/hwlog"
)
// VDeviceInfo vdevice created info
@@ -37,6 +39,7 @@ type WorkerInterface interface {
FindDevice(visibleDevice int32) (int32, int32, error)
CreateVDevice(cardID, deviceID int32, coreNum string) (int32, error)
DestroyVDevice(cardID, deviceID int32, vDevID int32) error
GetProductType(cardID, deviceID int32) (string, error)
}
// CreateVDevice will create virtual device
@@ -99,3 +102,39 @@ func extractVpuParam(spec *specs.Spec) (int32, string, error) {
return int32(visibleDevice), splitDevice, nil
}
// GetProductType get type of product
func GetProductType(w WorkerInterface) (string, error) {
invalidType := ""
if err := w.Initialize(); err != nil {
return invalidType, fmt.Errorf("cannot init dcmi : %v", err)
}
defer w.ShutDown()
cardNum, cardList, err := GetCardList()
if cardNum == 0 || err != nil {
hwlog.RunLog.Errorf("failed to get card list, err: %#v", err)
return invalidType, err
}
for _, cardID := range cardList {
devNum, err := GetDeviceNumInCard(cardID)
if err != nil {
hwlog.RunLog.Debugf("get device num by cardID(%d) failed, error: %#v", cardID, err)
continue
}
if devNum == 0 {
hwlog.RunLog.Debugf("not found device on card %d", cardID)
continue
}
for devID := int32(0); devID < devNum; devID++ {
productType, err := w.GetProductType(cardID, devID)
if err != nil {
hwlog.RunLog.Debugf("get product type by card %d deviceID %d failed, err: %#v", cardID, devID, err)
continue
}
return productType, nil
}
}
return invalidType, nil
}

View File

@@ -98,6 +98,12 @@ int dcmi_get_device_logicid_from_phyid(unsigned int phyid, unsigned int *logicid
CALL_FUNC(dcmi_get_device_logicid_from_phyid, phyid, logicid);
}
int (*dcmi_get_product_type_func)(int card_id, int device_id, char *product_type_str, int buf_size);
int dcmi_get_product_type(int card_id, int device_id, char *product_type_str, int buf_size)
{
CALL_FUNC(dcmi_get_product_type, card_id, device_id, product_type_str, buf_size);
}
// load .so files and functions
int dcmiInit_dl(char *dl_path)
{
@@ -132,6 +138,8 @@ int dcmiInit_dl(char *dl_path)
dcmi_get_device_logicid_from_phyid_func = dlsym(dcmiHandle, "dcmi_get_device_logicid_from_phyid");
dcmi_get_product_type_func = dlsym(dcmiHandle, "dcmi_get_product_type");
return SUCCESS;
}

View File

@@ -6,7 +6,7 @@ require (
github.com/containerd/containerd v1.6.19
github.com/opencontainers/runtime-spec v1.0.3-0.20220718201635-a8106e99982b
github.com/prashantv/gostub v1.1.0
huawei.com/npu-exporter/v3 v3.0.0
huawei.com/npu-exporter/v5 v5.0.0-rc1.1
mindxcheckutils v1.0.0
)
@@ -40,6 +40,6 @@ require (
replace (
github.com/prashantv/gostub => github.com/prashantv/gostub v1.0.1-0.20191007164320-bbe3712b9c4a
huawei.com/npu-exporter/v3 => gitee.com/ascend/ascend-npu-exporter/v3 v3.0.0
huawei.com/npu-exporter/v5 => gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-rc1.1
mindxcheckutils => ../mindxcheckutils
)

View File

@@ -39,8 +39,8 @@ cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohl
cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs=
cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0=
dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU=
gitee.com/ascend/ascend-npu-exporter/v3 v3.0.0 h1:JfB5Kmce3mWEzbtAhybJozGp6+yJH+jU7D6WytEcOzs=
gitee.com/ascend/ascend-npu-exporter/v3 v3.0.0/go.mod h1:78lAYBVM18u8mobeoKqhJP7POvbayTYBk32hnm9IkfQ=
gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-rc1.1 h1:XXNpZemFi7/kmW7v+eXLwYumeDCDgLZtIT6AJHY8w9Q=
gitee.com/ascend/ascend-npu-exporter/v5 v5.0.0-rc1.1/go.mod h1:fMiyWfHf3p+Wo4hu8Uy+cnD8ulQV3nzwwVzupRM8XEk=
github.com/AdaLogics/go-fuzz-headers v0.0.0-20210715213245-6c3934b029d8/go.mod h1:CzsSbkDixRphAF5hS6wbMKq0eI6ccJRb7/A0M6JBnwg=
github.com/Azure/azure-sdk-for-go v16.2.1+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc=
github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8=

View File

@@ -25,6 +25,7 @@ import (
"os/exec"
"path"
"path/filepath"
"regexp"
"sort"
"strconv"
"strings"
@@ -32,7 +33,7 @@ import (
"github.com/containerd/containerd/oci"
"github.com/opencontainers/runtime-spec/specs-go"
"huawei.com/npu-exporter/v3/common-utils/hwlog"
"huawei.com/npu-exporter/v5/common-utils/hwlog"
"main/dcmi"
"mindxcheckutils"
@@ -42,22 +43,15 @@ const (
runLogPath = "/var/log/ascend-docker-runtime/runtime-run.log"
operateLogPath = "/var/log/ascend-docker-runtime/runtime-operate.log"
hookDefaultFilePath = "/usr/local/bin/ascend-docker-hook"
devicePath = "/dev/"
davinciName = "davinci"
davinciManager = "davinci_manager"
devmmSvm = "devmm_svm"
hisiHdc = "hisi_hdc"
svm0 = "svm0"
tsAisle = "ts_aisle"
maxCommandLength = 65535
hookCli = "ascend-docker-hook"
destroyHookCli = "ascend-docker-destroy"
dockerRuncFile = "docker-runc"
runcFile = "runc"
envLength = 2
kvPairSize = 2
borderNum = 2
vdeviceIdlen = 3
maxCommandLength = 65535
hookCli = "ascend-docker-hook"
destroyHookCli = "ascend-docker-destroy"
dockerRuncFile = "docker-runc"
runcFile = "runc"
envLength = 2
kvPairSize = 2
borderNum = 2
// ENV for device-plugin to identify ascend-docker-runtime
useAscendDocker = "ASCEND_DOCKER_RUNTIME=True"
@@ -73,6 +67,31 @@ var (
runcName = runcFile
)
const (
// Atlas200ISoc Product name
Atlas200ISoc = "Atlas 200I SoC A1"
// Atlas200 Product name
Atlas200 = "Atlas 200 Model 3000"
// Atlas500A2 Product name
Atlas500A2 = "Atlas 500 A2"
devicePath = "/dev/"
davinciName = "davinci"
davinciManager = "davinci_manager"
devmmSvm = "devmm_svm"
hisiHdc = "hisi_hdc"
svm0 = "svm0"
tsAisle = "ts_aisle"
upgrade = "upgrade"
sys = "sys"
vdec = "vdec"
vpc = "vpc"
pngd = "pngd"
venc = "venc"
dvppCmdList = "dvpp_cmdlist"
logDrv = "log_drv"
)
type args struct {
bundleDirPath string
cmd string
@@ -307,12 +326,16 @@ func getValueByKey(data []string, name string) string {
func addDeviceToSpec(spec *specs.Spec, dPath string, vdevice bool) error {
device, err := oci.DeviceFromPath(dPath)
if err != nil {
return fmt.Errorf("failed to get device info : %#v", err)
return fmt.Errorf("failed to get %s info : %#v", dPath, err)
}
lenPath := len(dPath)
if vdevice {
vPath := devicePath + davinciName + dPath[lenPath-vdeviceIdlen:]
re := regexp.MustCompile("[0-9]+")
vDeviceNumber := re.FindAllString(dPath, -1)
if len(vDeviceNumber) != 1 {
return fmt.Errorf("error vdevice : %s", dPath)
}
vPath := devicePath + davinciName + vDeviceNumber[0]
device.Path = vPath
}
@@ -328,54 +351,68 @@ func addDeviceToSpec(spec *specs.Spec, dPath string, vdevice bool) error {
return nil
}
func addA500ManagerDevice(spec *specs.Spec) error {
var Atlas500ManageDevices = []string{
svm0,
tsAisle,
upgrade,
sys,
vdec,
vpc,
pngd,
venc,
dvppCmdList,
logDrv,
}
for _, device := range Atlas500ManageDevices {
dPath := devicePath + device
if err := addDeviceToSpec(spec, dPath, false); err != nil {
return fmt.Errorf("failed to add %s of A500 A2 to spec : %#v", dPath, err)
}
}
return nil
}
func addCommonManagerDevice(spec *specs.Spec) error {
var commonManagerDevices = []string{
devmmSvm,
hisiHdc,
}
for _, device := range commonManagerDevices {
dPath := devicePath + device
if err := addDeviceToSpec(spec, dPath, false); err != nil {
return fmt.Errorf("failed to add common manage device to spec : %#v", err)
}
}
return nil
}
func addManagerDevice(spec *specs.Spec) error {
managerPath := devicePath + davinciManager
if err := addDeviceToSpec(spec, managerPath, false); err != nil {
return fmt.Errorf("failed to add manager device to spec : %#v", err)
return fmt.Errorf("add davinci_manager to spec error: %#v", err)
}
svmPath := devicePath + devmmSvm
if _, err := os.Stat(svmPath); err == nil {
if err = addDeviceToSpec(spec, svmPath, false); err != nil {
return fmt.Errorf("failed to add devmm_svm to spec : %#v", err)
}
} else {
// do nothing when device is not exist.
if !os.IsNotExist(err) {
return fmt.Errorf("stat devmm_svm device err: %#v", err)
}
productType, err := dcmi.GetProductType(&dcmi.NpuWorker{})
if err != nil {
return fmt.Errorf("parse product type error: %#v", err)
}
hwlog.RunLog.Infof("product type is %s", productType)
hdcPath := devicePath + hisiHdc
if _, err := os.Stat(hdcPath); err == nil {
if err = addDeviceToSpec(spec, hdcPath, false); err != nil {
return fmt.Errorf("failed to add hisi_hdc device to spec : %#v", err)
switch productType {
case Atlas500A2:
if err = addA500ManagerDevice(spec); err != nil {
return fmt.Errorf("add A500 manage device error: %#v", err)
}
} else {
if !os.IsNotExist(err) {
return fmt.Errorf("stat hisi_hdc device err: %#v", err)
}
}
svm0Path := devicePath + svm0
if _, err := os.Stat(svm0Path); err == nil {
if err = addDeviceToSpec(spec, svm0Path, false); err != nil {
return fmt.Errorf("failed to add svm0 device to spec : %#v", err)
}
} else {
if !os.IsNotExist(err) {
return fmt.Errorf("stat svm0 device err: %#v", err)
}
}
tsAislePath := devicePath + tsAisle
if _, err := os.Stat(tsAislePath); err == nil {
if err = addDeviceToSpec(spec, tsAislePath, false); err != nil {
return fmt.Errorf("failed to add tsAisle device to spec : %#v", err)
}
} else {
if !os.IsNotExist(err) {
return fmt.Errorf("stat tsAisle device err: %#v", err)
// do nothing
case Atlas200ISoc, Atlas200:
default:
if err = addCommonManagerDevice(spec); err != nil {
return fmt.Errorf("add common manage device error: %#v", err)
}
}