Files
ascend-docker-runtime/runtime/dcmi/dcmi.go
2022-07-12 16:37:47 +08:00

179 lines
5.6 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Package dcmi is used to work with Ascend devices
/*
* Copyright(C) Huawei Technologies Co.,Ltd. 2020-2021. All rights reserved.
*/
package dcmi
// #cgo LDFLAGS: -ldl
// #include "dcmi_interface_api.h"
import "C"
import (
"fmt"
"math"
)
const (
// RetError return error when the function failed
retError = -1
// dcmiMaxVdevNum is max number of vdevice, value is from driver specification
dcmiMaxVdevNum = 16
// maxErrorCodeCount is the max number of error code
hiAIMaxCardNum = 16
coreNumLen = 32
deviceNum = 4294967295 // vfg_id表示指定虚拟设备所属的虚拟分组ID默认自动分配默认值为0xFFFFFFFF转换成10进制为4294967295。
)
// NpuWorker Dcmi worker
type NpuWorker struct {
}
// Initialize dcmi lib init
func (w *NpuWorker) Initialize() error {
if err := C.dcmiInit_dl(); err != C.SUCCESS {
errInfo := fmt.Errorf("dcmi lib load failed, , error code: %d", int32(err))
return errInfo
}
if err := C.dcmi_init(); err != C.SUCCESS {
errInfo := fmt.Errorf("dcmi init failed, , error code: %d", int32(err))
return errInfo
}
return nil
}
// ShutDown shutdown dcmi lib
func (w *NpuWorker) ShutDown() {
if err := C.dcmiShutDown(); err != C.SUCCESS {
println(fmt.Errorf("dcmi shut down failed, error code: %d", int32(err)))
}
}
// GetCardList list all cards on system
func GetCardList() (int32, []int32, error) {
var ids [hiAIMaxCardNum]C.int
var cNum C.int
if err := C.dcmi_get_card_num_list(&cNum, &ids[0], hiAIMaxCardNum); err != 0 {
errInfo := fmt.Errorf("get card list failed, error code: %d", int32(err))
return retError, nil, errInfo
}
// checking card's quantity
if cNum <= 0 || cNum > hiAIMaxCardNum {
errInfo := fmt.Errorf("get error card quantity: %d", int32(cNum))
return retError, nil, errInfo
}
var cardNum = int32(cNum)
var cardIDList []int32
for i := int32(0); i < cardNum && i < hiAIMaxCardNum; i++ {
cardID := int32(ids[i])
if cardID < 0 {
continue
}
cardIDList = append(cardIDList, cardID)
}
return cardNum, cardIDList, nil
}
// GetDeviceNumInCard get device number in the npu card
func GetDeviceNumInCard(cardID int32) (int32, error) {
var deviceNum C.int
if err := C.dcmi_get_device_num_in_card(C.int(cardID), &deviceNum); err != 0 {
errInfo := fmt.Errorf("get device count on the card failed, error code: %d", int32(err))
return retError, errInfo
}
if deviceNum <= 0 {
errInfo := fmt.Errorf("the number of chips obtained is invalid, the number is: %d", int32(deviceNum))
return retError, errInfo
}
return int32(deviceNum), nil
}
// GetDeviceLogicID get device logicID
func GetDeviceLogicID(cardID, deviceID int32) (int32, error) {
var logicID C.int
if err := C.dcmi_get_device_logic_id(&logicID, C.int(cardID), C.int(deviceID)); err != 0 {
errInfo := fmt.Errorf("get logicID failed, error code: %d", int32(err))
return retError, errInfo
}
// check whether phyID is too big
if logicID < 0 || uint32(logicID) > uint32(math.MaxInt8) {
errInfo := fmt.Errorf("the logicID value is invalid, logicID is: %d", logicID)
return retError, errInfo
}
return int32(logicID), nil
}
// CreateVDevice create virtual device
func (w *NpuWorker) CreateVDevice(cardID, deviceID int32, coreNum string) (int32, error) {
var createInfo C.struct_dcmi_create_vdev_out
createInfo.vdev_id = C.uint(math.MaxUint32)
var deviceCreateStr C.struct_dcmi_create_vdev_res_stru
deviceCreateStr = C.struct_dcmi_create_vdev_res_stru{
vdev_id: C.uint(deviceNum),
vfg_id: C.uint(deviceNum),
}
deviceCreateStrArr := [coreNumLen]C.char{0}
for i := 0; i < len(coreNum); i++ {
if i >= coreNumLen {
return math.MaxInt32, fmt.Errorf("wrong template")
}
deviceCreateStrArr[i] = C.char(coreNum[i])
}
deviceCreateStr.template_name = deviceCreateStrArr
err := C.dcmi_create_vdevice(C.int(cardID), C.int(deviceID), &deviceCreateStr, &createInfo)
if err != 0 {
errInfo := fmt.Errorf("create virtual device failed, error code: %d", int32(err))
return math.MaxInt32, errInfo
}
if createInfo.vdev_id > math.MaxInt32 {
return math.MaxInt32, fmt.Errorf("create virtual device failed, vdeviceId too large")
}
return int32(createInfo.vdev_id), nil
}
// DestroyVDevice destroy virtual device
func (w *NpuWorker) DestroyVDevice(cardID, deviceID int32, vDevID int32) error {
if vDevID < 0 {
return fmt.Errorf("param error on vDevID")
}
if err := C.dcmi_set_destroy_vdevice(C.int(cardID), C.int(deviceID), C.uint(vDevID)); err != 0 {
errInfo := fmt.Errorf("destroy virtual device failed, error code: %d", int32(err))
return errInfo
}
return nil
}
// FindDevice find device by phyical id
func (w *NpuWorker) FindDevice(visibleDevice int32) (int32, int32, error) {
var dcmiLogicID C.uint
if err := C.dcmi_get_device_logicid_from_phyid(C.uint(visibleDevice), &dcmiLogicID); err != 0 {
return 0, 0, fmt.Errorf("phy id can not be converted to logic id : %v", err)
}
if uint(dcmiLogicID) > math.MaxInt32 {
return 0, 0, fmt.Errorf("logic id too large")
}
targetLogicID := int32(dcmiLogicID)
_, cardList, err := GetCardList()
if err != nil {
return 0, 0, fmt.Errorf("get card list err : %v", err)
}
targetDeviceID, targetCardID := int32(math.MaxInt32), int32(math.MaxInt32)
for _, cardID := range cardList {
deviceCount, err := GetDeviceNumInCard(cardID)
if err != nil {
return 0, 0, fmt.Errorf("cannot get device num in card : %v", err)
}
for deviceID := int32(0); deviceID < deviceCount; deviceID++ {
logicID, err := GetDeviceLogicID(cardID, deviceID)
if err != nil {
return 0, 0, fmt.Errorf("cannot get logic id : %v", err)
}
if logicID == int32(targetLogicID) {
targetCardID, targetDeviceID = cardID, deviceID
}
}
}
return targetDeviceID, targetCardID, nil
}