[XPU] Support XPU via Paddle Inference backend (#1987)

* [backend] Support XPU via Paddle Inference backend

* [backend] Support XPU via Paddle Inference backend

* [backend] Support XPU via Paddle Inference backend

* [XPU] support XPU benchmark via paddle inference

* [XPU] support XPU benchmark via paddle inference

* [benchmark] add xpu paddle h2d config files
This commit is contained in:
DefTruth
2023-05-25 14:13:40 +08:00
committed by GitHub
parent 24f32d10a7
commit 49c033a828
16 changed files with 262 additions and 57 deletions

View File

@@ -45,6 +45,33 @@ struct IpuOption {
bool ipu_enable_half_partial;
};
/*! @brief Option object to configure KUNLUNXIN XPU
*/
struct XpuOption {
/// kunlunxin device id
int kunlunxin_device_id = 0;
/// EnableXpu
/// kunlunxin_l3_workspace_size
int kunlunxin_l3_workspace_size = 0xfffc00;
/// kunlunxin_locked
bool kunlunxin_locked = false;
/// kunlunxin_autotune
bool kunlunxin_autotune = true;
/// kunlunxin_autotune_file
std::string kunlunxin_autotune_file = "";
/// kunlunxin_precision
std::string kunlunxin_precision = "int16";
/// kunlunxin_adaptive_seqlen
bool kunlunxin_adaptive_seqlen = false;
/// kunlunxin_enable_multi_stream
bool kunlunxin_enable_multi_stream = false;
/// SetXpuConfig
/// quant post dynamic weight bits
int kunlunxin_quant_post_dynamic_weight_bits = -1;
/// quant post dynamic op types
std::vector<std::string> kunlunxin_quant_post_dynamic_op_types = {};
};
/*! @brief Option object to configure Paddle Inference backend
*/
struct PaddleBackendOption {
@@ -63,6 +90,10 @@ struct PaddleBackendOption {
* @brief IPU option, this will configure the IPU hardware, if inference model in IPU
*/
IpuOption ipu_option;
/*
* @brief XPU option, this will configure the KUNLUNXIN XPU hardware, if inference model in XPU
*/
XpuOption xpu_option;
/// Collect shape for model while enable_trt is true
bool collect_trt_shape = false;
@@ -84,8 +115,8 @@ struct PaddleBackendOption {
}
void SetIpuConfig(bool enable_fp16, int replica_num,
float available_memory_proportion,
bool enable_half_partial) {
float available_memory_proportion,
bool enable_half_partial) {
ipu_option.ipu_enable_fp16 = enable_fp16;
ipu_option.ipu_replica_num = replica_num;
ipu_option.ipu_available_memory_proportion =
@@ -93,12 +124,22 @@ struct PaddleBackendOption {
ipu_option.ipu_enable_half_partial = enable_half_partial;
}
void SetXpuConfig(
int quant_post_dynamic_weight_bits = -1,
const std::vector<std::string>& quant_post_dynamic_op_types = {}) {
xpu_option.kunlunxin_quant_post_dynamic_weight_bits =
quant_post_dynamic_weight_bits;
xpu_option.kunlunxin_quant_post_dynamic_op_types =
quant_post_dynamic_op_types;
}
// The belowing parameters may be removed, please do not
// read or write them directly
TrtBackendOption trt_option;
bool enable_pinned_memory = false;
void* external_stream_ = nullptr;
Device device = Device::CPU;
/// device id for CPU/GPU
int device_id = 0;
std::vector<std::string> trt_disabled_ops_{};
int cpu_thread_num = 8;

View File

@@ -78,9 +78,28 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
option.ipu_option.ipu_available_memory_proportion,
option.ipu_option.ipu_enable_half_partial);
#else
FDWARNING << "The FastDeploy is not compiled with IPU backend, so will "
FDWARNING << "The FastDeploy is not compiled with IPU device, so will "
"fallback to CPU with Paddle Inference Backend."
<< std::endl;
#endif
} else if (option.device == Device::KUNLUNXIN) {
#ifdef WITH_KUNLUNXIN
config_.EnableXpu(option.xpu_option.kunlunxin_l3_workspace_size,
option.xpu_option.kunlunxin_locked,
option.xpu_option.kunlunxin_autotune,
option.xpu_option.kunlunxin_autotune_file,
option.xpu_option.kunlunxin_precision,
option.xpu_option.kunlunxin_adaptive_seqlen,
option.xpu_option.kunlunxin_enable_multi_stream);
config_.SetXpuConfig(
option.xpu_option.kunlunxin_quant_post_dynamic_weight_bits,
option.xpu_option.kunlunxin_quant_post_dynamic_op_types);
config_.SetXpuDeviceId(option.xpu_option.kunlunxin_device_id);
#else
FDWARNING
<< "The FastDeploy is not compiled with KUNLUNXIN device, so will "
"fallback to CPU with Paddle Inference Backend."
<< std::endl;
#endif
} else {
config_.DisableGpu();
@@ -89,6 +108,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
config_.SetMkldnnCacheCapacity(option.mkldnn_cache_size);
}
}
if (!option.enable_log_info) {
config_.DisableGlogInfo();
}
@@ -106,6 +126,7 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) {
}
auto option = runtime_option;
// Collect basic paddle inference option and trt option.
option.paddle_infer_option.model_file = runtime_option.model_file;
option.paddle_infer_option.params_file = runtime_option.params_file;
option.paddle_infer_option.model_from_memory_ =
@@ -117,6 +138,10 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) {
option.paddle_infer_option.external_stream_ = runtime_option.external_stream_;
option.paddle_infer_option.trt_option = runtime_option.trt_option;
option.paddle_infer_option.trt_option.gpu_id = runtime_option.device_id;
// Note(qiuyanjun): For Ipu option and XPU option, please check the
// details of RuntimeOption::UseIpu() and RuntimeOption::UseKunlunXin().
// Futhermore, please check paddle_infer_option.SetIpuConfig() and
// paddle_infer_option.SetXpuConfig() for more details of extra configs.
return InitFromPaddle(option.model_file, option.params_file,
option.model_from_memory_, option.paddle_infer_option);
}

View File

@@ -19,6 +19,8 @@ namespace fastdeploy {
paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device) {
if (device == Device::GPU) {
return paddle_infer::PlaceType::kGPU;
} else if (device == Device::KUNLUNXIN) {
return paddle_infer::PlaceType::kXPU;
}
return paddle_infer::PlaceType::kCPU;
}
@@ -52,9 +54,21 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor,
tensor->CopyFromCpu(static_cast<const int64_t*>(fd_tensor.Data()));
}
return;
} else if (fd_tensor.dtype == FDDataType::INT8) {
if (place == paddle_infer::PlaceType::kGPU) {
tensor->ShareExternalData(static_cast<const int8_t*>(fd_tensor.Data()),
shape, place);
} else {
tensor->CopyFromCpu(static_cast<const int8_t*>(fd_tensor.Data()));
}
return;
} else if (fd_tensor.dtype == FDDataType::UINT8) {
tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
shape, paddle_infer::PlaceType::kCPU);
if (place == paddle_infer::PlaceType::kGPU) {
tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
shape, place);
} else {
tensor->CopyFromCpu(static_cast<const uint8_t*>(fd_tensor.Data()));
}
return;
}
FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
@@ -89,9 +103,21 @@ void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor,
tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor.MutableData()));
}
return;
} else if (fd_tensor.dtype == FDDataType::INT8) {
if (place == paddle_infer::PlaceType::kGPU) {
tensor->ShareExternalData(static_cast<const int8_t*>(fd_tensor.Data()),
shape, place);
} else {
tensor->CopyFromCpu(static_cast<const int8_t*>(fd_tensor.Data()));
}
return;
} else if (fd_tensor.dtype == FDDataType::UINT8) {
tensor->ShareExternalData(static_cast<uint8_t*>(fd_tensor.MutableData()),
shape, paddle_infer::PlaceType::kCPU);
if (place == paddle_infer::PlaceType::kGPU) {
tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
shape, place);
} else {
tensor->CopyFromCpu(static_cast<const uint8_t*>(fd_tensor.Data()));
}
return;
}
FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
@@ -149,6 +175,11 @@ void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
Device device = Device::CPU;
if (place == paddle_infer::PlaceType::kGPU) {
device = Device::GPU;
} else if (place == paddle_infer::PlaceType::kXPU) {
device = Device::KUNLUNXIN;
FDASSERT(false,
"Currently, copy_to_fd=false, FDTensor SetExternalData "
"is not support for Device::KUNLUNXIN now!")
}
fd_tensor->name = tensor->name();
fd_tensor->SetExternalData(shape, fd_dtype, out_data, device);