mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-14 20:55:57 +08:00
[XPU] Support XPU via Paddle Inference backend (#1987)
* [backend] Support XPU via Paddle Inference backend * [backend] Support XPU via Paddle Inference backend * [backend] Support XPU via Paddle Inference backend * [XPU] support XPU benchmark via paddle inference * [XPU] support XPU benchmark via paddle inference * [benchmark] add xpu paddle h2d config files
This commit is contained in:
@@ -45,6 +45,33 @@ struct IpuOption {
|
||||
bool ipu_enable_half_partial;
|
||||
};
|
||||
|
||||
/*! @brief Option object to configure KUNLUNXIN XPU
|
||||
*/
|
||||
struct XpuOption {
|
||||
/// kunlunxin device id
|
||||
int kunlunxin_device_id = 0;
|
||||
/// EnableXpu
|
||||
/// kunlunxin_l3_workspace_size
|
||||
int kunlunxin_l3_workspace_size = 0xfffc00;
|
||||
/// kunlunxin_locked
|
||||
bool kunlunxin_locked = false;
|
||||
/// kunlunxin_autotune
|
||||
bool kunlunxin_autotune = true;
|
||||
/// kunlunxin_autotune_file
|
||||
std::string kunlunxin_autotune_file = "";
|
||||
/// kunlunxin_precision
|
||||
std::string kunlunxin_precision = "int16";
|
||||
/// kunlunxin_adaptive_seqlen
|
||||
bool kunlunxin_adaptive_seqlen = false;
|
||||
/// kunlunxin_enable_multi_stream
|
||||
bool kunlunxin_enable_multi_stream = false;
|
||||
/// SetXpuConfig
|
||||
/// quant post dynamic weight bits
|
||||
int kunlunxin_quant_post_dynamic_weight_bits = -1;
|
||||
/// quant post dynamic op types
|
||||
std::vector<std::string> kunlunxin_quant_post_dynamic_op_types = {};
|
||||
};
|
||||
|
||||
/*! @brief Option object to configure Paddle Inference backend
|
||||
*/
|
||||
struct PaddleBackendOption {
|
||||
@@ -63,6 +90,10 @@ struct PaddleBackendOption {
|
||||
* @brief IPU option, this will configure the IPU hardware, if inference model in IPU
|
||||
*/
|
||||
IpuOption ipu_option;
|
||||
/*
|
||||
* @brief XPU option, this will configure the KUNLUNXIN XPU hardware, if inference model in XPU
|
||||
*/
|
||||
XpuOption xpu_option;
|
||||
|
||||
/// Collect shape for model while enable_trt is true
|
||||
bool collect_trt_shape = false;
|
||||
@@ -84,8 +115,8 @@ struct PaddleBackendOption {
|
||||
}
|
||||
|
||||
void SetIpuConfig(bool enable_fp16, int replica_num,
|
||||
float available_memory_proportion,
|
||||
bool enable_half_partial) {
|
||||
float available_memory_proportion,
|
||||
bool enable_half_partial) {
|
||||
ipu_option.ipu_enable_fp16 = enable_fp16;
|
||||
ipu_option.ipu_replica_num = replica_num;
|
||||
ipu_option.ipu_available_memory_proportion =
|
||||
@@ -93,12 +124,22 @@ struct PaddleBackendOption {
|
||||
ipu_option.ipu_enable_half_partial = enable_half_partial;
|
||||
}
|
||||
|
||||
void SetXpuConfig(
|
||||
int quant_post_dynamic_weight_bits = -1,
|
||||
const std::vector<std::string>& quant_post_dynamic_op_types = {}) {
|
||||
xpu_option.kunlunxin_quant_post_dynamic_weight_bits =
|
||||
quant_post_dynamic_weight_bits;
|
||||
xpu_option.kunlunxin_quant_post_dynamic_op_types =
|
||||
quant_post_dynamic_op_types;
|
||||
}
|
||||
|
||||
// The belowing parameters may be removed, please do not
|
||||
// read or write them directly
|
||||
TrtBackendOption trt_option;
|
||||
bool enable_pinned_memory = false;
|
||||
void* external_stream_ = nullptr;
|
||||
Device device = Device::CPU;
|
||||
/// device id for CPU/GPU
|
||||
int device_id = 0;
|
||||
std::vector<std::string> trt_disabled_ops_{};
|
||||
int cpu_thread_num = 8;
|
||||
|
@@ -78,9 +78,28 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
|
||||
option.ipu_option.ipu_available_memory_proportion,
|
||||
option.ipu_option.ipu_enable_half_partial);
|
||||
#else
|
||||
FDWARNING << "The FastDeploy is not compiled with IPU backend, so will "
|
||||
FDWARNING << "The FastDeploy is not compiled with IPU device, so will "
|
||||
"fallback to CPU with Paddle Inference Backend."
|
||||
<< std::endl;
|
||||
#endif
|
||||
} else if (option.device == Device::KUNLUNXIN) {
|
||||
#ifdef WITH_KUNLUNXIN
|
||||
config_.EnableXpu(option.xpu_option.kunlunxin_l3_workspace_size,
|
||||
option.xpu_option.kunlunxin_locked,
|
||||
option.xpu_option.kunlunxin_autotune,
|
||||
option.xpu_option.kunlunxin_autotune_file,
|
||||
option.xpu_option.kunlunxin_precision,
|
||||
option.xpu_option.kunlunxin_adaptive_seqlen,
|
||||
option.xpu_option.kunlunxin_enable_multi_stream);
|
||||
config_.SetXpuConfig(
|
||||
option.xpu_option.kunlunxin_quant_post_dynamic_weight_bits,
|
||||
option.xpu_option.kunlunxin_quant_post_dynamic_op_types);
|
||||
config_.SetXpuDeviceId(option.xpu_option.kunlunxin_device_id);
|
||||
#else
|
||||
FDWARNING
|
||||
<< "The FastDeploy is not compiled with KUNLUNXIN device, so will "
|
||||
"fallback to CPU with Paddle Inference Backend."
|
||||
<< std::endl;
|
||||
#endif
|
||||
} else {
|
||||
config_.DisableGpu();
|
||||
@@ -89,6 +108,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
|
||||
config_.SetMkldnnCacheCapacity(option.mkldnn_cache_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (!option.enable_log_info) {
|
||||
config_.DisableGlogInfo();
|
||||
}
|
||||
@@ -106,6 +126,7 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) {
|
||||
}
|
||||
|
||||
auto option = runtime_option;
|
||||
// Collect basic paddle inference option and trt option.
|
||||
option.paddle_infer_option.model_file = runtime_option.model_file;
|
||||
option.paddle_infer_option.params_file = runtime_option.params_file;
|
||||
option.paddle_infer_option.model_from_memory_ =
|
||||
@@ -117,6 +138,10 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) {
|
||||
option.paddle_infer_option.external_stream_ = runtime_option.external_stream_;
|
||||
option.paddle_infer_option.trt_option = runtime_option.trt_option;
|
||||
option.paddle_infer_option.trt_option.gpu_id = runtime_option.device_id;
|
||||
// Note(qiuyanjun): For Ipu option and XPU option, please check the
|
||||
// details of RuntimeOption::UseIpu() and RuntimeOption::UseKunlunXin().
|
||||
// Futhermore, please check paddle_infer_option.SetIpuConfig() and
|
||||
// paddle_infer_option.SetXpuConfig() for more details of extra configs.
|
||||
return InitFromPaddle(option.model_file, option.params_file,
|
||||
option.model_from_memory_, option.paddle_infer_option);
|
||||
}
|
||||
|
@@ -19,6 +19,8 @@ namespace fastdeploy {
|
||||
paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device) {
|
||||
if (device == Device::GPU) {
|
||||
return paddle_infer::PlaceType::kGPU;
|
||||
} else if (device == Device::KUNLUNXIN) {
|
||||
return paddle_infer::PlaceType::kXPU;
|
||||
}
|
||||
return paddle_infer::PlaceType::kCPU;
|
||||
}
|
||||
@@ -52,9 +54,21 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor,
|
||||
tensor->CopyFromCpu(static_cast<const int64_t*>(fd_tensor.Data()));
|
||||
}
|
||||
return;
|
||||
} else if (fd_tensor.dtype == FDDataType::INT8) {
|
||||
if (place == paddle_infer::PlaceType::kGPU) {
|
||||
tensor->ShareExternalData(static_cast<const int8_t*>(fd_tensor.Data()),
|
||||
shape, place);
|
||||
} else {
|
||||
tensor->CopyFromCpu(static_cast<const int8_t*>(fd_tensor.Data()));
|
||||
}
|
||||
return;
|
||||
} else if (fd_tensor.dtype == FDDataType::UINT8) {
|
||||
tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
|
||||
shape, paddle_infer::PlaceType::kCPU);
|
||||
if (place == paddle_infer::PlaceType::kGPU) {
|
||||
tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
|
||||
shape, place);
|
||||
} else {
|
||||
tensor->CopyFromCpu(static_cast<const uint8_t*>(fd_tensor.Data()));
|
||||
}
|
||||
return;
|
||||
}
|
||||
FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
|
||||
@@ -89,9 +103,21 @@ void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor,
|
||||
tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor.MutableData()));
|
||||
}
|
||||
return;
|
||||
} else if (fd_tensor.dtype == FDDataType::INT8) {
|
||||
if (place == paddle_infer::PlaceType::kGPU) {
|
||||
tensor->ShareExternalData(static_cast<const int8_t*>(fd_tensor.Data()),
|
||||
shape, place);
|
||||
} else {
|
||||
tensor->CopyFromCpu(static_cast<const int8_t*>(fd_tensor.Data()));
|
||||
}
|
||||
return;
|
||||
} else if (fd_tensor.dtype == FDDataType::UINT8) {
|
||||
tensor->ShareExternalData(static_cast<uint8_t*>(fd_tensor.MutableData()),
|
||||
shape, paddle_infer::PlaceType::kCPU);
|
||||
if (place == paddle_infer::PlaceType::kGPU) {
|
||||
tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
|
||||
shape, place);
|
||||
} else {
|
||||
tensor->CopyFromCpu(static_cast<const uint8_t*>(fd_tensor.Data()));
|
||||
}
|
||||
return;
|
||||
}
|
||||
FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
|
||||
@@ -149,6 +175,11 @@ void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
|
||||
Device device = Device::CPU;
|
||||
if (place == paddle_infer::PlaceType::kGPU) {
|
||||
device = Device::GPU;
|
||||
} else if (place == paddle_infer::PlaceType::kXPU) {
|
||||
device = Device::KUNLUNXIN;
|
||||
FDASSERT(false,
|
||||
"Currently, copy_to_fd=false, FDTensor SetExternalData "
|
||||
"is not support for Device::KUNLUNXIN now!")
|
||||
}
|
||||
fd_tensor->name = tensor->name();
|
||||
fd_tensor->SetExternalData(shape, fd_dtype, out_data, device);
|
||||
|
Reference in New Issue
Block a user