[XPU] Support XPU via Paddle Inference backend (#1987)

* [backend] Support XPU via Paddle Inference backend * [backend] Support XPU via Paddle Inference backend * [backend] Support XPU via Paddle Inference backend * [XPU] support XPU benchmark via paddle inference * [XPU] support XPU benchmark via paddle inference * [benchmark] add xpu paddle h2d config files
2025-10-14 20:55:57 +08:00 · 2023-05-25 14:13:40 +08:00
parent 24f32d10a7
commit 49c033a828
16 changed files with 262 additions and 57 deletions
--- a/fastdeploy/runtime/backends/paddle/option.h
+++ b/fastdeploy/runtime/backends/paddle/option.h
@@ -45,6 +45,33 @@ struct IpuOption {
  bool ipu_enable_half_partial;
 };

+/*! @brief Option object to configure KUNLUNXIN XPU
+ */
+struct XpuOption {
+  /// kunlunxin device id
+  int kunlunxin_device_id = 0;
+  /// EnableXpu
+  /// kunlunxin_l3_workspace_size
+  int kunlunxin_l3_workspace_size = 0xfffc00;
+  /// kunlunxin_locked
+  bool kunlunxin_locked = false;
+  /// kunlunxin_autotune
+  bool kunlunxin_autotune = true;
+  /// kunlunxin_autotune_file
+  std::string kunlunxin_autotune_file = "";
+  /// kunlunxin_precision
+  std::string kunlunxin_precision = "int16";
+  /// kunlunxin_adaptive_seqlen
+  bool kunlunxin_adaptive_seqlen = false;
+  /// kunlunxin_enable_multi_stream
+  bool kunlunxin_enable_multi_stream = false;
+  /// SetXpuConfig
+  /// quant post dynamic weight bits
+  int kunlunxin_quant_post_dynamic_weight_bits = -1;
+  /// quant post dynamic op types
+  std::vector<std::string> kunlunxin_quant_post_dynamic_op_types = {};
+};
+
 /*! @brief Option object to configure Paddle Inference backend
 */
 struct PaddleBackendOption {
@@ -63,6 +90,10 @@ struct PaddleBackendOption {
   * @brief IPU option, this will configure the IPU hardware, if inference model in IPU
   */
  IpuOption ipu_option;
+  /*
+   * @brief XPU option, this will configure the  KUNLUNXIN XPU hardware, if inference model in XPU
+   */
+  XpuOption xpu_option;

  /// Collect shape for model while enable_trt is true
  bool collect_trt_shape = false;
@@ -84,8 +115,8 @@ struct PaddleBackendOption {
  }

  void SetIpuConfig(bool enable_fp16, int replica_num,
-                                   float available_memory_proportion,
-                                   bool enable_half_partial) {
+                    float available_memory_proportion,
+                    bool enable_half_partial) {
    ipu_option.ipu_enable_fp16 = enable_fp16;
    ipu_option.ipu_replica_num = replica_num;
    ipu_option.ipu_available_memory_proportion =
@@ -93,12 +124,22 @@ struct PaddleBackendOption {
    ipu_option.ipu_enable_half_partial = enable_half_partial;
  }

+  void SetXpuConfig(
+      int quant_post_dynamic_weight_bits = -1,
+      const std::vector<std::string>& quant_post_dynamic_op_types = {}) {
+    xpu_option.kunlunxin_quant_post_dynamic_weight_bits =
+      quant_post_dynamic_weight_bits;
+    xpu_option.kunlunxin_quant_post_dynamic_op_types =
+      quant_post_dynamic_op_types;
+  }
+
  // The belowing parameters may be removed, please do not
  // read or write them directly
  TrtBackendOption trt_option;
  bool enable_pinned_memory = false;
  void* external_stream_ = nullptr;
  Device device = Device::CPU;
+  /// device id for CPU/GPU
  int device_id = 0;
  std::vector<std::string> trt_disabled_ops_{};
  int cpu_thread_num = 8;
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -78,9 +78,28 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
                         option.ipu_option.ipu_available_memory_proportion,
                         option.ipu_option.ipu_enable_half_partial);
 #else
-    FDWARNING << "The FastDeploy is not compiled with IPU backend, so will "
+    FDWARNING << "The FastDeploy is not compiled with IPU device, so will "
                 "fallback to CPU with Paddle Inference Backend."
              << std::endl;
+#endif
+  } else if (option.device == Device::KUNLUNXIN) {
+#ifdef WITH_KUNLUNXIN
+    config_.EnableXpu(option.xpu_option.kunlunxin_l3_workspace_size,
+                      option.xpu_option.kunlunxin_locked,
+                      option.xpu_option.kunlunxin_autotune,
+                      option.xpu_option.kunlunxin_autotune_file,
+                      option.xpu_option.kunlunxin_precision,
+                      option.xpu_option.kunlunxin_adaptive_seqlen,
+                      option.xpu_option.kunlunxin_enable_multi_stream);
+    config_.SetXpuConfig(
+        option.xpu_option.kunlunxin_quant_post_dynamic_weight_bits,
+        option.xpu_option.kunlunxin_quant_post_dynamic_op_types);
+    config_.SetXpuDeviceId(option.xpu_option.kunlunxin_device_id);
+#else
+    FDWARNING
+        << "The FastDeploy is not compiled with KUNLUNXIN device, so will "
+           "fallback to CPU with Paddle Inference Backend."
+        << std::endl;
 #endif
  } else {
    config_.DisableGpu();
@@ -89,6 +108,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
      config_.SetMkldnnCacheCapacity(option.mkldnn_cache_size);
    }
  }
+
  if (!option.enable_log_info) {
    config_.DisableGlogInfo();
  }
@@ -106,6 +126,7 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) {
  }

  auto option = runtime_option;
+  // Collect basic paddle inference option and trt option.
  option.paddle_infer_option.model_file = runtime_option.model_file;
  option.paddle_infer_option.params_file = runtime_option.params_file;
  option.paddle_infer_option.model_from_memory_ =
@@ -117,6 +138,10 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) {
  option.paddle_infer_option.external_stream_ = runtime_option.external_stream_;
  option.paddle_infer_option.trt_option = runtime_option.trt_option;
  option.paddle_infer_option.trt_option.gpu_id = runtime_option.device_id;
+  // Note(qiuyanjun): For Ipu option and XPU option, please check the
+  // details of RuntimeOption::UseIpu() and RuntimeOption::UseKunlunXin().
+  // Futhermore, please check paddle_infer_option.SetIpuConfig() and
+  // paddle_infer_option.SetXpuConfig() for more details of extra configs.
  return InitFromPaddle(option.model_file, option.params_file,
                        option.model_from_memory_, option.paddle_infer_option);
 }
--- a/fastdeploy/runtime/backends/paddle/util.cc
+++ b/fastdeploy/runtime/backends/paddle/util.cc
@@ -19,6 +19,8 @@ namespace fastdeploy {
 paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device) {
  if (device == Device::GPU) {
    return paddle_infer::PlaceType::kGPU;
+  } else if (device == Device::KUNLUNXIN) {
+    return paddle_infer::PlaceType::kXPU;
  }
  return paddle_infer::PlaceType::kCPU;
 }
@@ -52,9 +54,21 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor,
      tensor->CopyFromCpu(static_cast<const int64_t*>(fd_tensor.Data()));
    }
    return;
+  } else if (fd_tensor.dtype == FDDataType::INT8) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<const int8_t*>(fd_tensor.Data()),
+                                shape, place);
+    } else {
+      tensor->CopyFromCpu(static_cast<const int8_t*>(fd_tensor.Data()));
+    }
+    return;
  } else if (fd_tensor.dtype == FDDataType::UINT8) {
-    tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
-                              shape, paddle_infer::PlaceType::kCPU);
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
+                                shape, place);
+    } else {
+      tensor->CopyFromCpu(static_cast<const uint8_t*>(fd_tensor.Data()));
+    }
    return;
  }
  FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
@@ -89,9 +103,21 @@ void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor,
      tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor.MutableData()));
    }
    return;
+  } else if (fd_tensor.dtype == FDDataType::INT8) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<const int8_t*>(fd_tensor.Data()),
+                                shape, place);
+    } else {
+      tensor->CopyFromCpu(static_cast<const int8_t*>(fd_tensor.Data()));
+    }
+    return;
  } else if (fd_tensor.dtype == FDDataType::UINT8) {
-    tensor->ShareExternalData(static_cast<uint8_t*>(fd_tensor.MutableData()),
-                              shape, paddle_infer::PlaceType::kCPU);
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
+                                shape, place);
+    } else {
+      tensor->CopyFromCpu(static_cast<const uint8_t*>(fd_tensor.Data()));
+    }
    return;
  }
  FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
@@ -149,6 +175,11 @@ void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
    Device device = Device::CPU;
    if (place == paddle_infer::PlaceType::kGPU) {
      device = Device::GPU;
+    } else if (place == paddle_infer::PlaceType::kXPU) {
+      device = Device::KUNLUNXIN;
+      FDASSERT(false,
+               "Currently, copy_to_fd=false, FDTensor SetExternalData "
+               "is not support for Device::KUNLUNXIN now!")
    }
    fd_tensor->name = tensor->name();
    fd_tensor->SetExternalData(shape, fd_dtype, out_data, device);