From 49c033a8283f5553a293d9cfcd97bca1eb32919f Mon Sep 17 00:00:00 2001
From: DefTruth <31974251+DefTruth@users.noreply.github.com>
Date: Thu, 25 May 2023 14:13:40 +0800
Subject: [PATCH] [XPU] Support XPU via Paddle Inference backend (#1987)

* [backend] Support XPU via Paddle Inference backend

* [backend] Support XPU via Paddle Inference backend

* [backend] Support XPU via Paddle Inference backend

* [XPU] support XPU benchmark via paddle inference

* [XPU] support XPU benchmark via paddle inference

* [benchmark] add xpu paddle h2d config files
---
 benchmark/cpp/CMakeLists.txt                  |  6 +-
 .../config/config.xpu.paddle.fp32.h2d.l3.txt  | 14 +++++
 .../cpp/config/config.xpu.paddle.fp32.h2d.txt | 14 +++++
 .../cpp/config/config.xpu.paddle.fp32.l3.txt  | 14 +++++
 .../cpp/config/config.xpu.paddle.fp32.txt     | 14 +++++
 benchmark/cpp/flags.h                         |  4 +-
 benchmark/cpp/option.h                        | 24 +++++---
 cmake/kunlunxin.cmake                         | 36 +++++++-----
 cmake/paddle_inference.cmake                  | 11 +++-
 fastdeploy/core/config.h.in                   |  4 ++
 fastdeploy/runtime/backends/paddle/option.h   | 45 ++++++++++++++-
 .../runtime/backends/paddle/paddle_backend.cc | 27 ++++++++-
 fastdeploy/runtime/backends/paddle/util.cc    | 39 +++++++++++--
 fastdeploy/runtime/enum_variables.h           |  2 +-
 fastdeploy/runtime/runtime_option.cc          | 57 +++++++++++++------
 ...ild_linux_x86_64_cpp_xpu_with_benchmark.sh |  8 +--
 16 files changed, 262 insertions(+), 57 deletions(-)
 create mode 100755 benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt
 create mode 100755 benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt
 create mode 100755 benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt
 create mode 100755 benchmark/cpp/config/config.xpu.paddle.fp32.txt

diff --git a/benchmark/cpp/CMakeLists.txt b/benchmark/cpp/CMakeLists.txt
index 627d07714..ee531a57f 100755
--- a/benchmark/cpp/CMakeLists.txt
+++ b/benchmark/cpp/CMakeLists.txt
@@ -7,6 +7,7 @@ include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)
 
 include_directories(${FASTDEPLOY_INCS})
 
+add_executable(benchmark ${PROJECT_SOURCE_DIR}/benchmark.cc)
 add_executable(benchmark_yolov5 ${PROJECT_SOURCE_DIR}/benchmark_yolov5.cc)
 add_executable(benchmark_ppyolov5 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov5.cc)
 add_executable(benchmark_ppyolov6 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov6.cc)
@@ -37,13 +38,13 @@ add_executable(benchmark_gfl ${PROJECT_SOURCE_DIR}/benchmark_gfl.cc)
 add_executable(benchmark_retinanet ${PROJECT_SOURCE_DIR}/benchmark_retinanet.cc)
 add_executable(benchmark_tood ${PROJECT_SOURCE_DIR}/benchmark_tood.cc)
 add_executable(benchmark_ttfnet ${PROJECT_SOURCE_DIR}/benchmark_ttfnet.cc)
-add_executable(benchmark ${PROJECT_SOURCE_DIR}/benchmark.cc)
 add_executable(benchmark_ppdet ${PROJECT_SOURCE_DIR}/benchmark_ppdet.cc)
 add_executable(benchmark_dino ${PROJECT_SOURCE_DIR}/benchmark_dino.cc)
 add_executable(benchmark_ppshituv2_rec ${PROJECT_SOURCE_DIR}/benchmark_ppshituv2_rec.cc)
 add_executable(benchmark_ppshituv2_det ${PROJECT_SOURCE_DIR}/benchmark_ppshituv2_det.cc)
 
 if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
+  target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_ppyolov5 ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_ppyolov6 ${FASTDEPLOY_LIBS} gflags pthread)
@@ -74,12 +75,12 @@ if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
   target_link_libraries(benchmark_retinanet ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_tood ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_ttfnet ${FASTDEPLOY_LIBS} gflags pthread)
-  target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_ppdet ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_dino ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_ppshituv2_rec ${FASTDEPLOY_LIBS} gflags pthread)
   target_link_libraries(benchmark_ppshituv2_det ${FASTDEPLOY_LIBS} gflags pthread)
 else()
+  target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_ppyolov5 ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_ppyolov6 ${FASTDEPLOY_LIBS} gflags)
@@ -110,7 +111,6 @@ else()
   target_link_libraries(benchmark_retinanet ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_tood ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_ttfnet ${FASTDEPLOY_LIBS} gflags)
-  target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_ppdet ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_dino ${FASTDEPLOY_LIBS} gflags)
   target_link_libraries(benchmark_ppshituv2_rec ${FASTDEPLOY_LIBS} gflags)
diff --git a/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt
new file mode 100755
index 000000000..c89b4b6df
--- /dev/null
+++ b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt
@@ -0,0 +1,14 @@
+device: xpu
+device_id: 0
+cpu_thread_nums: 1
+warmup: 200
+repeat: 1000
+backend: paddle
+profile_mode: runtime
+include_h2d_d2h: true
+use_fp16: false
+collect_memory_info: false
+sampling_interval: 1
+precision_compare: false
+xpu_l3_cache: 62914560
+result_path: benchmark_xpu_paddle_fp32_l3.txt
diff --git a/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt
new file mode 100755
index 000000000..890fb7276
--- /dev/null
+++ b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt
@@ -0,0 +1,14 @@
+device: xpu
+device_id: 0
+cpu_thread_nums: 1
+warmup: 200
+repeat: 1000
+backend: paddle
+profile_mode: runtime
+include_h2d_d2h: true
+use_fp16: false
+collect_memory_info: false
+sampling_interval: 1
+precision_compare: false
+xpu_l3_cache: 0
+result_path: benchmark_xpu_paddle_fp32.txt
diff --git a/benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt b/benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt
new file mode 100755
index 000000000..59103958b
--- /dev/null
+++ b/benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt
@@ -0,0 +1,14 @@
+device: xpu
+device_id: 0
+cpu_thread_nums: 1
+warmup: 200
+repeat: 1000
+backend: paddle
+profile_mode: runtime
+include_h2d_d2h: false
+use_fp16: false
+collect_memory_info: false
+sampling_interval: 1
+precision_compare: false
+xpu_l3_cache: 62914560
+result_path: benchmark_xpu_paddle_fp32_l3.txt
diff --git a/benchmark/cpp/config/config.xpu.paddle.fp32.txt b/benchmark/cpp/config/config.xpu.paddle.fp32.txt
new file mode 100755
index 000000000..3f65b9d0a
--- /dev/null
+++ b/benchmark/cpp/config/config.xpu.paddle.fp32.txt
@@ -0,0 +1,14 @@
+device: xpu
+device_id: 0
+cpu_thread_nums: 1
+warmup: 200
+repeat: 1000
+backend: paddle
+profile_mode: runtime
+include_h2d_d2h: false
+use_fp16: false
+collect_memory_info: false
+sampling_interval: 1
+precision_compare: false
+xpu_l3_cache: 0
+result_path: benchmark_xpu_paddle_fp32.txt
diff --git a/benchmark/cpp/flags.h b/benchmark/cpp/flags.h
index a36591a2c..049c066ac 100755
--- a/benchmark/cpp/flags.h
+++ b/benchmark/cpp/flags.h
@@ -60,7 +60,9 @@ DEFINE_int32(device_id, -1,
              "Optional, set specific device id for GPU/XPU, default -1."
              "will force to override the value in config file "
              "eg, 0/1/2/...");
-
+DEFINE_bool(enable_log_info, false,
+            "Optional, whether to enable log info for paddle backend,"
+            "default false.");
 
 static void PrintUsage() {
   std::cout << "Usage: infer_demo --model model_path --image img_path "
diff --git a/benchmark/cpp/option.h b/benchmark/cpp/option.h
index 0dad4824c..3bc6a94ca 100755
--- a/benchmark/cpp/option.h
+++ b/benchmark/cpp/option.h
@@ -18,6 +18,7 @@
 
 static void UpdateBaseCustomFlags(
   std::unordered_map<std::string, std::string>& config_info) {
+  // see benchmark/cpp/flags.h
   if (FLAGS_warmup > -1) {
     config_info["warmup"] = std::to_string(FLAGS_warmup);
   }
@@ -30,6 +31,14 @@ static void UpdateBaseCustomFlags(
   if (FLAGS_use_fp16) {
     config_info["use_fp16"] = "true";
   }
+  if (FLAGS_xpu_l3_cache >= 0) {
+    config_info["xpu_l3_cache"] = std::to_string(FLAGS_xpu_l3_cache);
+  }
+  if (FLAGS_enable_log_info) {
+    config_info["enable_log_info"] = "true";
+  } else {
+    config_info["enable_log_info"] = "false";
+  }
 }
 
 static bool CreateRuntimeOption(fastdeploy::RuntimeOption* option,
@@ -47,6 +56,9 @@ static bool CreateRuntimeOption(fastdeploy::RuntimeOption* option,
     option->EnableProfiling(config_info["include_h2d_d2h"] == "true",
                             repeat, warmup);
   }
+  if (config_info["enable_log_info"] == "true") {
+    option->paddle_infer_option.enable_log_info = true;
+  }
   if (config_info["device"] == "gpu") {
     option->UseGpu(std::stoi(config_info["device_id"]));
     if (config_info["backend"] == "ort") {
@@ -104,16 +116,14 @@ static bool CreateRuntimeOption(fastdeploy::RuntimeOption* option,
       return false;
     }
   } else if (config_info["device"] == "xpu") {
-    if (FLAGS_xpu_l3_cache >= 0) {
-       option->UseKunlunXin(std::stoi(config_info["device_id"]),
-                                      FLAGS_xpu_l3_cache);
-    } else {
-      option->UseKunlunXin(std::stoi(config_info["device_id"]),
-                           std::stoi(config_info["xpu_l3_cache"]));
-    }
+    option->UseKunlunXin(std::stoi(config_info["device_id"]),
+                         std::stoi(config_info["xpu_l3_cache"]));
     if (config_info["backend"] == "ort") {
       option->UseOrtBackend();
     } else if (config_info["backend"] == "paddle") {
+      // Note: For inference + XPU fp16, As long as the
+      // model is fp16, it can automatically run on the
+      // fp16 precision.
       option->UsePaddleInferBackend();
     } else if (config_info["backend"] == "lite") {
       option->UsePaddleLiteBackend();
diff --git a/cmake/kunlunxin.cmake b/cmake/kunlunxin.cmake
index 5935d826f..1afd691f5 100755
--- a/cmake/kunlunxin.cmake
+++ b/cmake/kunlunxin.cmake
@@ -1,20 +1,26 @@
-if(NOT ENABLE_LITE_BACKEND)
-  message("Will force to set ENABLE_LITE_BACKEND when build with KunlunXin.")
-  set(ENABLE_LITE_BACKEND ON)
+if(NOT ENABLE_PADDLE_BACKEND)
+  if(NOT ENABLE_LITE_BACKEND)
+    message(WARNING "Will force to set ENABLE_LITE_BACKEND=ON if ENABLE_PADDLE_BACKEND=OFF when build with KunlunXin.")
+    set(ENABLE_LITE_BACKEND ON)      
+  endif()
+else()
+  if(ENABLE_LITE_BACKEND)
+    message(WARNING "Will force to set ENABLE_LITE_BACKEND=OFF if ENABLE_PADDLE_BACKEND=ON when build with KunlunXin.")
+    set(ENABLE_LITE_BACKEND OFF)      
+  endif()
 endif()
 
 option(WITH_LITE_XPU_LOG "" ON)
-
-if(NOT PADDLELITE_URL)
-  if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
-    set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-aarch64-xpu-v213.tgz")
-  else ()
-    if (WITH_LITE_XPU_LOG)
-      # set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-20221215.tgz")
-      # set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-20230303.tgz")
-      set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-20230410.tgz")
-    else()
-      set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-without-log-20230303.tgz")
+if(NOT ENABLE_PADDLE_BACKEND)
+  if(NOT PADDLELITE_URL)
+    if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+      set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-aarch64-xpu-v213.tgz")
+    else ()
+      if (WITH_LITE_XPU_LOG)
+        set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-20230410.tgz")
+      else()
+        set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-without-log-20230303.tgz")
+      endif()
     endif()
   endif()
-endif()
+endif()
\ No newline at end of file
diff --git a/cmake/paddle_inference.cmake b/cmake/paddle_inference.cmake
index 6e3d4d689..6766289b2 100755
--- a/cmake/paddle_inference.cmake
+++ b/cmake/paddle_inference.cmake
@@ -114,10 +114,11 @@ else()
       endif()
       set(PADDLEINFERENCE_VERSION "0.0.0.660f781b77")
     else()
-      # Linux with x86 CPU/Arm CPU/GPU/IPU ...
+      # Linux with x86/aarch64 CPU/Arm CPU/GPU/IPU ...
       if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
         message(FATAL_ERROR "Paddle Backend doesn't support linux aarch64 now.")
       else()
+        # x86_64
         if(WITH_GPU)
           set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-gpu-trt8.5.2.2-mkl-avx-0.0.0.660f781b77.tgz")
           set(PADDLEINFERENCE_VERSION "0.0.0.660f781b77")
@@ -125,14 +126,18 @@ else()
           set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-mkl-avx-0.0.0.660f781b77.tgz")
           set(PADDLEINFERENCE_VERSION "0.0.0.660f781b77")
         endif()
-        if (WITH_IPU)
+        if(WITH_IPU)
           set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-ipu-2.4-dev1.tgz")
           # TODO(qiuyanjun): Should use the commit id to tag the version
           set(PADDLEINFERENCE_VERSION "2.4-dev1")
         endif()
+        if(WITH_KUNLUNXIN)
+          set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-xpu-openblas-0.0.0.021fd73536.tgz")
+          set(PADDLEINFERENCE_VERSION "0.0.0.021fd73536")
+        endif()
 
         if(NEED_ABI0)
-          if(WITH_GPU OR WITH_PU)
+          if(WITH_GPU OR WITH_IPU OR WITH_KUNLUNXIN)
             message(WARNING "While NEED_ABI0=ON, only support CPU now, will fallback to CPU.")
           endif()
           set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-2.4.0-abi0.tgz")
diff --git a/fastdeploy/core/config.h.in b/fastdeploy/core/config.h.in
index 10d39cabb..f5a8d41b7 100755
--- a/fastdeploy/core/config.h.in
+++ b/fastdeploy/core/config.h.in
@@ -41,6 +41,10 @@
 #cmakedefine WITH_GPU
 #endif
 
+#ifndef WITH_KUNLUNXIN
+#cmakedefine WITH_KUNLUNXIN
+#endif
+
 #ifndef WITH_DIRECTML
 #cmakedefine WITH_DIRECTML
 #endif
diff --git a/fastdeploy/runtime/backends/paddle/option.h b/fastdeploy/runtime/backends/paddle/option.h
index 134fc7ef0..c6f48ff0c 100755
--- a/fastdeploy/runtime/backends/paddle/option.h
+++ b/fastdeploy/runtime/backends/paddle/option.h
@@ -45,6 +45,33 @@ struct IpuOption {
   bool ipu_enable_half_partial;
 };
 
+/*! @brief Option object to configure KUNLUNXIN XPU
+ */
+struct XpuOption {
+  /// kunlunxin device id
+  int kunlunxin_device_id = 0;
+  /// EnableXpu
+  /// kunlunxin_l3_workspace_size
+  int kunlunxin_l3_workspace_size = 0xfffc00;
+  /// kunlunxin_locked
+  bool kunlunxin_locked = false;
+  /// kunlunxin_autotune
+  bool kunlunxin_autotune = true;
+  /// kunlunxin_autotune_file
+  std::string kunlunxin_autotune_file = "";
+  /// kunlunxin_precision
+  std::string kunlunxin_precision = "int16";
+  /// kunlunxin_adaptive_seqlen
+  bool kunlunxin_adaptive_seqlen = false;
+  /// kunlunxin_enable_multi_stream
+  bool kunlunxin_enable_multi_stream = false;
+  /// SetXpuConfig
+  /// quant post dynamic weight bits
+  int kunlunxin_quant_post_dynamic_weight_bits = -1;
+  /// quant post dynamic op types
+  std::vector<std::string> kunlunxin_quant_post_dynamic_op_types = {};
+};
+
 /*! @brief Option object to configure Paddle Inference backend
  */
 struct PaddleBackendOption {
@@ -63,6 +90,10 @@ struct PaddleBackendOption {
    * @brief IPU option, this will configure the IPU hardware, if inference model in IPU
    */
   IpuOption ipu_option;
+  /*
+   * @brief XPU option, this will configure the  KUNLUNXIN XPU hardware, if inference model in XPU
+   */
+  XpuOption xpu_option;
 
   /// Collect shape for model while enable_trt is true
   bool collect_trt_shape = false;
@@ -84,8 +115,8 @@ struct PaddleBackendOption {
   }
 
   void SetIpuConfig(bool enable_fp16, int replica_num,
-                                   float available_memory_proportion,
-                                   bool enable_half_partial) {
+                    float available_memory_proportion,
+                    bool enable_half_partial) {
     ipu_option.ipu_enable_fp16 = enable_fp16;
     ipu_option.ipu_replica_num = replica_num;
     ipu_option.ipu_available_memory_proportion =
@@ -93,12 +124,22 @@ struct PaddleBackendOption {
     ipu_option.ipu_enable_half_partial = enable_half_partial;
   }
 
+  void SetXpuConfig(
+      int quant_post_dynamic_weight_bits = -1,
+      const std::vector<std::string>& quant_post_dynamic_op_types = {}) {
+    xpu_option.kunlunxin_quant_post_dynamic_weight_bits =
+      quant_post_dynamic_weight_bits;
+    xpu_option.kunlunxin_quant_post_dynamic_op_types =
+      quant_post_dynamic_op_types;
+  }
+
   // The belowing parameters may be removed, please do not
   // read or write them directly
   TrtBackendOption trt_option;
   bool enable_pinned_memory = false;
   void* external_stream_ = nullptr;
   Device device = Device::CPU;
+  /// device id for CPU/GPU
   int device_id = 0;
   std::vector<std::string> trt_disabled_ops_{};
   int cpu_thread_num = 8;
diff --git a/fastdeploy/runtime/backends/paddle/paddle_backend.cc b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
index 3ea81cb6b..3db73a00a 100644
--- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc
+++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc
@@ -78,9 +78,28 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
                          option.ipu_option.ipu_available_memory_proportion,
                          option.ipu_option.ipu_enable_half_partial);
 #else
-    FDWARNING << "The FastDeploy is not compiled with IPU backend, so will "
+    FDWARNING << "The FastDeploy is not compiled with IPU device, so will "
                  "fallback to CPU with Paddle Inference Backend."
               << std::endl;
+#endif
+  } else if (option.device == Device::KUNLUNXIN) {
+#ifdef WITH_KUNLUNXIN
+    config_.EnableXpu(option.xpu_option.kunlunxin_l3_workspace_size,
+                      option.xpu_option.kunlunxin_locked,
+                      option.xpu_option.kunlunxin_autotune,
+                      option.xpu_option.kunlunxin_autotune_file,
+                      option.xpu_option.kunlunxin_precision,
+                      option.xpu_option.kunlunxin_adaptive_seqlen,
+                      option.xpu_option.kunlunxin_enable_multi_stream);
+    config_.SetXpuConfig(
+        option.xpu_option.kunlunxin_quant_post_dynamic_weight_bits,
+        option.xpu_option.kunlunxin_quant_post_dynamic_op_types);
+    config_.SetXpuDeviceId(option.xpu_option.kunlunxin_device_id);
+#else
+    FDWARNING
+        << "The FastDeploy is not compiled with KUNLUNXIN device, so will "
+           "fallback to CPU with Paddle Inference Backend."
+        << std::endl;
 #endif
   } else {
     config_.DisableGpu();
@@ -89,6 +108,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) {
       config_.SetMkldnnCacheCapacity(option.mkldnn_cache_size);
     }
   }
+
   if (!option.enable_log_info) {
     config_.DisableGlogInfo();
   }
@@ -106,6 +126,7 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) {
   }
 
   auto option = runtime_option;
+  // Collect basic paddle inference option and trt option.
   option.paddle_infer_option.model_file = runtime_option.model_file;
   option.paddle_infer_option.params_file = runtime_option.params_file;
   option.paddle_infer_option.model_from_memory_ =
@@ -117,6 +138,10 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) {
   option.paddle_infer_option.external_stream_ = runtime_option.external_stream_;
   option.paddle_infer_option.trt_option = runtime_option.trt_option;
   option.paddle_infer_option.trt_option.gpu_id = runtime_option.device_id;
+  // Note(qiuyanjun): For Ipu option and XPU option, please check the
+  // details of RuntimeOption::UseIpu() and RuntimeOption::UseKunlunXin().
+  // Futhermore, please check paddle_infer_option.SetIpuConfig() and
+  // paddle_infer_option.SetXpuConfig() for more details of extra configs.
   return InitFromPaddle(option.model_file, option.params_file,
                         option.model_from_memory_, option.paddle_infer_option);
 }
diff --git a/fastdeploy/runtime/backends/paddle/util.cc b/fastdeploy/runtime/backends/paddle/util.cc
index a4e4ed29b..58fa96552 100644
--- a/fastdeploy/runtime/backends/paddle/util.cc
+++ b/fastdeploy/runtime/backends/paddle/util.cc
@@ -19,6 +19,8 @@ namespace fastdeploy {
 paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device) {
   if (device == Device::GPU) {
     return paddle_infer::PlaceType::kGPU;
+  } else if (device == Device::KUNLUNXIN) {
+    return paddle_infer::PlaceType::kXPU;
   }
   return paddle_infer::PlaceType::kCPU;
 }
@@ -52,9 +54,21 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor,
       tensor->CopyFromCpu(static_cast<const int64_t*>(fd_tensor.Data()));
     }
     return;
+  } else if (fd_tensor.dtype == FDDataType::INT8) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<const int8_t*>(fd_tensor.Data()),
+                                shape, place);
+    } else {
+      tensor->CopyFromCpu(static_cast<const int8_t*>(fd_tensor.Data()));
+    }
+    return;
   } else if (fd_tensor.dtype == FDDataType::UINT8) {
-    tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
-                              shape, paddle_infer::PlaceType::kCPU);
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
+                                shape, place);
+    } else {
+      tensor->CopyFromCpu(static_cast<const uint8_t*>(fd_tensor.Data()));
+    }
     return;
   }
   FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
@@ -89,9 +103,21 @@ void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor,
       tensor->CopyToCpu(static_cast<int64_t*>(fd_tensor.MutableData()));
     }
     return;
+  } else if (fd_tensor.dtype == FDDataType::INT8) {
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<const int8_t*>(fd_tensor.Data()),
+                                shape, place);
+    } else {
+      tensor->CopyFromCpu(static_cast<const int8_t*>(fd_tensor.Data()));
+    }
+    return;
   } else if (fd_tensor.dtype == FDDataType::UINT8) {
-    tensor->ShareExternalData(static_cast<uint8_t*>(fd_tensor.MutableData()),
-                              shape, paddle_infer::PlaceType::kCPU);
+    if (place == paddle_infer::PlaceType::kGPU) {
+      tensor->ShareExternalData(static_cast<const uint8_t*>(fd_tensor.Data()),
+                                shape, place);
+    } else {
+      tensor->CopyFromCpu(static_cast<const uint8_t*>(fd_tensor.Data()));
+    }
     return;
   }
   FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.",
@@ -149,6 +175,11 @@ void PaddleTensorToFDTensor(std::unique_ptr<paddle_infer::Tensor>& tensor,
     Device device = Device::CPU;
     if (place == paddle_infer::PlaceType::kGPU) {
       device = Device::GPU;
+    } else if (place == paddle_infer::PlaceType::kXPU) {
+      device = Device::KUNLUNXIN;
+      FDASSERT(false,
+               "Currently, copy_to_fd=false, FDTensor SetExternalData "
+               "is not support for Device::KUNLUNXIN now!")
     }
     fd_tensor->name = tensor->name();
     fd_tensor->SetExternalData(shape, fd_dtype, out_data, device);
diff --git a/fastdeploy/runtime/enum_variables.h b/fastdeploy/runtime/enum_variables.h
index b8427303e..0e23f21ee 100644
--- a/fastdeploy/runtime/enum_variables.h
+++ b/fastdeploy/runtime/enum_variables.h
@@ -99,7 +99,7 @@ static std::map<Device, std::vector<Backend>>
   {Device::SUNRISENPU, {Backend::HORIZONNPU}},
   {Device::IPU, {Backend::PDINFER}},
   {Device::TIMVX, {Backend::LITE}},
-  {Device::KUNLUNXIN, {Backend::LITE}},
+  {Device::KUNLUNXIN, {Backend::LITE, Backend::PDINFER}},
   {Device::ASCEND, {Backend::LITE}},
   {Device::SOPHGOTPUD, {Backend::SOPHGOTPU}},
   {Device::DIRECTML, {Backend::ORT}}
diff --git a/fastdeploy/runtime/runtime_option.cc b/fastdeploy/runtime/runtime_option.cc
index 563339237..2af84d482 100644
--- a/fastdeploy/runtime/runtime_option.cc
+++ b/fastdeploy/runtime/runtime_option.cc
@@ -79,14 +79,18 @@ void RuntimeOption::UseTimVX() {
   paddle_lite_option.device = device;
 }
 
-void RuntimeOption::UseKunlunXin(int kunlunxin_id, int l3_workspace_size,
+void RuntimeOption::UseKunlunXin(int kunlunxin_id, 
+                                 int l3_workspace_size,
                                  bool locked, bool autotune,
                                  const std::string& autotune_file,
                                  const std::string& precision,
                                  bool adaptive_seqlen,
                                  bool enable_multi_stream,
                                  int64_t gm_default_size) {
+#ifdef WITH_KUNLUNXIN                                
   device = Device::KUNLUNXIN;
+  
+#ifdef ENABLE_LITE_BACKEND  
   paddle_lite_option.device = device;
   paddle_lite_option.device_id = kunlunxin_id;
   paddle_lite_option.kunlunxin_l3_workspace_size = l3_workspace_size;
@@ -97,6 +101,42 @@ void RuntimeOption::UseKunlunXin(int kunlunxin_id, int l3_workspace_size,
   paddle_lite_option.kunlunxin_adaptive_seqlen = adaptive_seqlen;
   paddle_lite_option.kunlunxin_enable_multi_stream = enable_multi_stream;
   paddle_lite_option.kunlunxin_gm_default_size = gm_default_size;
+#endif
+#ifdef ENABLE_PADDLE_BACKEND  
+  paddle_infer_option.device = device;
+  paddle_infer_option.xpu_option.kunlunxin_device_id = kunlunxin_id;
+  paddle_infer_option.xpu_option.kunlunxin_l3_workspace_size = l3_workspace_size;
+  paddle_infer_option.xpu_option.kunlunxin_locked = locked;
+  paddle_infer_option.xpu_option.kunlunxin_autotune = autotune;
+  paddle_infer_option.xpu_option.kunlunxin_autotune_file = autotune_file;
+  paddle_infer_option.xpu_option.kunlunxin_precision = precision;
+  paddle_infer_option.xpu_option.kunlunxin_adaptive_seqlen = adaptive_seqlen;
+  paddle_infer_option.xpu_option.kunlunxin_enable_multi_stream = enable_multi_stream;
+  // paddle_infer_option.xpu_option.kunlunxin_gm_default_size = gm_default_size;
+  // use paddle_infer_option.xpu_option.SetXpuConfig() for more options.
+#endif
+
+#else
+  FDWARNING << "The FastDeploy didn't compile with KUNLUNXIN, will force to use CPU."
+            << std::endl;
+  device = Device::CPU;
+#endif
+}
+
+void RuntimeOption::UseIpu(int device_num, int micro_batch_size,
+                           bool enable_pipelining, int batches_per_step) {
+#ifdef WITH_IPU
+  device = Device::IPU;
+  paddle_infer_option.ipu_option.ipu_device_num = device_num;
+  paddle_infer_option.ipu_option.ipu_micro_batch_size = micro_batch_size;
+  paddle_infer_option.ipu_option.ipu_enable_pipelining = enable_pipelining;
+  paddle_infer_option.ipu_option.ipu_batches_per_step = batches_per_step;
+  // use paddle_infer_option.ipu_option.SetIpuConfig() for more options.
+#else
+  FDWARNING << "The FastDeploy didn't compile with IPU, will force to use CPU."
+            << std::endl;
+  device = Device::CPU;
+#endif
 }
 
 void RuntimeOption::UseAscend() {
@@ -484,19 +524,4 @@ void RuntimeOption::DisablePaddleTrtOPs(const std::vector<std::string>& ops) {
   paddle_infer_option.DisableTrtOps(ops);
 }
 
-void RuntimeOption::UseIpu(int device_num, int micro_batch_size,
-                           bool enable_pipelining, int batches_per_step) {
-#ifdef WITH_IPU
-  device = Device::IPU;
-  ipu_device_num = device_num;
-  ipu_micro_batch_size = micro_batch_size;
-  ipu_enable_pipelining = enable_pipelining;
-  ipu_batches_per_step = batches_per_step;
-#else
-  FDWARNING << "The FastDeploy didn't compile with IPU, will force to use CPU."
-            << std::endl;
-  device = Device::CPU;
-#endif
-}
-
 }  // namespace fastdeploy
diff --git a/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh b/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh
index 414fa33da..5d058093d 100755
--- a/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh
+++ b/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh
@@ -52,11 +52,11 @@ __build_fastdeploy_linux_x86_64_xpu_shared() {
   local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install"
   cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}"
 
-  cmake -DWITH_KUNLUNXIN=ON \
-	-DCMAKE_BUILD_TYPE=Release \
-        -DWITH_GPU=OFF \
+  cmake -DCMAKE_BUILD_TYPE=Release \
+        -DWITH_KUNLUNXIN=ON \
         -DENABLE_ORT_BACKEND=OFF \
-        -DENABLE_PADDLE_BACKEND=OFF \
+        -DENABLE_PADDLE_BACKEND=ON \
+        -DENABLE_LITE_BACKEND=OFF \
         -DENABLE_VISION=ON \
         -DENABLE_BENCHMARK=ON \
         -DBUILD_EXAMPLES=OFF \