[XPU] Support XPU via Paddle Inference backend (#1987)

* [backend] Support XPU via Paddle Inference backend * [backend] Support XPU via Paddle Inference backend * [backend] Support XPU via Paddle Inference backend * [XPU] support XPU benchmark via paddle inference * [XPU] support XPU benchmark via paddle inference * [benchmark] add xpu paddle h2d config files
2025-10-05 16:48:03 +08:00 · 2023-05-25 14:13:40 +08:00
parent 24f32d10a7
commit 49c033a828
16 changed files with 262 additions and 57 deletions
--- a/benchmark/cpp/CMakeLists.txt
+++ b/benchmark/cpp/CMakeLists.txt
@@ -7,6 +7,7 @@ include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake)

 include_directories(${FASTDEPLOY_INCS})

+add_executable(benchmark ${PROJECT_SOURCE_DIR}/benchmark.cc)
 add_executable(benchmark_yolov5 ${PROJECT_SOURCE_DIR}/benchmark_yolov5.cc)
 add_executable(benchmark_ppyolov5 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov5.cc)
 add_executable(benchmark_ppyolov6 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov6.cc)
@@ -37,13 +38,13 @@ add_executable(benchmark_gfl ${PROJECT_SOURCE_DIR}/benchmark_gfl.cc)
 add_executable(benchmark_retinanet ${PROJECT_SOURCE_DIR}/benchmark_retinanet.cc)
 add_executable(benchmark_tood ${PROJECT_SOURCE_DIR}/benchmark_tood.cc)
 add_executable(benchmark_ttfnet ${PROJECT_SOURCE_DIR}/benchmark_ttfnet.cc)
-add_executable(benchmark ${PROJECT_SOURCE_DIR}/benchmark.cc)
 add_executable(benchmark_ppdet ${PROJECT_SOURCE_DIR}/benchmark_ppdet.cc)
 add_executable(benchmark_dino ${PROJECT_SOURCE_DIR}/benchmark_dino.cc)
 add_executable(benchmark_ppshituv2_rec ${PROJECT_SOURCE_DIR}/benchmark_ppshituv2_rec.cc)
 add_executable(benchmark_ppshituv2_det ${PROJECT_SOURCE_DIR}/benchmark_ppshituv2_det.cc)

 if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
+  target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppyolov5 ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppyolov6 ${FASTDEPLOY_LIBS} gflags pthread)
@@ -74,12 +75,12 @@ if(UNIX AND (NOT APPLE) AND (NOT ANDROID))
  target_link_libraries(benchmark_retinanet ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_tood ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ttfnet ${FASTDEPLOY_LIBS} gflags pthread)
-  target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppdet ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_dino ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppshituv2_rec ${FASTDEPLOY_LIBS} gflags pthread)
  target_link_libraries(benchmark_ppshituv2_det ${FASTDEPLOY_LIBS} gflags pthread)
 else()
+  target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppyolov5 ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppyolov6 ${FASTDEPLOY_LIBS} gflags)
@@ -110,7 +111,6 @@ else()
  target_link_libraries(benchmark_retinanet ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_tood ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ttfnet ${FASTDEPLOY_LIBS} gflags)
-  target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppdet ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_dino ${FASTDEPLOY_LIBS} gflags)
  target_link_libraries(benchmark_ppshituv2_rec ${FASTDEPLOY_LIBS} gflags)
--- a/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt
+++ b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt
@@ -0,0 +1,14 @@
+device: xpu
+device_id: 0
+cpu_thread_nums: 1
+warmup: 200
+repeat: 1000
+backend: paddle
+profile_mode: runtime
+include_h2d_d2h: true
+use_fp16: false
+collect_memory_info: false
+sampling_interval: 1
+precision_compare: false
+xpu_l3_cache: 62914560
+result_path: benchmark_xpu_paddle_fp32_l3.txt
--- a/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt
+++ b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt
@@ -0,0 +1,14 @@
+device: xpu
+device_id: 0
+cpu_thread_nums: 1
+warmup: 200
+repeat: 1000
+backend: paddle
+profile_mode: runtime
+include_h2d_d2h: true
+use_fp16: false
+collect_memory_info: false
+sampling_interval: 1
+precision_compare: false
+xpu_l3_cache: 0
+result_path: benchmark_xpu_paddle_fp32.txt
--- a/benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt
+++ b/benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt
@@ -0,0 +1,14 @@
+device: xpu
+device_id: 0
+cpu_thread_nums: 1
+warmup: 200
+repeat: 1000
+backend: paddle
+profile_mode: runtime
+include_h2d_d2h: false
+use_fp16: false
+collect_memory_info: false
+sampling_interval: 1
+precision_compare: false
+xpu_l3_cache: 62914560
+result_path: benchmark_xpu_paddle_fp32_l3.txt
--- a/benchmark/cpp/config/config.xpu.paddle.fp32.txt
+++ b/benchmark/cpp/config/config.xpu.paddle.fp32.txt
@@ -0,0 +1,14 @@
+device: xpu
+device_id: 0
+cpu_thread_nums: 1
+warmup: 200
+repeat: 1000
+backend: paddle
+profile_mode: runtime
+include_h2d_d2h: false
+use_fp16: false
+collect_memory_info: false
+sampling_interval: 1
+precision_compare: false
+xpu_l3_cache: 0
+result_path: benchmark_xpu_paddle_fp32.txt
--- a/benchmark/cpp/flags.h
+++ b/benchmark/cpp/flags.h
@@ -60,7 +60,9 @@ DEFINE_int32(device_id, -1,
             "Optional, set specific device id for GPU/XPU, default -1."
             "will force to override the value in config file "
             "eg, 0/1/2/...");
-
+DEFINE_bool(enable_log_info, false,
+            "Optional, whether to enable log info for paddle backend,"
+            "default false.");

 static void PrintUsage() {
  std::cout << "Usage: infer_demo --model model_path --image img_path "
--- a/benchmark/cpp/option.h
+++ b/benchmark/cpp/option.h
@@ -18,6 +18,7 @@

 static void UpdateBaseCustomFlags(
  std::unordered_map<std::string, std::string>& config_info) {
+  // see benchmark/cpp/flags.h
  if (FLAGS_warmup > -1) {
    config_info["warmup"] = std::to_string(FLAGS_warmup);
  }
@@ -30,6 +31,14 @@ static void UpdateBaseCustomFlags(
  if (FLAGS_use_fp16) {
    config_info["use_fp16"] = "true";
  }
+  if (FLAGS_xpu_l3_cache >= 0) {
+    config_info["xpu_l3_cache"] = std::to_string(FLAGS_xpu_l3_cache);
+  }
+  if (FLAGS_enable_log_info) {
+    config_info["enable_log_info"] = "true";
+  } else {
+    config_info["enable_log_info"] = "false";
+  }
 }

 static bool CreateRuntimeOption(fastdeploy::RuntimeOption* option,
@@ -47,6 +56,9 @@ static bool CreateRuntimeOption(fastdeploy::RuntimeOption* option,
    option->EnableProfiling(config_info["include_h2d_d2h"] == "true",
                            repeat, warmup);
  }
+  if (config_info["enable_log_info"] == "true") {
+    option->paddle_infer_option.enable_log_info = true;
+  }
  if (config_info["device"] == "gpu") {
    option->UseGpu(std::stoi(config_info["device_id"]));
    if (config_info["backend"] == "ort") {
@@ -104,16 +116,14 @@ static bool CreateRuntimeOption(fastdeploy::RuntimeOption* option,
      return false;
    }
  } else if (config_info["device"] == "xpu") {
-    if (FLAGS_xpu_l3_cache >= 0) {
-       option->UseKunlunXin(std::stoi(config_info["device_id"]),
-                                      FLAGS_xpu_l3_cache);
-    } else {
-      option->UseKunlunXin(std::stoi(config_info["device_id"]),
-                           std::stoi(config_info["xpu_l3_cache"]));
-    }
+    option->UseKunlunXin(std::stoi(config_info["device_id"]),
+                         std::stoi(config_info["xpu_l3_cache"]));
    if (config_info["backend"] == "ort") {
      option->UseOrtBackend();
    } else if (config_info["backend"] == "paddle") {
+      // Note: For inference + XPU fp16, As long as the
+      // model is fp16, it can automatically run on the
+      // fp16 precision.
      option->UsePaddleInferBackend();
    } else if (config_info["backend"] == "lite") {
      option->UsePaddleLiteBackend();