From 49c033a8283f5553a293d9cfcd97bca1eb32919f Mon Sep 17 00:00:00 2001 From: DefTruth <31974251+DefTruth@users.noreply.github.com> Date: Thu, 25 May 2023 14:13:40 +0800 Subject: [PATCH] [XPU] Support XPU via Paddle Inference backend (#1987) * [backend] Support XPU via Paddle Inference backend * [backend] Support XPU via Paddle Inference backend * [backend] Support XPU via Paddle Inference backend * [XPU] support XPU benchmark via paddle inference * [XPU] support XPU benchmark via paddle inference * [benchmark] add xpu paddle h2d config files --- benchmark/cpp/CMakeLists.txt | 6 +- .../config/config.xpu.paddle.fp32.h2d.l3.txt | 14 +++++ .../cpp/config/config.xpu.paddle.fp32.h2d.txt | 14 +++++ .../cpp/config/config.xpu.paddle.fp32.l3.txt | 14 +++++ .../cpp/config/config.xpu.paddle.fp32.txt | 14 +++++ benchmark/cpp/flags.h | 4 +- benchmark/cpp/option.h | 24 +++++--- cmake/kunlunxin.cmake | 36 +++++++----- cmake/paddle_inference.cmake | 11 +++- fastdeploy/core/config.h.in | 4 ++ fastdeploy/runtime/backends/paddle/option.h | 45 ++++++++++++++- .../runtime/backends/paddle/paddle_backend.cc | 27 ++++++++- fastdeploy/runtime/backends/paddle/util.cc | 39 +++++++++++-- fastdeploy/runtime/enum_variables.h | 2 +- fastdeploy/runtime/runtime_option.cc | 57 +++++++++++++------ ...ild_linux_x86_64_cpp_xpu_with_benchmark.sh | 8 +-- 16 files changed, 262 insertions(+), 57 deletions(-) create mode 100755 benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt create mode 100755 benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt create mode 100755 benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt create mode 100755 benchmark/cpp/config/config.xpu.paddle.fp32.txt diff --git a/benchmark/cpp/CMakeLists.txt b/benchmark/cpp/CMakeLists.txt index 627d07714..ee531a57f 100755 --- a/benchmark/cpp/CMakeLists.txt +++ b/benchmark/cpp/CMakeLists.txt @@ -7,6 +7,7 @@ include(${FASTDEPLOY_INSTALL_DIR}/FastDeploy.cmake) include_directories(${FASTDEPLOY_INCS}) +add_executable(benchmark ${PROJECT_SOURCE_DIR}/benchmark.cc) add_executable(benchmark_yolov5 ${PROJECT_SOURCE_DIR}/benchmark_yolov5.cc) add_executable(benchmark_ppyolov5 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov5.cc) add_executable(benchmark_ppyolov6 ${PROJECT_SOURCE_DIR}/benchmark_ppyolov6.cc) @@ -37,13 +38,13 @@ add_executable(benchmark_gfl ${PROJECT_SOURCE_DIR}/benchmark_gfl.cc) add_executable(benchmark_retinanet ${PROJECT_SOURCE_DIR}/benchmark_retinanet.cc) add_executable(benchmark_tood ${PROJECT_SOURCE_DIR}/benchmark_tood.cc) add_executable(benchmark_ttfnet ${PROJECT_SOURCE_DIR}/benchmark_ttfnet.cc) -add_executable(benchmark ${PROJECT_SOURCE_DIR}/benchmark.cc) add_executable(benchmark_ppdet ${PROJECT_SOURCE_DIR}/benchmark_ppdet.cc) add_executable(benchmark_dino ${PROJECT_SOURCE_DIR}/benchmark_dino.cc) add_executable(benchmark_ppshituv2_rec ${PROJECT_SOURCE_DIR}/benchmark_ppshituv2_rec.cc) add_executable(benchmark_ppshituv2_det ${PROJECT_SOURCE_DIR}/benchmark_ppshituv2_det.cc) if(UNIX AND (NOT APPLE) AND (NOT ANDROID)) + target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags pthread) target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags pthread) target_link_libraries(benchmark_ppyolov5 ${FASTDEPLOY_LIBS} gflags pthread) target_link_libraries(benchmark_ppyolov6 ${FASTDEPLOY_LIBS} gflags pthread) @@ -74,12 +75,12 @@ if(UNIX AND (NOT APPLE) AND (NOT ANDROID)) target_link_libraries(benchmark_retinanet ${FASTDEPLOY_LIBS} gflags pthread) target_link_libraries(benchmark_tood ${FASTDEPLOY_LIBS} gflags pthread) target_link_libraries(benchmark_ttfnet ${FASTDEPLOY_LIBS} gflags pthread) - target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags pthread) target_link_libraries(benchmark_ppdet ${FASTDEPLOY_LIBS} gflags pthread) target_link_libraries(benchmark_dino ${FASTDEPLOY_LIBS} gflags pthread) target_link_libraries(benchmark_ppshituv2_rec ${FASTDEPLOY_LIBS} gflags pthread) target_link_libraries(benchmark_ppshituv2_det ${FASTDEPLOY_LIBS} gflags pthread) else() + target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags) target_link_libraries(benchmark_yolov5 ${FASTDEPLOY_LIBS} gflags) target_link_libraries(benchmark_ppyolov5 ${FASTDEPLOY_LIBS} gflags) target_link_libraries(benchmark_ppyolov6 ${FASTDEPLOY_LIBS} gflags) @@ -110,7 +111,6 @@ else() target_link_libraries(benchmark_retinanet ${FASTDEPLOY_LIBS} gflags) target_link_libraries(benchmark_tood ${FASTDEPLOY_LIBS} gflags) target_link_libraries(benchmark_ttfnet ${FASTDEPLOY_LIBS} gflags) - target_link_libraries(benchmark ${FASTDEPLOY_LIBS} gflags) target_link_libraries(benchmark_ppdet ${FASTDEPLOY_LIBS} gflags) target_link_libraries(benchmark_dino ${FASTDEPLOY_LIBS} gflags) target_link_libraries(benchmark_ppshituv2_rec ${FASTDEPLOY_LIBS} gflags) diff --git a/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt new file mode 100755 index 000000000..c89b4b6df --- /dev/null +++ b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.l3.txt @@ -0,0 +1,14 @@ +device: xpu +device_id: 0 +cpu_thread_nums: 1 +warmup: 200 +repeat: 1000 +backend: paddle +profile_mode: runtime +include_h2d_d2h: true +use_fp16: false +collect_memory_info: false +sampling_interval: 1 +precision_compare: false +xpu_l3_cache: 62914560 +result_path: benchmark_xpu_paddle_fp32_l3.txt diff --git a/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt new file mode 100755 index 000000000..890fb7276 --- /dev/null +++ b/benchmark/cpp/config/config.xpu.paddle.fp32.h2d.txt @@ -0,0 +1,14 @@ +device: xpu +device_id: 0 +cpu_thread_nums: 1 +warmup: 200 +repeat: 1000 +backend: paddle +profile_mode: runtime +include_h2d_d2h: true +use_fp16: false +collect_memory_info: false +sampling_interval: 1 +precision_compare: false +xpu_l3_cache: 0 +result_path: benchmark_xpu_paddle_fp32.txt diff --git a/benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt b/benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt new file mode 100755 index 000000000..59103958b --- /dev/null +++ b/benchmark/cpp/config/config.xpu.paddle.fp32.l3.txt @@ -0,0 +1,14 @@ +device: xpu +device_id: 0 +cpu_thread_nums: 1 +warmup: 200 +repeat: 1000 +backend: paddle +profile_mode: runtime +include_h2d_d2h: false +use_fp16: false +collect_memory_info: false +sampling_interval: 1 +precision_compare: false +xpu_l3_cache: 62914560 +result_path: benchmark_xpu_paddle_fp32_l3.txt diff --git a/benchmark/cpp/config/config.xpu.paddle.fp32.txt b/benchmark/cpp/config/config.xpu.paddle.fp32.txt new file mode 100755 index 000000000..3f65b9d0a --- /dev/null +++ b/benchmark/cpp/config/config.xpu.paddle.fp32.txt @@ -0,0 +1,14 @@ +device: xpu +device_id: 0 +cpu_thread_nums: 1 +warmup: 200 +repeat: 1000 +backend: paddle +profile_mode: runtime +include_h2d_d2h: false +use_fp16: false +collect_memory_info: false +sampling_interval: 1 +precision_compare: false +xpu_l3_cache: 0 +result_path: benchmark_xpu_paddle_fp32.txt diff --git a/benchmark/cpp/flags.h b/benchmark/cpp/flags.h index a36591a2c..049c066ac 100755 --- a/benchmark/cpp/flags.h +++ b/benchmark/cpp/flags.h @@ -60,7 +60,9 @@ DEFINE_int32(device_id, -1, "Optional, set specific device id for GPU/XPU, default -1." "will force to override the value in config file " "eg, 0/1/2/..."); - +DEFINE_bool(enable_log_info, false, + "Optional, whether to enable log info for paddle backend," + "default false."); static void PrintUsage() { std::cout << "Usage: infer_demo --model model_path --image img_path " diff --git a/benchmark/cpp/option.h b/benchmark/cpp/option.h index 0dad4824c..3bc6a94ca 100755 --- a/benchmark/cpp/option.h +++ b/benchmark/cpp/option.h @@ -18,6 +18,7 @@ static void UpdateBaseCustomFlags( std::unordered_map& config_info) { + // see benchmark/cpp/flags.h if (FLAGS_warmup > -1) { config_info["warmup"] = std::to_string(FLAGS_warmup); } @@ -30,6 +31,14 @@ static void UpdateBaseCustomFlags( if (FLAGS_use_fp16) { config_info["use_fp16"] = "true"; } + if (FLAGS_xpu_l3_cache >= 0) { + config_info["xpu_l3_cache"] = std::to_string(FLAGS_xpu_l3_cache); + } + if (FLAGS_enable_log_info) { + config_info["enable_log_info"] = "true"; + } else { + config_info["enable_log_info"] = "false"; + } } static bool CreateRuntimeOption(fastdeploy::RuntimeOption* option, @@ -47,6 +56,9 @@ static bool CreateRuntimeOption(fastdeploy::RuntimeOption* option, option->EnableProfiling(config_info["include_h2d_d2h"] == "true", repeat, warmup); } + if (config_info["enable_log_info"] == "true") { + option->paddle_infer_option.enable_log_info = true; + } if (config_info["device"] == "gpu") { option->UseGpu(std::stoi(config_info["device_id"])); if (config_info["backend"] == "ort") { @@ -104,16 +116,14 @@ static bool CreateRuntimeOption(fastdeploy::RuntimeOption* option, return false; } } else if (config_info["device"] == "xpu") { - if (FLAGS_xpu_l3_cache >= 0) { - option->UseKunlunXin(std::stoi(config_info["device_id"]), - FLAGS_xpu_l3_cache); - } else { - option->UseKunlunXin(std::stoi(config_info["device_id"]), - std::stoi(config_info["xpu_l3_cache"])); - } + option->UseKunlunXin(std::stoi(config_info["device_id"]), + std::stoi(config_info["xpu_l3_cache"])); if (config_info["backend"] == "ort") { option->UseOrtBackend(); } else if (config_info["backend"] == "paddle") { + // Note: For inference + XPU fp16, As long as the + // model is fp16, it can automatically run on the + // fp16 precision. option->UsePaddleInferBackend(); } else if (config_info["backend"] == "lite") { option->UsePaddleLiteBackend(); diff --git a/cmake/kunlunxin.cmake b/cmake/kunlunxin.cmake index 5935d826f..1afd691f5 100755 --- a/cmake/kunlunxin.cmake +++ b/cmake/kunlunxin.cmake @@ -1,20 +1,26 @@ -if(NOT ENABLE_LITE_BACKEND) - message("Will force to set ENABLE_LITE_BACKEND when build with KunlunXin.") - set(ENABLE_LITE_BACKEND ON) +if(NOT ENABLE_PADDLE_BACKEND) + if(NOT ENABLE_LITE_BACKEND) + message(WARNING "Will force to set ENABLE_LITE_BACKEND=ON if ENABLE_PADDLE_BACKEND=OFF when build with KunlunXin.") + set(ENABLE_LITE_BACKEND ON) + endif() +else() + if(ENABLE_LITE_BACKEND) + message(WARNING "Will force to set ENABLE_LITE_BACKEND=OFF if ENABLE_PADDLE_BACKEND=ON when build with KunlunXin.") + set(ENABLE_LITE_BACKEND OFF) + endif() endif() option(WITH_LITE_XPU_LOG "" ON) - -if(NOT PADDLELITE_URL) - if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") - set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-aarch64-xpu-v213.tgz") - else () - if (WITH_LITE_XPU_LOG) - # set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-20221215.tgz") - # set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-20230303.tgz") - set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-20230410.tgz") - else() - set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-without-log-20230303.tgz") +if(NOT ENABLE_PADDLE_BACKEND) + if(NOT PADDLELITE_URL) + if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-aarch64-xpu-v213.tgz") + else () + if (WITH_LITE_XPU_LOG) + set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-20230410.tgz") + else() + set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-without-log-20230303.tgz") + endif() endif() endif() -endif() +endif() \ No newline at end of file diff --git a/cmake/paddle_inference.cmake b/cmake/paddle_inference.cmake index 6e3d4d689..6766289b2 100755 --- a/cmake/paddle_inference.cmake +++ b/cmake/paddle_inference.cmake @@ -114,10 +114,11 @@ else() endif() set(PADDLEINFERENCE_VERSION "0.0.0.660f781b77") else() - # Linux with x86 CPU/Arm CPU/GPU/IPU ... + # Linux with x86/aarch64 CPU/Arm CPU/GPU/IPU ... if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") message(FATAL_ERROR "Paddle Backend doesn't support linux aarch64 now.") else() + # x86_64 if(WITH_GPU) set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-gpu-trt8.5.2.2-mkl-avx-0.0.0.660f781b77.tgz") set(PADDLEINFERENCE_VERSION "0.0.0.660f781b77") @@ -125,14 +126,18 @@ else() set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-mkl-avx-0.0.0.660f781b77.tgz") set(PADDLEINFERENCE_VERSION "0.0.0.660f781b77") endif() - if (WITH_IPU) + if(WITH_IPU) set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-ipu-2.4-dev1.tgz") # TODO(qiuyanjun): Should use the commit id to tag the version set(PADDLEINFERENCE_VERSION "2.4-dev1") endif() + if(WITH_KUNLUNXIN) + set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-xpu-openblas-0.0.0.021fd73536.tgz") + set(PADDLEINFERENCE_VERSION "0.0.0.021fd73536") + endif() if(NEED_ABI0) - if(WITH_GPU OR WITH_PU) + if(WITH_GPU OR WITH_IPU OR WITH_KUNLUNXIN) message(WARNING "While NEED_ABI0=ON, only support CPU now, will fallback to CPU.") endif() set(PADDLEINFERENCE_FILE "paddle_inference-linux-x64-2.4.0-abi0.tgz") diff --git a/fastdeploy/core/config.h.in b/fastdeploy/core/config.h.in index 10d39cabb..f5a8d41b7 100755 --- a/fastdeploy/core/config.h.in +++ b/fastdeploy/core/config.h.in @@ -41,6 +41,10 @@ #cmakedefine WITH_GPU #endif +#ifndef WITH_KUNLUNXIN +#cmakedefine WITH_KUNLUNXIN +#endif + #ifndef WITH_DIRECTML #cmakedefine WITH_DIRECTML #endif diff --git a/fastdeploy/runtime/backends/paddle/option.h b/fastdeploy/runtime/backends/paddle/option.h index 134fc7ef0..c6f48ff0c 100755 --- a/fastdeploy/runtime/backends/paddle/option.h +++ b/fastdeploy/runtime/backends/paddle/option.h @@ -45,6 +45,33 @@ struct IpuOption { bool ipu_enable_half_partial; }; +/*! @brief Option object to configure KUNLUNXIN XPU + */ +struct XpuOption { + /// kunlunxin device id + int kunlunxin_device_id = 0; + /// EnableXpu + /// kunlunxin_l3_workspace_size + int kunlunxin_l3_workspace_size = 0xfffc00; + /// kunlunxin_locked + bool kunlunxin_locked = false; + /// kunlunxin_autotune + bool kunlunxin_autotune = true; + /// kunlunxin_autotune_file + std::string kunlunxin_autotune_file = ""; + /// kunlunxin_precision + std::string kunlunxin_precision = "int16"; + /// kunlunxin_adaptive_seqlen + bool kunlunxin_adaptive_seqlen = false; + /// kunlunxin_enable_multi_stream + bool kunlunxin_enable_multi_stream = false; + /// SetXpuConfig + /// quant post dynamic weight bits + int kunlunxin_quant_post_dynamic_weight_bits = -1; + /// quant post dynamic op types + std::vector kunlunxin_quant_post_dynamic_op_types = {}; +}; + /*! @brief Option object to configure Paddle Inference backend */ struct PaddleBackendOption { @@ -63,6 +90,10 @@ struct PaddleBackendOption { * @brief IPU option, this will configure the IPU hardware, if inference model in IPU */ IpuOption ipu_option; + /* + * @brief XPU option, this will configure the KUNLUNXIN XPU hardware, if inference model in XPU + */ + XpuOption xpu_option; /// Collect shape for model while enable_trt is true bool collect_trt_shape = false; @@ -84,8 +115,8 @@ struct PaddleBackendOption { } void SetIpuConfig(bool enable_fp16, int replica_num, - float available_memory_proportion, - bool enable_half_partial) { + float available_memory_proportion, + bool enable_half_partial) { ipu_option.ipu_enable_fp16 = enable_fp16; ipu_option.ipu_replica_num = replica_num; ipu_option.ipu_available_memory_proportion = @@ -93,12 +124,22 @@ struct PaddleBackendOption { ipu_option.ipu_enable_half_partial = enable_half_partial; } + void SetXpuConfig( + int quant_post_dynamic_weight_bits = -1, + const std::vector& quant_post_dynamic_op_types = {}) { + xpu_option.kunlunxin_quant_post_dynamic_weight_bits = + quant_post_dynamic_weight_bits; + xpu_option.kunlunxin_quant_post_dynamic_op_types = + quant_post_dynamic_op_types; + } + // The belowing parameters may be removed, please do not // read or write them directly TrtBackendOption trt_option; bool enable_pinned_memory = false; void* external_stream_ = nullptr; Device device = Device::CPU; + /// device id for CPU/GPU int device_id = 0; std::vector trt_disabled_ops_{}; int cpu_thread_num = 8; diff --git a/fastdeploy/runtime/backends/paddle/paddle_backend.cc b/fastdeploy/runtime/backends/paddle/paddle_backend.cc index 3ea81cb6b..3db73a00a 100644 --- a/fastdeploy/runtime/backends/paddle/paddle_backend.cc +++ b/fastdeploy/runtime/backends/paddle/paddle_backend.cc @@ -78,9 +78,28 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) { option.ipu_option.ipu_available_memory_proportion, option.ipu_option.ipu_enable_half_partial); #else - FDWARNING << "The FastDeploy is not compiled with IPU backend, so will " + FDWARNING << "The FastDeploy is not compiled with IPU device, so will " "fallback to CPU with Paddle Inference Backend." << std::endl; +#endif + } else if (option.device == Device::KUNLUNXIN) { +#ifdef WITH_KUNLUNXIN + config_.EnableXpu(option.xpu_option.kunlunxin_l3_workspace_size, + option.xpu_option.kunlunxin_locked, + option.xpu_option.kunlunxin_autotune, + option.xpu_option.kunlunxin_autotune_file, + option.xpu_option.kunlunxin_precision, + option.xpu_option.kunlunxin_adaptive_seqlen, + option.xpu_option.kunlunxin_enable_multi_stream); + config_.SetXpuConfig( + option.xpu_option.kunlunxin_quant_post_dynamic_weight_bits, + option.xpu_option.kunlunxin_quant_post_dynamic_op_types); + config_.SetXpuDeviceId(option.xpu_option.kunlunxin_device_id); +#else + FDWARNING + << "The FastDeploy is not compiled with KUNLUNXIN device, so will " + "fallback to CPU with Paddle Inference Backend." + << std::endl; #endif } else { config_.DisableGpu(); @@ -89,6 +108,7 @@ void PaddleBackend::BuildOption(const PaddleBackendOption& option) { config_.SetMkldnnCacheCapacity(option.mkldnn_cache_size); } } + if (!option.enable_log_info) { config_.DisableGlogInfo(); } @@ -106,6 +126,7 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) { } auto option = runtime_option; + // Collect basic paddle inference option and trt option. option.paddle_infer_option.model_file = runtime_option.model_file; option.paddle_infer_option.params_file = runtime_option.params_file; option.paddle_infer_option.model_from_memory_ = @@ -117,6 +138,10 @@ bool PaddleBackend::Init(const RuntimeOption& runtime_option) { option.paddle_infer_option.external_stream_ = runtime_option.external_stream_; option.paddle_infer_option.trt_option = runtime_option.trt_option; option.paddle_infer_option.trt_option.gpu_id = runtime_option.device_id; + // Note(qiuyanjun): For Ipu option and XPU option, please check the + // details of RuntimeOption::UseIpu() and RuntimeOption::UseKunlunXin(). + // Futhermore, please check paddle_infer_option.SetIpuConfig() and + // paddle_infer_option.SetXpuConfig() for more details of extra configs. return InitFromPaddle(option.model_file, option.params_file, option.model_from_memory_, option.paddle_infer_option); } diff --git a/fastdeploy/runtime/backends/paddle/util.cc b/fastdeploy/runtime/backends/paddle/util.cc index a4e4ed29b..58fa96552 100644 --- a/fastdeploy/runtime/backends/paddle/util.cc +++ b/fastdeploy/runtime/backends/paddle/util.cc @@ -19,6 +19,8 @@ namespace fastdeploy { paddle_infer::PlaceType ConvertFDDeviceToPlace(Device device) { if (device == Device::GPU) { return paddle_infer::PlaceType::kGPU; + } else if (device == Device::KUNLUNXIN) { + return paddle_infer::PlaceType::kXPU; } return paddle_infer::PlaceType::kCPU; } @@ -52,9 +54,21 @@ void ShareTensorFromFDTensor(paddle_infer::Tensor* tensor, tensor->CopyFromCpu(static_cast(fd_tensor.Data())); } return; + } else if (fd_tensor.dtype == FDDataType::INT8) { + if (place == paddle_infer::PlaceType::kGPU) { + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, place); + } else { + tensor->CopyFromCpu(static_cast(fd_tensor.Data())); + } + return; } else if (fd_tensor.dtype == FDDataType::UINT8) { - tensor->ShareExternalData(static_cast(fd_tensor.Data()), - shape, paddle_infer::PlaceType::kCPU); + if (place == paddle_infer::PlaceType::kGPU) { + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, place); + } else { + tensor->CopyFromCpu(static_cast(fd_tensor.Data())); + } return; } FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.", @@ -89,9 +103,21 @@ void ShareOutTensorFromFDTensor(paddle_infer::Tensor* tensor, tensor->CopyToCpu(static_cast(fd_tensor.MutableData())); } return; + } else if (fd_tensor.dtype == FDDataType::INT8) { + if (place == paddle_infer::PlaceType::kGPU) { + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, place); + } else { + tensor->CopyFromCpu(static_cast(fd_tensor.Data())); + } + return; } else if (fd_tensor.dtype == FDDataType::UINT8) { - tensor->ShareExternalData(static_cast(fd_tensor.MutableData()), - shape, paddle_infer::PlaceType::kCPU); + if (place == paddle_infer::PlaceType::kGPU) { + tensor->ShareExternalData(static_cast(fd_tensor.Data()), + shape, place); + } else { + tensor->CopyFromCpu(static_cast(fd_tensor.Data())); + } return; } FDASSERT(false, "Unexpected data type(%s) while infer with PaddleBackend.", @@ -149,6 +175,11 @@ void PaddleTensorToFDTensor(std::unique_ptr& tensor, Device device = Device::CPU; if (place == paddle_infer::PlaceType::kGPU) { device = Device::GPU; + } else if (place == paddle_infer::PlaceType::kXPU) { + device = Device::KUNLUNXIN; + FDASSERT(false, + "Currently, copy_to_fd=false, FDTensor SetExternalData " + "is not support for Device::KUNLUNXIN now!") } fd_tensor->name = tensor->name(); fd_tensor->SetExternalData(shape, fd_dtype, out_data, device); diff --git a/fastdeploy/runtime/enum_variables.h b/fastdeploy/runtime/enum_variables.h index b8427303e..0e23f21ee 100644 --- a/fastdeploy/runtime/enum_variables.h +++ b/fastdeploy/runtime/enum_variables.h @@ -99,7 +99,7 @@ static std::map> {Device::SUNRISENPU, {Backend::HORIZONNPU}}, {Device::IPU, {Backend::PDINFER}}, {Device::TIMVX, {Backend::LITE}}, - {Device::KUNLUNXIN, {Backend::LITE}}, + {Device::KUNLUNXIN, {Backend::LITE, Backend::PDINFER}}, {Device::ASCEND, {Backend::LITE}}, {Device::SOPHGOTPUD, {Backend::SOPHGOTPU}}, {Device::DIRECTML, {Backend::ORT}} diff --git a/fastdeploy/runtime/runtime_option.cc b/fastdeploy/runtime/runtime_option.cc index 563339237..2af84d482 100644 --- a/fastdeploy/runtime/runtime_option.cc +++ b/fastdeploy/runtime/runtime_option.cc @@ -79,14 +79,18 @@ void RuntimeOption::UseTimVX() { paddle_lite_option.device = device; } -void RuntimeOption::UseKunlunXin(int kunlunxin_id, int l3_workspace_size, +void RuntimeOption::UseKunlunXin(int kunlunxin_id, + int l3_workspace_size, bool locked, bool autotune, const std::string& autotune_file, const std::string& precision, bool adaptive_seqlen, bool enable_multi_stream, int64_t gm_default_size) { +#ifdef WITH_KUNLUNXIN device = Device::KUNLUNXIN; + +#ifdef ENABLE_LITE_BACKEND paddle_lite_option.device = device; paddle_lite_option.device_id = kunlunxin_id; paddle_lite_option.kunlunxin_l3_workspace_size = l3_workspace_size; @@ -97,6 +101,42 @@ void RuntimeOption::UseKunlunXin(int kunlunxin_id, int l3_workspace_size, paddle_lite_option.kunlunxin_adaptive_seqlen = adaptive_seqlen; paddle_lite_option.kunlunxin_enable_multi_stream = enable_multi_stream; paddle_lite_option.kunlunxin_gm_default_size = gm_default_size; +#endif +#ifdef ENABLE_PADDLE_BACKEND + paddle_infer_option.device = device; + paddle_infer_option.xpu_option.kunlunxin_device_id = kunlunxin_id; + paddle_infer_option.xpu_option.kunlunxin_l3_workspace_size = l3_workspace_size; + paddle_infer_option.xpu_option.kunlunxin_locked = locked; + paddle_infer_option.xpu_option.kunlunxin_autotune = autotune; + paddle_infer_option.xpu_option.kunlunxin_autotune_file = autotune_file; + paddle_infer_option.xpu_option.kunlunxin_precision = precision; + paddle_infer_option.xpu_option.kunlunxin_adaptive_seqlen = adaptive_seqlen; + paddle_infer_option.xpu_option.kunlunxin_enable_multi_stream = enable_multi_stream; + // paddle_infer_option.xpu_option.kunlunxin_gm_default_size = gm_default_size; + // use paddle_infer_option.xpu_option.SetXpuConfig() for more options. +#endif + +#else + FDWARNING << "The FastDeploy didn't compile with KUNLUNXIN, will force to use CPU." + << std::endl; + device = Device::CPU; +#endif +} + +void RuntimeOption::UseIpu(int device_num, int micro_batch_size, + bool enable_pipelining, int batches_per_step) { +#ifdef WITH_IPU + device = Device::IPU; + paddle_infer_option.ipu_option.ipu_device_num = device_num; + paddle_infer_option.ipu_option.ipu_micro_batch_size = micro_batch_size; + paddle_infer_option.ipu_option.ipu_enable_pipelining = enable_pipelining; + paddle_infer_option.ipu_option.ipu_batches_per_step = batches_per_step; + // use paddle_infer_option.ipu_option.SetIpuConfig() for more options. +#else + FDWARNING << "The FastDeploy didn't compile with IPU, will force to use CPU." + << std::endl; + device = Device::CPU; +#endif } void RuntimeOption::UseAscend() { @@ -484,19 +524,4 @@ void RuntimeOption::DisablePaddleTrtOPs(const std::vector& ops) { paddle_infer_option.DisableTrtOps(ops); } -void RuntimeOption::UseIpu(int device_num, int micro_batch_size, - bool enable_pipelining, int batches_per_step) { -#ifdef WITH_IPU - device = Device::IPU; - ipu_device_num = device_num; - ipu_micro_batch_size = micro_batch_size; - ipu_enable_pipelining = enable_pipelining; - ipu_batches_per_step = batches_per_step; -#else - FDWARNING << "The FastDeploy didn't compile with IPU, will force to use CPU." - << std::endl; - device = Device::CPU; -#endif -} - } // namespace fastdeploy diff --git a/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh b/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh index 414fa33da..5d058093d 100755 --- a/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh +++ b/scripts/linux/build_linux_x86_64_cpp_xpu_with_benchmark.sh @@ -52,11 +52,11 @@ __build_fastdeploy_linux_x86_64_xpu_shared() { local FASDEPLOY_INSTALL_DIR="${ROOT_PATH}/${BUILD_DIR}/install" cd "${BUILD_DIR}" && echo "-- [INFO] Working Dir: ${PWD}" - cmake -DWITH_KUNLUNXIN=ON \ - -DCMAKE_BUILD_TYPE=Release \ - -DWITH_GPU=OFF \ + cmake -DCMAKE_BUILD_TYPE=Release \ + -DWITH_KUNLUNXIN=ON \ -DENABLE_ORT_BACKEND=OFF \ - -DENABLE_PADDLE_BACKEND=OFF \ + -DENABLE_PADDLE_BACKEND=ON \ + -DENABLE_LITE_BACKEND=OFF \ -DENABLE_VISION=ON \ -DENABLE_BENCHMARK=ON \ -DBUILD_EXAMPLES=OFF \