From c38b7d43771606f39c66e3793f36406d7e4cb099 Mon Sep 17 00:00:00 2001
From: yunyaoXYY <109218879+yunyaoXYY@users.noreply.github.com>
Date: Fri, 17 Feb 2023 10:53:51 +0800
Subject: [PATCH] [Backend] Support onnxruntime DirectML inference.  (#1304)

* Fix links in readme

* Fix links in readme

* Update PPOCRv2/v3 examples

* Update auto compression configs

* Add neww quantization  support for paddleclas model

* Update quantized Yolov6s model download link

* Improve PPOCR comments

* Add English doc for quantization

* Fix PPOCR rec model bug

* Add  new paddleseg quantization support

* Add  new paddleseg quantization support

* Add  new paddleseg quantization support

* Add  new paddleseg quantization support

* Add Ascend model list

* Add ascend model list

* Add ascend model list

* Add ascend model list

* Add ascend model list

* Add ascend model list

* Add ascend model list

* Support DirectML in onnxruntime

* Support onnxruntime DirectML

* Support onnxruntime DirectML

* Support onnxruntime DirectML

* Support OnnxRuntime DirectML

* Support OnnxRuntime DirectML

* Support OnnxRuntime DirectML

* Support OnnxRuntime DirectML

* Support OnnxRuntime DirectML

* Support OnnxRuntime DirectML

* Support OnnxRuntime DirectML

* Support OnnxRuntime DirectML

* Remove DirectML vision model example

* Imporve OnnxRuntime DirectML

* Imporve OnnxRuntime DirectML

* fix opencv cmake in Windows

* recheck codestyle
---
 CMakeLists.txt                                |  1 +
 cmake/check.cmake                             |  3 -
 cmake/onnxruntime.cmake                       | 10 ++-
 cmake/opencv.cmake                            | 12 ++-
 cmake/summary.cmake                           |  1 +
 docs/cn/build_and_install/directml.md         | 59 ++++++++++++++
 docs/en/build_and_install/directml.md         | 57 ++++++++++++++
 examples/runtime/cpp/infer_paddle_dml.cc      | 77 +++++++++++++++++++
 .../classification/paddleclas/cpp/README.md   | 16 ++--
 .../classification/paddleclas/cpp/infer.cc    |  9 ++-
 .../paddleseg/cpu-gpu/cpp/infer.cc            |  0
 fastdeploy/core/config.h.in                   |  6 +-
 fastdeploy/fastdeploy_model.cc                | 60 +++++++++++----
 fastdeploy/fastdeploy_model.h                 |  6 +-
 .../runtime/backends/ort/ort_backend.cc       | 65 ++++++++++++++--
 fastdeploy/runtime/backends/ort/ort_backend.h |  8 +-
 fastdeploy/runtime/enum_variables.cc          |  3 +
 fastdeploy/runtime/enum_variables.h           | 25 +++---
 fastdeploy/runtime/runtime_option.cc          |  2 +
 fastdeploy/runtime/runtime_option.h           |  4 +
 .../vision/classification/ppcls/model.cc      |  2 +
 fastdeploy/vision/segmentation/ppseg/model.cc | 27 ++++---
 22 files changed, 393 insertions(+), 60 deletions(-)
 create mode 100644 docs/cn/build_and_install/directml.md
 create mode 100644 docs/en/build_and_install/directml.md
 create mode 100644 examples/runtime/cpp/infer_paddle_dml.cc
 mode change 100644 => 100755 examples/vision/segmentation/paddleseg/cpu-gpu/cpp/infer.cc
 mode change 100755 => 100644 fastdeploy/runtime/runtime_option.h
 mode change 100644 => 100755 fastdeploy/vision/classification/ppcls/model.cc

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6cee4ef72..c234a0f06 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,7 @@ option(ENABLE_CVCUDA "Whether to enable NVIDIA CV-CUDA to boost image preprocess
 option(ENABLE_ENCRYPTION "Whether to enable ENCRYPTION." OFF)
 option(ENABLE_BENCHMARK "Whether to enable Benchmark mode." OFF)
 option(WITH_ASCEND "Whether to compile for Huawei Ascend deploy." OFF)
+option(WITH_DIRECTML "Whether to compile for onnxruntime DirectML deploy." OFF)
 option(WITH_TIMVX "Whether to compile for TIMVX deploy." OFF)
 option(WITH_KUNLUNXIN "Whether to compile for KunlunXin XPU deploy." OFF)
 option(WITH_TESTING "Whether to compile with unittest." OFF)
diff --git a/cmake/check.cmake b/cmake/check.cmake
index 690149a9e..5e0ce1794 100644
--- a/cmake/check.cmake
+++ b/cmake/check.cmake
@@ -12,9 +12,6 @@ if(WIN32)
     if(ENABLE_POROS_BACKEND)
       message(FATAL_ERROR "-DENABLE_POROS_BACKEND=ON doesn't support on non 64-bit system now.")
     endif()
-    if(ENABLE_VISION)
-      message(FATAL_ERROR "-DENABLE_VISION=ON doesn't support on non 64-bit system now.")
-    endif()
   endif()
 endif()
 
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 48b4675f1..b823f734b 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -44,14 +44,20 @@ set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}" "${ONNXRUNTIME_LIB_DIR}")
 set(ONNXRUNTIME_VERSION "1.12.0")
 set(ONNXRUNTIME_URL_PREFIX "https://bj.bcebos.com/paddle2onnx/libs/")
 
-if(WIN32)
+if(WIN32) 
   if(WITH_GPU)
     set(ONNXRUNTIME_FILENAME "onnxruntime-win-x64-gpu-${ONNXRUNTIME_VERSION}.zip")
+  elseif(WITH_DIRECTML)
+    set(ONNXRUNTIME_FILENAME "onnxruntime-directml-win-x64.zip")
   else()
     set(ONNXRUNTIME_FILENAME "onnxruntime-win-x64-${ONNXRUNTIME_VERSION}.zip")
   endif()
   if(NOT CMAKE_CL_64)
-    set(ONNXRUNTIME_FILENAME "onnxruntime-win-x86-${ONNXRUNTIME_VERSION}.zip")
+    if(WITH_DIRECTML)
+      set(ONNXRUNTIME_FILENAME "onnxruntime-directml-win-x86.zip")
+    else()
+      set(ONNXRUNTIME_FILENAME "onnxruntime-win-x86-${ONNXRUNTIME_VERSION}.zip")
+    endif()
   endif()
 elseif(APPLE)
   if(CURRENT_OSX_ARCH MATCHES "arm64")
diff --git a/cmake/opencv.cmake b/cmake/opencv.cmake
index 9968c129c..e4a63f42b 100755
--- a/cmake/opencv.cmake
+++ b/cmake/opencv.cmake
@@ -15,7 +15,11 @@
 set(COMPRESSED_SUFFIX ".tgz")
 
 if(WIN32)
-  set(OPENCV_FILENAME "opencv-win-x64-3.4.16")
+  if(NOT CMAKE_CL_64)
+    set(OPENCV_FILENAME "opencv-win-x86-3.4.16")
+  else()
+    set(OPENCV_FILENAME "opencv-win-x64-3.4.16")
+  endif()
   set(COMPRESSED_SUFFIX ".zip")
 elseif(APPLE)
   if(CURRENT_OSX_ARCH MATCHES "arm64")
@@ -51,6 +55,12 @@ endif()
 set(OPENCV_INSTALL_DIR ${THIRD_PARTY_PATH}/install/)
 if(ANDROID)
   set(OPENCV_URL_PREFIX "https://bj.bcebos.com/fastdeploy/third_libs")
+elseif(WIN32)
+  if(NOT CMAKE_CL_64)
+    set(OPENCV_URL_PREFIX "https://bj.bcebos.com/fastdeploy/third_libs")
+  else()
+    set(OPENCV_URL_PREFIX "https://bj.bcebos.com/paddle2onnx/libs")
+  endif()
 else() # TODO: use fastdeploy/third_libs instead.
   set(OPENCV_URL_PREFIX "https://bj.bcebos.com/paddle2onnx/libs")
 endif()
diff --git a/cmake/summary.cmake b/cmake/summary.cmake
index 6bda8fdcb..45cc837a8 100755
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@@ -43,6 +43,7 @@ function(fastdeploy_summary)
   message(STATUS "  WITH_GPU                  : ${WITH_GPU}")
   message(STATUS "  WITH_TESTING              : ${WITH_TESTING}")
   message(STATUS "  WITH_ASCEND               : ${WITH_ASCEND}")
+  message(STATUS "  WITH_DIRECTML             : ${WITH_DIRECTML}")
   message(STATUS "  WITH_TIMVX                : ${WITH_TIMVX}")
   message(STATUS "  WITH_KUNLUNXIN            : ${WITH_KUNLUNXIN}")
   message(STATUS "  WITH_CAPI            : ${WITH_CAPI}")
diff --git a/docs/cn/build_and_install/directml.md b/docs/cn/build_and_install/directml.md
new file mode 100644
index 000000000..c1d701a84
--- /dev/null
+++ b/docs/cn/build_and_install/directml.md
@@ -0,0 +1,59 @@
+[English](../../en/build_and_install/directml.md) | 简体中文
+
+# DirectML部署库编译
+Direct Machine Learning (DirectML) 是Windows系统上用于机器学习的一款高性能, 提供硬件加速的 DirectX 12 库.
+目前, Fastdeploy的ONNX Runtime后端已集成DirectML,让用户可以在支持DirectX 12的 AMD/Intel/Nvidia/Qualcomm的GPU上部署模型.
+
+更多详细介绍可见:
+- [ONNX Runtime DirectML Execution Provider](https://onnxruntime.ai/docs/execution-providers/DirectML-ExecutionProvider.html)
+
+# DirectML使用需求
+- 编译需求: Visuald Studio 2017 及其以上工具链.
+- 操作系统: Windows10, 1903 版本, 及其更新版本. (DirectML为操作系统的组成部分, 无需单独安装)
+- 硬件需求: 支持DirectX 12的显卡, 例如, AMD GCN 第一代及以上版本/ Intel Haswell HD集成显卡及以上版本/Nvidia Kepler架构及以上版本/ Qualcomm Adreno 600及以上版本.
+
+# 编译DirectML部署库
+DirectML是基于ONNX Runtime后端集成, 所以要使用DirectML, 用户需要打开编译ONNX Runtime的选项. 同时, FastDeploy的DirectML支持x64/x86(Win32)架构的程序构建.
+
+
+x64示例, 在Windows菜单中，找到`x64 Native Tools Command Prompt for VS 2019`打开，执行如下命令
+```bat
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy
+mkdir build && cd build
+
+cmake .. -G "Visual Studio 16 2019" -A x64 ^
+         -DWITH_DIRECTML=ON ^
+         -DENABLE_ORT_BACKEND=ON ^
+         -DENABLE_VISION=ON ^
+         -DCMAKE_INSTALL_PREFIX="D:\Paddle\compiled_fastdeploy" ^
+
+msbuild fastdeploy.sln /m /p:Configuration=Release /p:Platform=x64
+msbuild INSTALL.vcxproj /m /p:Configuration=Release /p:Platform=x64
+```
+编译完成后，即在`CMAKE_INSTALL_PREFIX`指定的目录下生成C++推理库.
+如您使用CMake GUI可参考文档[Windows使用CMakeGUI + Visual Studio 2019 IDE编译](../faq/build_on_win_with_gui.md)
+
+
+x86(Win32)示例, 在Windows菜单中，找到`x86 Native Tools Command Prompt for VS 2019`打开，执行如下命令
+```bat
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy
+mkdir build && cd build
+
+cmake .. -G "Visual Studio 16 2019" -A Win32 ^
+         -DWITH_DIRECTML=ON ^
+         -DENABLE_ORT_BACKEND=ON ^
+         -DENABLE_VISION=ON ^
+         -DCMAKE_INSTALL_PREFIX="D:\Paddle\compiled_fastdeploy" ^
+
+msbuild fastdeploy.sln /m /p:Configuration=Release /p:Platform=Win32
+msbuild INSTALL.vcxproj /m /p:Configuration=Release /p:Platform=Win32
+```
+编译完成后，即在`CMAKE_INSTALL_PREFIX`指定的目录下生成C++推理库.
+如您使用CMake GUI可参考文档[Windows使用CMakeGUI + Visual Studio 2019 IDE编译](../faq/build_on_win_with_gui.md)
+
+# 使用DirectML库
+DirectML编译库的使用方式, 和其他硬件在Windows上使用的方式一样, 参考以下链接.
+- [FastDeploy C++库在Windows上的多种使用方式](../faq/use_sdk_on_windows_build.md)
+- [在 Windows 使用 FastDeploy C++ SDK](../faq/use_sdk_on_windows.md)
diff --git a/docs/en/build_and_install/directml.md b/docs/en/build_and_install/directml.md
new file mode 100644
index 000000000..bc68ac702
--- /dev/null
+++ b/docs/en/build_and_install/directml.md
@@ -0,0 +1,57 @@
+English | [中文](../../cn/build_and_install/directml.md)
+
+# How to Build DirectML Deployment Environment
+Direct Machine Learning (DirectML) is a high-performance, hardware-accelerated DirectX 12 library for machine learning on Windows systems.
+Currently, Fastdeploy's ONNX Runtime backend has DirectML integrated, allowing users to deploy models on AMD/Intel/Nvidia/Qualcomm GPUs with DirectX 12 support.
+
+More details:
+- [ONNX Runtime DirectML Execution Provider](https://onnxruntime.ai/docs/execution-providers/DirectML-ExecutionProvider.html)
+
+# DirectML requirements
+- Compilation requirements: Visual Studio 2017 toolchain and above.
+- Operating system: Windows 10, version 1903, and newer. (DirectML is part of the operating system and does not need to be installed separately)
+- Hardware requirements: DirectX 12 supported graphics cards, e.g., AMD GCN 1st generation and above/ Intel Haswell HD integrated graphics and above/ Nvidia Kepler architecture and above/ Qualcomm Adreno 600 and above.
+
+# How to Build and Install DirectML C++ SDK
+The DirectML is integrated with the ONNX Runtime backend, so to use DirectML, users need to turn on the option to compile ONNX Runtime. Also, FastDeploy's DirectML supports building programs for x64/x86 (Win32) architectures.
+
+For the x64 example, in the Windows menu, find `x64 Native Tools Command Prompt for VS 2019` and open it by executing the following command
+```bat
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy
+mkdir build && cd build
+
+cmake .. -G "Visual Studio 16 2019" -A x64 ^
+         -DWITH_DIRECTML=ON ^
+         -DENABLE_ORT_BACKEND=ON ^
+         -DENABLE_VISION=ON ^
+         -DCMAKE_INSTALL_PREFIX="D:\Paddle\compiled_fastdeploy" ^
+
+msbuild fastdeploy.sln /m /p:Configuration=Release /p:Platform=x64
+msbuild INSTALL.vcxproj /m /p:Configuration=Release /p:Platform=x64
+```
+Once compiled, the C++ inference library is generated in the directory specified by `CMAKE_INSTALL_PREFIX`
+If you use CMake GUI, please refer to [How to Compile with CMakeGUI + Visual Studio 2019 IDE on Windows](../faq/build_on_win_with_gui.md)
+
+
+For the x86(Win32) example, in the Windows menu, find `x86 Native Tools Command Prompt for VS 2019` and open it by executing the following command
+```bat
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy
+mkdir build && cd build
+
+cmake .. -G "Visual Studio 16 2019" -A Win32 ^
+         -DWITH_DIRECTML=ON ^
+         -DENABLE_ORT_BACKEND=ON ^
+         -DENABLE_VISION=ON ^
+         -DCMAKE_INSTALL_PREFIX="D:\Paddle\compiled_fastdeploy" ^
+
+msbuild fastdeploy.sln /m /p:Configuration=Release /p:Platform=Win32
+msbuild INSTALL.vcxproj /m /p:Configuration=Release /p:Platform=Win32
+```
+Once compiled, the C++ inference library is generated in the directory specified by `CMAKE_INSTALL_PREFIX`
+If you use CMake GUI, please refer to [How to Compile with CMakeGUI + Visual Studio 2019 IDE on Windows](../faq/build_on_win_with_gui.md)
+
+# How to use compiled DirectML SDK.
+The DirectML compiled library can be used in the same way as any other hardware on Windows, see the following link.
+- [Using the FastDeploy C++ SDK on Windows Platform](../faq/use_sdk_on_windows.md)
diff --git a/examples/runtime/cpp/infer_paddle_dml.cc b/examples/runtime/cpp/infer_paddle_dml.cc
new file mode 100644
index 000000000..dfa71a661
--- /dev/null
+++ b/examples/runtime/cpp/infer_paddle_dml.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/runtime.h"
+
+namespace fd = fastdeploy;
+
+int main(int argc, char* argv[]) {
+  // create option
+  fd::RuntimeOption runtime_option;
+
+  // model and param files
+  std::string model_file = "mobilenetv2/inference.pdmodel";
+  std::string params_file = "mobilenetv2/inference.pdiparams";
+
+  // read model From disk.
+  // runtime_option.SetModelPath(model_file, params_file,
+  // fd::ModelFormat::PADDLE);
+
+  // read model from buffer
+  std::string model_buffer, params_buffer;
+  fd::ReadBinaryFromFile(model_file, &model_buffer);
+  fd::ReadBinaryFromFile(params_file, &params_buffer);
+  runtime_option.SetModelBuffer(model_buffer, params_buffer,
+                                fd::ModelFormat::PADDLE);
+
+  // setup other option
+  runtime_option.SetCpuThreadNum(12);
+  // use ONNX Runtime DirectML
+  runtime_option.UseOrtBackend();
+  runtime_option.UseDirectML();
+
+  // init runtime
+  std::unique_ptr<fd::Runtime> runtime =
+      std::unique_ptr<fd::Runtime>(new fd::Runtime());
+  if (!runtime->Init(runtime_option)) {
+    std::cerr << "--- Init FastDeploy Runitme Failed! "
+              << "\n--- Model:  " << model_file << std::endl;
+    return -1;
+  } else {
+    std::cout << "--- Init FastDeploy Runitme Done! "
+              << "\n--- Model:  " << model_file << std::endl;
+  }
+  // init input tensor shape
+  fd::TensorInfo info = runtime->GetInputInfo(0);
+  info.shape = {1, 3, 224, 224};
+
+  std::vector<fd::FDTensor> input_tensors(1);
+  std::vector<fd::FDTensor> output_tensors(1);
+
+  std::vector<float> inputs_data;
+  inputs_data.resize(1 * 3 * 224 * 224);
+  for (size_t i = 0; i < inputs_data.size(); ++i) {
+    inputs_data[i] = std::rand() % 1000 / 1000.0f;
+  }
+  input_tensors[0].SetExternalData({1, 3, 224, 224}, fd::FDDataType::FP32,
+                                   inputs_data.data());
+
+  // get input name
+  input_tensors[0].name = info.name;
+
+  runtime->Infer(input_tensors, &output_tensors);
+
+  output_tensors[0].PrintInfo();
+  return 0;
+}
\ No newline at end of file
diff --git a/examples/vision/classification/paddleclas/cpp/README.md b/examples/vision/classification/paddleclas/cpp/README.md
index 8d85c24fc..f0e6fc7ac 100755
--- a/examples/vision/classification/paddleclas/cpp/README.md
+++ b/examples/vision/classification/paddleclas/cpp/README.md
@@ -1,7 +1,7 @@
 English | [简体中文](README_CN.md)
 # PaddleClas C++ Deployment Example
 
-This directory provides examples that `infer.cc` fast finishes the deployment of PaddleClas models on CPU/GPU and GPU accelerated by TensorRT. 
+This directory provides examples that `infer.cc` fast finishes the deployment of PaddleClas models on CPU/GPU and GPU accelerated by TensorRT.
 
 Before deployment, two steps require confirmation.
 
@@ -13,13 +13,13 @@ Taking ResNet50_vd inference on Linux as an example, the compilation test can be
 ```bash
 mkdir build
 cd build
-# Download FastDeploy precompiled library. Users can choose your appropriate version in the`FastDeploy Precompiled Library` mentioned above 
+# Download FastDeploy precompiled library. Users can choose your appropriate version in the`FastDeploy Precompiled Library` mentioned above
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
 make -j
 
-# Download ResNet50_vd model file and test images 
+# Download ResNet50_vd model file and test images
 wget https://bj.bcebos.com/paddlehub/fastdeploy/ResNet50_vd_infer.tgz
 tar -xvf ResNet50_vd_infer.tgz
 wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/ImageNet/ILSVRC2012_val_00000010.jpeg
@@ -35,12 +35,14 @@ wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/Ima
 ./infer_demo ResNet50_vd_infer ILSVRC2012_val_00000010.jpeg 3
 # KunlunXin XPU inference
 ./infer_demo ResNet50_vd_infer ILSVRC2012_val_00000010.jpeg 4
+# Ascend inference
+./infer_demo ResNet50_vd_infer ILSVRC2012_val_00000010.jpeg 5
 ```
 
-The above command works for Linux or MacOS. Refer to 
+The above command works for Linux or MacOS. Refer to
 - [How to use FastDeploy C++ SDK in Windows](../../../../../docs/cn/faq/use_sdk_on_windows.md) for SDK use-pattern in Windows
 
-## PaddleClas C++ Interface 
+## PaddleClas C++ Interface
 
 ### PaddleClas Class
 
@@ -57,8 +59,8 @@ PaddleClas model loading and initialization, where model_file and params_file ar
 
 **Parameter**
 
-> * **model_file**(str): Model file path 
-> * **params_file**(str): Parameter file path 
+> * **model_file**(str): Model file path
+> * **params_file**(str): Parameter file path
 > * **config_file**(str): Inference deployment configuration file
 > * **runtime_option**(RuntimeOption): Backend inference configuration. None by default. (use the default configuration)
 > * **model_format**(ModelFormat): Model format. Paddle format by default
diff --git a/examples/vision/classification/paddleclas/cpp/infer.cc b/examples/vision/classification/paddleclas/cpp/infer.cc
index 90c5557c2..b0065fdb9 100755
--- a/examples/vision/classification/paddleclas/cpp/infer.cc
+++ b/examples/vision/classification/paddleclas/cpp/infer.cc
@@ -96,7 +96,8 @@ void IpuInfer(const std::string& model_dir, const std::string& image_file) {
   std::cout << res.Str() << std::endl;
 }
 
-void KunlunXinInfer(const std::string& model_dir, const std::string& image_file) {
+void KunlunXinInfer(const std::string& model_dir,
+                    const std::string& image_file) {
   auto model_file = model_dir + sep + "inference.pdmodel";
   auto params_file = model_dir + sep + "inference.pdiparams";
   auto config_file = model_dir + sep + "inference_cls.yaml";
@@ -152,7 +153,7 @@ void AscendInfer(const std::string& model_dir, const std::string& image_file) {
   auto model_file = model_dir + sep + "inference.pdmodel";
   auto params_file = model_dir + sep + "inference.pdiparams";
   auto config_file = model_dir + sep + "inference_cls.yaml";
-  
+
   auto option = fastdeploy::RuntimeOption();
   option.UseAscend();
 
@@ -172,14 +173,14 @@ void AscendInfer(const std::string& model_dir, const std::string& image_file) {
   std::cout << res.Str() << std::endl;
 }
 
-
 int main(int argc, char* argv[]) {
   if (argc < 4) {
     std::cout << "Usage: infer_demo path/to/model path/to/image run_option, "
                  "e.g ./infer_demo ./ResNet50_vd ./test.jpeg 0"
               << std::endl;
     std::cout << "The data type of run_option is int, 0: run with cpu; 1: run "
-                 "with gpu; 2: run with gpu and use tensorrt backend; 3: run with ipu; 4: run with kunlunxin."
+                 "with gpu; 2: run with gpu and use tensorrt backend; 3: run "
+                 "with ipu; 4: run with kunlunxin."
               << std::endl;
     return -1;
   }
diff --git a/examples/vision/segmentation/paddleseg/cpu-gpu/cpp/infer.cc b/examples/vision/segmentation/paddleseg/cpu-gpu/cpp/infer.cc
old mode 100644
new mode 100755
diff --git a/fastdeploy/core/config.h.in b/fastdeploy/core/config.h.in
index 5593f9fd8..4da8594b8 100755
--- a/fastdeploy/core/config.h.in
+++ b/fastdeploy/core/config.h.in
@@ -41,6 +41,10 @@
 #cmakedefine WITH_GPU
 #endif
 
+#ifndef WITH_DIRECTML
+#cmakedefine WITH_DIRECTML
+#endif
+
 #ifndef ENABLE_TRT_BACKEND
 #cmakedefine ENABLE_TRT_BACKEND
 #endif
@@ -59,4 +63,4 @@
 
 #ifndef ENABLE_BENCHMARK
 #cmakedefine ENABLE_BENCHMARK
-#endif
\ No newline at end of file
+#endif
diff --git a/fastdeploy/fastdeploy_model.cc b/fastdeploy/fastdeploy_model.cc
index d909a6138..2eb25e383 100644
--- a/fastdeploy/fastdeploy_model.cc
+++ b/fastdeploy/fastdeploy_model.cc
@@ -41,20 +41,19 @@ bool CheckBackendSupported(const std::vector<Backend>& backends,
   return false;
 }
 
-bool FastDeployModel::IsSupported(const std::vector<Backend>& backends, 
+bool FastDeployModel::IsSupported(const std::vector<Backend>& backends,
                                   Backend backend) {
 #ifdef ENABLE_BENCHMARK
   if (runtime_option.benchmark_option.enable_profile) {
-    FDWARNING << "In benchmark mode, we don't check to see if " 
-              << "the backend [" << backend 
-              << "] is supported for current model!"
-              << std::endl;
+    FDWARNING << "In benchmark mode, we don't check to see if "
+              << "the backend [" << backend
+              << "] is supported for current model!" << std::endl;
     return true;
   }
-  return CheckBackendSupported(backends, backend);  
-#else  
   return CheckBackendSupported(backends, backend);
-#endif  
+#else
+  return CheckBackendSupported(backends, backend);
+#endif
 }
 
 bool FastDeployModel::InitRuntimeWithSpecifiedBackend() {
@@ -70,6 +69,7 @@ bool FastDeployModel::InitRuntimeWithSpecifiedBackend() {
   bool use_sophgotpu = (runtime_option.device == Device::SOPHGOTPUD);
   bool use_timvx = (runtime_option.device == Device::TIMVX);
   bool use_ascend = (runtime_option.device == Device::ASCEND);
+  bool use_directml = (runtime_option.device == Device::DIRECTML);
   bool use_kunlunxin = (runtime_option.device == Device::KUNLUNXIN);
 
   if (use_gpu) {
@@ -107,6 +107,13 @@ bool FastDeployModel::InitRuntimeWithSpecifiedBackend() {
               << " is not supported." << std::endl;
       return false;
     }
+  } else if (use_directml) {
+    if (!IsSupported(valid_directml_backends, runtime_option.backend)) {
+      FDERROR << "The valid directml backends of model " << ModelName()
+              << " are " << Str(valid_directml_backends) << ", "
+              << runtime_option.backend << " is not supported." << std::endl;
+      return false;
+    }
   } else if (use_kunlunxin) {
     if (!IsSupported(valid_kunlunxin_backends, runtime_option.backend)) {
       FDERROR << "The valid kunlunxin backends of model " << ModelName()
@@ -155,6 +162,8 @@ bool FastDeployModel::InitRuntimeWithSpecifiedDevice() {
     return CreateTimVXBackend();
   } else if (runtime_option.device == Device::ASCEND) {
     return CreateASCENDBackend();
+  } else if (runtime_option.device == Device::DIRECTML) {
+    return CreateDirectMLBackend();
   } else if (runtime_option.device == Device::KUNLUNXIN) {
     return CreateKunlunXinBackend();
   } else if (runtime_option.device == Device::SOPHGOTPUD) {
@@ -168,8 +177,9 @@ bool FastDeployModel::InitRuntimeWithSpecifiedDevice() {
     return false;
 #endif
   }
-  FDERROR << "Only support CPU/GPU/IPU/RKNPU/TIMVX/KunlunXin/ASCEND now."
-          << std::endl;
+  FDERROR
+      << "Only support CPU/GPU/IPU/RKNPU/TIMVX/KunlunXin/ASCEND/DirectML now."
+      << std::endl;
   return false;
 }
 
@@ -350,6 +360,30 @@ bool FastDeployModel::CreateASCENDBackend() {
   return false;
 }
 
+bool FastDeployModel::CreateDirectMLBackend() {
+  if (valid_directml_backends.size() == 0) {
+    FDERROR << "There's no valid directml backends for model: " << ModelName()
+            << std::endl;
+    return false;
+  }
+
+  for (size_t i = 0; i < valid_directml_backends.size(); ++i) {
+    if (!IsBackendAvailable(valid_directml_backends[i])) {
+      continue;
+    }
+    runtime_option.backend = valid_directml_backends[i];
+    runtime_ = std::unique_ptr<Runtime>(new Runtime());
+    if (!runtime_->Init(runtime_option)) {
+      return false;
+    }
+    runtime_initialized_ = true;
+    return true;
+  }
+  FDERROR << "Found no valid directml backend for model: " << ModelName()
+          << std::endl;
+  return false;
+}
+
 bool FastDeployModel::CreateIpuBackend() {
   if (valid_ipu_backends.size() == 0) {
     FDERROR << "There's no valid ipu backends for model: " << ModelName()
@@ -384,13 +418,13 @@ bool FastDeployModel::Infer(std::vector<FDTensor>& input_tensors,
     tc.End();
     if (time_of_runtime_.size() > 50000) {
       FDWARNING << "There are already 50000 records of runtime, will force to "
-                    "disable record time of runtime now."
+                   "disable record time of runtime now."
                 << std::endl;
       enable_record_time_of_runtime_ = false;
     }
     time_of_runtime_.push_back(tc.Duration());
   }
-  
+
   return ret;
 }
 
@@ -434,7 +468,7 @@ std::map<std::string, float> FastDeployModel::PrintStatisInfoOfRuntime() {
   statis_info_of_runtime_dict["warmup_iter"] = warmup_iter;
   statis_info_of_runtime_dict["avg_time"] = avg_time;
   statis_info_of_runtime_dict["iterations"] = time_of_runtime_.size();
-  
+
   return statis_info_of_runtime_dict;
 }
 }  // namespace fastdeploy
diff --git a/fastdeploy/fastdeploy_model.h b/fastdeploy/fastdeploy_model.h
index 037bb2192..bd7320912 100755
--- a/fastdeploy/fastdeploy_model.h
+++ b/fastdeploy/fastdeploy_model.h
@@ -45,6 +45,9 @@ class FASTDEPLOY_DECL FastDeployModel {
   /** Model's valid timvx backends. This member defined all the timvx backends have successfully tested for the model
    */
   std::vector<Backend> valid_timvx_backends = {};
+    /** Model's valid directml backends. This member defined all the onnxruntime directml backends have successfully tested for the model
+   */
+  std::vector<Backend> valid_directml_backends = {};
   /** Model's valid ascend backends. This member defined all the cann backends have successfully tested for the model
    */
   std::vector<Backend> valid_ascend_backends = {};
@@ -117,7 +120,7 @@ class FASTDEPLOY_DECL FastDeployModel {
    */
   virtual double GetProfileTime() {
     return runtime_->GetProfileTime();
-  }            
+  }
 
   /** \brief Release reused input/output buffers
   */
@@ -158,6 +161,7 @@ class FASTDEPLOY_DECL FastDeployModel {
   bool CreateTimVXBackend();
   bool CreateKunlunXinBackend();
   bool CreateASCENDBackend();
+  bool CreateDirectMLBackend();
   bool IsSupported(const std::vector<Backend>& backends,
                    Backend backend);
 
diff --git a/fastdeploy/runtime/backends/ort/ort_backend.cc b/fastdeploy/runtime/backends/ort/ort_backend.cc
index 58c449cc6..ae5e8f3ed 100644
--- a/fastdeploy/runtime/backends/ort/ort_backend.cc
+++ b/fastdeploy/runtime/backends/ort/ort_backend.cc
@@ -1,3 +1,4 @@
+
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,6 +14,7 @@
 // limitations under the License.
 
 #include "fastdeploy/runtime/backends/ort/ort_backend.h"
+
 #include "fastdeploy/core/float16.h"
 #include "fastdeploy/runtime/backends/ort/ops/adaptive_pool2d.h"
 #include "fastdeploy/runtime/backends/ort/ops/multiclass_nms.h"
@@ -24,13 +26,12 @@
 
 #include <memory>
 
-
 namespace fastdeploy {
 
 std::vector<OrtCustomOp*> OrtBackend::custom_operators_ =
     std::vector<OrtCustomOp*>();
 
-void OrtBackend::BuildOption(const OrtBackendOption& option) {
+bool OrtBackend::BuildOption(const OrtBackendOption& option) {
   option_ = option;
   if (option.graph_optimization_level >= 0) {
     session_options_.SetGraphOptimizationLevel(
@@ -45,6 +46,53 @@ void OrtBackend::BuildOption(const OrtBackendOption& option) {
   if (option.execution_mode >= 0) {
     session_options_.SetExecutionMode(ExecutionMode(option.execution_mode));
   }
+
+#ifdef WITH_DIRECTML
+  // If use DirectML
+  if (option.device == Device::DIRECTML) {
+    auto all_providers = Ort::GetAvailableProviders();
+    bool support_dml = false;
+    std::string providers_msg = "";
+    for (size_t i = 0; i < all_providers.size(); ++i) {
+      providers_msg = providers_msg + all_providers[i] + ", ";
+      if (all_providers[i] == "DmlExecutionProvider") {
+        support_dml = true;
+      }
+    }
+
+    if (!support_dml) {
+      FDWARNING << "Compiled fastdeploy with onnxruntime doesn't "
+                   "support DirectML, the available providers are "
+                << providers_msg << "will fallback to CPUExecutionProvider."
+                << "Please check if DirectML is installed successfully."
+                << std::endl;
+      option_.device = Device::CPU;
+    } else {
+      // Must set as below when use dml.
+      session_options_.DisableMemPattern();
+      session_options_.SetExecutionMode(ExecutionMode(0));
+
+      // DML session_option
+      OrtApi const& ortApi = Ort::GetApi();
+      const OrtDmlApi* ortDmlApi;
+      ortApi.GetExecutionProviderApi(
+          "DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi));
+      OrtStatus* onnx_dml_status =
+          ortDmlApi->SessionOptionsAppendExecutionProvider_DML(session_options_,
+                                                               0);
+      if (onnx_dml_status != nullptr) {
+        FDERROR
+            << "DirectML is not support in your machine, the program will exit."
+            << std::endl;
+        ortApi.ReleaseStatus(onnx_dml_status);
+        return false;
+      }
+    }
+    return true;
+  }
+#endif
+
+  // CUDA
   if (option.device == Device::GPU) {
     auto all_providers = Ort::GetAvailableProviders();
     bool support_cuda = false;
@@ -70,11 +118,14 @@ void OrtBackend::BuildOption(const OrtBackendOption& option) {
       }
       session_options_.AppendExecutionProvider_CUDA(cuda_options);
     }
+    return true;
   }
+  return true;
 }
 
 bool OrtBackend::Init(const RuntimeOption& option) {
-  if (option.device != Device::CPU && option.device != Device::GPU) {
+  if (option.device != Device::CPU && option.device != Device::GPU &&
+      option.device != Device::DIRECTML) {
     FDERROR
         << "Backend::ORT only supports Device::CPU/Device::GPU, but now its "
         << option.device << "." << std::endl;
@@ -169,7 +220,11 @@ bool OrtBackend::InitFromOnnx(const std::string& model_file,
     return false;
   }
 
-  BuildOption(option);
+  if (!BuildOption(option)) {
+    FDERROR << "Create Ort option fail." << std::endl;
+    return false;
+  }
+
   InitCustomOperators();
   session_ = {env_, model_file.data(), model_file.size(), session_options_};
   binding_ = std::make_shared<Ort::IoBinding>(session_);
@@ -355,4 +410,4 @@ void OrtBackend::InitCustomOperators() {
 #endif
 }
 
-}  // namespace fastdeploy
+}  // namespace fastdeploy
\ No newline at end of file
diff --git a/fastdeploy/runtime/backends/ort/ort_backend.h b/fastdeploy/runtime/backends/ort/ort_backend.h
index e0caf48a3..543b125e9 100644
--- a/fastdeploy/runtime/backends/ort/ort_backend.h
+++ b/fastdeploy/runtime/backends/ort/ort_backend.h
@@ -24,6 +24,10 @@
 #include "fastdeploy/runtime/backends/ort/option.h"
 #include "onnxruntime_cxx_api.h"  // NOLINT
 
+#ifdef WITH_DIRECTML
+#include "dml_provider_factory.h" // NOLINT
+#endif
+
 namespace fastdeploy {
 
 struct OrtValueInfo {
@@ -37,7 +41,7 @@ class OrtBackend : public BaseBackend {
   OrtBackend() {}
   virtual ~OrtBackend() = default;
 
-  void BuildOption(const OrtBackendOption& option);
+  bool BuildOption(const OrtBackendOption& option);
 
   bool Init(const RuntimeOption& option);
 
@@ -54,7 +58,7 @@ class OrtBackend : public BaseBackend {
   std::vector<TensorInfo> GetOutputInfos() override;
   static std::vector<OrtCustomOp*> custom_operators_;
   void InitCustomOperators();
-  
+
  private:
   bool InitFromPaddle(const std::string& model_buffer,
                       const std::string& params_buffer,
diff --git a/fastdeploy/runtime/enum_variables.cc b/fastdeploy/runtime/enum_variables.cc
index 22afeb9cd..c57636057 100644
--- a/fastdeploy/runtime/enum_variables.cc
+++ b/fastdeploy/runtime/enum_variables.cc
@@ -61,6 +61,9 @@ std::ostream& operator<<(std::ostream& out, const Device& d) {
     case Device::ASCEND:
       out << "Device::ASCEND";
       break;
+    case Device::DIRECTML:
+      out << "Device::DIRECTML";
+      break;
     default:
       out << "Device::UNKOWN";
   }
diff --git a/fastdeploy/runtime/enum_variables.h b/fastdeploy/runtime/enum_variables.h
index 7a96e60b4..c608504c5 100644
--- a/fastdeploy/runtime/enum_variables.h
+++ b/fastdeploy/runtime/enum_variables.h
@@ -29,7 +29,8 @@ namespace fastdeploy {
 /*! Inference backend supported in FastDeploy */
 enum Backend {
   UNKNOWN,  ///< Unknown inference backend
-  ORT,  ///< ONNX Runtime, support Paddle/ONNX format model, CPU / Nvidia GPU
+  ORT,  //< ONNX Runtime, support Paddle/ONNX format model,
+  //< CPU/ Nvidia GPU DirectML
   TRT,  ///< TensorRT, support Paddle/ONNX format model, Nvidia GPU only
   PDINFER,  ///< Paddle Inference, support Paddle format model, CPU / Nvidia GPU
   POROS,    ///< Poros, support TorchScript format model, CPU / Nvidia GPU
@@ -58,7 +59,8 @@ enum FASTDEPLOY_DECL Device {
   TIMVX,
   KUNLUNXIN,
   ASCEND,
-  SOPHGOTPUD
+  SOPHGOTPUD,
+  DIRECTML
 };
 
 /*! Deep learning model format */
@@ -93,13 +95,15 @@ static std::map<Device, std::vector<Backend>>
   {Device::TIMVX, {Backend::LITE}},
   {Device::KUNLUNXIN, {Backend::LITE}},
   {Device::ASCEND, {Backend::LITE}},
-  {Device::SOPHGOTPUD, {Backend::SOPHGOTPU}}
+  {Device::SOPHGOTPUD, {Backend::SOPHGOTPU}},
+  {Device::DIRECTML, {Backend::ORT}}
 };
 
 inline bool Supported(ModelFormat format, Backend backend) {
   auto iter = s_default_backends_by_format.find(format);
   if (iter == s_default_backends_by_format.end()) {
-    FDERROR << "Didn't find format is registered in s_default_backends_by_format." << std::endl;
+    FDERROR << "Didn't find format is registered in " <<
+            "s_default_backends_by_format." << std::endl;
     return false;
   }
   for (size_t i = 0; i < iter->second.size(); ++i) {
@@ -107,15 +111,17 @@ inline bool Supported(ModelFormat format, Backend backend) {
       return true;
     }
   }
-  std::string msg = Str(iter->second); 
-  FDERROR << backend << " only supports " << msg << ", but now it's " << format << "." << std::endl;
+  std::string msg = Str(iter->second);
+  FDERROR << backend << " only supports " << msg << ", but now it's "
+                      << format << "." << std::endl;
   return false;
 }
 
 inline bool Supported(Device device, Backend backend) {
   auto iter = s_default_backends_by_device.find(device);
   if (iter == s_default_backends_by_device.end()) {
-    FDERROR << "Didn't find device is registered in s_default_backends_by_device." << std::endl;
+    FDERROR << "Didn't find device is registered in " <<
+              "s_default_backends_by_device." << std::endl;
     return false;
   }
   for (size_t i = 0; i < iter->second.size(); ++i) {
@@ -123,8 +129,9 @@ inline bool Supported(Device device, Backend backend) {
       return true;
     }
   }
-  std::string msg = Str(iter->second); 
-  FDERROR << backend << " only supports " << msg << ", but now it's " << device << "." << std::endl;
+  std::string msg = Str(iter->second);
+  FDERROR << backend << " only supports " << msg << ", but now it's "
+          << device << "." << std::endl;
   return false;
 }
 
diff --git a/fastdeploy/runtime/runtime_option.cc b/fastdeploy/runtime/runtime_option.cc
index ff687e4f6..8d18637a8 100644
--- a/fastdeploy/runtime/runtime_option.cc
+++ b/fastdeploy/runtime/runtime_option.cc
@@ -93,6 +93,8 @@ void RuntimeOption::UseAscend() {
   paddle_lite_option.device = device;
 }
 
+void RuntimeOption::UseDirectML() { device = Device::DIRECTML; }
+
 void RuntimeOption::UseSophgo() {
   device = Device::SOPHGOTPUD;
   UseSophgoBackend();
diff --git a/fastdeploy/runtime/runtime_option.h b/fastdeploy/runtime/runtime_option.h
old mode 100755
new mode 100644
index 9bb3b4539..4b7af8a99
--- a/fastdeploy/runtime/runtime_option.h
+++ b/fastdeploy/runtime/runtime_option.h
@@ -78,6 +78,10 @@ struct FASTDEPLOY_DECL RuntimeOption {
   void UseTimVX();
   /// Use Huawei Ascend to inference
   void UseAscend();
+
+  /// Use onnxruntime DirectML to inference
+  void UseDirectML();
+
   /// Use Sophgo to inference
   void UseSophgo();
   /// \brief Turn on KunlunXin XPU.
diff --git a/fastdeploy/vision/classification/ppcls/model.cc b/fastdeploy/vision/classification/ppcls/model.cc
old mode 100644
new mode 100755
index d52eeace9..df4d89eb7
--- a/fastdeploy/vision/classification/ppcls/model.cc
+++ b/fastdeploy/vision/classification/ppcls/model.cc
@@ -34,12 +34,14 @@ PaddleClasModel::PaddleClasModel(const std::string& model_file,
     valid_ascend_backends = {Backend::LITE};
     valid_kunlunxin_backends = {Backend::LITE};
     valid_ipu_backends = {Backend::PDINFER};
+    valid_directml_backends = {Backend::ORT};
   } else if (model_format == ModelFormat::SOPHGO) {
     valid_sophgonpu_backends = {Backend::SOPHGOTPU};
   } else {
     valid_cpu_backends = {Backend::ORT, Backend::OPENVINO};
     valid_gpu_backends = {Backend::ORT, Backend::TRT};
     valid_rknpu_backends = {Backend::RKNPU2};
+    valid_directml_backends = {Backend::ORT};
   }
 
   runtime_option = custom_option;
diff --git a/fastdeploy/vision/segmentation/ppseg/model.cc b/fastdeploy/vision/segmentation/ppseg/model.cc
index 54f978828..2f5d45c5f 100755
--- a/fastdeploy/vision/segmentation/ppseg/model.cc
+++ b/fastdeploy/vision/segmentation/ppseg/model.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "fastdeploy/vision/segmentation/ppseg/model.h"
+
 #include "fastdeploy/utils/unique_ptr.h"
 
 namespace fastdeploy {
@@ -20,22 +21,23 @@ namespace vision {
 namespace segmentation {
 
 PaddleSegModel::PaddleSegModel(const std::string& model_file,
-                     const std::string& params_file,
-                     const std::string& config_file,
-                     const RuntimeOption& custom_option,
-                     const ModelFormat& model_format) : preprocessor_(config_file),
-                                                        postprocessor_(config_file) {
-  if(model_format == ModelFormat::SOPHGO) {
+                               const std::string& params_file,
+                               const std::string& config_file,
+                               const RuntimeOption& custom_option,
+                               const ModelFormat& model_format)
+    : preprocessor_(config_file), postprocessor_(config_file) {
+  if (model_format == ModelFormat::SOPHGO) {
     valid_sophgonpu_backends = {Backend::SOPHGOTPU};
-  }
-  else{
-    valid_cpu_backends = {Backend::OPENVINO, Backend::PDINFER, Backend::ORT, Backend::LITE};
+  } else {
+    valid_cpu_backends = {Backend::OPENVINO, Backend::PDINFER, Backend::ORT,
+                          Backend::LITE};
     valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
   }
   valid_rknpu_backends = {Backend::RKNPU2};
   valid_timvx_backends = {Backend::LITE};
   valid_kunlunxin_backends = {Backend::LITE};
   valid_ascend_backends = {Backend::LITE};
+  valid_directml_backends = {Backend::ORT};
 
   runtime_option = custom_option;
   runtime_option.model_format = model_format;
@@ -44,8 +46,9 @@ PaddleSegModel::PaddleSegModel(const std::string& model_file,
   initialized = Initialize();
 }
 
-std::unique_ptr<PaddleSegModel>  PaddleSegModel::Clone() const {
-  std::unique_ptr<PaddleSegModel> clone_model = fastdeploy::utils::make_unique<PaddleSegModel>(PaddleSegModel(*this));
+std::unique_ptr<PaddleSegModel> PaddleSegModel::Clone() const {
+  std::unique_ptr<PaddleSegModel> clone_model =
+      fastdeploy::utils::make_unique<PaddleSegModel>(PaddleSegModel(*this));
   clone_model->SetRuntime(clone_model->CloneRuntime());
   return clone_model;
 }
@@ -59,7 +62,7 @@ bool PaddleSegModel::Initialize() {
 }
 
 bool PaddleSegModel::Predict(cv::Mat* im, SegmentationResult* result) {
-  return Predict(*im, result); 
+  return Predict(*im, result);
 }
 
 bool PaddleSegModel::Predict(const cv::Mat& im, SegmentationResult* result) {