[Backend] Add KunlunXin XPU deploy support (#747)

* add xpu support * fix docs * update code * update doc * update code * update yolov5 * update cmake * add int64_t data support * fix * update download links * add en doc * update code * update xpu options * update doc * update doc * update doc * update lib links * update doc * update code * update lite xpu link * update xpu lib * update doc * update en doc
2025-10-05 16:48:03 +08:00 · 2022-12-15 21:17:14 +08:00
parent 6e79df40d9
commit 5be839b322
39 changed files with 870 additions and 58 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,7 +40,7 @@ if(NOT MSVC)
  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=1)
 endif(NOT MSVC)

-if(UNIX AND (NOT APPLE) AND (NOT ANDROID) AND (NOT ENABLE_TIMVX))
+if(UNIX AND (NOT APPLE) AND (NOT ANDROID) AND (NOT WITH_TIMVX))
  include(${PROJECT_SOURCE_DIR}/cmake/patchelf.cmake)
 endif()

@@ -64,7 +64,8 @@ option(ENABLE_LITE_BACKEND "Whether to enable paddle lite backend." OFF)
 option(ENABLE_VISION "Whether to enable vision models usage." OFF)
 option(ENABLE_TEXT "Whether to enable text models usage." OFF)
 option(ENABLE_FLYCV "Whether to enable flycv to boost image preprocess." OFF)
-option(ENABLE_TIMVX "Whether to compile for TIMVX deploy." OFF)
+option(WITH_TIMVX "Whether to compile for TIMVX deploy." OFF)
+option(WITH_XPU "Whether to compile for KunlunXin XPU deploy." OFF)
 option(WITH_TESTING "Whether to compile with unittest." OFF)
 ############################# Options for Android cross compiling #########################
 option(WITH_OPENCV_STATIC "Use OpenCV static lib for Android." OFF)
@@ -138,10 +139,23 @@ set(HEAD_DIR "${PROJECT_SOURCE_DIR}/${CSRCS_DIR_NAME}")
 include_directories(${HEAD_DIR})
 include_directories(${CMAKE_CURRENT_BINARY_DIR})

-if (ENABLE_TIMVX)
+if (WITH_TIMVX)
  include(${PROJECT_SOURCE_DIR}/cmake/timvx.cmake)
 endif()

+if (WITH_XPU)
+  if(NOT ENABLE_LITE_BACKEND)
+      message(WARNING "While compiling with -DWITH_XPU=ON, will force to set -DENABLE_LITE_BACKEND=ON")
+      set(ENABLE_LITE_BACKEND ON)
+  endif()
+  if(NOT CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
+    message(FATAL_ERROR "XPU is only supported on Linux x64 platform")
+  endif()
+  if(NOT PADDLELITE_URL)
+    set(PADDLELITE_URL "https://bj.bcebos.com/fastdeploy/third_libs/lite-linux-x64-xpu-20221215.tgz")
+  endif()
+endif()
+

 if(ANDROID OR IOS)
  if(ENABLE_ORT_BACKEND)
--- a/FastDeploy.cmake.in
+++ b/FastDeploy.cmake.in
@@ -27,6 +27,7 @@ set(OPENCV_DIRECTORY "@OPENCV_DIRECTORY@")
 set(ORT_DIRECTORY "@ORT_DIRECTORY@")
 set(OPENVINO_DIRECTORY "@OPENVINO_DIRECTORY@")
 set(RKNN2_TARGET_SOC "@RKNN2_TARGET_SOC@")
+set(WITH_XPU @WITH_XPU@)

 set(FASTDEPLOY_LIBS "")
 set(FASTDEPLOY_INCS "")
@@ -237,6 +238,10 @@ if(ENABLE_PADDLE_FRONTEND)
  list(APPEND FASTDEPLOY_LIBS ${PADDLE2ONNX_LIB})
 endif()

+if(WITH_XPU)
+  list(APPEND FASTDEPLOY_LIBS -lpthread -lrt -ldl)
+endif()
+
 remove_duplicate_libraries(FASTDEPLOY_LIBS)

 # Print compiler information
--- a/cmake/summary.cmake
+++ b/cmake/summary.cmake
@@ -37,6 +37,8 @@ function(fastdeploy_summary)
  message(STATUS "  ENABLE_POROS_BACKEND      : ${ENABLE_POROS_BACKEND}")
  message(STATUS "  ENABLE_TRT_BACKEND        : ${ENABLE_TRT_BACKEND}")
  message(STATUS "  ENABLE_OPENVINO_BACKEND   : ${ENABLE_OPENVINO_BACKEND}")
+  message(STATUS "  WITH_TIMVX                : ${WITH_TIMVX}")
+  message(STATUS "  WITH_XPU                  : ${WITH_XPU}")
  if(ENABLE_ORT_BACKEND)
    message(STATUS "  ONNXRuntime version       : ${ONNXRUNTIME_VERSION}")
  endif()
--- a/docs/README_CN.md
+++ b/docs/README_CN.md
@@ -8,6 +8,7 @@
 - [GPU部署环境编译安装](cn/build_and_install/gpu.md)
 - [CPU部署环境编译安装](cn/build_and_install/cpu.md)
 - [IPU部署环境编译安装](cn/build_and_install/ipu.md)
+- [昆仑芯XPU部署环境编译安装](cn/build_and_install/xpu.md)
 - [Jetson部署环境编译安装](cn/build_and_install/jetson.md)
 - [Android平台部署环境编译安装](cn/build_and_install/android.md)
 - [服务化部署镜像编译安装](../serving/docs/zh_CN/compile.md)
--- a/docs/README_EN.md
+++ b/docs/README_EN.md
@@ -8,6 +8,7 @@
 - [Build and Install FastDeploy Library on GPU Platform](en/build_and_install/gpu.md)
 - [Build and Install FastDeploy Library on CPU Platform](en/build_and_install/cpu.md)
 - [Build and Install FastDeploy Library on IPU Platform](en/build_and_install/ipu.md)
+- [Build and Install FastDeploy Library on KunlunXin XPU Platform](en/build_and_install/xpu.md)
 - [Build and Install FastDeploy Library on  Nvidia Jetson Platform](en/build_and_install/jetson.md)
 - [Build and Install FastDeploy Library on Android Platform](en/build_and_install/android.md)
 - [Build and Install FastDeploy Serving Deployment Image](../serving/docs/EN/compile-en.md)
@@ -19,10 +20,10 @@
 - [A Quick Start on Runtime Python](en/quick_start/runtime/python.md)
 - [A Quick Start on Runtime C++](en/quick_start/runtime/cpp.md)

-## API 
+## API

 - [Python API](https://baidu-paddle.github.io/fastdeploy-api/python/html/)
- [C++ API](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/) 
+- [C++ API](https://baidu-paddle.github.io/fastdeploy-api/cpp/html/)
 - [Android Java API](../java/android)

 ## Performance Optimization
--- a/docs/cn/build_and_install/README.md
+++ b/docs/cn/build_and_install/README.md
@@ -13,6 +13,7 @@
 - [Android平台部署环境](android.md)
 - [瑞芯微RV1126部署环境](rv1126.md)
 - [晶晨A311D部署环境](a311d.md)
+- [昆仑芯XPU部署环境](xpu.md)


 ## FastDeploy编译选项说明
@@ -20,10 +21,11 @@
 | 选项                      | 说明                                                                        |
 |:------------------------|:--------------------------------------------------------------------------|
 | ENABLE_ORT_BACKEND      | 默认OFF, 是否编译集成ONNX Runtime后端(CPU/GPU上推荐打开)                                 |
-| ENABLE_PADDLE_BACKEND   | 默认OFF，是否编译集成Paddle Inference后端(CPU/GPU上推荐打开)                              |  
-| ENABLE_LITE_BACKEND     | 默认OFF，是否编译集成Paddle Lite后端(编译Android库时需要设置为ON)                             |
+| ENABLE_PADDLE_BACKEND   | 默认OFF，是否编译集成Paddle Inference后端(CPU/GPU上推荐打开)                             |  
+| ENABLE_LITE_BACKEND     | 默认OFF，是否编译集成Paddle Lite后端(编译Android库时需要设置为ON)                          |
 | ENABLE_RKNPU2_BACKEND   | 默认OFF，是否编译集成RKNPU2后端(RK3588/RK3568/RK3566上推荐打开)                           |
-| WITH_TIMVX            | 默认OFF，需要在RV1126/RV1109/A311D上部署时，需设置为ON                                   |
+| WITH_XPU                | 默认OFF，当在昆仑芯XPU上部署时，需设置为ON                                                |
+| WITH_TIMVX              | 默认OFF，需要在RV1126/RV1109/A311D上部署时，需设置为ON                                   |
 | ENABLE_TRT_BACKEND      | 默认OFF，是否编译集成TensorRT后端(GPU上推荐打开)                                          |
 | ENABLE_OPENVINO_BACKEND | 默认OFF，是否编译集成OpenVINO后端(CPU上推荐打开)                                          |
 | ENABLE_VISION           | 默认OFF，是否编译集成视觉模型的部署模块                                                     |
--- a/docs/cn/build_and_install/a311d.md
+++ b/docs/cn/build_and_install/a311d.md
@@ -9,7 +9,8 @@ FastDeploy 基于 Paddle-Lite 后端支持在晶晨 NPU 上进行部署推理。
 |编译选项|默认值|说明|备注|  
 |:---|:---|:---|:---|  
 |ENABLE_LITE_BACKEND|OFF|编译A311D部署库时需要设置为ON| - |
-|WITH_TIMVX|OFF|编译A311D部署库时需要设置为ON| - |
+|WITH_TIMVX|OFF|编译A311D部署库时需要设置为ON| - |  
+|TARGET_ABI|NONE|编译RK库时需要设置为arm64| - |

 更多编译选项请参考[FastDeploy编译选项说明](./README.md)

--- a/docs/cn/build_and_install/rv1126.md
+++ b/docs/cn/build_and_install/rv1126.md
@@ -8,8 +8,9 @@ FastDeploy基于 Paddle-Lite 后端支持在瑞芯微（Rockchip）Soc 上进行
 相关编译选项说明如下：  
 |编译选项|默认值|说明|备注|  
 |:---|:---|:---|:---|  
-|ENABLE_LITE_BACKEND|OFF|编译RK库时需要设置为ON| - |
-|WITH_TIMVX|OFF|编译RK库时需要设置为ON| - |
+|ENABLE_LITE_BACKEND|OFF|编译RK库时需要设置为ON| - |  
+|WITH_TIMVX|OFF|编译RK库时需要设置为ON| - |  
+|TARGET_ABI|NONE|编译RK库时需要设置为armhf| - |

 更多编译选项请参考[FastDeploy编译选项说明](./README.md)

@@ -86,7 +87,7 @@ dmesg | grep Galcore
 wget https://paddlelite-demo.bj.bcebos.com/devices/generic/PaddleLite-generic-demo.tar.gz
 tar -xf PaddleLite-generic-demo.tar.gz
 ```
-2. 使用 `uname -a` 查看 `Linux Kernel` 版本，确定为 `Linux` 系统 4.19.111 版本，
+2. 使用 `uname -a` 查看 `Linux Kernel` 版本，确定为 `Linux` 系统 4.19.111 版本
 3. 将 `PaddleLite-generic-demo/libs/PaddleLite/linux/armhf/lib/verisilicon_timvx/viv_sdk_6_4_6_5/lib/1126/4.19.111/` 路径下的 `galcore.ko` 上传至开发板。

 4. 登录开发板，命令行输入 `sudo rmmod galcore` 来卸载原始驱动，输入 `sudo insmod galcore.ko` 来加载传上设备的驱动。（是否需要 sudo 根据开发板实际情况，部分 adb 链接的设备请提前 adb root）。此步骤如果操作失败，请跳转至方法 2。
--- a/docs/cn/build_and_install/xpu.md
+++ b/docs/cn/build_and_install/xpu.md
@@ -0,0 +1,75 @@
+# 昆仑芯 XPU 部署环境编译安装
+
+FastDeploy 基于 Paddle-Lite 后端支持在昆仑芯 XPU 上进行部署推理。
+更多详细的信息请参考：[PaddleLite部署示例](https://www.paddlepaddle.org.cn/lite/develop/demo_guides/kunlunxin_xpu.html#xpu)。
+
+本文档介绍如何编译基于 PaddleLite 的 C++ FastDeploy 编译库。
+
+相关编译选项说明如下：  
+|编译选项|默认值|说明|备注|  
+|:---|:---|:---|:---|  
+| WITH_XPU| OFF | 需要在XPU上部署时需要设置为ON | - |
+| ENABLE_ORT_BACKEND | OFF | 是否编译集成ONNX Runtime后端 | - |
+| ENABLE_PADDLE_BACKEND | OFF | 是否编译集成Paddle Inference后端 | - |
+| ENABLE_OPENVINO_BACKEND | OFF | 是否编译集成OpenVINO后端 | - |
+| ENABLE_VISION | OFF | 是否编译集成视觉模型的部署模块 | - |
+| ENABLE_TEXT | OFF | 是否编译集成文本NLP模型的部署模块 | - |
+
+第三方库依赖指定（不设定如下参数，会自动下载预编译库）
+| 选项                     | 说明                                                                                           |
+| :---------------------- | :--------------------------------------------------------------------------------------------- |
+| ORT_DIRECTORY           | 当开启ONNX Runtime后端时，用于指定用户本地的ONNX Runtime库路径；如果不指定，编译过程会自动下载ONNX Runtime库  |
+| OPENCV_DIRECTORY        | 当ENABLE_VISION=ON时，用于指定用户本地的OpenCV库路径；如果不指定，编译过程会自动下载OpenCV库              |
+| OPENVINO_DIRECTORY      | 当开启OpenVINO后端时, 用于指定用户本地的OpenVINO库路径；如果不指定，编译过程会自动下载OpenVINO库             |
+更多编译选项请参考[FastDeploy编译选项说明](./README.md)
+
+## 基于 PaddleLite 的 C++ FastDeploy 库编译
+- OS: Linux
+- gcc/g++: version >= 8.2
+- cmake: version >= 3.15
+此外更推荐开发者自行安装，编译时通过`-DOPENCV_DIRECTORY`来指定环境中的OpenCV（如若不指定-DOPENCV_DIRECTORY，会自动下载FastDeploy提供的预编译的OpenCV，但在**Linux平台**无法支持Video的读取，以及imshow等可视化界面功能）
+```
+sudo apt-get install libopencv-dev
+```
+编译命令如下：
+```bash
+# Download the latest source code
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy  
+mkdir build && cd build
+
+# CMake configuration with KunlunXin xpu toolchain
+cmake -DWITH_XPU=ON  \
+      -DWITH_GPU=OFF  \ # 不编译 GPU
+      -DENABLE_ORT_BACKEND=ON  \ # 可选择开启 ORT 后端
+      -DENABLE_PADDLE_BACKEND=ON  \ # 可选择开启 Paddle 后端
+      -DCMAKE_INSTALL_PREFIX=fastdeploy-xpu \
+      -DENABLE_VISION=ON \ # 是否编译集成视觉模型的部署模块，可选择开启
+      -DOPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4 \
+      ..
+
+# Build FastDeploy KunlunXin XPU C++ SDK
+make -j8
+make install
+```  
+编译完成之后，会生成 fastdeploy-xpu 目录，表示基于 PadddleLite 的 FastDeploy 库编译完成。
+
+## Python 编译
+编译命令如下：
+```bash
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy/python
+export WITH_XPU=ON
+export WITH_GPU=OFF
+export ENABLE_ORT_BACKEND=ON
+export ENABLE_PADDLE_BACKEND=ON
+export ENABLE_VISION=ON
+# OPENCV_DIRECTORY可选，不指定会自动下载FastDeploy提供的预编译OpenCV库
+export OPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4
+
+python setup.py build
+python setup.py bdist_wheel
+```  
+编译完成即会在 `FastDeploy/python/dist` 目录下生成编译后的 `wheel` 包，直接 pip install 即可
+
+编译过程中，如若修改编译参数，为避免带来缓存影响，可删除 `FastDeploy/python` 目录下的 `build` 和 `.setuptools-cmake-build` 两个子目录后再重新编译
--- a/docs/en/build_and_install/README.md
+++ b/docs/en/build_and_install/README.md
@@ -12,6 +12,9 @@ English | [中文](../../cn/build_and_install/README.md)
 - [Build and Install on IPU Platform](ipu.md)
 - [Build and Install on Nvidia Jetson Platform](jetson.md)
 - [Build and Install on Android Platform](android.md)
+- [Build and Install on RV1126 Platform](rv1126.md)
+- [Build and Install on A311D Platform](a311d.md)
+- [Build and Install on KunlunXin XPU Platform](xpu.md)


 ## Build options
@@ -25,6 +28,8 @@ English | [中文](../../cn/build_and_install/README.md)
 | ENABLE_VISION | Default OFF，whether to enable vision models deployment module |
 | ENABLE_TEXT | Default OFF，whether to enable text models deployment module |
 | WITH_GPU | Default OFF, if build on GPU, this need to be ON |
+| WITH_XPU | Default OFF，if deploy on KunlunXin XPU，this need to be ON |
+| WITH_TIMVX | Default OFF，if deploy on RV1126/RV1109/A311D，this need to be ON |
 | CUDA_DIRECTORY | Default /usr/local/cuda, if build on GPU, this defines the path of CUDA(>=11.2) |
 | TRT_DIRECTORY | If build with ENABLE_TRT_BACKEND=ON, this defines the path of TensorRT(>=8.4) |
 | ORT_DIRECTORY | [Optional] If build with ENABLE_ORT_BACKEND=ON, this flag defines the path of ONNX Runtime, but if this flag is not set, it will download ONNX Runtime library automatically |
--- a/docs/en/build_and_install/a311d.md
+++ b/docs/en/build_and_install/a311d.md
@@ -0,0 +1,105 @@
+# How to Build A311D Deployment Environment
+
+FastDeploy supports AI deployment on Rockchip Soc based on Paddle-Lite backend. For more detailed information, please refer to: [PaddleLite Deployment Example](https://www.paddlepaddle.org.cn/lite/develop/demo_guides/verisilicon_timvx.html).
+
+This document describes how to compile the PaddleLite-based C++ FastDeploy cross-compilation library.
+
+The relevant compilation options are described as follows:  
+|Compile Options|Default Values|Description|Remarks|  
+|:---|:---|:---|:---|  
+|ENABLE_LITE_BACKEND|OFF|It needs to be set to ON when compiling the A311D library| - |  
+|WITH_TIMVX|OFF|It needs to be set to ON when compiling the A311D library| - |  
+|TARGET_ABI|NONE|It needs to be set to arm64 when compiling the A311D library| - |  
+
+For more compilation options, please refer to [Description of FastDeploy compilation options](./README.md)
+
+## Cross-compilation environment construction
+
+### Host Environment Requirements  
+- os：Ubuntu == 16.04
+- cmake： version >= 3.10.0  
+
+### Building the compilation environment
+You can enter the FastDeploy/tools/timvx directory and use the following command to install:
+```bash
+cd FastDeploy/tools/timvx
+bash install.sh
+```
+You can also install it with the following commands:
+```bash
+ # 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip
+
+# 2. Install arm gcc toolchains
+apt-get install -y --no-install-recommends \
+  g++-arm-linux-gnueabi gcc-arm-linux-gnueabi \
+  g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf \
+  gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+  tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+  mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
+  ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+  ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+```
+
+## FastDeploy cross-compilation library compilation based on PaddleLite
+After setting up the cross-compilation environment, the compilation command is as follows:
+```bash
+# Download the latest source code
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy  
+mkdir build && cd build
+
+# CMake configuration with A311D toolchain
+cmake -DCMAKE_TOOLCHAIN_FILE=./../cmake/toolchain.cmake \
+      -DWITH_TIMVX=ON  \
+      -DTARGET_ABI=arm64 \
+      -DCMAKE_INSTALL_PREFIX=fastdeploy-tmivx \
+      -DENABLE_VISION=ON \ # 是否编译集成视觉模型的部署模块，可选择开启
+      -Wno-dev ..
+
+# Build FastDeploy A311D C++ SDK
+make -j8
+make install
+```  
+After the compilation is complete, the fastdeploy-tmivx directory will be generated, indicating that the FastDeploy library based on PadddleLite TIM-VX has been compiled.
+
+## Prepare the Soc environment
+Before deployment, ensure that the version of the driver galcore.so of the Verisilicon Linux Kernel NPU meets the requirements. Before deployment, please log in to the development board, and enter the following command through the command line to query the NPU driver version. The recommended version of the Rockchip driver is: 6.4.4.3
+```bash
+dmesg | grep Galcore
+```  
+If the current version does not comply with the above, please read the following content carefully to ensure that the underlying NPU driver environment is correct.
+
+There are two ways to modify the current NPU driver version:
+1. Manually replace the NPU driver version. (recommend)
+2. flash the machine, and flash the firmware that meets the requirements of the NPU driver version.
+
+### Manually replace the NPU driver version
+1. Use the following command to download and decompress the PaddleLite demo, which provides ready-made driver files
+```bash
+wget https://paddlelite-demo.bj.bcebos.com/devices/generic/PaddleLite-generic-demo.tar.gz
+tar -xf PaddleLite-generic-demo.tar.gz
+```
+2. Use `uname -a` to check `Linux Kernel` version, it is determined to be version 4.19.111.
+3. Upload `galcore.ko` under `PaddleLite-generic-demo/libs/PaddleLite/linux/arm64/lib/verisilicon_timvx/viv_sdk_6_4_4_3/lib/a311d/4.9.113` path to the development board.
+4. Log in to the development board, enter `sudo rmmod galcore` on the command line to uninstall the original driver, and enter `sudo insmod galcore.ko` to load the uploaded device driver. (Whether sudo is needed depends on the actual situation of the development board. For some adb-linked devices, please adb root in advance). If this step fails, go to method 2.
+5. Enter `dmesg | grep Galcore` in the development board to query the NPU driver version, and it is determined to be: 6.4.4.3
+
+### flash
+According to the specific development board model, ask the development board seller or the official website customer service for the firmware and flashing method corresponding to the 6.4.4.3 version of the NPU driver.
+
+For more details, please refer to: [PaddleLite prepares the device environment](https://www.paddlepaddle.org.cn/lite/develop/demo_guides/verisilicon_timvx.html#zhunbeishebeihuanjing)
+
+## Deployment example based on FastDeploy on A311D
+1. For deploying the PaddleClas classification model on A311D, please refer to: [C++ deployment example of PaddleClas classification model on A311D](../../../examples/vision/classification/paddleclas/a311d/README.md)
+
+2. For deploying PPYOLOE detection model on A311D, please refer to: [C++ deployment example of PPYOLOE detection model on A311D](../../../examples/vision/detection/paddledetection/a311d/README.md)
+
+3. For deploying YOLOv5 detection model on A311D, please refer to: [C++ Deployment Example of YOLOv5 Detection Model on A311D](../../../examples/vision/detection/yolov5/a311d/README.md)
+
+4. For deploying PP-LiteSeg segmentation model on A311D, please refer to: [C++ Deployment Example of PP-LiteSeg Segmentation Model on A311D](../../../examples/vision/segmentation/paddleseg/a311d/README.md)
--- a/docs/en/build_and_install/rv1126.md
+++ b/docs/en/build_and_install/rv1126.md
@@ -0,0 +1,105 @@
+# How to Build RV1126 Deployment Environment
+
+FastDeploy supports AI deployment on Rockchip Soc based on Paddle-Lite backend. For more detailed information, please refer to: [PaddleLite Deployment Example](https://www.paddlepaddle.org.cn/lite/develop/demo_guides/verisilicon_timvx.html).
+
+This document describes how to compile the PaddleLite-based C++ FastDeploy cross-compilation library.
+
+The relevant compilation options are described as follows:  
+|Compile Options|Default Values|Description|Remarks|  
+|:---|:---|:---|:---|  
+|ENABLE_LITE_BACKEND|OFF|It needs to be set to ON when compiling the RK library| - |  
+|WITH_TIMVX|OFF|It needs to be set to ON when compiling the RK library| - |  
+|TARGET_ABI|NONE|It needs to be set to armhf when compiling the RK library| - |  
+
+For more compilation options, please refer to [Description of FastDeploy compilation options](./README.md)
+
+## Cross-compilation environment construction
+
+### Host Environment Requirements  
+- os：Ubuntu == 16.04
+- cmake： version >= 3.10.0  
+
+### Building the compilation environment
+You can enter the FastDeploy/tools/timvx directory and use the following command to install:
+```bash
+cd FastDeploy/tools/timvx
+bash install.sh
+```
+You can also install it with the following commands:
+```bash
+ # 1. Install basic software
+apt update
+apt-get install -y --no-install-recommends \
+  gcc g++ git make wget python unzip
+
+# 2. Install arm gcc toolchains
+apt-get install -y --no-install-recommends \
+  g++-arm-linux-gnueabi gcc-arm-linux-gnueabi \
+  g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf \
+  gcc-aarch64-linux-gnu g++-aarch64-linux-gnu
+
+# 3. Install cmake 3.10 or above
+wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz && \
+  tar xzf cmake-3.10.3-Linux-x86_64.tar.gz && \
+  mv cmake-3.10.3-Linux-x86_64 /opt/cmake-3.10 && \
+  ln -s /opt/cmake-3.10/bin/cmake /usr/bin/cmake && \
+  ln -s /opt/cmake-3.10/bin/ccmake /usr/bin/ccmake
+```
+
+## FastDeploy cross-compilation library compilation based on PaddleLite
+After setting up the cross-compilation environment, the compilation command is as follows:
+```bash
+# Download the latest source code
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy  
+mkdir build && cd build
+
+# CMake configuration with RK toolchain
+cmake -DCMAKE_TOOLCHAIN_FILE=./../cmake/toolchain.cmake \
+      -DWITH_TIMVX=ON  \
+      -DTARGET_ABI=armhf \
+      -DCMAKE_INSTALL_PREFIX=fastdeploy-tmivx \
+      -DENABLE_VISION=ON \ # 是否编译集成视觉模型的部署模块，可选择开启
+      -Wno-dev ..
+
+# Build FastDeploy RV1126 C++ SDK
+make -j8
+make install
+```  
+After the compilation is complete, the fastdeploy-tmivx directory will be generated, indicating that the FastDeploy library based on PadddleLite TIM-VX has been compiled.
+
+## Prepare the Soc environment
+Before deployment, ensure that the version of the driver galcore.so of the Verisilicon Linux Kernel NPU meets the requirements. Before deployment, please log in to the development board, and enter the following command through the command line to query the NPU driver version. The recommended version of the Rockchip driver is: 6.4.6.5
+```bash
+dmesg | grep Galcore
+```  
+If the current version does not comply with the above, please read the following content carefully to ensure that the underlying NPU driver environment is correct.
+
+There are two ways to modify the current NPU driver version:
+1. Manually replace the NPU driver version. (recommend)
+2. flash the machine, and flash the firmware that meets the requirements of the NPU driver version.
+
+### Manually replace the NPU driver version
+1. Use the following command to download and decompress the PaddleLite demo, which provides ready-made driver files
+```bash
+wget https://paddlelite-demo.bj.bcebos.com/devices/generic/PaddleLite-generic-demo.tar.gz
+tar -xf PaddleLite-generic-demo.tar.gz
+```
+2. Use `uname -a` to check `Linux Kernel` version, it is determined to be version 4.19.111.
+3. Upload `galcore.ko` under `PaddleLite-generic-demo/libs/PaddleLite/linux/armhf/lib/verisilicon_timvx/viv_sdk_6_4_6_5/lib/1126/4.19.111/` path to the development board.
+4. Log in to the development board, enter `sudo rmmod galcore` on the command line to uninstall the original driver, and enter `sudo insmod galcore.ko` to load the uploaded device driver. (Whether sudo is needed depends on the actual situation of the development board. For some adb-linked devices, please adb root in advance). If this step fails, go to method 2.
+5. Enter `dmesg | grep Galcore` in the development board to query the NPU driver version, and it is determined to be: 6.4.6.5
+
+### flash
+According to the specific development board model, ask the development board seller or the official website customer service for the firmware and flashing method corresponding to the 6.4.6.5 version of the NPU driver.
+
+For more details, please refer to: [PaddleLite prepares the device environment](https://www.paddlepaddle.org.cn/lite/develop/demo_guides/verisilicon_timvx.html#zhunbeishebeihuanjing)
+
+## Deployment example based on FastDeploy on RV1126
+1. For deploying the PaddleClas classification model on RV1126, please refer to: [C++ deployment example of PaddleClas classification model on RV1126](../../../examples/vision/classification/paddleclas/rv1126/README.md)
+
+2. For deploying PPYOLOE detection model on RV1126, please refer to: [C++ deployment example of PPYOLOE detection model on RV1126](../../../examples/vision/detection/paddledetection/rv1126/README.md)
+
+3. For deploying YOLOv5 detection model on RV1126, please refer to: [C++ Deployment Example of YOLOv5 Detection Model on RV1126](../../../examples/vision/detection/yolov5/rv1126/README.md)
+
+4. For deploying PP-LiteSeg segmentation model on RV1126, please refer to: [C++ Deployment Example of PP-LiteSeg Segmentation Model on RV1126](../../../examples/vision/segmentation/paddleseg/rv1126/README.md)
--- a/docs/en/build_and_install/xpu.md
+++ b/docs/en/build_and_install/xpu.md
@@ -0,0 +1,78 @@
+# How to Build KunlunXin XPU Deployment Environment
+
+FastDeploy supports deployment AI on KunlunXin XPU based on Paddle-Lite backend. For more detailed information, please refer to: [PaddleLite Deployment Example](https://www.paddlepaddle.org.cn/lite/develop/demo_guides/kunlunxin_xpu.html#xpu)。
+
+This document describes how to compile the C++ FastDeploy library based on PaddleLite.
+
+The relevant compilation options are described as follows:  
+|Compile Options|Default Values|Description|Remarks|  
+|:---|:---|:---|:---|  
+| ENABLE_LITE_BACKEND | OFF | It needs to be set to ON when compiling the RK library| - |  
+| WITH_XPU | OFF | It needs to be set to ON when compiling the KunlunXin XPU library| - |
+| ENABLE_ORT_BACKEND | OFF | whether to intergrate ONNX Runtime backend | - |
+| ENABLE_PADDLE_BACKEND | OFF | whether to intergrate Paddle Inference backend | - |
+| ENABLE_OPENVINO_BACKEND | OFF | whether to intergrate OpenVINO backend | - |
+| ENABLE_VISION | OFF | whether to intergrate vision models | - |
+| ENABLE_TEXT | OFF | whether to intergrate text models | - |
+
+The configuration for third libraries(Optional, if the following option is not defined, the prebuilt third libraries will download automaticly while building FastDeploy).
+| Option                     | Description                                                                                           |
+| :---------------------- | :--------------------------------------------------------------------------------------------- |
+| ORT_DIRECTORY           | While ENABLE_ORT_BACKEND=ON, use ORT_DIRECTORY to specify your own ONNX Runtime library path.  |
+| OPENCV_DIRECTORY        | While ENABLE_VISION=ON, use OPENCV_DIRECTORY to specify your own OpenCV library path.     |
+| OPENVINO_DIRECTORY      |  While ENABLE_OPENVINO_BACKEND=ON, use OPENVINO_DIRECTORY to specify your own OpenVINO library path.    |
+
+For more compilation options, please refer to [Description of FastDeploy compilation options](./README.md)
+
+## C++ FastDeploy library compilation based on PaddleLite
+- OS: Linux
+- gcc/g++: version >= 8.2
+- cmake: version >= 3.15
+
+It it recommend install OpenCV library manually, and define `-DOPENCV_DIRECTORY` to set path of OpenCV library(If the flag is not defined, a prebuilt OpenCV library will be downloaded automaticly while building FastDeploy, but the prebuilt OpenCV cannot support reading video file or other function e.g `imshow`)
+```
+sudo apt-get install libopencv-dev
+```
+
+The compilation command is as follows:
+```bash
+# Download the latest source code
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy  
+mkdir build && cd build
+
+# CMake configuration with KunlunXin xpu toolchain
+cmake -DWITH_XPU=ON  \
+      -DWITH_GPU=OFF  \
+      -DENABLE_ORT_BACKEND=ON  \
+      -DENABLE_PADDLE_BACKEND=ON  \
+      -DCMAKE_INSTALL_PREFIX=fastdeploy-xpu \
+      -DENABLE_VISION=ON \
+      -DOPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4 \
+      ..
+
+# Build FastDeploy KunlunXin XPU C++ SDK
+make -j8
+make install
+```  
+After the compilation is complete, the fastdeploy-xpu directory will be generated, indicating that the PadddleLite-based FastDeploy library has been compiled.
+
+## Python compile
+The compilation command is as follows:
+```bash
+git clone https://github.com/PaddlePaddle/FastDeploy.git
+cd FastDeploy/python
+export WITH_XPU=ON
+export WITH_GPU=OFF
+export ENABLE_ORT_BACKEND=ON
+export ENABLE_PADDLE_BACKEND=ON
+export ENABLE_VISION=ON
+# The OPENCV_DIRECTORY is optional, if not exported, a prebuilt OpenCV library will be downloaded
+export OPENCV_DIRECTORY=/usr/lib/x86_64-linux-gnu/cmake/opencv4
+
+python setup.py build
+python setup.py bdist_wheel
+```  
+After the compilation is completed, the compiled `wheel` package will be generated in the `FastDeploy/python/dist` directory, just pip install it directly
+
+During the compilation process, if you modify the compilation parameters, in order to avoid the cache impact, you can delete the two subdirectories `build` and `.setuptools-cmake-build` under the `FastDeploy/python` directory and then recompile.
--- a/examples/vision/classification/paddleclas/cpp/README.md
+++ b/examples/vision/classification/paddleclas/cpp/README.md
@@ -30,6 +30,10 @@ wget https://gitee.com/paddlepaddle/PaddleClas/raw/release/2.4/deploy/images/Ima
 ./infer_demo ResNet50_vd_infer ILSVRC2012_val_00000010.jpeg 1
 # GPU上TensorRT推理
 ./infer_demo ResNet50_vd_infer ILSVRC2012_val_00000010.jpeg 2
+# IPU推理
+./infer_demo ResNet50_vd_infer ILSVRC2012_val_00000010.jpeg 3
+# KunlunXin XPU推理
+./infer_demo ResNet50_vd_infer ILSVRC2012_val_00000010.jpeg 4
 ```

 以上命令只适用于Linux或MacOS, Windows下SDK的使用方式请参考:  
--- a/examples/vision/classification/paddleclas/cpp/infer.cc
+++ b/examples/vision/classification/paddleclas/cpp/infer.cc
@@ -96,6 +96,32 @@ void IpuInfer(const std::string& model_dir, const std::string& image_file) {
  std::cout << res.Str() << std::endl;
 }

+void XpuInfer(const std::string& model_dir, const std::string& image_file) {
+  auto model_file = model_dir + sep + "inference.pdmodel";
+  auto params_file = model_dir + sep + "inference.pdiparams";
+  auto config_file = model_dir + sep + "inference_cls.yaml";
+
+  auto option = fastdeploy::RuntimeOption();
+  option.UseXpu();
+  auto model = fastdeploy::vision::classification::PaddleClasModel(
+      model_file, params_file, config_file, option);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::ClassifyResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+
+  // print res
+  std::cout << res.Str() << std::endl;
+}
+
 void TrtInfer(const std::string& model_dir, const std::string& image_file) {
  auto model_file = model_dir + sep + "inference.pdmodel";
  auto params_file = model_dir + sep + "inference.pdiparams";
@@ -128,7 +154,7 @@ int main(int argc, char* argv[]) {
                 "e.g ./infer_demo ./ResNet50_vd ./test.jpeg 0"
              << std::endl;
    std::cout << "The data type of run_option is int, 0: run with cpu; 1: run "
-                 "with gpu; 2: run with gpu and use tensorrt backend."
+                 "with gpu; 2: run with gpu and use tensorrt backend; 3: run with ipu; 4: run with xpu."
              << std::endl;
    return -1;
  }
@@ -141,6 +167,8 @@ int main(int argc, char* argv[]) {
    TrtInfer(argv[1], argv[2]);
  } else if (std::atoi(argv[3]) == 3) {
    IpuInfer(argv[1], argv[2]);
+  } else if (std::atoi(argv[3]) == 4) {
+    XpuInfer(argv[1], argv[2]);
  }
  return 0;
 }
--- a/examples/vision/classification/paddleclas/python/README.md
+++ b/examples/vision/classification/paddleclas/python/README.md
@@ -25,6 +25,8 @@ python infer.py --model ResNet50_vd_infer --image ILSVRC2012_val_00000010.jpeg -
 python infer.py --model ResNet50_vd_infer --image ILSVRC2012_val_00000010.jpeg --device gpu --use_trt True --topk 1
 # IPU推理（注意：IPU推理首次运行会有序列化模型的操作，有一定耗时，需要耐心等待）
 python infer.py --model ResNet50_vd_infer --image ILSVRC2012_val_00000010.jpeg --device ipu --topk 1
+# XPU推理
+python infer.py --model ResNet50_vd_infer --image ILSVRC2012_val_00000010.jpeg --device xpu --topk 1
 ```

 运行完成后返回结果如下所示
--- a/examples/vision/classification/paddleclas/python/infer.py
+++ b/examples/vision/classification/paddleclas/python/infer.py
@@ -35,6 +35,9 @@ def build_option(args):
    if args.device.lower() == "ipu":
        option.use_ipu()

+    if args.device.lower() == "xpu":
+        option.use_xpu()
+
    if args.use_trt:
        option.use_trt_backend()
    return option
--- a/examples/vision/detection/yolov5/cpp/CMakeLists.txt
+++ b/examples/vision/detection/yolov5/cpp/CMakeLists.txt
@@ -12,3 +12,7 @@ include_directories(${FASTDEPLOY_INCS})
 add_executable(infer_demo ${PROJECT_SOURCE_DIR}/infer.cc)
 # 添加FastDeploy库依赖
 target_link_libraries(infer_demo ${FASTDEPLOY_LIBS})
+
+add_executable(infer_paddle_demo ${PROJECT_SOURCE_DIR}/infer_paddle_model.cc)
+# 添加FastDeploy库依赖
+target_link_libraries(infer_paddle_demo ${FASTDEPLOY_LIBS})
--- a/examples/vision/detection/yolov5/cpp/README.md
+++ b/examples/vision/detection/yolov5/cpp/README.md
@@ -12,16 +12,33 @@
 ```bash
 mkdir build
 cd build
-# 下载FastDeploy预编译库，用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
+# 下载 FastDeploy 预编译库，用户可在上文提到的`FastDeploy预编译库`中自行选择合适的版本使用
 wget https://bj.bcebos.com/fastdeploy/release/cpp/fastdeploy-linux-x64-x.x.x.tgz
 tar xvf fastdeploy-linux-x64-x.x.x.tgz
 cmake .. -DFASTDEPLOY_INSTALL_DIR=${PWD}/fastdeploy-linux-x64-x.x.x
 make -j
-#下载官方转换好的yolov5模型文件和测试图片
-wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s.onnx
+#下载官方转换好的 yolov5 Paddle 模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_infer.tar
+tar -xvf yolov5s_infer.tar
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg


+# CPU推理
+./infer_paddle_demo yolov5s_infer 000000014439.jpg 0
+# GPU推理
+./infer_paddle_demo yolov5s_infer 000000014439.jpg 1
+# GPU上TensorRT推理
+./infer_paddle_demo yolov5s_infer 000000014439.jpg 2
+# XPU推理
+./infer_paddle_demo yolov5s_infer 000000014439.jpg 3
+```
+
+上述的模型为 Paddle 模型的推理，如果想要做 ONNX 模型的推理，可以按照如下步骤：
+```bash
+# 1. 下载官方转换好的 yolov5 ONNX 模型文件和测试图片
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s.onnx
+wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg
+
 # CPU推理
 ./infer_demo yolov5s.onnx 000000014439.jpg 0
 # GPU推理
@@ -29,7 +46,6 @@ wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/0000000
 # GPU上TensorRT推理
 ./infer_demo yolov5s.onnx 000000014439.jpg 2
 ```
-
 运行完成可视化结果如下图所示

 <img width="640" src="https://user-images.githubusercontent.com/67993288/184309358-d803347a-8981-44b6-b589-4608021ad0f4.jpg">
--- a/examples/vision/detection/yolov5/cpp/infer.cc
+++ b/examples/vision/detection/yolov5/cpp/infer.cc
@@ -102,4 +102,4 @@ int main(int argc, char* argv[]) {
    TrtInfer(argv[1], argv[2]);
  }
  return 0;
-}
+}
--- a/examples/vision/detection/yolov5/cpp/infer_paddle_model.cc
+++ b/examples/vision/detection/yolov5/cpp/infer_paddle_model.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "fastdeploy/vision.h"
+#ifdef WIN32
+const char sep = '\\';
+#else
+const char sep = '/';
+#endif
+
+void CpuInfer(const std::string& model_dir, const std::string& image_file) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  fastdeploy::RuntimeOption option;
+  option.UseCpu();
+  auto model = fastdeploy::vision::detection::YOLOv5(
+      model_file, params_file, option, fastdeploy::ModelFormat::PADDLE);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::DetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisDetection(im, res);
+
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+void GpuInfer(const std::string& model_dir, const std::string& image_file) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto option = fastdeploy::RuntimeOption();
+  option.UseGpu();
+  auto model = fastdeploy::vision::detection::YOLOv5(
+      model_file, params_file, option, fastdeploy::ModelFormat::PADDLE);
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::DetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisDetection(im, res);
+
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+void TrtInfer(const std::string& model_dir, const std::string& image_file) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  auto option = fastdeploy::RuntimeOption();
+  option.UseGpu();
+  option.UseTrtBackend();
+  option.SetTrtInputShape("images", {1, 3, 640, 640});
+  auto model = fastdeploy::vision::detection::YOLOv5(
+      model_file, params_file, option, fastdeploy::ModelFormat::PADDLE);
+
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::DetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::Visualize::VisDetection(im, res);
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+void XpuInfer(const std::string& model_dir, const std::string& image_file) {
+  auto model_file = model_dir + sep + "model.pdmodel";
+  auto params_file = model_dir + sep + "model.pdiparams";
+  fastdeploy::RuntimeOption option;
+  option.UseXpu();
+  auto model = fastdeploy::vision::detection::YOLOv5(
+      model_file, params_file, option, fastdeploy::ModelFormat::PADDLE);
+
+  if (!model.Initialized()) {
+    std::cerr << "Failed to initialize." << std::endl;
+    return;
+  }
+
+  auto im = cv::imread(image_file);
+
+  fastdeploy::vision::DetectionResult res;
+  if (!model.Predict(im, &res)) {
+    std::cerr << "Failed to predict." << std::endl;
+    return;
+  }
+  std::cout << res.Str() << std::endl;
+
+  auto vis_im = fastdeploy::vision::VisDetection(im, res);
+
+  cv::imwrite("vis_result.jpg", vis_im);
+  std::cout << "Visualized result saved in ./vis_result.jpg" << std::endl;
+}
+
+int main(int argc, char* argv[]) {
+  if (argc < 4) {
+    std::cout << "Usage: infer_demo path/to/model path/to/image run_option, "
+                 "e.g ./infer_model ./yolov5s_infer ./test.jpeg 0"
+              << std::endl;
+    std::cout << "The data type of run_option is int, 0: run with cpu; 1: run "
+                 "with gpu; 2: run with gpu and use tensorrt backend; 3: run with KunlunXin XPU."
+              << std::endl;
+    return -1;
+  }
+
+  if (std::atoi(argv[3]) == 0) {
+    CpuInfer(argv[1], argv[2]);
+  } else if (std::atoi(argv[3]) == 1) {
+    GpuInfer(argv[1], argv[2]);
+  } else if (std::atoi(argv[3]) == 2) {
+    TrtInfer(argv[1], argv[2]);
+  } else if (std::atoi(argv[3]) == 3) {
+    XpuInfer(argv[1], argv[2]);
+  }
+  return 0;
+}
--- a/examples/vision/detection/yolov5/python/README.md
+++ b/examples/vision/detection/yolov5/python/README.md
@@ -13,15 +13,18 @@ git clone https://github.com/PaddlePaddle/FastDeploy.git
 cd examples/vision/detection/yolov5/python/

 #下载yolov5模型文件和测试图片
-wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s.onnx
+wget https://bj.bcebos.com/paddlehub/fastdeploy/yolov5s_infer.tar
+tar -xf yolov5s_infer.tar
 wget https://gitee.com/paddlepaddle/PaddleDetection/raw/release/2.4/demo/000000014439.jpg

 # CPU推理
-python infer.py --model yolov5s.onnx --image 000000014439.jpg --device cpu
+python infer.py --model yolov5s_infer --image 000000014439.jpg --device cpu
 # GPU推理
-python infer.py --model yolov5s.onnx --image 000000014439.jpg --device gpu
+python infer.py --model yolov5s_infer --image 000000014439.jpg --device gpu
 # GPU上使用TensorRT推理
-python infer.py --model yolov5s.onnx --image 000000014439.jpg --device gpu --use_trt True
+python infer.py --model yolov5s_infer --image 000000014439.jpg --device gpu --use_trt True
+# XPU推理
+python infer.py --model yolov5s_infer --image 000000014439.jpg --device xpu
 ```

 运行完成可视化结果如下图所示
--- a/examples/vision/detection/yolov5/python/infer.py
+++ b/examples/vision/detection/yolov5/python/infer.py
@@ -1,20 +1,20 @@
 import fastdeploy as fd
 import cv2
+import os


 def parse_arguments():
    import argparse
    import ast
    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model", default=None, help="Path of yolov5 onnx model.")
+    parser.add_argument("--model", default=None, help="Path of yolov5 model.")
    parser.add_argument(
        "--image", default=None, help="Path of test image file.")
    parser.add_argument(
        "--device",
        type=str,
        default='cpu',
-        help="Type of inference device, support 'cpu' or 'gpu'.")
+        help="Type of inference device, support 'cpu' or 'gpu' or 'xpu'.")
    parser.add_argument(
        "--use_trt",
        type=ast.literal_eval,
@@ -25,6 +25,8 @@ def parse_arguments():

 def build_option(args):
    option = fd.RuntimeOption()
+    if args.device.lower() == "xpu":
+        option.use_xpu()

    if args.device.lower() == "gpu":
        option.use_gpu()
@@ -37,14 +39,15 @@ def build_option(args):

 args = parse_arguments()

-if args.model is None:
-    model = fd.download_model(name='YOLOv5s')
-else:
-    model = args.model
-
 # 配置runtime，加载模型
 runtime_option = build_option(args)
-model = fd.vision.detection.YOLOv5(model, runtime_option=runtime_option)
+model_file = os.path.join(args.model, "model.pdmodel")
+params_file = os.path.join(args.model, "model.pdiparams")
+model = fd.vision.detection.YOLOv5(
+    model_file,
+    params_file,
+    runtime_option=runtime_option,
+    model_format=fd.ModelFormat.PADDLE)

 # 预测图片检测结果
 if args.image is None:
--- a/fastdeploy/backends/lite/lite_backend.cc
+++ b/fastdeploy/backends/lite/lite_backend.cc
@@ -43,24 +43,33 @@ void LiteBackend::BuildOption(const LiteBackendOption& option) {
  option_ = option;
  std::vector<paddle::lite_api::Place> valid_places;
  if (option_.enable_int8) {
-    valid_places.push_back(
+    if(option_.enable_xpu) {
+      valid_places.push_back(
+          paddle::lite_api::Place{TARGET(kXPU), PRECISION(kInt8)});
+    } else {
+      valid_places.push_back(
        paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt8)});
+    }
    FDINFO << "Lite::Backend enable_int8 option is ON ! Lite::Backend will "
           << "inference with int8 precision!" << std::endl;    
  }
  if (option_.enable_fp16) {
-    paddle::lite_api::MobileConfig check_fp16_config;
-    // Determine whether the device supports the FP16
-    // instruction set (or whether it is an arm device
-    // of the armv8.2 architecture)
-    supported_fp16_ = check_fp16_config.check_fp16_valid();
-    if (supported_fp16_) {
+    if(option_.enable_xpu){
      valid_places.push_back(
-          paddle::lite_api::Place{TARGET(kARM), PRECISION(kFP16)});
-      FDINFO << "Your device is supported fp16 ! Lite::Backend will "
-             << "inference with fp16 precision!" << std::endl;    
+          paddle::lite_api::Place{TARGET(kXPU), PRECISION(kFP16)});
    } else {
-      FDWARNING << "This device is not supported fp16, will skip fp16 option.";
+      paddle::lite_api::MobileConfig check_fp16_config;
+      // Determine whether the device supports the FP16
+      // instruction set (or whether it is an arm device
+      // of the armv8.2 architecture)
+      supported_fp16_ = check_fp16_config.check_fp16_valid();
+      if (supported_fp16_) {
+        valid_places.push_back(
+            paddle::lite_api::Place{TARGET(kARM), PRECISION(kFP16)});
+        FDINFO << "The device supports FP16, Lite::Backend will inference with FP16 precision." << std::endl;    
+      } else {
+        FDWARNING << "The device doesn't support FP16, will fallback to FP32.";
+      }
    }
  }
  if (!option_.nnadapter_subgraph_partition_config_path.empty()) {
@@ -81,8 +90,24 @@ void LiteBackend::BuildOption(const LiteBackendOption& option) {
    valid_places.push_back(
        paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt8)});
  }
-  valid_places.push_back(
+  
+  if(option_.enable_xpu){
+    valid_places.push_back(
+      paddle::lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
+    valid_places.push_back(
+      paddle::lite_api::Place{TARGET(kX86), PRECISION(kFloat)});
+    config_.set_xpu_dev_per_thread(option_.device_id);
+    config_.set_xpu_workspace_l3_size_per_thread(option_.xpu_l3_workspace_size);
+    config_.set_xpu_l3_cache_method(option_.xpu_l3_workspace_size, option_.xpu_locked);
+    config_.set_xpu_conv_autotune(option_.xpu_autotune, option_.xpu_autotune_file);
+    config_.set_xpu_multi_encoder_method(option_.xpu_precision, option_.xpu_adaptive_seqlen);
+    if (option_.xpu_enable_multi_stream) {
+      config_.enable_xpu_multi_stream();
+    }
+  } else {
+    valid_places.push_back(
      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)});
+  }
  config_.set_valid_places(valid_places);
  if (option_.threads > 0) {
    config_.set_threads(option_.threads);
@@ -160,7 +185,9 @@ bool LiteBackend::InitFromPaddle(const std::string& model_file,
    auto shape = tensor->shape();
    info.shape.assign(shape.begin(), shape.end());
    info.name = output_names[i];
-    info.dtype = LiteDataTypeToFD(tensor->precision());
+    if(!option_.enable_xpu){
+      info.dtype = LiteDataTypeToFD(tensor->precision());
+    }
    outputs_desc_.emplace_back(info);
  }

@@ -239,6 +266,9 @@ bool LiteBackend::Infer(std::vector<FDTensor>& inputs,
  outputs->resize(outputs_desc_.size());
  for (size_t i = 0; i < outputs_desc_.size(); ++i) {
    auto tensor = predictor_->GetOutput(i);
+    if(outputs_desc_[i].dtype != LiteDataTypeToFD(tensor->precision())){
+      outputs_desc_[i].dtype = LiteDataTypeToFD(tensor->precision());
+    }
    (*outputs)[i].Resize(tensor->shape(), outputs_desc_[i].dtype,
                         outputs_desc_[i].name);
    memcpy((*outputs)[i].MutableData(), tensor->data<void>(),
--- a/fastdeploy/backends/lite/lite_backend.h
+++ b/fastdeploy/backends/lite/lite_backend.h
@@ -45,6 +45,15 @@ struct LiteBackendOption {
  // Such as fp16, different device target (kARM/kXPU/kNPU/...)
  std::string nnadapter_subgraph_partition_config_path = "";
  bool enable_timvx = false;
+  bool enable_xpu = false;
+  int device_id = 0;
+  int xpu_l3_workspace_size = 0xfffc00;
+  bool xpu_locked = false;
+  bool xpu_autotune = true;
+  std::string xpu_autotune_file = "";
+  std::string xpu_precision = "int16";
+  bool xpu_adaptive_seqlen = false;
+  bool xpu_enable_multi_stream = false;
 };

 // Convert data type from paddle lite to fastdeploy
--- a/fastdeploy/core/fd_type.cc
+++ b/fastdeploy/core/fd_type.cc
@@ -62,6 +62,9 @@ std::string Str(const Device& d) {
    case Device::TIMVX:
      out = "Device::TIMVX";
      break;
+    case Device::XPU:
+      out = "Device::XPU";
+      break;
    default:
      out = "Device::UNKOWN";
  }
@@ -82,6 +85,9 @@ std::ostream& operator<<(std::ostream& out,const Device& d){
  case Device::TIMVX:
    out << "Device::TIMVX";
    break;
+  case Device::XPU:
+    out << "Device::XPU";
+    break;
  default:
    out << "Device::UNKOWN";
  }
--- a/fastdeploy/core/fd_type.h
+++ b/fastdeploy/core/fd_type.h
@@ -22,7 +22,7 @@

 namespace fastdeploy {

-enum FASTDEPLOY_DECL Device { CPU, GPU, RKNPU, IPU, TIMVX};
+enum FASTDEPLOY_DECL Device { CPU, GPU, RKNPU, IPU, TIMVX, XPU};

 FASTDEPLOY_DECL std::string Str(const Device& d);

--- a/fastdeploy/fastdeploy_model.cc
+++ b/fastdeploy/fastdeploy_model.cc
@@ -51,6 +51,7 @@ bool FastDeployModel::InitRuntimeWithSpecifiedBackend() {
  bool use_ipu = (runtime_option.device == Device::IPU);
  bool use_rknpu = (runtime_option.device == Device::RKNPU);
  bool use_timvx = (runtime_option.device == Device::TIMVX);
+  bool use_xpu = (runtime_option.device == Device::XPU);

  if (use_gpu) {
    if (!IsSupported(valid_gpu_backends, runtime_option.backend)) {
@@ -67,6 +68,11 @@ bool FastDeployModel::InitRuntimeWithSpecifiedBackend() {
      FDERROR << "The valid timvx backends of model " << ModelName() << " are " << Str(valid_timvx_backends) << ", " << runtime_option.backend << " is not supported." << std::endl;
      return false;
    }
+  } else if (use_xpu) {
+    if (!IsSupported(valid_xpu_backends, runtime_option.backend)) {
+      FDERROR << "The valid xpu backends of model " << ModelName() << " are " << Str(valid_xpu_backends) << ", " << runtime_option.backend << " is not supported." << std::endl;
+      return false;
+    }
  } else if(use_ipu) {
    if (!IsSupported(valid_ipu_backends, runtime_option.backend)) {
      FDERROR << "The valid ipu backends of model " << ModelName() << " are " << Str(valid_ipu_backends) << ", " << runtime_option.backend << " is not supported." << std::endl;
@@ -102,6 +108,8 @@ bool FastDeployModel::InitRuntimeWithSpecifiedDevice() {
    return CreateRKNPUBackend();
  } else if (runtime_option.device == Device::TIMVX) {
    return CreateTimVXBackend();
+  } else if (runtime_option.device == Device::XPU) {
+    return CreateXPUBackend();
  } else if (runtime_option.device == Device::IPU) {
 #ifdef WITH_IPU
    return CreateIpuBackend();
@@ -111,7 +119,7 @@ bool FastDeployModel::InitRuntimeWithSpecifiedDevice() {
    return false;
 #endif
  }
-  FDERROR << "Only support CPU/GPU/IPU/RKNPU/TIMVX now." << std::endl;
+  FDERROR << "Only support CPU/GPU/IPU/RKNPU/TIMVX/XPU now." << std::endl;
  return false;
 }

@@ -225,6 +233,29 @@ bool FastDeployModel::CreateTimVXBackend() {
  return false;
 }

+bool FastDeployModel::CreateXPUBackend() {
+  if (valid_xpu_backends.size() == 0) {
+    FDERROR << "There's no valid xpu backends for model: " << ModelName()
+            << std::endl;
+    return false;
+  }
+
+  for (size_t i = 0; i < valid_xpu_backends.size(); ++i) {
+    if (!IsBackendAvailable(valid_xpu_backends[i])) {
+      continue;
+    }
+    runtime_option.backend = valid_xpu_backends[i];
+    runtime_ = std::unique_ptr<Runtime>(new Runtime());
+    if (!runtime_->Init(runtime_option)) {
+      return false;
+    }
+    runtime_initialized_ = true;
+    return true;
+  }
+  FDERROR << "Found no valid backend for model: " << ModelName() << std::endl;
+  return false;
+}
+
 bool FastDeployModel::CreateIpuBackend() {
  if (valid_ipu_backends.size() == 0) {
    FDERROR << "There's no valid ipu backends for model: " << ModelName()
--- a/fastdeploy/fastdeploy_model.h
+++ b/fastdeploy/fastdeploy_model.h
@@ -45,6 +45,9 @@ class FASTDEPLOY_DECL FastDeployModel {
  /** Model's valid timvx backends. This member defined all the timvx backends have successfully tested for the model
   */
  std::vector<Backend> valid_timvx_backends = {};
+  /** Model's valid KunlunXin xpu backends. This member defined all the KunlunXin xpu backends have successfully tested for the model
+   */
+  std::vector<Backend> valid_xpu_backends = {};
  /** Model's valid hardware backends. This member defined all the gpu backends have successfully tested for the model
   */
  std::vector<Backend> valid_rknpu_backends = {};
@@ -143,6 +146,7 @@ class FASTDEPLOY_DECL FastDeployModel {
  bool CreateIpuBackend();
  bool CreateRKNPUBackend();
  bool CreateTimVXBackend();
+  bool CreateXPUBackend();

  std::shared_ptr<Runtime> runtime_;
  bool runtime_initialized_ = false;
--- a/fastdeploy/pybind/runtime.cc
+++ b/fastdeploy/pybind/runtime.cc
@@ -23,6 +23,7 @@ void BindRuntime(pybind11::module& m) {
      .def("use_gpu", &RuntimeOption::UseGpu)
      .def("use_cpu", &RuntimeOption::UseCpu)
      .def("use_rknpu2", &RuntimeOption::UseRKNPU2)
+      .def("use_xpu", &RuntimeOption::UseXpu)
      .def("set_external_stream", &RuntimeOption::SetExternalStream)
      .def("set_cpu_thread_num", &RuntimeOption::SetCpuThreadNum)
      .def("use_paddle_backend", &RuntimeOption::UsePaddleBackend)
@@ -100,7 +101,21 @@ void BindRuntime(pybind11::module& m) {
      .def_readwrite("ipu_available_memory_proportion",
                     &RuntimeOption::ipu_available_memory_proportion)
      .def_readwrite("ipu_enable_half_partial",
-                     &RuntimeOption::ipu_enable_half_partial);
+                     &RuntimeOption::ipu_enable_half_partial)
+      .def_readwrite("xpu_l3_workspace_size",
+                     &RuntimeOption::xpu_l3_workspace_size)
+      .def_readwrite("xpu_locked",
+                     &RuntimeOption::xpu_locked)
+      .def_readwrite("xpu_autotune",
+                     &RuntimeOption::xpu_autotune)
+      .def_readwrite("xpu_autotune_file",
+                     &RuntimeOption::xpu_autotune_file)
+      .def_readwrite("xpu_precision",
+                     &RuntimeOption::xpu_precision)
+      .def_readwrite("xpu_adaptive_seqlen",
+                     &RuntimeOption::xpu_adaptive_seqlen)
+      .def_readwrite("xpu_enable_multi_stream",
+                     &RuntimeOption::xpu_enable_multi_stream);                              

  pybind11::class_<TensorInfo>(m, "TensorInfo")
      .def_readwrite("name", &TensorInfo::name)
--- a/fastdeploy/runtime.cc
+++ b/fastdeploy/runtime.cc
@@ -236,7 +236,26 @@ void RuntimeOption::UseRKNPU2(fastdeploy::rknpu2::CpuName rknpu2_name,
 void RuntimeOption::UseTimVX() {
  enable_timvx = true;
  device = Device::TIMVX;
-  UseLiteBackend();
+}
+
+void RuntimeOption::UseXpu(int xpu_id, 
+                          int l3_workspace_size,
+                          bool locked,
+                          bool autotune,
+                          const std::string &autotune_file,
+                          const std::string &precision,
+                          bool adaptive_seqlen,
+                          bool enable_multi_stream) {
+  enable_xpu = true;
+  device_id = xpu_id;
+  xpu_l3_workspace_size = l3_workspace_size;
+  xpu_locked=locked;
+  xpu_autotune=autotune;
+  xpu_autotune_file=autotune_file;
+  xpu_precision = precision;
+  xpu_adaptive_seqlen=adaptive_seqlen;
+  xpu_enable_multi_stream=enable_multi_stream;
+  device = Device::XPU;
 }

 void RuntimeOption::SetExternalStream(void* external_stream) {
@@ -532,8 +551,8 @@ bool Runtime::Init(const RuntimeOption& _option) {
    FDINFO << "Runtime initialized with Backend::OPENVINO in "
           << Str(option.device) << "." << std::endl;
  } else if (option.backend == Backend::LITE) {
-    FDASSERT(option.device == Device::CPU || option.device == Device::TIMVX,
-             "Backend::LITE only supports Device::CPU/Device::TIMVX.");
+    FDASSERT(option.device == Device::CPU || option.device == Device::TIMVX || option.device == Device::XPU,
+             "Backend::LITE only supports Device::CPU/Device::TIMVX/Device::XPU.");
    CreateLiteBackend();
    FDINFO << "Runtime initialized with Backend::LITE in " << Str(option.device)
           << "." << std::endl;
@@ -784,6 +803,16 @@ void Runtime::CreateLiteBackend() {
  lite_option.nnadapter_subgraph_partition_config_path =
      option.lite_nnadapter_subgraph_partition_config_path;
  lite_option.enable_timvx = option.enable_timvx;
+  lite_option.enable_xpu = option.enable_xpu;
+  lite_option.device_id  = option.device_id;
+  lite_option.xpu_l3_workspace_size  = option.xpu_l3_workspace_size;
+  lite_option.xpu_locked = option.xpu_locked;
+  lite_option.xpu_autotune = option.xpu_autotune;
+  lite_option.xpu_autotune_file = option.xpu_autotune_file;
+  lite_option.xpu_precision  = option.xpu_precision;
+  lite_option.xpu_adaptive_seqlen = option.xpu_adaptive_seqlen;
+  lite_option.xpu_enable_multi_stream = option.xpu_enable_multi_stream;
+
  FDASSERT(option.model_format == ModelFormat::PADDLE,
           "LiteBackend only support model format of ModelFormat::PADDLE");
  backend_ = utils::make_unique<LiteBackend>();
--- a/fastdeploy/runtime.h
+++ b/fastdeploy/runtime.h
@@ -102,6 +102,37 @@ struct FASTDEPLOY_DECL RuntimeOption {
  /// Use TimVX to inference
  void UseTimVX();

+  ///
+  /// \brief Turn on XPU.
+  ///
+  /// \param xpu_id the XPU card to use (default is 0).
+  /// \param l3_workspace_size The size of the video memory allocated by the l3
+  ///         cache, the maximum is 16M.
+  /// \param locked Whether the allocated L3 cache can be locked. If false,
+  ///       it means that the L3 cache is not locked, and the allocated L3
+  ///       cache can be shared by multiple models, and multiple models
+  ///       sharing the L3 cache will be executed sequentially on the card.
+  /// \param autotune Whether to autotune the conv operator in the model. If
+  ///       true, when the conv operator of a certain dimension is executed
+  ///       for the first time, it will automatically search for a better
+  ///       algorithm to improve the performance of subsequent conv operators
+  ///       of the same dimension.
+  /// \param autotune_file Specify the path of the autotune file. If
+  ///       autotune_file is specified, the algorithm specified in the
+  ///       file will be used and autotune will not be performed again.
+  /// \param precision Calculation accuracy of multi_encoder
+  /// \param adaptive_seqlen Is the input of multi_encoder variable length
+  /// \param enable_multi_stream Whether to enable the multi stream of xpu.
+  ///
+  void UseXpu(int xpu_id = 0,
+              int l3_workspace_size = 0xfffc00,
+              bool locked = false,
+              bool autotune = true,
+              const std::string& autotune_file = "",
+              const std::string& precision = "int16",
+              bool adaptive_seqlen = false,
+              bool enable_multi_stream = false);
+
  void SetExternalStream(void* external_stream);

  /*
@@ -354,6 +385,7 @@ struct FASTDEPLOY_DECL RuntimeOption {
  std::string lite_optimized_model_dir = "";
  std::string lite_nnadapter_subgraph_partition_config_path = "";
  bool enable_timvx = false;
+  bool enable_xpu = false;

  // ======Only for Trt Backend=======
  std::map<std::string, std::vector<int32_t>> trt_max_shape;
@@ -386,6 +418,15 @@ struct FASTDEPLOY_DECL RuntimeOption {
  fastdeploy::rknpu2::CoreMask rknpu2_core_mask_ =
      fastdeploy::rknpu2::CoreMask::RKNN_NPU_CORE_AUTO;

+  // ======Only for XPU Backend=======
+  int xpu_l3_workspace_size = 0xfffc00;
+  bool xpu_locked = false;
+  bool xpu_autotune = true;
+  std::string xpu_autotune_file = "";
+  std::string xpu_precision = "int16";
+  bool xpu_adaptive_seqlen = false;
+  bool xpu_enable_multi_stream = false;
+
  std::string model_file = "";   // Path of model file
  std::string params_file = "";  // Path of parameters file, can be empty
  // format of input model
--- a/fastdeploy/vision/classification/ppcls/model.cc
+++ b/fastdeploy/vision/classification/ppcls/model.cc
@@ -29,6 +29,7 @@ PaddleClasModel::PaddleClasModel(const std::string& model_file,
                          Backend::LITE};
    valid_gpu_backends = {Backend::ORT, Backend::PDINFER, Backend::TRT};
    valid_timvx_backends = {Backend::LITE};
+    valid_xpu_backends = {Backend::LITE};
    valid_ipu_backends = {Backend::PDINFER};
  } else if (model_format == ModelFormat::ONNX) {
    valid_cpu_backends = {Backend::ORT, Backend::OPENVINO};
--- a/fastdeploy/vision/detection/contrib/rknpu2/postprocessor.h
+++ b/fastdeploy/vision/detection/contrib/rknpu2/postprocessor.h
@@ -72,10 +72,11 @@ class FASTDEPLOY_DECL RKYOLOPostprocessor {
  }

  // Set Anchor
-  void SetAnchor(std::vector<int> anchors,int anchor_per_branch){
+  void SetAnchor(std::vector<int> anchors, int anchor_per_branch) {
      anchors_ = anchors;
      anchor_per_branch_ = anchor_per_branch;
-  };
+  }
+
 private:
  std::vector<int> anchors_ = {10, 13, 16,  30,  33, 23,  30,  61,  62,
                               45, 59, 119, 116, 90, 156, 198, 373, 326};
--- a/fastdeploy/vision/detection/contrib/rknpu2/utils.h
+++ b/fastdeploy/vision/detection/contrib/rknpu2/utils.h
@@ -31,4 +31,4 @@ int NMS(int valid_count, std::vector<float>& output_locations,

 }  // namespace detection
 }  // namespace vision
-}  // namespace fastdeploy
+}  // namespace fastdeploy
--- a/fastdeploy/vision/detection/contrib/yolov5/yolov5.cc
+++ b/fastdeploy/vision/detection/contrib/yolov5/yolov5.cc
@@ -27,6 +27,7 @@ YOLOv5::YOLOv5(const std::string& model_file, const std::string& params_file,
  } else {
    valid_cpu_backends = {Backend::PDINFER, Backend::ORT, Backend::LITE};
    valid_gpu_backends = {Backend::PDINFER, Backend::ORT, Backend::TRT};
+    valid_xpu_backends = {Backend::LITE};
    valid_timvx_backends = {Backend::LITE};
  }
  runtime_option = custom_option;
--- a/fastdeploy/vision/detection/ppdet/model.h
+++ b/fastdeploy/vision/detection/ppdet/model.h
@@ -185,7 +185,7 @@ class FASTDEPLOY_DECL PaddleYOLOv5 : public PPDetBase {
               const ModelFormat& model_format = ModelFormat::PADDLE)
      : PPDetBase(model_file, params_file, config_file, custom_option,
                model_format) {
-    valid_cpu_backends = {Backend::ORT,Backend::PDINFER};
+    valid_cpu_backends = {Backend::ORT, Backend::PDINFER};
    valid_gpu_backends = {Backend::ORT, Backend::PDINFER, Backend::TRT};
    initialized = Initialize();
  }
@@ -201,7 +201,7 @@ class FASTDEPLOY_DECL PaddleYOLOv6 : public PPDetBase {
               const ModelFormat& model_format = ModelFormat::PADDLE)
      : PPDetBase(model_file, params_file, config_file, custom_option,
                model_format) {
-    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT,Backend::PDINFER};
+    valid_cpu_backends = {Backend::OPENVINO, Backend::ORT, Backend::PDINFER};
    valid_gpu_backends = {Backend::ORT, Backend::PDINFER, Backend::TRT};
    initialized = Initialize();
  }
@@ -217,7 +217,7 @@ class FASTDEPLOY_DECL PaddleYOLOv7 : public PPDetBase {
               const ModelFormat& model_format = ModelFormat::PADDLE)
      : PPDetBase(model_file, params_file, config_file, custom_option,
                model_format) {
-    valid_cpu_backends = {Backend::ORT,Backend::PDINFER};
+    valid_cpu_backends = {Backend::ORT, Backend::PDINFER};
    valid_gpu_backends = {Backend::ORT, Backend::PDINFER, Backend::TRT};
    initialized = Initialize();
  }
--- a/python/fastdeploy/runtime.py
+++ b/python/fastdeploy/runtime.py
@@ -245,6 +245,34 @@ class RuntimeOption:
            return
        return self._option.use_gpu(device_id)

+    def use_xpu(self,
+                device_id=0,
+                l3_workspace_size=16 * 1024 * 1024,
+                locked=False,
+                autotune=True,
+                autotune_file="",
+                precision="int16",
+                adaptive_seqlen=False,
+                enable_multi_stream=False):
+        """Inference with XPU
+
+        :param device_id: (int)The index of XPU will be used for inference, default 0
+        :param l3_workspace_size: (int)The size of the video memory allocated by the l3 cache, the maximum is 16M, default 16M
+        :param locked: (bool)Whether the allocated L3 cache can be locked. If false, it means that the L3 cache is not locked,
+                        and the allocated L3 cache can be shared by multiple models, and multiple models
+        :param autotune: (bool)Whether to autotune the conv operator in the model.
+                        If true, when the conv operator of a certain dimension is executed for the first time,
+                        it will automatically search for a better algorithm to improve the performance of subsequent conv operators of the same dimension.
+        :param autotune_file: (str)Specify the path of the autotune file. If autotune_file is specified,
+                        the algorithm specified in the file will be used and autotune will not be performed again.
+        :param precision: (str)Calculation accuracy of multi_encoder
+        :param adaptive_seqlen: (bool)adaptive_seqlen Is the input of multi_encoder variable length
+        :param enable_multi_stream: (bool)Whether to enable the multi stream of xpu.
+        """
+        return self._option.use_xpu(device_id, l3_workspace_size, locked,
+                                    autotune, autotune_file, precision,
+                                    adaptive_seqlen, enable_multi_stream)
+
    def use_cpu(self):
        """Inference with CPU
        """
--- a/python/setup.py
+++ b/python/setup.py
@@ -65,11 +65,13 @@ setup_configs["ENABLE_POROS_BACKEND"] = os.getenv("ENABLE_POROS_BACKEND",
                                                  "OFF")
 setup_configs["ENABLE_TRT_BACKEND"] = os.getenv("ENABLE_TRT_BACKEND", "OFF")
 setup_configs["ENABLE_LITE_BACKEND"] = os.getenv("ENABLE_LITE_BACKEND", "OFF")
+setup_configs["PADDLELITE_URL"] = os.getenv("PADDLELITE_URL", "OFF")
 setup_configs["ENABLE_VISION"] = os.getenv("ENABLE_VISION", "OFF")
 setup_configs["ENABLE_FLYCV"] = os.getenv("ENABLE_FLYCV", "OFF")
 setup_configs["ENABLE_TEXT"] = os.getenv("ENABLE_TEXT", "OFF")
 setup_configs["WITH_GPU"] = os.getenv("WITH_GPU", "OFF")
 setup_configs["WITH_IPU"] = os.getenv("WITH_IPU", "OFF")
+setup_configs["WITH_XPU"] = os.getenv("WITH_XPU", "OFF")
 setup_configs["BUILD_ON_JETSON"] = os.getenv("BUILD_ON_JETSON", "OFF")
 setup_configs["TRT_DIRECTORY"] = os.getenv("TRT_DIRECTORY", "UNDEFINED")
 setup_configs["CUDA_DIRECTORY"] = os.getenv("CUDA_DIRECTORY",
@@ -78,10 +80,12 @@ setup_configs["LIBRARY_NAME"] = PACKAGE_NAME
 setup_configs["PY_LIBRARY_NAME"] = PACKAGE_NAME + "_main"
 setup_configs["OPENCV_DIRECTORY"] = os.getenv("OPENCV_DIRECTORY", "")
 setup_configs["ORT_DIRECTORY"] = os.getenv("ORT_DIRECTORY", "")
-setup_configs["PADDLEINFERENCE_DIRECTORY"] = os.getenv("PADDLEINFERENCE_DIRECTORY", "")
+setup_configs["PADDLEINFERENCE_DIRECTORY"] = os.getenv(
+    "PADDLEINFERENCE_DIRECTORY", "")

 setup_configs["RKNN2_TARGET_SOC"] = os.getenv("RKNN2_TARGET_SOC", "")
-if setup_configs["RKNN2_TARGET_SOC"] != "" or setup_configs["BUILD_ON_JETSON"] != "OFF":
+if setup_configs["RKNN2_TARGET_SOC"] != "" or setup_configs[
+        "BUILD_ON_JETSON"] != "OFF":
    REQUIRED_PACKAGES = REQUIRED_PACKAGES.replace("opencv-python", "")

 if setup_configs["WITH_GPU"] == "ON" or setup_configs[