diff --git a/.gitignore b/.gitignore index f94e8f7cc..b7c91af77 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,5 @@ custom_ops/tmp* build .ccls-cache + +third_party diff --git a/custom_ops/xpu_ops/src/download_dependencies.sh b/custom_ops/xpu_ops/src/download_dependencies.sh new file mode 100644 index 000000000..74cae9f3c --- /dev/null +++ b/custom_ops/xpu_ops/src/download_dependencies.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +if [ $# -ne 1 ] || { [ "$1" != "stable" ] && [ "$1" != "develop" ]; }; then + echo "Usage: $0 " + exit 1 +fi + +SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +THIRDPARTY_DIR="$SCRIPT_DIR/third_party" + +rm -rf "$THIRDPARTY_DIR" +mkdir -p "$THIRDPARTY_DIR" || exit 1 + +if [ "$1" == "stable" ]; then + version_xvllm="20250710" + version_xtdk="3.2.40.1" +else + version_xvllm="latest" + version_xtdk="latest" +fi + +( + cd "$THIRDPARTY_DIR" || exit 1 + + # Clean previous installation + rm -rf output* xvllm* xtdk-llvm* output.tar.gz xtdk-llvm*tar.gz + + # Download and install xvllm + if ! wget "https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/${version_xvllm}/output.tar.gz"; then + echo "Error downloading xvllm" + exit 2 + fi + tar -zxf output.tar.gz && mv output xvllm && rm output.tar.gz + + # Download and install xtdk + if ! wget "https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/${version_xtdk}/xtdk-llvm15-ubuntu2004_x86_64.tar.gz"; then + echo "Error downloading xtdk" + exit 3 + fi + tar -zxf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && \ + mv xtdk-llvm15-ubuntu2004_x86_64 xtdk && \ + rm xtdk-llvm15-ubuntu2004_x86_64.tar.gz +) + +if [ $? -ne 0 ]; then + echo "Installation failed" + exit 4 +fi + +echo "Installation completed in: $THIRDPARTY_DIR" +echo "You can set environment variables as follows to use XVLLM and XTDK:" +echo " export CLANG_PATH=$THIRDPARTY_DIR/xtdk" +echo " export XVLLM_PATH=$THIRDPARTY_DIR/xvllm" +echo "" diff --git a/dockerfiles/Dockerfile.xpu b/dockerfiles/Dockerfile.xpu index 9bc4bf816..66971690f 100644 --- a/dockerfiles/Dockerfile.xpu +++ b/dockerfiles/Dockerfile.xpu @@ -17,23 +17,18 @@ RUN python -m pip uninstall paddlepaddle-gpu paddlepaddle-xpu -y # install paddlepaddle RUN python -m pip install --no-cache-dir --progress-bar off paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/ +COPY . /workspace/FastDeploy + # get xtdk and xvllm and xre -RUN mkdir -p /workspace/deps && cd /workspace/deps && wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250710/output.tar.gz && \ - tar -zxf output.tar.gz && mv output xvllm && \ - wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz && \ - tar -zxf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk && \ +RUN mkdir -p /workspace/deps && cd /workspace/deps && \ wget https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz && \ - tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre + tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre && \ + cd /workspace/FastDeploy && bash custom_ops/xpu_ops/src/download_dependencies.sh stable ENV PATH=/workspace/deps/xre/bin:$PATH -ENV CLANG_PATH=/workspace/deps/xtdk -ENV XVLLM_PATH=/workspace/deps/xvllm +ENV CLANG_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xtdk +ENV XVLLM_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xvllm -ENV OPENBLAS_NUM_THREADS=1 -ENV OMP_NUM_THREADS=1 -ENV MKL_NUM_THREADS=1 -USER root -COPY . /workspace/FastDeploy # build and install FastDeploy RUN cd /workspace/FastDeploy && bash build.sh && python -m pip install --no-cache-dir dist/* && rm -rf /workspace/FastDeploy diff --git a/docs/get_started/installation/kunlunxin_xpu.md b/docs/get_started/installation/kunlunxin_xpu.md index 7e6c98984..fba7a123d 100644 --- a/docs/get_started/installation/kunlunxin_xpu.md +++ b/docs/get_started/installation/kunlunxin_xpu.md @@ -72,32 +72,36 @@ Alternatively, you can install the latest version of PaddlePaddle (Not recommend python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/ ``` -### Download Kunlunxin Toolkit (XTDK) and XVLLM library, then set their paths. +### Download FastDeploy source code, checkout the stable branch/TAG ```bash -# XTDK -wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz -tar -xvf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk -export CLANG_PATH=$(pwd)/xtdk +git clone https://github.com/PaddlePaddle/FastDeploy +git checkout +cd FastDeploy +``` -# XVLLM -wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250624/output.tar.gz -tar -xvf output.tar.gz && mv output xvllm -export XVLLM_PATH=$(pwd)/xvllm +### Download Kunlunxin Compilation Dependency + +```bash +bash custom_ops/xpu_ops/src/download_dependencies.sh stable ``` Alternatively, you can download the latest versions of XTDK and XVLLM (Not recommended) ```bash -XTDK: https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/latest/xtdk-llvm15-ubuntu2004_x86_64.tar.gz -XVLLM: https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/latest/output.tar.gz +bash custom_ops/xpu_ops/src/download_dependencies.sh develop ``` -### Download FastDeploy source code, checkout the stable branch/TAG, then compile and install. +Set environment variables, + +```bash +export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xtdk +export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xvllm +``` + +### Compile and Install. ```bash -git clone https://github.com/PaddlePaddle/FastDeploy -cd FastDeploy bash build.sh ``` @@ -114,106 +118,5 @@ python -c "from fastdeploy.model_executor.ops.xpu import block_attn" If all the above steps execute successfully, FastDeploy is installed correctly. -## Quick start - -The P800 supports the deployment of the ```ERNIE-4.5-300B-A47B-Paddle``` model using the following configurations (Note: Different configurations may result in variations in performance). -- 32K WINT4 with 8 XPUs (Recommended) -- 128K WINT4 with 8 XPUs -- 32K WINT4 with 4 XPUs - -### Online serving (OpenAI API-Compatible server) - -Deploy an OpenAI API-compatible server using FastDeploy with the following commands: - -#### Start service - -**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 8 XPUs(Recommended)** - -```bash -python -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-300B-A47B-Paddle \ - --port 8188 \ - --tensor-parallel-size 8 \ - --max-model-len 32768 \ - --max-num-seqs 64 \ - --quantization "wint4" \ - --gpu-memory-utilization 0.9 -``` - -**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 128K context length on 8 XPUs** - -```bash -python -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-300B-A47B-Paddle \ - --port 8188 \ - --tensor-parallel-size 8 \ - --max-model-len 131072 \ - --max-num-seqs 64 \ - --quantization "wint4" \ - --gpu-memory-utilization 0.9 -``` - -**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 4 XPUs** - -```bash -export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used -python -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-300B-A47B-Paddle \ - --port 8188 \ - --tensor-parallel-size 4 \ - --max-model-len 32768 \ - --max-num-seqs 64 \ - --quantization "wint4" \ - --gpu-memory-utilization 0.9 -``` - -**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities. -`export XPU_VISIBLE_DEVICES="0,1,2,3"` -or -`export XPU_VISIBLE_DEVICES="4,5,6,7"` - -Refer to [Parameters](../../parameters.md) for more options. - -#### Send requests - -Send requests using either curl or Python - -```bash -curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \ --H "Content-Type: application/json" \ --d '{ - "messages": [ - {"role": "user", "content": "Where is the capital of China?"} - ] -}' -``` - -```python -import openai -host = "0.0.0.0" -port = "8188" -client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null") - -response = client.completions.create( - model="null", - prompt="Where is the capital of China?", - stream=True, -) -for chunk in response: - print(chunk.choices[0].text, end='') -print('\n') - -response = client.chat.completions.create( - model="null", - messages=[ - {"role": "user", "content": "Where is the capital of China?"}, - ], - stream=True, -) -for chunk in response: - if chunk.choices[0].delta: - print(chunk.choices[0].delta.content, end='') -print('\n') -``` - -For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md). +## How to deploy services on kunlunxin XPU +Refer to [**Supported Models and Service Deployment**](../../usage/kunlunxin_xpu_deployment.md) for the details about the supported models and the way to deploy services on kunlunxin XPU. diff --git a/docs/usage/kunlunxin_xpu_deployment.md b/docs/usage/kunlunxin_xpu_deployment.md new file mode 100644 index 000000000..fdebe4663 --- /dev/null +++ b/docs/usage/kunlunxin_xpu_deployment.md @@ -0,0 +1,92 @@ +## Supported Models +|Model Name|Context Length|Quantization|XPUs Required|Deployment Commands| +|-|-|-|-|-| +|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-300B-A47B|32K|WINT4|4 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 4 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-300B-A47B|128K|WINT4|8 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 131072 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| + +## Quick start + +### Online serving (OpenAI API-Compatible server) + +Deploy an OpenAI API-compatible server using FastDeploy with the following commands: + +#### Start service + +**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 4 XPUs** + +```bash +export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used +python -m fastdeploy.entrypoints.openai.api_server \ + --model baidu/ERNIE-4.5-300B-A47B-Paddle \ + --port 8188 \ + --tensor-parallel-size 4 \ + --max-model-len 32768 \ + --max-num-seqs 64 \ + --quantization "wint4" \ + --gpu-memory-utilization 0.9 +``` + +**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities. +`export XPU_VISIBLE_DEVICES="0,1,2,3"` +or +`export XPU_VISIBLE_DEVICES="4,5,6,7"` + +Refer to [Parameters](../../parameters.md) for more options. + +All supported models can be found in the *Supported Models* section above. + +#### Send requests + +Send requests using either curl or Python + +```bash +curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Where is the capital of China?"} + ] +}' +``` + +```python +import openai +host = "0.0.0.0" +port = "8188" +client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null") + +response = client.completions.create( + model="null", + prompt="Where is the capital of China?", + stream=True, +) +for chunk in response: + print(chunk.choices[0].text, end='') +print('\n') + +response = client.chat.completions.create( + model="null", + messages=[ + {"role": "user", "content": "Where is the capital of China?"}, + ], + stream=True, +) +for chunk in response: + if chunk.choices[0].delta: + print(chunk.choices[0].delta.content, end='') +print('\n') +``` + +For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md). diff --git a/docs/zh/get_started/installation/kunlunxin_xpu.md b/docs/zh/get_started/installation/kunlunxin_xpu.md index 40721276e..9c6ae22bc 100644 --- a/docs/zh/get_started/installation/kunlunxin_xpu.md +++ b/docs/zh/get_started/installation/kunlunxin_xpu.md @@ -72,33 +72,37 @@ python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/ ``` -### 下载昆仑编译套件 XTDK 和 XVLLM 预编译算子库并设置路径 - -```bash -# XTDK -wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz -tar -xvf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk -export CLANG_PATH=$(pwd)/xtdk - -# XVLLM -wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250624/output.tar.gz -tar -xvf output.tar.gz && mv output xvllm -export XVLLM_PATH=$(pwd)/xvllm -``` - -或者你也可以下载最新版 XTDK 和 XVLLM(不推荐) - -```bash -XTDK: https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/latest/xtdk-llvm15-ubuntu2004_x86_64.tar.gz -XVLLM: https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/latest/output.tar.gz -``` - -### 下载 FastDelpoy 源码,切换到稳定分支或 TAG,开始编译并安装: +### 下载 FastDelpoy 源码,切换到稳定分支或 TAG ```bash git clone https://github.com/PaddlePaddle/FastDeploy git checkout cd FastDeploy +``` + +### 下载昆仑编译依赖 + +```bash +bash custom_ops/xpu_ops/src/download_dependencies.sh stable +``` + +或者你也可以下载最新版编译依赖 + +```bash +bash custom_ops/xpu_ops/src/download_dependencies.sh develop +``` + +设置环境变量 + +```bash +export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xtdk +export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xvllm +``` + +### 开始编译并安装: + +```bash + bash build.sh ``` @@ -115,106 +119,5 @@ python -c "from fastdeploy.model_executor.ops.xpu import block_attn" 如果上述步骤均执行成功,代表 FastDeploy 已安装成功。 -## 快速开始 - -P800 支持 ```ERNIE-4.5-300B-A47B-Paddle``` 模型采用以下配置部署(注意:不同配置在效果、性能上可能存在差异)。 -- 32K WINT4 8 卡(推荐) -- 128K WINT4 8 卡 -- 32K WINT4 4 卡 - -### OpenAI 兼容服务器 - -您还可以通过如下命令,基于 FastDeploy 实现 OpenAI API 协议兼容的服务器部署。 - -#### 启动服务 - -**基于 WINT4 精度和 32K 上下文部署 ERNIE-4.5-300B-A47B-Paddle 模型到 8 卡 P800 服务器(推荐)** - -```bash -python -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-300B-A47B-Paddle \ - --port 8188 \ - --tensor-parallel-size 8 \ - --max-model-len 32768 \ - --max-num-seqs 64 \ - --quantization "wint4" \ - --gpu-memory-utilization 0.9 -``` - -**基于 WINT4 精度和 128K 上下文部署 ERNIE-4.5-300B-A47B-Paddle 模型到 8 卡 P800 服务器** - -```bash -python -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-300B-A47B-Paddle \ - --port 8188 \ - --tensor-parallel-size 8 \ - --max-model-len 131072 \ - --max-num-seqs 64 \ - --quantization "wint4" \ - --gpu-memory-utilization 0.9 -``` - -**基于 WINT4 精度和 32K 上下文部署 ERNIE-4.5-300B-A47B-Paddle 模型到 4 卡 P800 服务器** - -```bash -export XPU_VISIBLE_DEVICES="0,1,2,3" # 设置使用的 XPU 卡 -python -m fastdeploy.entrypoints.openai.api_server \ - --model baidu/ERNIE-4.5-300B-A47B-Paddle \ - --port 8188 \ - --tensor-parallel-size 4 \ - --max-model-len 32768 \ - --max-num-seqs 64 \ - --quantization "wint4" \ - --gpu-memory-utilization 0.9 -``` - -**注意:** 使用 P800 在 4 块 XPU 上进行部署时,由于受到卡间互联拓扑等硬件限制,仅支持以下两种配置方式: -`export XPU_VISIBLE_DEVICES="0,1,2,3"` -or -`export XPU_VISIBLE_DEVICES="4,5,6,7"` - -更多参数可以参考 [参数说明](../../parameters.md)。 - -#### 请求服务 - -您可以基于 OpenAI 协议,通过 curl 和 python 两种方式请求服务。 - -```bash -curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \ --H "Content-Type: application/json" \ --d '{ - "messages": [ - {"role": "user", "content": "Where is the capital of China?"} - ] -}' -``` - -```python -import openai -host = "0.0.0.0" -port = "8188" -client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null") - -response = client.completions.create( - model="null", - prompt="Where is the capital of China?", - stream=True, -) -for chunk in response: - print(chunk.choices[0].text, end='') -print('\n') - -response = client.chat.completions.create( - model="null", - messages=[ - {"role": "user", "content": "Where is the capital of China?"}, - ], - stream=True, -) -for chunk in response: - if chunk.choices[0].delta: - print(chunk.choices[0].delta.content, end='') -print('\n') -``` - -OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../../online_serving/README.md)。 +## 如何在昆仑新XPU上部署服务 +请参考 [**支持的模型与服务部署**](../../usage/kunlunxin_xpu_deployment.md) 以了解昆仑芯 XPU 支持的模型与服务部署方法。 diff --git a/docs/zh/usage/kunlunxin_xpu_deployment.md b/docs/zh/usage/kunlunxin_xpu_deployment.md new file mode 100644 index 000000000..0dfe8680f --- /dev/null +++ b/docs/zh/usage/kunlunxin_xpu_deployment.md @@ -0,0 +1,92 @@ +## 支持的模型 +|模型名|上下文长度|量化|所需卡数|部署命令| +|-|-|-|-|-| +|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-300B-A47B|32K|WINT4|4 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 4 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-300B-A47B|128K|WINT4|8 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \
--port 8188 \
--tensor-parallel-size 8 \
--max-model-len 131072 \
--max-num-seqs 64 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint4" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--gpu-memory-utilization 0.9| +|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡
python -m fastdeploy.entrypoints.openai.api_server \
--model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \
--port 8188 \
--tensor-parallel-size 1 \
--max-model-len 131072 \
--max-num-seqs 128 \
--quantization "wint8" \
--gpu-memory-utilization 0.9| + +## 快速开始 + +### OpenAI 兼容服务器 + +您还可以通过如下命令,基于 FastDeploy 实现 OpenAI API 协议兼容的服务器部署。 + +#### 启动服务 + +**基于 WINT4 精度和 32K 上下文部署 ERNIE-4.5-300B-A47B-Paddle 模型到 4 卡 P800 服务器** + +```bash +export XPU_VISIBLE_DEVICES="0,1,2,3" # 设置使用的 XPU 卡 +python -m fastdeploy.entrypoints.openai.api_server \ + --model baidu/ERNIE-4.5-300B-A47B-Paddle \ + --port 8188 \ + --tensor-parallel-size 4 \ + --max-model-len 32768 \ + --max-num-seqs 64 \ + --quantization "wint4" \ + --gpu-memory-utilization 0.9 +``` + +**注意:** 使用 P800 在 4 块 XPU 上进行部署时,由于受到卡间互联拓扑等硬件限制,仅支持以下两种配置方式: +`export XPU_VISIBLE_DEVICES="0,1,2,3"` +or +`export XPU_VISIBLE_DEVICES="4,5,6,7"` + +更多参数可以参考 [参数说明](../../parameters.md)。 + +全部支持的模型可以在上方的 *支持的模型* 章节找到。 + +#### 请求服务 + +您可以基于 OpenAI 协议,通过 curl 和 python 两种方式请求服务。 + +```bash +curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Where is the capital of China?"} + ] +}' +``` + +```python +import openai +host = "0.0.0.0" +port = "8188" +client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null") + +response = client.completions.create( + model="null", + prompt="Where is the capital of China?", + stream=True, +) +for chunk in response: + print(chunk.choices[0].text, end='') +print('\n') + +response = client.chat.completions.create( + model="null", + messages=[ + {"role": "user", "content": "Where is the capital of China?"}, + ], + stream=True, +) +for chunk in response: + if chunk.choices[0].delta: + print(chunk.choices[0].delta.content, end='') +print('\n') +``` + +OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../../online_serving/README.md)。 diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 968a959a7..0d3329c1d 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -774,7 +774,6 @@ class XPUModelRunner(ModelRunnerBase): del self.share_inputs["caches"] if self.forward_meta is not None: del self.forward_meta.caches - del self.share_inputs["block_tables"] paddle.device.xpu.empty_cache() def cal_theortical_kvcache(self): @@ -817,11 +816,6 @@ class XPUModelRunner(ModelRunnerBase): # Reset block table and kv cache with global block num self.initialize_kv_cache() - self.share_inputs["block_tables"] = paddle.full( - [self.parallel_config.max_num_seqs, self.num_gpu_blocks], - -1, - dtype="int32") - # Reset free list free_list = list( range(