mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 00:33:03 +08:00
[XPU] Update doc and add scripts for downloading dependencies (#2845)
* [XPU] update xvllm download * update supported models * fix xpu model runner in huge memory with small model * update doc
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -162,3 +162,5 @@ custom_ops/tmp*
|
|||||||
build
|
build
|
||||||
|
|
||||||
.ccls-cache
|
.ccls-cache
|
||||||
|
|
||||||
|
third_party
|
||||||
|
54
custom_ops/xpu_ops/src/download_dependencies.sh
Normal file
54
custom_ops/xpu_ops/src/download_dependencies.sh
Normal file
@@ -0,0 +1,54 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ $# -ne 1 ] || { [ "$1" != "stable" ] && [ "$1" != "develop" ]; }; then
|
||||||
|
echo "Usage: $0 <stable|develop>"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||||
|
THIRDPARTY_DIR="$SCRIPT_DIR/third_party"
|
||||||
|
|
||||||
|
rm -rf "$THIRDPARTY_DIR"
|
||||||
|
mkdir -p "$THIRDPARTY_DIR" || exit 1
|
||||||
|
|
||||||
|
if [ "$1" == "stable" ]; then
|
||||||
|
version_xvllm="20250710"
|
||||||
|
version_xtdk="3.2.40.1"
|
||||||
|
else
|
||||||
|
version_xvllm="latest"
|
||||||
|
version_xtdk="latest"
|
||||||
|
fi
|
||||||
|
|
||||||
|
(
|
||||||
|
cd "$THIRDPARTY_DIR" || exit 1
|
||||||
|
|
||||||
|
# Clean previous installation
|
||||||
|
rm -rf output* xvllm* xtdk-llvm* output.tar.gz xtdk-llvm*tar.gz
|
||||||
|
|
||||||
|
# Download and install xvllm
|
||||||
|
if ! wget "https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/${version_xvllm}/output.tar.gz"; then
|
||||||
|
echo "Error downloading xvllm"
|
||||||
|
exit 2
|
||||||
|
fi
|
||||||
|
tar -zxf output.tar.gz && mv output xvllm && rm output.tar.gz
|
||||||
|
|
||||||
|
# Download and install xtdk
|
||||||
|
if ! wget "https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/${version_xtdk}/xtdk-llvm15-ubuntu2004_x86_64.tar.gz"; then
|
||||||
|
echo "Error downloading xtdk"
|
||||||
|
exit 3
|
||||||
|
fi
|
||||||
|
tar -zxf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && \
|
||||||
|
mv xtdk-llvm15-ubuntu2004_x86_64 xtdk && \
|
||||||
|
rm xtdk-llvm15-ubuntu2004_x86_64.tar.gz
|
||||||
|
)
|
||||||
|
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
echo "Installation failed"
|
||||||
|
exit 4
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Installation completed in: $THIRDPARTY_DIR"
|
||||||
|
echo "You can set environment variables as follows to use XVLLM and XTDK:"
|
||||||
|
echo " export CLANG_PATH=$THIRDPARTY_DIR/xtdk"
|
||||||
|
echo " export XVLLM_PATH=$THIRDPARTY_DIR/xvllm"
|
||||||
|
echo ""
|
@@ -17,23 +17,18 @@ RUN python -m pip uninstall paddlepaddle-gpu paddlepaddle-xpu -y
|
|||||||
# install paddlepaddle
|
# install paddlepaddle
|
||||||
RUN python -m pip install --no-cache-dir --progress-bar off paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
RUN python -m pip install --no-cache-dir --progress-bar off paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn/packages/stable/xpu-p800/
|
||||||
|
|
||||||
|
COPY . /workspace/FastDeploy
|
||||||
|
|
||||||
# get xtdk and xvllm and xre
|
# get xtdk and xvllm and xre
|
||||||
RUN mkdir -p /workspace/deps && cd /workspace/deps && wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250710/output.tar.gz && \
|
RUN mkdir -p /workspace/deps && cd /workspace/deps && \
|
||||||
tar -zxf output.tar.gz && mv output xvllm && \
|
|
||||||
wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz && \
|
|
||||||
tar -zxf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk && \
|
|
||||||
wget https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz && \
|
wget https://klx-sdk-release-public.su.bcebos.com/xre/kl3-release/5.0.21.21/xre-Linux-x86_64-5.0.21.21.tar.gz && \
|
||||||
tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre
|
tar -zxf xre-Linux-x86_64-5.0.21.21.tar.gz && mv xre-Linux-x86_64-5.0.21.21 xre && \
|
||||||
|
cd /workspace/FastDeploy && bash custom_ops/xpu_ops/src/download_dependencies.sh stable
|
||||||
|
|
||||||
ENV PATH=/workspace/deps/xre/bin:$PATH
|
ENV PATH=/workspace/deps/xre/bin:$PATH
|
||||||
ENV CLANG_PATH=/workspace/deps/xtdk
|
ENV CLANG_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xtdk
|
||||||
ENV XVLLM_PATH=/workspace/deps/xvllm
|
ENV XVLLM_PATH=/workspace/FastDeploy/custom_ops/xpu_ops/src/third_party/xvllm
|
||||||
|
|
||||||
ENV OPENBLAS_NUM_THREADS=1
|
|
||||||
ENV OMP_NUM_THREADS=1
|
|
||||||
ENV MKL_NUM_THREADS=1
|
|
||||||
USER root
|
|
||||||
COPY . /workspace/FastDeploy
|
|
||||||
# build and install FastDeploy
|
# build and install FastDeploy
|
||||||
RUN cd /workspace/FastDeploy && bash build.sh && python -m pip install --no-cache-dir dist/* && rm -rf /workspace/FastDeploy
|
RUN cd /workspace/FastDeploy && bash build.sh && python -m pip install --no-cache-dir dist/* && rm -rf /workspace/FastDeploy
|
||||||
|
|
||||||
|
@@ -72,32 +72,36 @@ Alternatively, you can install the latest version of PaddlePaddle (Not recommend
|
|||||||
python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
|
python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
|
||||||
```
|
```
|
||||||
|
|
||||||
### Download Kunlunxin Toolkit (XTDK) and XVLLM library, then set their paths.
|
### Download FastDeploy source code, checkout the stable branch/TAG
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# XTDK
|
git clone https://github.com/PaddlePaddle/FastDeploy
|
||||||
wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz
|
git checkout <tag or branch>
|
||||||
tar -xvf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk
|
cd FastDeploy
|
||||||
export CLANG_PATH=$(pwd)/xtdk
|
```
|
||||||
|
|
||||||
# XVLLM
|
### Download Kunlunxin Compilation Dependency
|
||||||
wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250624/output.tar.gz
|
|
||||||
tar -xvf output.tar.gz && mv output xvllm
|
```bash
|
||||||
export XVLLM_PATH=$(pwd)/xvllm
|
bash custom_ops/xpu_ops/src/download_dependencies.sh stable
|
||||||
```
|
```
|
||||||
|
|
||||||
Alternatively, you can download the latest versions of XTDK and XVLLM (Not recommended)
|
Alternatively, you can download the latest versions of XTDK and XVLLM (Not recommended)
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
XTDK: https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/latest/xtdk-llvm15-ubuntu2004_x86_64.tar.gz
|
bash custom_ops/xpu_ops/src/download_dependencies.sh develop
|
||||||
XVLLM: https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/latest/output.tar.gz
|
|
||||||
```
|
```
|
||||||
|
|
||||||
### Download FastDeploy source code, checkout the stable branch/TAG, then compile and install.
|
Set environment variables,
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xtdk
|
||||||
|
export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xvllm
|
||||||
|
```
|
||||||
|
|
||||||
|
### Compile and Install.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/PaddlePaddle/FastDeploy
|
|
||||||
cd FastDeploy
|
|
||||||
bash build.sh
|
bash build.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -114,106 +118,5 @@ python -c "from fastdeploy.model_executor.ops.xpu import block_attn"
|
|||||||
|
|
||||||
If all the above steps execute successfully, FastDeploy is installed correctly.
|
If all the above steps execute successfully, FastDeploy is installed correctly.
|
||||||
|
|
||||||
## Quick start
|
## How to deploy services on kunlunxin XPU
|
||||||
|
Refer to [**Supported Models and Service Deployment**](../../usage/kunlunxin_xpu_deployment.md) for the details about the supported models and the way to deploy services on kunlunxin XPU.
|
||||||
The P800 supports the deployment of the ```ERNIE-4.5-300B-A47B-Paddle``` model using the following configurations (Note: Different configurations may result in variations in performance).
|
|
||||||
- 32K WINT4 with 8 XPUs (Recommended)
|
|
||||||
- 128K WINT4 with 8 XPUs
|
|
||||||
- 32K WINT4 with 4 XPUs
|
|
||||||
|
|
||||||
### Online serving (OpenAI API-Compatible server)
|
|
||||||
|
|
||||||
Deploy an OpenAI API-compatible server using FastDeploy with the following commands:
|
|
||||||
|
|
||||||
#### Start service
|
|
||||||
|
|
||||||
**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 8 XPUs(Recommended)**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server \
|
|
||||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
|
||||||
--port 8188 \
|
|
||||||
--tensor-parallel-size 8 \
|
|
||||||
--max-model-len 32768 \
|
|
||||||
--max-num-seqs 64 \
|
|
||||||
--quantization "wint4" \
|
|
||||||
--gpu-memory-utilization 0.9
|
|
||||||
```
|
|
||||||
|
|
||||||
**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 128K context length on 8 XPUs**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server \
|
|
||||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
|
||||||
--port 8188 \
|
|
||||||
--tensor-parallel-size 8 \
|
|
||||||
--max-model-len 131072 \
|
|
||||||
--max-num-seqs 64 \
|
|
||||||
--quantization "wint4" \
|
|
||||||
--gpu-memory-utilization 0.9
|
|
||||||
```
|
|
||||||
|
|
||||||
**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 4 XPUs**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server \
|
|
||||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
|
||||||
--port 8188 \
|
|
||||||
--tensor-parallel-size 4 \
|
|
||||||
--max-model-len 32768 \
|
|
||||||
--max-num-seqs 64 \
|
|
||||||
--quantization "wint4" \
|
|
||||||
--gpu-memory-utilization 0.9
|
|
||||||
```
|
|
||||||
|
|
||||||
**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities.
|
|
||||||
`export XPU_VISIBLE_DEVICES="0,1,2,3"`
|
|
||||||
or
|
|
||||||
`export XPU_VISIBLE_DEVICES="4,5,6,7"`
|
|
||||||
|
|
||||||
Refer to [Parameters](../../parameters.md) for more options.
|
|
||||||
|
|
||||||
#### Send requests
|
|
||||||
|
|
||||||
Send requests using either curl or Python
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"messages": [
|
|
||||||
{"role": "user", "content": "Where is the capital of China?"}
|
|
||||||
]
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
```python
|
|
||||||
import openai
|
|
||||||
host = "0.0.0.0"
|
|
||||||
port = "8188"
|
|
||||||
client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
|
|
||||||
|
|
||||||
response = client.completions.create(
|
|
||||||
model="null",
|
|
||||||
prompt="Where is the capital of China?",
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
for chunk in response:
|
|
||||||
print(chunk.choices[0].text, end='')
|
|
||||||
print('\n')
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model="null",
|
|
||||||
messages=[
|
|
||||||
{"role": "user", "content": "Where is the capital of China?"},
|
|
||||||
],
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
for chunk in response:
|
|
||||||
if chunk.choices[0].delta:
|
|
||||||
print(chunk.choices[0].delta.content, end='')
|
|
||||||
print('\n')
|
|
||||||
```
|
|
||||||
|
|
||||||
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md).
|
|
||||||
|
92
docs/usage/kunlunxin_xpu_deployment.md
Normal file
92
docs/usage/kunlunxin_xpu_deployment.md
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
## Supported Models
|
||||||
|
|Model Name|Context Length|Quantization|XPUs Required|Deployment Commands|
|
||||||
|
|-|-|-|-|-|
|
||||||
|
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (recommend)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # Specify any card<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|
||||||
|
## Quick start
|
||||||
|
|
||||||
|
### Online serving (OpenAI API-Compatible server)
|
||||||
|
|
||||||
|
Deploy an OpenAI API-compatible server using FastDeploy with the following commands:
|
||||||
|
|
||||||
|
#### Start service
|
||||||
|
|
||||||
|
**Deploy the ERNIE-4.5-300B-A47B-Paddle model with WINT4 precision and 32K context length on 4 XPUs**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export XPU_VISIBLE_DEVICES="0,1,2,3" # Specify which cards to be used
|
||||||
|
python -m fastdeploy.entrypoints.openai.api_server \
|
||||||
|
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||||
|
--port 8188 \
|
||||||
|
--tensor-parallel-size 4 \
|
||||||
|
--max-model-len 32768 \
|
||||||
|
--max-num-seqs 64 \
|
||||||
|
--quantization "wint4" \
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
```
|
||||||
|
|
||||||
|
**Note:** When deploying on 4 XPUs, only two configurations are supported which constrained by hardware limitations such as interconnect capabilities.
|
||||||
|
`export XPU_VISIBLE_DEVICES="0,1,2,3"`
|
||||||
|
or
|
||||||
|
`export XPU_VISIBLE_DEVICES="4,5,6,7"`
|
||||||
|
|
||||||
|
Refer to [Parameters](../../parameters.md) for more options.
|
||||||
|
|
||||||
|
All supported models can be found in the *Supported Models* section above.
|
||||||
|
|
||||||
|
#### Send requests
|
||||||
|
|
||||||
|
Send requests using either curl or Python
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Where is the capital of China?"}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
host = "0.0.0.0"
|
||||||
|
port = "8188"
|
||||||
|
client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
|
||||||
|
|
||||||
|
response = client.completions.create(
|
||||||
|
model="null",
|
||||||
|
prompt="Where is the capital of China?",
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
for chunk in response:
|
||||||
|
print(chunk.choices[0].text, end='')
|
||||||
|
print('\n')
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="null",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "Where is the capital of China?"},
|
||||||
|
],
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
for chunk in response:
|
||||||
|
if chunk.choices[0].delta:
|
||||||
|
print(chunk.choices[0].delta.content, end='')
|
||||||
|
print('\n')
|
||||||
|
```
|
||||||
|
|
||||||
|
For detailed OpenAI protocol specifications, see [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create). Differences from the standard OpenAI protocol are documented in [OpenAI Protocol-Compatible API Server](../../online_serving/README.md).
|
@@ -72,33 +72,37 @@ python -m pip install paddlepaddle-xpu==3.1.0 -i https://www.paddlepaddle.org.cn
|
|||||||
python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
|
python -m pip install --pre paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packages/nightly/xpu-p800/
|
||||||
```
|
```
|
||||||
|
|
||||||
### 下载昆仑编译套件 XTDK 和 XVLLM 预编译算子库并设置路径
|
### 下载 FastDelpoy 源码,切换到稳定分支或 TAG
|
||||||
|
|
||||||
```bash
|
|
||||||
# XTDK
|
|
||||||
wget https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/3.2.40.1/xtdk-llvm15-ubuntu2004_x86_64.tar.gz
|
|
||||||
tar -xvf xtdk-llvm15-ubuntu2004_x86_64.tar.gz && mv xtdk-llvm15-ubuntu2004_x86_64 xtdk
|
|
||||||
export CLANG_PATH=$(pwd)/xtdk
|
|
||||||
|
|
||||||
# XVLLM
|
|
||||||
wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250624/output.tar.gz
|
|
||||||
tar -xvf output.tar.gz && mv output xvllm
|
|
||||||
export XVLLM_PATH=$(pwd)/xvllm
|
|
||||||
```
|
|
||||||
|
|
||||||
或者你也可以下载最新版 XTDK 和 XVLLM(不推荐)
|
|
||||||
|
|
||||||
```bash
|
|
||||||
XTDK: https://klx-sdk-release-public.su.bcebos.com/xtdk_15fusion/dev/latest/xtdk-llvm15-ubuntu2004_x86_64.tar.gz
|
|
||||||
XVLLM: https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/latest/output.tar.gz
|
|
||||||
```
|
|
||||||
|
|
||||||
### 下载 FastDelpoy 源码,切换到稳定分支或 TAG,开始编译并安装:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/PaddlePaddle/FastDeploy
|
git clone https://github.com/PaddlePaddle/FastDeploy
|
||||||
git checkout <tag or branch>
|
git checkout <tag or branch>
|
||||||
cd FastDeploy
|
cd FastDeploy
|
||||||
|
```
|
||||||
|
|
||||||
|
### 下载昆仑编译依赖
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash custom_ops/xpu_ops/src/download_dependencies.sh stable
|
||||||
|
```
|
||||||
|
|
||||||
|
或者你也可以下载最新版编译依赖
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash custom_ops/xpu_ops/src/download_dependencies.sh develop
|
||||||
|
```
|
||||||
|
|
||||||
|
设置环境变量
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xtdk
|
||||||
|
export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/src/third_party/xvllm
|
||||||
|
```
|
||||||
|
|
||||||
|
### 开始编译并安装:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
|
||||||
bash build.sh
|
bash build.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -115,106 +119,5 @@ python -c "from fastdeploy.model_executor.ops.xpu import block_attn"
|
|||||||
|
|
||||||
如果上述步骤均执行成功,代表 FastDeploy 已安装成功。
|
如果上述步骤均执行成功,代表 FastDeploy 已安装成功。
|
||||||
|
|
||||||
## 快速开始
|
## 如何在昆仑新XPU上部署服务
|
||||||
|
请参考 [**支持的模型与服务部署**](../../usage/kunlunxin_xpu_deployment.md) 以了解昆仑芯 XPU 支持的模型与服务部署方法。
|
||||||
P800 支持 ```ERNIE-4.5-300B-A47B-Paddle``` 模型采用以下配置部署(注意:不同配置在效果、性能上可能存在差异)。
|
|
||||||
- 32K WINT4 8 卡(推荐)
|
|
||||||
- 128K WINT4 8 卡
|
|
||||||
- 32K WINT4 4 卡
|
|
||||||
|
|
||||||
### OpenAI 兼容服务器
|
|
||||||
|
|
||||||
您还可以通过如下命令,基于 FastDeploy 实现 OpenAI API 协议兼容的服务器部署。
|
|
||||||
|
|
||||||
#### 启动服务
|
|
||||||
|
|
||||||
**基于 WINT4 精度和 32K 上下文部署 ERNIE-4.5-300B-A47B-Paddle 模型到 8 卡 P800 服务器(推荐)**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server \
|
|
||||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
|
||||||
--port 8188 \
|
|
||||||
--tensor-parallel-size 8 \
|
|
||||||
--max-model-len 32768 \
|
|
||||||
--max-num-seqs 64 \
|
|
||||||
--quantization "wint4" \
|
|
||||||
--gpu-memory-utilization 0.9
|
|
||||||
```
|
|
||||||
|
|
||||||
**基于 WINT4 精度和 128K 上下文部署 ERNIE-4.5-300B-A47B-Paddle 模型到 8 卡 P800 服务器**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server \
|
|
||||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
|
||||||
--port 8188 \
|
|
||||||
--tensor-parallel-size 8 \
|
|
||||||
--max-model-len 131072 \
|
|
||||||
--max-num-seqs 64 \
|
|
||||||
--quantization "wint4" \
|
|
||||||
--gpu-memory-utilization 0.9
|
|
||||||
```
|
|
||||||
|
|
||||||
**基于 WINT4 精度和 32K 上下文部署 ERNIE-4.5-300B-A47B-Paddle 模型到 4 卡 P800 服务器**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export XPU_VISIBLE_DEVICES="0,1,2,3" # 设置使用的 XPU 卡
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server \
|
|
||||||
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
|
||||||
--port 8188 \
|
|
||||||
--tensor-parallel-size 4 \
|
|
||||||
--max-model-len 32768 \
|
|
||||||
--max-num-seqs 64 \
|
|
||||||
--quantization "wint4" \
|
|
||||||
--gpu-memory-utilization 0.9
|
|
||||||
```
|
|
||||||
|
|
||||||
**注意:** 使用 P800 在 4 块 XPU 上进行部署时,由于受到卡间互联拓扑等硬件限制,仅支持以下两种配置方式:
|
|
||||||
`export XPU_VISIBLE_DEVICES="0,1,2,3"`
|
|
||||||
or
|
|
||||||
`export XPU_VISIBLE_DEVICES="4,5,6,7"`
|
|
||||||
|
|
||||||
更多参数可以参考 [参数说明](../../parameters.md)。
|
|
||||||
|
|
||||||
#### 请求服务
|
|
||||||
|
|
||||||
您可以基于 OpenAI 协议,通过 curl 和 python 两种方式请求服务。
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"messages": [
|
|
||||||
{"role": "user", "content": "Where is the capital of China?"}
|
|
||||||
]
|
|
||||||
}'
|
|
||||||
```
|
|
||||||
|
|
||||||
```python
|
|
||||||
import openai
|
|
||||||
host = "0.0.0.0"
|
|
||||||
port = "8188"
|
|
||||||
client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
|
|
||||||
|
|
||||||
response = client.completions.create(
|
|
||||||
model="null",
|
|
||||||
prompt="Where is the capital of China?",
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
for chunk in response:
|
|
||||||
print(chunk.choices[0].text, end='')
|
|
||||||
print('\n')
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model="null",
|
|
||||||
messages=[
|
|
||||||
{"role": "user", "content": "Where is the capital of China?"},
|
|
||||||
],
|
|
||||||
stream=True,
|
|
||||||
)
|
|
||||||
for chunk in response:
|
|
||||||
if chunk.choices[0].delta:
|
|
||||||
print(chunk.choices[0].delta.content, end='')
|
|
||||||
print('\n')
|
|
||||||
```
|
|
||||||
|
|
||||||
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../../online_serving/README.md)。
|
|
||||||
|
92
docs/zh/usage/kunlunxin_xpu_deployment.md
Normal file
92
docs/zh/usage/kunlunxin_xpu_deployment.md
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
## 支持的模型
|
||||||
|
|模型名|上下文长度|量化|所需卡数|部署命令|
|
||||||
|
|-|-|-|-|-|
|
||||||
|
|ERNIE-4.5-300B-A47B|32K|WINT8|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-300B-A47B|32K|WINT4|4 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3" or "4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 4 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-300B-A47B|32K|WINT4|8|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-300B-A47B|128K|WINT4|8 (推荐)|export XPU_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-300B-A47B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 8 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 64 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|32K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-21B-A3B|128K|WINT4|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-21B-A3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint4" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-0.3B|32K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-0.3B|32K|WINT8|1|export XPU_VISIBLE_DEVICES="x" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 32768 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-0.3B|128K|BF16|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|ERNIE-4.5-0.3B|128K|WINT8|1|export XPU_VISIBLE_DEVICES="0" # 指定任意一张卡<br>python -m fastdeploy.entrypoints.openai.api_server \ <br> --model PaddlePaddle/ERNIE-4.5-0.3B-Paddle \ <br> --port 8188 \ <br> --tensor-parallel-size 1 \ <br> --max-model-len 131072 \ <br> --max-num-seqs 128 \ <br> --quantization "wint8" \ <br> --gpu-memory-utilization 0.9|
|
||||||
|
|
||||||
|
## 快速开始
|
||||||
|
|
||||||
|
### OpenAI 兼容服务器
|
||||||
|
|
||||||
|
您还可以通过如下命令,基于 FastDeploy 实现 OpenAI API 协议兼容的服务器部署。
|
||||||
|
|
||||||
|
#### 启动服务
|
||||||
|
|
||||||
|
**基于 WINT4 精度和 32K 上下文部署 ERNIE-4.5-300B-A47B-Paddle 模型到 4 卡 P800 服务器**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export XPU_VISIBLE_DEVICES="0,1,2,3" # 设置使用的 XPU 卡
|
||||||
|
python -m fastdeploy.entrypoints.openai.api_server \
|
||||||
|
--model baidu/ERNIE-4.5-300B-A47B-Paddle \
|
||||||
|
--port 8188 \
|
||||||
|
--tensor-parallel-size 4 \
|
||||||
|
--max-model-len 32768 \
|
||||||
|
--max-num-seqs 64 \
|
||||||
|
--quantization "wint4" \
|
||||||
|
--gpu-memory-utilization 0.9
|
||||||
|
```
|
||||||
|
|
||||||
|
**注意:** 使用 P800 在 4 块 XPU 上进行部署时,由于受到卡间互联拓扑等硬件限制,仅支持以下两种配置方式:
|
||||||
|
`export XPU_VISIBLE_DEVICES="0,1,2,3"`
|
||||||
|
or
|
||||||
|
`export XPU_VISIBLE_DEVICES="4,5,6,7"`
|
||||||
|
|
||||||
|
更多参数可以参考 [参数说明](../../parameters.md)。
|
||||||
|
|
||||||
|
全部支持的模型可以在上方的 *支持的模型* 章节找到。
|
||||||
|
|
||||||
|
#### 请求服务
|
||||||
|
|
||||||
|
您可以基于 OpenAI 协议,通过 curl 和 python 两种方式请求服务。
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "Where is the capital of China?"}
|
||||||
|
]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
host = "0.0.0.0"
|
||||||
|
port = "8188"
|
||||||
|
client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
|
||||||
|
|
||||||
|
response = client.completions.create(
|
||||||
|
model="null",
|
||||||
|
prompt="Where is the capital of China?",
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
for chunk in response:
|
||||||
|
print(chunk.choices[0].text, end='')
|
||||||
|
print('\n')
|
||||||
|
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="null",
|
||||||
|
messages=[
|
||||||
|
{"role": "user", "content": "Where is the capital of China?"},
|
||||||
|
],
|
||||||
|
stream=True,
|
||||||
|
)
|
||||||
|
for chunk in response:
|
||||||
|
if chunk.choices[0].delta:
|
||||||
|
print(chunk.choices[0].delta.content, end='')
|
||||||
|
print('\n')
|
||||||
|
```
|
||||||
|
|
||||||
|
OpenAI 协议的更多说明可参考文档 [OpenAI Chat Compeltion API](https://platform.openai.com/docs/api-reference/chat/create),以及与 OpenAI 协议的区别可以参考 [兼容 OpenAI 协议的服务化部署](../../online_serving/README.md)。
|
@@ -774,7 +774,6 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
del self.share_inputs["caches"]
|
del self.share_inputs["caches"]
|
||||||
if self.forward_meta is not None:
|
if self.forward_meta is not None:
|
||||||
del self.forward_meta.caches
|
del self.forward_meta.caches
|
||||||
del self.share_inputs["block_tables"]
|
|
||||||
paddle.device.xpu.empty_cache()
|
paddle.device.xpu.empty_cache()
|
||||||
|
|
||||||
def cal_theortical_kvcache(self):
|
def cal_theortical_kvcache(self):
|
||||||
@@ -817,11 +816,6 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
# Reset block table and kv cache with global block num
|
# Reset block table and kv cache with global block num
|
||||||
self.initialize_kv_cache()
|
self.initialize_kv_cache()
|
||||||
|
|
||||||
self.share_inputs["block_tables"] = paddle.full(
|
|
||||||
[self.parallel_config.max_num_seqs, self.num_gpu_blocks],
|
|
||||||
-1,
|
|
||||||
dtype="int32")
|
|
||||||
|
|
||||||
# Reset free list
|
# Reset free list
|
||||||
free_list = list(
|
free_list = list(
|
||||||
range(
|
range(
|
||||||
|
Reference in New Issue
Block a user