mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Iluvatar] add vl into ci and support v1 loader (#4774)
This commit is contained in:
@@ -552,6 +552,8 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
||||
"gpu_ops/text_image_index_out.cu",
|
||||
"gpu_ops/text_image_gather_scatter.cu",
|
||||
"gpu_ops/set_data_ipc.cu",
|
||||
"gpu_ops/limit_thinking_content_length_v1.cu",
|
||||
"gpu_ops/limit_thinking_content_length_v2.cu",
|
||||
"iluvatar_ops/moe_dispatch.cu",
|
||||
"iluvatar_ops/moe_reduce.cu",
|
||||
"iluvatar_ops/paged_attn.cu",
|
||||
|
||||
@@ -22,7 +22,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
### Start Container
|
||||
|
||||
```bash
|
||||
docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
docker exec -it paddle_infer bash
|
||||
```
|
||||
|
||||
@@ -432,7 +432,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
### Start Container
|
||||
|
||||
```bash
|
||||
docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
docker exec -it paddle_infer bash
|
||||
```
|
||||
|
||||
@@ -441,8 +441,8 @@ docker exec -it paddle_infer bash
|
||||
### Install paddle
|
||||
|
||||
```bash
|
||||
pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
|
||||
pip3 install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
|
||||
pip3 install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
|
||||
```
|
||||
For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
||||
|
||||
@@ -556,3 +556,80 @@ generated_text=
|
||||
|
||||
这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。
|
||||
```
|
||||
|
||||
## Testing thinking model
|
||||
|
||||
### ERNIE-4.5-21B-A3B-Thinking
|
||||
Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), the command is bellow:
|
||||
|
||||
server:
|
||||
```bash
|
||||
#!/bin/bash
|
||||
export PADDLE_XCCL_BACKEND=iluvatar_gpu
|
||||
export INFERENCE_MSG_QUEUE_ID=232132
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
export FD_SAMPLING_CLASS=rejection
|
||||
export FD_DEBUG=1
|
||||
python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-21B-A3B-Thinking \
|
||||
--port 8180 \
|
||||
--load-choices "default_v1" \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-model-len 32768 \
|
||||
--quantization wint8 \
|
||||
--block-size 16 \
|
||||
--reasoning-parser ernie_x1 \
|
||||
--tool-call-parser ernie_x1 \
|
||||
--max-num-seqs 8
|
||||
```
|
||||
|
||||
client:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "Write me a poem about large language model."}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
### ERNIE-4.5-VL-28B-A3B
|
||||
Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), set `"chat_template_kwargs":{"enable_thinking": true}` and the command is bellow:
|
||||
|
||||
server:
|
||||
```bash
|
||||
#!/bin/bash
|
||||
export PADDLE_XCCL_BACKEND=iluvatar_gpu
|
||||
export INFERENCE_MSG_QUEUE_ID=232132
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
export FD_SAMPLING_CLASS=rejection
|
||||
export FD_DEBUG=1
|
||||
python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
|
||||
--port 8180 \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-model-len 32768 \
|
||||
--quantization wint8 \
|
||||
--block-size 16 \
|
||||
--limit-mm-per-prompt '{"image": 100, "video": 100}' \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--max-num-seqs 8
|
||||
```
|
||||
|
||||
client:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": [
|
||||
{"type": "image_url", "image_url": {"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
|
||||
{"type": "text", "text": "From which era does the artifact in the image originate?"}
|
||||
]}
|
||||
],
|
||||
"chat_template_kwargs":{"enable_thinking": true}
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -22,7 +22,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
### 启动容器
|
||||
|
||||
```bash
|
||||
docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
docker exec -it paddle_infer bash
|
||||
```
|
||||
|
||||
@@ -432,7 +432,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
### 启动容器
|
||||
|
||||
```bash
|
||||
docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
|
||||
docker exec -it paddle_infer bash
|
||||
```
|
||||
|
||||
@@ -441,8 +441,8 @@ docker exec -it paddle_infer bash
|
||||
### Install paddle
|
||||
|
||||
```bash
|
||||
pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
|
||||
pip3 install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
|
||||
pip3 install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
|
||||
```
|
||||
获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)
|
||||
|
||||
@@ -556,3 +556,80 @@ generated_text=
|
||||
|
||||
这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。
|
||||
```
|
||||
|
||||
## 测试thinking模型
|
||||
|
||||
### ERNIE-4.5-21B-A3B-Thinking
|
||||
参考 [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), 命令如下所示:
|
||||
|
||||
server:
|
||||
```bash
|
||||
#!/bin/bash
|
||||
export PADDLE_XCCL_BACKEND=iluvatar_gpu
|
||||
export INFERENCE_MSG_QUEUE_ID=232132
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
export FD_SAMPLING_CLASS=rejection
|
||||
export FD_DEBUG=1
|
||||
python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-21B-A3B-Thinking \
|
||||
--port 8180 \
|
||||
--load-choices "default_v1" \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-model-len 32768 \
|
||||
--quantization wint8 \
|
||||
--block-size 16 \
|
||||
--reasoning-parser ernie_x1 \
|
||||
--tool-call-parser ernie_x1 \
|
||||
--max-num-seqs 8
|
||||
```
|
||||
|
||||
client:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": "Write me a poem about large language model."}
|
||||
]
|
||||
}'
|
||||
```
|
||||
|
||||
### ERNIE-4.5-VL-28B-A3B
|
||||
参考 [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), 设置 `"chat_template_kwargs":{"enable_thinking": true}`,命令如下所示:
|
||||
|
||||
server:
|
||||
```bash
|
||||
#!/bin/bash
|
||||
export PADDLE_XCCL_BACKEND=iluvatar_gpu
|
||||
export INFERENCE_MSG_QUEUE_ID=232132
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
export FD_SAMPLING_CLASS=rejection
|
||||
export FD_DEBUG=1
|
||||
python3 -m fastdeploy.entrypoints.openai.api_server \
|
||||
--model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
|
||||
--port 8180 \
|
||||
--tensor-parallel-size 2 \
|
||||
--max-model-len 32768 \
|
||||
--quantization wint8 \
|
||||
--block-size 16 \
|
||||
--limit-mm-per-prompt '{"image": 100, "video": 100}' \
|
||||
--reasoning-parser ernie-45-vl \
|
||||
--max-num-seqs 8
|
||||
```
|
||||
|
||||
client:
|
||||
|
||||
```bash
|
||||
curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"messages": [
|
||||
{"role": "user", "content": [
|
||||
{"type": "image_url", "image_url": {"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
|
||||
{"type": "text", "text": "From which era does the artifact in the image originate?"}
|
||||
]}
|
||||
],
|
||||
"chat_template_kwargs":{"enable_thinking": true}
|
||||
}'
|
||||
```
|
||||
|
||||
@@ -56,7 +56,10 @@ class GPUMemoryChecker:
|
||||
|
||||
def __del__(self):
|
||||
""" """
|
||||
pynvml.nvmlShutdown()
|
||||
if self.gpu_memory_handle is None:
|
||||
pass
|
||||
else:
|
||||
pynvml.nvmlShutdown()
|
||||
|
||||
def _print_memory_info(
|
||||
self,
|
||||
|
||||
@@ -223,7 +223,7 @@ class FusedMoE(nn.Layer):
|
||||
if expert_id - self.expert_id_offset >= 0 and expert_id - self.expert_id_offset < self.num_local_experts:
|
||||
if hasattr(param, "SHARD_ID_TO_SHARDED_DIM"):
|
||||
SHARD_ID_TO_SHARDED_DIM = param.SHARD_ID_TO_SHARDED_DIM
|
||||
elif current_platform.is_cuda():
|
||||
elif current_platform.is_cuda() or current_platform.is_iluvatar():
|
||||
SHARD_ID_TO_SHARDED_DIM = {"gate": 1, "down": 0, "up": 1}
|
||||
else:
|
||||
SHARD_ID_TO_SHARDED_DIM = {"gate": 0, "down": 1, "up": 0}
|
||||
|
||||
@@ -27,6 +27,8 @@ from fastdeploy.platforms import current_platform
|
||||
if current_platform.is_iluvatar():
|
||||
from fastdeploy.model_executor.ops.iluvatar import (
|
||||
get_padding_offset,
|
||||
limit_thinking_content_length_v1,
|
||||
limit_thinking_content_length_v2,
|
||||
save_output,
|
||||
set_stop_value_multi_ends,
|
||||
step_paddle,
|
||||
|
||||
@@ -263,8 +263,8 @@ def v1_loader_support(fd_config):
|
||||
def _err_msg(msg: str) -> str:
|
||||
logger.info(msg + "; fallback to the v0 loader for model loading.")
|
||||
|
||||
if not (current_platform.is_cuda() or current_platform.is_xpu()):
|
||||
_err_msg("v1loader currently only support backends gpu and xpu")
|
||||
if not (current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_iluvatar()):
|
||||
_err_msg("v1loader currently only support backends gpu, xpu and iluvatar")
|
||||
return False
|
||||
|
||||
if is_pre_sliced_weight(fd_config.model_config.model):
|
||||
|
||||
@@ -6,39 +6,44 @@ echo "$DIR"
|
||||
ps -efww | grep -E 'run_ernie300B_4layer' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
|
||||
ixsmi -r
|
||||
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
ln -sf /usr/local/bin/python3 /usr/local/bin/python
|
||||
echo "pip requirements"
|
||||
python -m pip install -r requirements_iluvatar.txt
|
||||
echo "uninstall org"
|
||||
python -m pip uninstall paddlepaddle -y
|
||||
python -m pip uninstall paddle-iluvatar-gpu -y
|
||||
# python -m pip install --pre paddlepaddle==3.0.0.dev20250708 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
|
||||
# python -m pip install --pre paddle-iluvatar-gpu==3.0.0.dev20250806 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
|
||||
python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
|
||||
python -m pip install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
|
||||
# Patch, remove if image updated
|
||||
cp /data1/fastdeploy/packages/cusolver.h /usr/local/lib/python3.10/site-packages/paddle/include/paddle/phi/backends/dynload/cusolver.h
|
||||
echo "build whl"
|
||||
bash build.sh || exit 1
|
||||
|
||||
unset http_proxy
|
||||
unset https_proxy
|
||||
unset no_proxy
|
||||
|
||||
rm -rf log/*
|
||||
export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
|
||||
ln -sf /usr/local/bin/python3 /usr/local/bin/python
|
||||
echo "pip requirements"
|
||||
python -m pip install -r requirements_iluvatar.txt
|
||||
echo "install paddle cpu and custom device"
|
||||
python -m pip install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
|
||||
python -m pip install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
|
||||
echo "build whl"
|
||||
bash build.sh || exit 1
|
||||
|
||||
CI_PATH=tests/ci_use/iluvatar_UT
|
||||
export INFERENCE_MSG_QUEUE_ID=232132
|
||||
export FD_DEBUG=1
|
||||
export PADDLE_XCCL_BACKEND=iluvatar_gpu
|
||||
export FD_SAMPLING_CLASS=rejection
|
||||
python tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py
|
||||
exit_code=$?
|
||||
echo exit_code is ${exit_code}
|
||||
|
||||
ps -efww | grep -E 'run_ernie300B_4layer' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
|
||||
ci_list=(
|
||||
${CI_PATH}/run_ernie300B_4layer.py
|
||||
${CI_PATH}/run_ernie_vl_28B.py
|
||||
)
|
||||
echo "test ci files: ${ci_list[@]}"
|
||||
for cur_test_file in ${ci_list[@]}
|
||||
do
|
||||
echo "============ start to test ${cur_test_file} ==========="
|
||||
rm -rf log/*
|
||||
python ${cur_test_file}
|
||||
exit_code=$?
|
||||
echo exit_code is ${exit_code}
|
||||
|
||||
if [ ${exit_code} -ne 0 ]; then
|
||||
echo "log/workerlog.0"
|
||||
cat log/workerlog.0
|
||||
exit 1
|
||||
fi
|
||||
ps -efww | grep -E '${cur_test_file}' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
|
||||
|
||||
if [ ${exit_code} -ne 0 ]; then
|
||||
echo "log/workerlog.0"
|
||||
cat log/workerlog.0
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
@@ -37,27 +37,21 @@ def timeout(seconds):
|
||||
return decorator
|
||||
|
||||
|
||||
@timeout(60)
|
||||
@timeout(80)
|
||||
def offline_infer_check():
|
||||
set_random_seed(123)
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
|
||||
# 采样参数
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.00001, max_tokens=16)
|
||||
|
||||
# 加载模型
|
||||
llm = LLM(
|
||||
model="/data1/fastdeploy/ERNIE_300B_4L",
|
||||
tensor_parallel_size=8,
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=8192,
|
||||
quantization="wint8",
|
||||
block_size=16,
|
||||
)
|
||||
|
||||
# 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理)
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
assert outputs[0].outputs.token_ids == [
|
||||
@@ -86,6 +80,10 @@ if __name__ == "__main__":
|
||||
result = offline_infer_check()
|
||||
sys.exit(0)
|
||||
except TimeoutError:
|
||||
print(
|
||||
"The timeout exit may be due to multiple processes sharing the "
|
||||
"same gpu card. You can check this using ixsmi on the device."
|
||||
)
|
||||
sys.exit(124)
|
||||
except Exception:
|
||||
sys.exit(1)
|
||||
|
||||
130
tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py
Normal file
130
tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py
Normal file
@@ -0,0 +1,130 @@
|
||||
import functools
|
||||
import io
|
||||
import sys
|
||||
import threading
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from fastdeploy import LLM, SamplingParams
|
||||
from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
|
||||
from fastdeploy.utils import set_random_seed
|
||||
|
||||
|
||||
def timeout(seconds):
|
||||
def decorator(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
result = [None]
|
||||
exception = [None]
|
||||
|
||||
def target():
|
||||
try:
|
||||
result[0] = func(*args, **kwargs)
|
||||
except Exception as e:
|
||||
exception[0] = e
|
||||
|
||||
thread = threading.Thread(target=target)
|
||||
thread.daemon = True
|
||||
thread.start()
|
||||
thread.join(seconds)
|
||||
|
||||
if thread.is_alive():
|
||||
raise TimeoutError(f"Function timed out after {seconds} seconds")
|
||||
|
||||
if exception[0]:
|
||||
raise exception[0]
|
||||
|
||||
return result[0]
|
||||
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@timeout(150)
|
||||
def offline_infer_check():
|
||||
set_random_seed(123)
|
||||
|
||||
PATH = "/data1/fastdeploy/ERNIE-4.5-VL-28B-A3B-Paddle"
|
||||
tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"
|
||||
},
|
||||
},
|
||||
{"type": "text", "text": "图中的文物属于哪个年代"},
|
||||
],
|
||||
}
|
||||
]
|
||||
prompt = tokenizer.apply_chat_template(messages, tokenize=False)
|
||||
images, videos = [], []
|
||||
for message in messages:
|
||||
content = message["content"]
|
||||
if not isinstance(content, list):
|
||||
continue
|
||||
for part in content:
|
||||
if part["type"] == "image_url":
|
||||
url = part["image_url"]["url"]
|
||||
image_bytes = requests.get(url).content
|
||||
img = Image.open(io.BytesIO(image_bytes))
|
||||
images.append(img)
|
||||
elif part["type"] == "video_url":
|
||||
url = part["video_url"]["url"]
|
||||
video_bytes = requests.get(url).content
|
||||
videos.append({"video": video_bytes, "max_frames": 30})
|
||||
|
||||
sampling_params = SamplingParams(temperature=0.1, max_tokens=16)
|
||||
llm = LLM(
|
||||
model=PATH,
|
||||
tensor_parallel_size=2,
|
||||
max_model_len=32768,
|
||||
block_size=16,
|
||||
quantization="wint8",
|
||||
limit_mm_per_prompt={"image": 100},
|
||||
reasoning_parser="ernie-45-vl",
|
||||
)
|
||||
outputs = llm.generate(
|
||||
prompts={"prompt": prompt, "multimodal_data": {"image": images, "video": videos}},
|
||||
sampling_params=sampling_params,
|
||||
)
|
||||
|
||||
assert outputs[0].outputs.token_ids == [
|
||||
23,
|
||||
3843,
|
||||
94206,
|
||||
2075,
|
||||
52352,
|
||||
94133,
|
||||
13553,
|
||||
10878,
|
||||
93977,
|
||||
5119,
|
||||
93956,
|
||||
68725,
|
||||
14449,
|
||||
4356,
|
||||
38225,
|
||||
2,
|
||||
], f"{outputs[0].outputs.token_ids}"
|
||||
print("PASSED")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
result = offline_infer_check()
|
||||
sys.exit(0)
|
||||
except TimeoutError:
|
||||
print(
|
||||
"The timeout exit may be due to multiple processes sharing the "
|
||||
"same gpu card. You can check this using ixsmi on the device."
|
||||
)
|
||||
sys.exit(124)
|
||||
except Exception:
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user