diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index c215f13f4..3a34e5662 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -552,6 +552,8 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"): "gpu_ops/text_image_index_out.cu", "gpu_ops/text_image_gather_scatter.cu", "gpu_ops/set_data_ipc.cu", + "gpu_ops/limit_thinking_content_length_v1.cu", + "gpu_ops/limit_thinking_content_length_v2.cu", "iluvatar_ops/moe_dispatch.cu", "iluvatar_ops/moe_reduce.cu", "iluvatar_ops/paged_attn.cu", diff --git a/docs/get_started/installation/iluvatar_gpu.md b/docs/get_started/installation/iluvatar_gpu.md index ebade0b83..f78a23b85 100644 --- a/docs/get_started/installation/iluvatar_gpu.md +++ b/docs/get_started/installation/iluvatar_gpu.md @@ -22,7 +22,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest ### Start Container ```bash -docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest docker exec -it paddle_infer bash ``` @@ -432,7 +432,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest ### Start Container ```bash -docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest docker exec -it paddle_infer bash ``` @@ -441,8 +441,8 @@ docker exec -it paddle_infer bash ### Install paddle ```bash -pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ -pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ +pip3 install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ +pip3 install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ ``` For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) @@ -556,3 +556,80 @@ generated_text= 这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。 ``` + +## Testing thinking model + +### ERNIE-4.5-21B-A3B-Thinking +Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), the command is bellow: + +server: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +export FD_DEBUG=1 +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model baidu/ERNIE-4.5-21B-A3B-Thinking \ + --port 8180 \ + --load-choices "default_v1" \ + --tensor-parallel-size 2 \ + --max-model-len 32768 \ + --quantization wint8 \ + --block-size 16 \ + --reasoning-parser ernie_x1 \ + --tool-call-parser ernie_x1 \ + --max-num-seqs 8 +``` + +client: + +```bash +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Write me a poem about large language model."} + ] +}' +``` + +### ERNIE-4.5-VL-28B-A3B +Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), set `"chat_template_kwargs":{"enable_thinking": true}` and the command is bellow: + +server: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +export FD_DEBUG=1 +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \ + --port 8180 \ + --tensor-parallel-size 2 \ + --max-model-len 32768 \ + --quantization wint8 \ + --block-size 16 \ + --limit-mm-per-prompt '{"image": 100, "video": 100}' \ + --reasoning-parser ernie-45-vl \ + --max-num-seqs 8 +``` + +client: + +```bash +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": [ + {"type": "image_url", "image_url": {"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}}, + {"type": "text", "text": "From which era does the artifact in the image originate?"} + ]} + ], + "chat_template_kwargs":{"enable_thinking": true} +}' +``` diff --git a/docs/zh/get_started/installation/iluvatar_gpu.md b/docs/zh/get_started/installation/iluvatar_gpu.md index 1712b4fbc..95dc41873 100644 --- a/docs/zh/get_started/installation/iluvatar_gpu.md +++ b/docs/zh/get_started/installation/iluvatar_gpu.md @@ -22,7 +22,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest ### 启动容器 ```bash -docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest docker exec -it paddle_infer bash ``` @@ -432,7 +432,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest ### 启动容器 ```bash -docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest +docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest docker exec -it paddle_infer bash ``` @@ -441,8 +441,8 @@ docker exec -it paddle_infer bash ### Install paddle ```bash -pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ -pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ +pip3 install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ +pip3 install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ ``` 获取Paddle的最新安装版本: [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/) @@ -556,3 +556,80 @@ generated_text= 这件佛像具有典型的北齐风格,佛像结跏趺坐于莲花座上,身披通肩袈裟,面部圆润,神态安详,体现了北齐佛教艺术的独特魅力。 ``` + +## 测试thinking模型 + +### ERNIE-4.5-21B-A3B-Thinking +参考 [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), 命令如下所示: + +server: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +export FD_DEBUG=1 +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model baidu/ERNIE-4.5-21B-A3B-Thinking \ + --port 8180 \ + --load-choices "default_v1" \ + --tensor-parallel-size 2 \ + --max-model-len 32768 \ + --quantization wint8 \ + --block-size 16 \ + --reasoning-parser ernie_x1 \ + --tool-call-parser ernie_x1 \ + --max-num-seqs 8 +``` + +client: + +```bash +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "Write me a poem about large language model."} + ] +}' +``` + +### ERNIE-4.5-VL-28B-A3B +参考 [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), 设置 `"chat_template_kwargs":{"enable_thinking": true}`,命令如下所示: + +server: +```bash +#!/bin/bash +export PADDLE_XCCL_BACKEND=iluvatar_gpu +export INFERENCE_MSG_QUEUE_ID=232132 +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +export FD_SAMPLING_CLASS=rejection +export FD_DEBUG=1 +python3 -m fastdeploy.entrypoints.openai.api_server \ + --model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \ + --port 8180 \ + --tensor-parallel-size 2 \ + --max-model-len 32768 \ + --quantization wint8 \ + --block-size 16 \ + --limit-mm-per-prompt '{"image": 100, "video": 100}' \ + --reasoning-parser ernie-45-vl \ + --max-num-seqs 8 +``` + +client: + +```bash +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": [ + {"type": "image_url", "image_url": {"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}}, + {"type": "text", "text": "From which era does the artifact in the image originate?"} + ]} + ], + "chat_template_kwargs":{"enable_thinking": true} +}' +``` diff --git a/fastdeploy/model_executor/graph_optimization/utils.py b/fastdeploy/model_executor/graph_optimization/utils.py index ee157041e..2b5241e5a 100644 --- a/fastdeploy/model_executor/graph_optimization/utils.py +++ b/fastdeploy/model_executor/graph_optimization/utils.py @@ -56,7 +56,10 @@ class GPUMemoryChecker: def __del__(self): """ """ - pynvml.nvmlShutdown() + if self.gpu_memory_handle is None: + pass + else: + pynvml.nvmlShutdown() def _print_memory_info( self, diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index bc3e00f1c..a27015c1a 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -223,7 +223,7 @@ class FusedMoE(nn.Layer): if expert_id - self.expert_id_offset >= 0 and expert_id - self.expert_id_offset < self.num_local_experts: if hasattr(param, "SHARD_ID_TO_SHARDED_DIM"): SHARD_ID_TO_SHARDED_DIM = param.SHARD_ID_TO_SHARDED_DIM - elif current_platform.is_cuda(): + elif current_platform.is_cuda() or current_platform.is_iluvatar(): SHARD_ID_TO_SHARDED_DIM = {"gate": 1, "down": 0, "up": 1} else: SHARD_ID_TO_SHARDED_DIM = {"gate": 0, "down": 1, "up": 0} diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 43370d8e4..c9c8b27b4 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -27,6 +27,8 @@ from fastdeploy.platforms import current_platform if current_platform.is_iluvatar(): from fastdeploy.model_executor.ops.iluvatar import ( get_padding_offset, + limit_thinking_content_length_v1, + limit_thinking_content_length_v2, save_output, set_stop_value_multi_ends, step_paddle, diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 69c6f9a4c..40af040e9 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -263,8 +263,8 @@ def v1_loader_support(fd_config): def _err_msg(msg: str) -> str: logger.info(msg + "; fallback to the v0 loader for model loading.") - if not (current_platform.is_cuda() or current_platform.is_xpu()): - _err_msg("v1loader currently only support backends gpu and xpu") + if not (current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_iluvatar()): + _err_msg("v1loader currently only support backends gpu, xpu and iluvatar") return False if is_pre_sliced_weight(fd_config.model_config.model): diff --git a/scripts/run_ci_iluvatar.sh b/scripts/run_ci_iluvatar.sh index acc1898d4..c47fe2c4c 100644 --- a/scripts/run_ci_iluvatar.sh +++ b/scripts/run_ci_iluvatar.sh @@ -6,39 +6,44 @@ echo "$DIR" ps -efww | grep -E 'run_ernie300B_4layer' | grep -v grep | awk '{print $2}' | xargs kill -9 || true ixsmi -r -export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 -ln -sf /usr/local/bin/python3 /usr/local/bin/python -echo "pip requirements" -python -m pip install -r requirements_iluvatar.txt -echo "uninstall org" -python -m pip uninstall paddlepaddle -y -python -m pip uninstall paddle-iluvatar-gpu -y -# python -m pip install --pre paddlepaddle==3.0.0.dev20250708 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ -# python -m pip install --pre paddle-iluvatar-gpu==3.0.0.dev20250806 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ -python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ -python -m pip install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/ -# Patch, remove if image updated -cp /data1/fastdeploy/packages/cusolver.h /usr/local/lib/python3.10/site-packages/paddle/include/paddle/phi/backends/dynload/cusolver.h -echo "build whl" -bash build.sh || exit 1 - unset http_proxy unset https_proxy unset no_proxy -rm -rf log/* +export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1 +ln -sf /usr/local/bin/python3 /usr/local/bin/python +echo "pip requirements" +python -m pip install -r requirements_iluvatar.txt +echo "install paddle cpu and custom device" +python -m pip install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/ +python -m pip install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/ +echo "build whl" +bash build.sh || exit 1 + +CI_PATH=tests/ci_use/iluvatar_UT export INFERENCE_MSG_QUEUE_ID=232132 export FD_DEBUG=1 export PADDLE_XCCL_BACKEND=iluvatar_gpu export FD_SAMPLING_CLASS=rejection -python tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py -exit_code=$? -echo exit_code is ${exit_code} -ps -efww | grep -E 'run_ernie300B_4layer' | grep -v grep | awk '{print $2}' | xargs kill -9 || true +ci_list=( + ${CI_PATH}/run_ernie300B_4layer.py + ${CI_PATH}/run_ernie_vl_28B.py +) +echo "test ci files: ${ci_list[@]}" +for cur_test_file in ${ci_list[@]} +do + echo "============ start to test ${cur_test_file} ===========" + rm -rf log/* + python ${cur_test_file} + exit_code=$? + echo exit_code is ${exit_code} -if [ ${exit_code} -ne 0 ]; then - echo "log/workerlog.0" - cat log/workerlog.0 - exit 1 -fi + ps -efww | grep -E '${cur_test_file}' | grep -v grep | awk '{print $2}' | xargs kill -9 || true + + if [ ${exit_code} -ne 0 ]; then + echo "log/workerlog.0" + cat log/workerlog.0 + exit 1 + fi +done diff --git a/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py b/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py index e619eaf0e..2fb33d74b 100644 --- a/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py +++ b/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py @@ -37,27 +37,21 @@ def timeout(seconds): return decorator -@timeout(60) +@timeout(80) def offline_infer_check(): set_random_seed(123) prompts = [ "Hello, my name is", ] - - # 采样参数 sampling_params = SamplingParams(temperature=0.8, top_p=0.00001, max_tokens=16) - - # 加载模型 llm = LLM( model="/data1/fastdeploy/ERNIE_300B_4L", - tensor_parallel_size=8, + tensor_parallel_size=2, max_model_len=8192, quantization="wint8", block_size=16, ) - - # 批量进行推理(llm内部基于资源情况进行请求排队、动态插入处理) outputs = llm.generate(prompts, sampling_params) assert outputs[0].outputs.token_ids == [ @@ -86,6 +80,10 @@ if __name__ == "__main__": result = offline_infer_check() sys.exit(0) except TimeoutError: + print( + "The timeout exit may be due to multiple processes sharing the " + "same gpu card. You can check this using ixsmi on the device." + ) sys.exit(124) except Exception: sys.exit(1) diff --git a/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py b/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py new file mode 100644 index 000000000..0b6a4ac74 --- /dev/null +++ b/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py @@ -0,0 +1,130 @@ +import functools +import io +import sys +import threading + +import requests +from PIL import Image + +from fastdeploy import LLM, SamplingParams +from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer +from fastdeploy.utils import set_random_seed + + +def timeout(seconds): + def decorator(func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + result = [None] + exception = [None] + + def target(): + try: + result[0] = func(*args, **kwargs) + except Exception as e: + exception[0] = e + + thread = threading.Thread(target=target) + thread.daemon = True + thread.start() + thread.join(seconds) + + if thread.is_alive(): + raise TimeoutError(f"Function timed out after {seconds} seconds") + + if exception[0]: + raise exception[0] + + return result[0] + + return wrapper + + return decorator + + +@timeout(150) +def offline_infer_check(): + set_random_seed(123) + + PATH = "/data1/fastdeploy/ERNIE-4.5-VL-28B-A3B-Paddle" + tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH) + + messages = [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg" + }, + }, + {"type": "text", "text": "图中的文物属于哪个年代"}, + ], + } + ] + prompt = tokenizer.apply_chat_template(messages, tokenize=False) + images, videos = [], [] + for message in messages: + content = message["content"] + if not isinstance(content, list): + continue + for part in content: + if part["type"] == "image_url": + url = part["image_url"]["url"] + image_bytes = requests.get(url).content + img = Image.open(io.BytesIO(image_bytes)) + images.append(img) + elif part["type"] == "video_url": + url = part["video_url"]["url"] + video_bytes = requests.get(url).content + videos.append({"video": video_bytes, "max_frames": 30}) + + sampling_params = SamplingParams(temperature=0.1, max_tokens=16) + llm = LLM( + model=PATH, + tensor_parallel_size=2, + max_model_len=32768, + block_size=16, + quantization="wint8", + limit_mm_per_prompt={"image": 100}, + reasoning_parser="ernie-45-vl", + ) + outputs = llm.generate( + prompts={"prompt": prompt, "multimodal_data": {"image": images, "video": videos}}, + sampling_params=sampling_params, + ) + + assert outputs[0].outputs.token_ids == [ + 23, + 3843, + 94206, + 2075, + 52352, + 94133, + 13553, + 10878, + 93977, + 5119, + 93956, + 68725, + 14449, + 4356, + 38225, + 2, + ], f"{outputs[0].outputs.token_ids}" + print("PASSED") + + +if __name__ == "__main__": + try: + result = offline_infer_check() + sys.exit(0) + except TimeoutError: + print( + "The timeout exit may be due to multiple processes sharing the " + "same gpu card. You can check this using ixsmi on the device." + ) + sys.exit(124) + except Exception: + sys.exit(1)