[Iluvatar] add vl into ci and support v1 loader (#4774)

2025-12-24 13:28:13 +08:00 · 2025-11-11 10:50:17 +08:00
parent 07a82afcae
commit 3707af7a4f
10 changed files with 340 additions and 46 deletions
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -552,6 +552,8 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
                "gpu_ops/text_image_index_out.cu",
                "gpu_ops/text_image_gather_scatter.cu",
                "gpu_ops/set_data_ipc.cu",
+                "gpu_ops/limit_thinking_content_length_v1.cu",
+                "gpu_ops/limit_thinking_content_length_v2.cu",
                "iluvatar_ops/moe_dispatch.cu",
                "iluvatar_ops/moe_reduce.cu",
                "iluvatar_ops/paged_attn.cu",
--- a/docs/get_started/installation/iluvatar_gpu.md
+++ b/docs/get_started/installation/iluvatar_gpu.md
@@ -22,7 +22,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
 ### Start Container

 ```bash
-docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
 docker exec -it paddle_infer bash
 ```

@@ -432,7 +432,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
 ### Start Container

 ```bash
-docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
 docker exec -it paddle_infer bash
 ```

@@ -441,8 +441,8 @@ docker exec -it paddle_infer bash
 ### Install paddle

 ```bash
-pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
-pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+pip3 install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+pip3 install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
 ```
 For latest paddle version on iluvatar. Refer to [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)

@@ -556,3 +556,80 @@ generated_text=

 这件佛像具有典型的北齐风格，佛像结跏趺坐于莲花座上，身披通肩袈裟，面部圆润，神态安详，体现了北齐佛教艺术的独特魅力。
 ```
+
+## Testing thinking model
+
+### ERNIE-4.5-21B-A3B-Thinking
+Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), the command is bellow:
+
+server:
+```bash
+#!/bin/bash
+export PADDLE_XCCL_BACKEND=iluvatar_gpu
+export INFERENCE_MSG_QUEUE_ID=232132
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+export FD_SAMPLING_CLASS=rejection
+export FD_DEBUG=1
+python3 -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-21B-A3B-Thinking \
+       --port 8180 \
+       --load-choices "default_v1" \
+       --tensor-parallel-size 2 \
+       --max-model-len 32768 \
+       --quantization wint8 \
+       --block-size 16 \
+       --reasoning-parser ernie_x1 \
+       --tool-call-parser ernie_x1 \
+       --max-num-seqs 8
+```
+
+client:
+
+```bash
+curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "Write me a poem about large language model."}
+  ]
+}'
+```
+
+### ERNIE-4.5-VL-28B-A3B
+Refer to [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), set `"chat_template_kwargs":{"enable_thinking": true}` and the command is bellow:
+
+server:
+```bash
+#!/bin/bash
+export PADDLE_XCCL_BACKEND=iluvatar_gpu
+export INFERENCE_MSG_QUEUE_ID=232132
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+export FD_SAMPLING_CLASS=rejection
+export FD_DEBUG=1
+python3 -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
+       --port 8180 \
+       --tensor-parallel-size 2 \
+       --max-model-len 32768 \
+       --quantization wint8 \
+       --block-size 16 \
+       --limit-mm-per-prompt '{"image": 100, "video": 100}' \
+       --reasoning-parser ernie-45-vl \
+       --max-num-seqs 8
+```
+
+client:
+
+```bash
+curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": [
+      {"type": "image_url", "image_url": {"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
+      {"type": "text", "text": "From which era does the artifact in the image originate?"}
+    ]}
+  ],
+  "chat_template_kwargs":{"enable_thinking": true}
+}'
+```
--- a/docs/zh/get_started/installation/iluvatar_gpu.md
+++ b/docs/zh/get_started/installation/iluvatar_gpu.md
@@ -22,7 +22,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
 ### 启动容器

 ```bash
-docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
 docker exec -it paddle_infer bash
 ```

@@ -432,7 +432,7 @@ docker pull ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
 ### 启动容器

 ```bash
-docker run -itd --name paddle_infer -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
+docker run -itd --name paddle_infer --network host -v /usr/src:/usr/src -v /lib/modules:/lib/modules -v /dev:/dev -v /home/paddle:/home/paddle --privileged --cap-add=ALL --pid=host ccr-2vdh3abv-pub.cnc.bj.baidubce.com/device/paddle-ixuca:latest
 docker exec -it paddle_infer bash
 ```

@@ -441,8 +441,8 @@ docker exec -it paddle_infer bash
 ### Install paddle

 ```bash
-pip3 install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
-pip3 install paddle-iluvatar-gpu==3.0.0.dev20250926 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+pip3 install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+pip3 install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
 ```
 获取Paddle的最新安装版本： [PaddlePaddle Installation](https://www.paddlepaddle.org.cn/)

@@ -556,3 +556,80 @@ generated_text=

 这件佛像具有典型的北齐风格，佛像结跏趺坐于莲花座上，身披通肩袈裟，面部圆润，神态安详，体现了北齐佛教艺术的独特魅力。
 ```
+
+## 测试thinking模型
+
+### ERNIE-4.5-21B-A3B-Thinking
+参考 [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/best_practices/ERNIE-4.5-21B-A3B-Thinking.md), 命令如下所示:
+
+server:
+```bash
+#!/bin/bash
+export PADDLE_XCCL_BACKEND=iluvatar_gpu
+export INFERENCE_MSG_QUEUE_ID=232132
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+export FD_SAMPLING_CLASS=rejection
+export FD_DEBUG=1
+python3 -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-21B-A3B-Thinking \
+       --port 8180 \
+       --load-choices "default_v1" \
+       --tensor-parallel-size 2 \
+       --max-model-len 32768 \
+       --quantization wint8 \
+       --block-size 16 \
+       --reasoning-parser ernie_x1 \
+       --tool-call-parser ernie_x1 \
+       --max-num-seqs 8
+```
+
+client:
+
+```bash
+curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "Write me a poem about large language model."}
+  ]
+}'
+```
+
+### ERNIE-4.5-VL-28B-A3B
+参考 [gpu doc](https://github.com/PaddlePaddle/FastDeploy/blob/develop/docs/get_started/ernie-4.5-vl.md), 设置 `"chat_template_kwargs":{"enable_thinking": true}`，命令如下所示：
+
+server:
+```bash
+#!/bin/bash
+export PADDLE_XCCL_BACKEND=iluvatar_gpu
+export INFERENCE_MSG_QUEUE_ID=232132
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+export FD_SAMPLING_CLASS=rejection
+export FD_DEBUG=1
+python3 -m fastdeploy.entrypoints.openai.api_server \
+       --model baidu/ERNIE-4.5-VL-28B-A3B-Paddle \
+       --port 8180 \
+       --tensor-parallel-size 2 \
+       --max-model-len 32768 \
+       --quantization wint8 \
+       --block-size 16 \
+       --limit-mm-per-prompt '{"image": 100, "video": 100}' \
+       --reasoning-parser ernie-45-vl \
+       --max-num-seqs 8
+```
+
+client:
+
+```bash
+curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": [
+      {"type": "image_url", "image_url": {"url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"}},
+      {"type": "text", "text": "From which era does the artifact in the image originate?"}
+    ]}
+  ],
+  "chat_template_kwargs":{"enable_thinking": true}
+}'
+```
--- a/fastdeploy/model_executor/graph_optimization/utils.py
+++ b/fastdeploy/model_executor/graph_optimization/utils.py
@@ -56,7 +56,10 @@ class GPUMemoryChecker:

    def __del__(self):
        """ """
-        pynvml.nvmlShutdown()
+        if self.gpu_memory_handle is None:
+            pass
+        else:
+            pynvml.nvmlShutdown()

    def _print_memory_info(
        self,
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -223,7 +223,7 @@ class FusedMoE(nn.Layer):
        if expert_id - self.expert_id_offset >= 0 and expert_id - self.expert_id_offset < self.num_local_experts:
            if hasattr(param, "SHARD_ID_TO_SHARDED_DIM"):
                SHARD_ID_TO_SHARDED_DIM = param.SHARD_ID_TO_SHARDED_DIM
-            elif current_platform.is_cuda():
+            elif current_platform.is_cuda() or current_platform.is_iluvatar():
                SHARD_ID_TO_SHARDED_DIM = {"gate": 1, "down": 0, "up": 1}
            else:
                SHARD_ID_TO_SHARDED_DIM = {"gate": 0, "down": 1, "up": 0}
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -27,6 +27,8 @@ from fastdeploy.platforms import current_platform
 if current_platform.is_iluvatar():
    from fastdeploy.model_executor.ops.iluvatar import (
        get_padding_offset,
+        limit_thinking_content_length_v1,
+        limit_thinking_content_length_v2,
        save_output,
        set_stop_value_multi_ends,
        step_paddle,
--- a/fastdeploy/model_executor/utils.py
+++ b/fastdeploy/model_executor/utils.py
@@ -263,8 +263,8 @@ def v1_loader_support(fd_config):
    def _err_msg(msg: str) -> str:
        logger.info(msg + "; fallback to the v0 loader for model loading.")

-    if not (current_platform.is_cuda() or current_platform.is_xpu()):
-        _err_msg("v1loader currently only support backends gpu and xpu")
+    if not (current_platform.is_cuda() or current_platform.is_xpu() or current_platform.is_iluvatar()):
+        _err_msg("v1loader currently only support backends gpu, xpu and iluvatar")
        return False

    if is_pre_sliced_weight(fd_config.model_config.model):
--- a/scripts/run_ci_iluvatar.sh
+++ b/scripts/run_ci_iluvatar.sh
@@ -6,39 +6,44 @@ echo "$DIR"
 ps -efww | grep -E 'run_ernie300B_4layer' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
 ixsmi -r

-export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
-ln -sf /usr/local/bin/python3 /usr/local/bin/python
-echo "pip requirements"
-python -m pip install -r requirements_iluvatar.txt
-echo "uninstall org"
-python -m pip uninstall paddlepaddle -y
-python -m pip uninstall paddle-iluvatar-gpu -y
-# python -m pip install --pre paddlepaddle==3.0.0.dev20250708 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
-# python -m pip install --pre paddle-iluvatar-gpu==3.0.0.dev20250806 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
-python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
-python -m pip install paddle-iluvatar-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/ixuca/
-# Patch, remove if image updated
-cp /data1/fastdeploy/packages/cusolver.h /usr/local/lib/python3.10/site-packages/paddle/include/paddle/phi/backends/dynload/cusolver.h
-echo "build whl"
-bash build.sh || exit 1
-
 unset http_proxy
 unset https_proxy
 unset no_proxy

-rm -rf log/*
+export LD_PRELOAD=/usr/local/corex/lib64/libcuda.so.1
+ln -sf /usr/local/bin/python3 /usr/local/bin/python
+echo "pip requirements"
+python -m pip install -r requirements_iluvatar.txt
+echo "install paddle cpu and custom device"
+python -m pip install paddlepaddle==3.3.0.dev20251028 -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
+python -m pip install paddle-iluvatar-gpu==3.0.0.dev20251029 -i https://www.paddlepaddle.org.cn/packages/nightly/ixuca/
+echo "build whl"
+bash build.sh || exit 1
+
+CI_PATH=tests/ci_use/iluvatar_UT
 export INFERENCE_MSG_QUEUE_ID=232132
 export FD_DEBUG=1
 export PADDLE_XCCL_BACKEND=iluvatar_gpu
 export FD_SAMPLING_CLASS=rejection
-python tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py
-exit_code=$?
-echo exit_code is ${exit_code}

-ps -efww | grep -E 'run_ernie300B_4layer' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
+ci_list=(
+    ${CI_PATH}/run_ernie300B_4layer.py
+    ${CI_PATH}/run_ernie_vl_28B.py
+)
+echo "test ci files: ${ci_list[@]}"
+for cur_test_file in ${ci_list[@]}
+do
+    echo "============ start to test ${cur_test_file} ==========="
+    rm -rf log/*
+    python ${cur_test_file}
+    exit_code=$?
+    echo exit_code is ${exit_code}

-if [ ${exit_code} -ne 0 ]; then
-    echo "log/workerlog.0"
-    cat log/workerlog.0
-    exit 1
-fi
+    ps -efww | grep -E '${cur_test_file}' | grep -v grep | awk '{print $2}' | xargs kill -9 || true
+
+    if [ ${exit_code} -ne 0 ]; then
+        echo "log/workerlog.0"
+        cat log/workerlog.0
+        exit 1
+    fi
+done
--- a/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py
+++ b/tests/ci_use/iluvatar_UT/run_ernie300B_4layer.py
@@ -37,27 +37,21 @@ def timeout(seconds):
    return decorator


-@timeout(60)
+@timeout(80)
 def offline_infer_check():
    set_random_seed(123)

    prompts = [
        "Hello, my name is",
    ]
-
-    # 采样参数
    sampling_params = SamplingParams(temperature=0.8, top_p=0.00001, max_tokens=16)
-
-    # 加载模型
    llm = LLM(
        model="/data1/fastdeploy/ERNIE_300B_4L",
-        tensor_parallel_size=8,
+        tensor_parallel_size=2,
        max_model_len=8192,
        quantization="wint8",
        block_size=16,
    )
-
-    # 批量进行推理（llm内部基于资源情况进行请求排队、动态插入处理）
    outputs = llm.generate(prompts, sampling_params)

    assert outputs[0].outputs.token_ids == [
@@ -86,6 +80,10 @@ if __name__ == "__main__":
        result = offline_infer_check()
        sys.exit(0)
    except TimeoutError:
+        print(
+            "The timeout exit may be due to multiple processes sharing the "
+            "same gpu card. You can check this using ixsmi on the device."
+        )
        sys.exit(124)
    except Exception:
        sys.exit(1)
--- a/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py
+++ b/tests/ci_use/iluvatar_UT/run_ernie_vl_28B.py
@@ -0,0 +1,130 @@
+import functools
+import io
+import sys
+import threading
+
+import requests
+from PIL import Image
+
+from fastdeploy import LLM, SamplingParams
+from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
+from fastdeploy.utils import set_random_seed
+
+
+def timeout(seconds):
+    def decorator(func):
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            result = [None]
+            exception = [None]
+
+            def target():
+                try:
+                    result[0] = func(*args, **kwargs)
+                except Exception as e:
+                    exception[0] = e
+
+            thread = threading.Thread(target=target)
+            thread.daemon = True
+            thread.start()
+            thread.join(seconds)
+
+            if thread.is_alive():
+                raise TimeoutError(f"Function timed out after {seconds} seconds")
+
+            if exception[0]:
+                raise exception[0]
+
+            return result[0]
+
+        return wrapper
+
+    return decorator
+
+
+@timeout(150)
+def offline_infer_check():
+    set_random_seed(123)
+
+    PATH = "/data1/fastdeploy/ERNIE-4.5-VL-28B-A3B-Paddle"
+    tokenizer = Ernie4_5Tokenizer.from_pretrained(PATH)
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://paddlenlp.bj.bcebos.com/datasets/paddlemix/demo_images/example2.jpg"
+                    },
+                },
+                {"type": "text", "text": "图中的文物属于哪个年代"},
+            ],
+        }
+    ]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False)
+    images, videos = [], []
+    for message in messages:
+        content = message["content"]
+        if not isinstance(content, list):
+            continue
+        for part in content:
+            if part["type"] == "image_url":
+                url = part["image_url"]["url"]
+                image_bytes = requests.get(url).content
+                img = Image.open(io.BytesIO(image_bytes))
+                images.append(img)
+            elif part["type"] == "video_url":
+                url = part["video_url"]["url"]
+                video_bytes = requests.get(url).content
+                videos.append({"video": video_bytes, "max_frames": 30})
+
+    sampling_params = SamplingParams(temperature=0.1, max_tokens=16)
+    llm = LLM(
+        model=PATH,
+        tensor_parallel_size=2,
+        max_model_len=32768,
+        block_size=16,
+        quantization="wint8",
+        limit_mm_per_prompt={"image": 100},
+        reasoning_parser="ernie-45-vl",
+    )
+    outputs = llm.generate(
+        prompts={"prompt": prompt, "multimodal_data": {"image": images, "video": videos}},
+        sampling_params=sampling_params,
+    )
+
+    assert outputs[0].outputs.token_ids == [
+        23,
+        3843,
+        94206,
+        2075,
+        52352,
+        94133,
+        13553,
+        10878,
+        93977,
+        5119,
+        93956,
+        68725,
+        14449,
+        4356,
+        38225,
+        2,
+    ], f"{outputs[0].outputs.token_ids}"
+    print("PASSED")
+
+
+if __name__ == "__main__":
+    try:
+        result = offline_infer_check()
+        sys.exit(0)
+    except TimeoutError:
+        print(
+            "The timeout exit may be due to multiple processes sharing the "
+            "same gpu card. You can check this using ixsmi on the device."
+        )
+        sys.exit(124)
+    except Exception:
+        sys.exit(1)