From 6e2e2fcd296d78517f86f59a77018a37df289c19 Mon Sep 17 00:00:00 2001 From: bukejiyu <52310069+bukejiyu@users.noreply.github.com> Date: Wed, 12 Nov 2025 15:12:59 +0800 Subject: [PATCH] xpu (#4969) --- fastdeploy/model_executor/utils.py | 9 ++++++++- requirements_dcu.txt | 1 + requirements_metaxgpu.txt | 1 + scripts/run_ci_xpu.sh | 9 +++------ 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 5e7c734f7..de7d87cbc 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -353,6 +353,13 @@ def h2d_copy(dst, src, blocking=True): def v1_loader_support(fd_config): _v1_no_support_archs = ["Qwen2VLForConditionalGeneration"] + def _get_unsupported_quant(): + if current_platform.is_cuda(): + return {"w4a8", "w4afp8", "wint2"} + elif current_platform.is_xpu(): + return {"w4a8", "w8a8"} + return set() + def _err_msg(msg: str) -> str: logger.info(msg + "; fallback to the v0 loader for model loading.") @@ -375,7 +382,7 @@ def v1_loader_support(fd_config): else: moe_quant_type = fd_config.quant_config.name() dense_quant_type = fd_config.quant_config.name() - unsupported_quant = {"w4a8", "w4afp8", "wint2"} + unsupported_quant = _get_unsupported_quant() if unsupported_quant & {moe_quant_type, dense_quant_type}: _err_msg("v1 loader currently does not support w4a8/w4afp8/win2 quantization") diff --git a/requirements_dcu.txt b/requirements_dcu.txt index a622320a9..308fb0611 100644 --- a/requirements_dcu.txt +++ b/requirements_dcu.txt @@ -27,6 +27,7 @@ moviepy use-triton-in-paddle crcmod fastsafetensors==0.1.14 +safetensors==0.7.0rc0 msgpack gunicorn opentelemetry-api>=1.24.0 diff --git a/requirements_metaxgpu.txt b/requirements_metaxgpu.txt index c17f3b354..9d653febc 100644 --- a/requirements_metaxgpu.txt +++ b/requirements_metaxgpu.txt @@ -29,6 +29,7 @@ triton use-triton-in-paddle crcmod fastsafetensors==0.1.14 +safetensors==0.7.0rc0 msgpack gunicorn modelscope diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index 268f92bd2..1fcc6e114 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -81,8 +81,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --num-gpu-blocks-override 16384 \ --max-model-len 32768 \ --max-num-seqs 128 \ - --quantization wint4 \ - --load-choices default > server.log 2>&1 & + --quantization wint4 > server.log 2>&1 & sleep 60 # 探活 @@ -157,8 +156,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --num-gpu-blocks-override 16384 \ --max-model-len 32768 \ --max-num-seqs 64 \ - --quantization "W4A8" \ - --load-choices default > server.log 2>&1 & + --quantization "W4A8" > server.log 2>&1 & sleep 60 # 探活 @@ -236,8 +234,7 @@ python -m fastdeploy.entrypoints.openai.api_server \ --enable-mm \ --mm-processor-kwargs '{"video_max_frames": 30}' \ --limit-mm-per-prompt '{"image": 10, "video": 3}' \ - --reasoning-parser ernie-45-vl \ - --load-choices default > server.log 2>&1 & + --reasoning-parser ernie-45-vl > server.log 2>&1 & sleep 60 # 探活