From d6e59447f52dd4abf9e8ed54a1a80a9d424b61b5 Mon Sep 17 00:00:00 2001 From: yyssys Date: Wed, 24 Sep 2025 10:29:48 +0800 Subject: [PATCH] [XPU] Enable XPU V1 mode based on environment variable (#4213) * Enable XPU V1 mode based on environment variable * add default param to xft_moe_fc_block_eb for latest xvllm compatibility; update run_ci_xpu to use latest xvllm --- custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc | 2 ++ fastdeploy/engine/args_utils.py | 2 +- fastdeploy/worker/worker_process.py | 2 +- scripts/run_ci_xpu.sh | 5 +---- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc b/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc index 7916f38a9..98e0e6648 100644 --- a/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc +++ b/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc @@ -72,6 +72,7 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in, is_padding_input ? token_num_info : nullptr, expert_num, 1, // moe_topk + 0, // group_size ffn1_out_shape.size() == 2 ? xftblock::MoeFCInputMode::DENSE : xftblock::MoeFCInputMode::SPARSE); PD_CHECK(ret == 0); @@ -134,6 +135,7 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in, is_padding_input ? token_num_info : nullptr, expert_num, 1, // moe_topk + 0, // group_size ffn1_out_shape.size() == 2 ? xftblock::MoeFCInputMode::DENSE : xftblock::MoeFCInputMode::SPARSE); // bias_mode diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index e77e4388a..f1367b1ae 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -424,7 +424,7 @@ class EngineArgs: envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma": envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 - if not current_platform.is_cuda(): + if not current_platform.is_cuda() and not current_platform.is_xpu(): envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if self.guided_decoding_backend != "off": envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 44d3e0150..193970d0f 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -778,7 +778,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if args.splitwise_role != "mixed" and args.cache_transfer_protocol != "rdma": envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 - if not current_platform.is_cuda(): + if not current_platform.is_cuda() and not current_platform.is_xpu(): logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.") envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if parallel_config.guided_decoding_backend != "off": diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index 597fa3480..870b463d9 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -24,10 +24,7 @@ python -m pip install paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packag echo "build whl" bash custom_ops/xpu_ops/download_dependencies.sh develop export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk -# export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm -# 由于xvllm更新导致编译报错暂时锁定xvllm版本 -wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250921/output.tar.gz --no-proxy && tar xf output.tar.gz && mv output xvllm -export XVLLM_PATH=${PWD}/xvllm +export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm bash build.sh || exit 1 echo "pip others"