[XPU] Enable XPU V1 mode based on environment variable (#4213)

* Enable XPU V1 mode based on environment variable * add default param to xft_moe_fc_block_eb for latest xvllm compatibility; update run_ci_xpu to use latest xvllm
2025-09-26 12:31:27 +08:00 · 2025-09-24 10:29:48 +08:00
parent ec99474e71
commit d6e59447f5
4 changed files with 5 additions and 6 deletions
--- a/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc
+++ b/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc
@@ -72,6 +72,7 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in,
      is_padding_input ? token_num_info : nullptr,
      expert_num,
      1,  // moe_topk
+      0, // group_size
      ffn1_out_shape.size() == 2 ? xftblock::MoeFCInputMode::DENSE
                                 : xftblock::MoeFCInputMode::SPARSE);
  PD_CHECK(ret == 0);
@@ -134,6 +135,7 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in,
      is_padding_input ? token_num_info : nullptr,
      expert_num,
      1,  // moe_topk
+      0,  // group_size
      ffn1_out_shape.size() == 2
          ? xftblock::MoeFCInputMode::DENSE
          : xftblock::MoeFCInputMode::SPARSE);  // bias_mode
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -424,7 +424,7 @@ class EngineArgs:
            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
        if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma":
            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
-        if not current_platform.is_cuda():
+        if not current_platform.is_cuda() and not current_platform.is_xpu():
            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
        if self.guided_decoding_backend != "off":
            envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -778,7 +778,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
    if args.splitwise_role != "mixed" and args.cache_transfer_protocol != "rdma":
        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
-    if not current_platform.is_cuda():
+    if not current_platform.is_cuda() and not current_platform.is_xpu():
        logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.")
        envs.ENABLE_V1_KVCACHE_SCHEDULER = 0
    if parallel_config.guided_decoding_backend != "off":
--- a/scripts/run_ci_xpu.sh
+++ b/scripts/run_ci_xpu.sh
@@ -24,10 +24,7 @@ python -m pip install paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packag
 echo "build whl"
 bash custom_ops/xpu_ops/download_dependencies.sh develop
 export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk
-# export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm
-# 由于xvllm更新导致编译报错暂时锁定xvllm版本
-wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250921/output.tar.gz --no-proxy && tar xf output.tar.gz && mv output xvllm
-export XVLLM_PATH=${PWD}/xvllm
+export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm
 bash build.sh || exit 1

 echo "pip others"