diff --git a/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc b/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc index 7916f38a9..98e0e6648 100644 --- a/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc +++ b/custom_ops/xpu_ops/src/ops/moe_expert_ffn.cc @@ -72,6 +72,7 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in, is_padding_input ? token_num_info : nullptr, expert_num, 1, // moe_topk + 0, // group_size ffn1_out_shape.size() == 2 ? xftblock::MoeFCInputMode::DENSE : xftblock::MoeFCInputMode::SPARSE); PD_CHECK(ret == 0); @@ -134,6 +135,7 @@ void MoeExpertFFNImpl(xftblock::Tensor* ffn_in, is_padding_input ? token_num_info : nullptr, expert_num, 1, // moe_topk + 0, // group_size ffn1_out_shape.size() == 2 ? xftblock::MoeFCInputMode::DENSE : xftblock::MoeFCInputMode::SPARSE); // bias_mode diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index e77e4388a..f1367b1ae 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -424,7 +424,7 @@ class EngineArgs: envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if self.splitwise_role != "mixed" and self.cache_transfer_protocol != "rdma": envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 - if not current_platform.is_cuda(): + if not current_platform.is_cuda() and not current_platform.is_xpu(): envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if self.guided_decoding_backend != "off": envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 44d3e0150..193970d0f 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -778,7 +778,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if args.splitwise_role != "mixed" and args.cache_transfer_protocol != "rdma": envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 - if not current_platform.is_cuda(): + if not current_platform.is_cuda() and not current_platform.is_xpu(): logger.info("Set ENABLE_V1_KVCACHE_SCHEDULER to 0 due to not supported.") envs.ENABLE_V1_KVCACHE_SCHEDULER = 0 if parallel_config.guided_decoding_backend != "off": diff --git a/scripts/run_ci_xpu.sh b/scripts/run_ci_xpu.sh index 597fa3480..870b463d9 100644 --- a/scripts/run_ci_xpu.sh +++ b/scripts/run_ci_xpu.sh @@ -24,10 +24,7 @@ python -m pip install paddlepaddle-xpu -i https://www.paddlepaddle.org.cn/packag echo "build whl" bash custom_ops/xpu_ops/download_dependencies.sh develop export CLANG_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xtdk -# export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm -# 由于xvllm更新导致编译报错暂时锁定xvllm版本 -wget https://klx-sdk-release-public.su.bcebos.com/xinfer/daily/eb/20250921/output.tar.gz --no-proxy && tar xf output.tar.gz && mv output xvllm -export XVLLM_PATH=${PWD}/xvllm +export XVLLM_PATH=$(pwd)/custom_ops/xpu_ops/third_party/xvllm bash build.sh || exit 1 echo "pip others"