[XPU] refactor moe ffn (#5501)

- remove BKCL_DISPATCH_ALL_GATHER
- support sparse mode
- support moe quant_method
This commit is contained in:
zhupengyang
2025-12-18 14:14:05 +08:00
committed by GitHub
parent d0a7834a17
commit 8735cb5045
12 changed files with 397 additions and 127 deletions

View File

@@ -632,6 +632,7 @@ export XSHMEM_MODE=1
export XSHMEM_QP_NUM_PER_RANK=32
export BKCL_RDMA_VERBS=1
export MOE_FFN_USE_DENSE_INPUT=1
export FD_XPU_MOE_FFN_QUANT_TYPE_MAP="w_channelwise_int8_a_tokenwise_int8:8->53"
export port_num=$((8188 + XPU_ID * 100))
# 启动服务
@@ -643,7 +644,7 @@ python -m fastdeploy.entrypoints.openai.api_server \
--data-parallel-size 1 \
--max-model-len 32768 \
--max-num-seqs 64 \
--quantization "wint4" \
--quantization "wint8" \
--engine-worker-queue-port $((port_num + 10)) \
--metrics-port $((port_num + 2)) \
--cache-queue-port $((port_num + 47873)) \
@@ -692,6 +693,7 @@ unset XSHMEM_MODE
unset XSHMEM_QP_NUM_PER_RANK
unset BKCL_RDMA_VERBS
unset MOE_FFN_USE_DENSE_INPUT
unset XPU_MOE_FFN_QUANT_TYPE_MAP
stop_processes >kill.log 2>&1
if [ ${ep_online_exit_code} -ne 0 ]; then