diff --git a/fastdeploy/model_executor/layers/backends/intel_hpu/moe/fused_moe_hpu_backend.py b/fastdeploy/model_executor/layers/backends/intel_hpu/moe/fused_moe_hpu_backend.py index 4b68fb4e2..d47bfc86b 100644 --- a/fastdeploy/model_executor/layers/backends/intel_hpu/moe/fused_moe_hpu_backend.py +++ b/fastdeploy/model_executor/layers/backends/intel_hpu/moe/fused_moe_hpu_backend.py @@ -18,7 +18,6 @@ import paddle from paddle import nn from fastdeploy import envs -from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce_custom from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import ( UnquantizedFusedMoEMethod, ) @@ -96,8 +95,6 @@ class HpuMoEMethod(UnquantizedFusedMoEMethod): experts_max=layer.expert_id_offset + layer.num_local_experts - 1, chunk_size=chunk_size, ) - if layer.reduce_results and layer.tp_size > 1: - tensor_model_parallel_all_reduce_custom(fused_moe_out) return fused_moe_out @@ -198,7 +195,4 @@ class HpuTensorWiseFP8MoEMethod(HpuMoEMethod): chunk_size=chunk_size, ) - if layer.reduce_results and layer.tp_size > 1: - tensor_model_parallel_all_reduce_custom(fused_moe_out) - return fused_moe_out diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index ad0498fa7..678b99f27 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -21,7 +21,10 @@ from paddle import nn from paddleformers.utils.log import logger from fastdeploy import envs -from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce +from fastdeploy.distributed.communication import ( + tensor_model_parallel_all_reduce, + tensor_model_parallel_all_reduce_custom, +) from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.model_executor.utils import h2d_copy, slice_fn from fastdeploy.platforms import current_platform @@ -643,7 +646,10 @@ class FusedMoE(nn.Layer): out = self.forward_normal(x, gate) if self.reduce_results and self.tp_size > 1: - out = tensor_model_parallel_all_reduce(out, self.tp_group) + if current_platform.is_intel_hpu(): + tensor_model_parallel_all_reduce_custom(out) + else: + out = tensor_model_parallel_all_reduce(out, self.tp_group) return out def forward_chunked_moe(self, x: paddle.Tensor, gate: nn.Layer): diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py index 1146e4db8..fda1dcead 100644 --- a/fastdeploy/worker/hpu_model_runner.py +++ b/fastdeploy/worker/hpu_model_runner.py @@ -1120,10 +1120,11 @@ class HPUModelRunner(ModelRunnerBase): max_prefill_length = self.cache_config.block_size + warmup_max_model_len prefill_context_block_step = int(os.environ.get("CONTEXT_BLOCK_STEP_PREFILL", 1)) + prefill_batchs.reverse() + prefill_length_with_contexts = list(range(self.cache_config.block_size, max_prefill_length, prefill_seq_step)) + prefill_length_with_contexts.reverse() for prefill_batch in prefill_batchs: - for prefill_length_with_context in range( - self.cache_config.block_size, max_prefill_length, prefill_seq_step - ): + for prefill_length_with_context in prefill_length_with_contexts: if prefill_length_with_context * prefill_batch > self.scheduler_config.max_num_batched_tokens: continue for context_len in range( @@ -1171,6 +1172,8 @@ class HPUModelRunner(ModelRunnerBase): current_decode_block_num += decode_block_num_step logger.info(f"warmup decode_batchs: {decode_batchs}, decode_block_nums: {decode_block_nums} start") + decode_batchs.reverse() + decode_block_nums.reverse() for decode_batch in decode_batchs: for decode_block_num in decode_block_nums: if decode_block_num < decode_batch: