mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-08 10:00:29 +08:00
[Fix] fix expert_parallel bug in decoder stage (#2848)
This commit is contained in:
@@ -49,7 +49,7 @@ class MoEMethodBase(QuantMethodBase):
|
||||
from .ep import EPDecoderRunner
|
||||
self.ep_decoder_runner = EPDecoderRunner(
|
||||
layer.top_k, layer.hidden_size, layer.num_experts,
|
||||
layer.model_config.num_max_dispatch_tokens_per_rank,
|
||||
layer.fd_config.model_config.num_max_dispatch_tokens_per_rank,
|
||||
layer.ep_size, layer.ep_rank)
|
||||
else:
|
||||
from .ep import EPPrefillRunner
|
||||
|
@@ -241,7 +241,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
[
|
||||
layer.num_local_experts,
|
||||
layer.ep_size *
|
||||
layer.model_config.num_max_dispatch_tokens_per_rank,
|
||||
layer.fd_config.model_config.num_max_dispatch_tokens_per_rank,
|
||||
layer.moe_intermediate_size * 2,
|
||||
],
|
||||
dtype=paddle.bfloat16,
|
||||
@@ -251,7 +251,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
||||
[
|
||||
layer.num_local_experts,
|
||||
layer.ep_size *
|
||||
layer.model_config.num_max_dispatch_tokens_per_rank,
|
||||
layer.fd_config.model_config.num_max_dispatch_tokens_per_rank,
|
||||
layer.hidden_size,
|
||||
],
|
||||
dtype=paddle.bfloat16,
|
||||
|
Reference in New Issue
Block a user