mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-08 18:11:00 +08:00
[Fix] fix expert_parallel bug in decoder stage (#2848)
This commit is contained in:
@@ -49,7 +49,7 @@ class MoEMethodBase(QuantMethodBase):
|
|||||||
from .ep import EPDecoderRunner
|
from .ep import EPDecoderRunner
|
||||||
self.ep_decoder_runner = EPDecoderRunner(
|
self.ep_decoder_runner = EPDecoderRunner(
|
||||||
layer.top_k, layer.hidden_size, layer.num_experts,
|
layer.top_k, layer.hidden_size, layer.num_experts,
|
||||||
layer.model_config.num_max_dispatch_tokens_per_rank,
|
layer.fd_config.model_config.num_max_dispatch_tokens_per_rank,
|
||||||
layer.ep_size, layer.ep_rank)
|
layer.ep_size, layer.ep_rank)
|
||||||
else:
|
else:
|
||||||
from .ep import EPPrefillRunner
|
from .ep import EPPrefillRunner
|
||||||
|
@@ -241,7 +241,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
|||||||
[
|
[
|
||||||
layer.num_local_experts,
|
layer.num_local_experts,
|
||||||
layer.ep_size *
|
layer.ep_size *
|
||||||
layer.model_config.num_max_dispatch_tokens_per_rank,
|
layer.fd_config.model_config.num_max_dispatch_tokens_per_rank,
|
||||||
layer.moe_intermediate_size * 2,
|
layer.moe_intermediate_size * 2,
|
||||||
],
|
],
|
||||||
dtype=paddle.bfloat16,
|
dtype=paddle.bfloat16,
|
||||||
@@ -251,7 +251,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
|
|||||||
[
|
[
|
||||||
layer.num_local_experts,
|
layer.num_local_experts,
|
||||||
layer.ep_size *
|
layer.ep_size *
|
||||||
layer.model_config.num_max_dispatch_tokens_per_rank,
|
layer.fd_config.model_config.num_max_dispatch_tokens_per_rank,
|
||||||
layer.hidden_size,
|
layer.hidden_size,
|
||||||
],
|
],
|
||||||
dtype=paddle.bfloat16,
|
dtype=paddle.bfloat16,
|
||||||
|
Reference in New Issue
Block a user