mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-22 00:02:10 +08:00
[Iluvatar GPU] Optimize attention performance and fix moe load ckpt error (#3651)
This commit is contained in:
@@ -53,6 +53,7 @@ from fastdeploy.model_executor.models.model_base import (
|
||||
from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
|
||||
from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid
|
||||
from fastdeploy.model_executor.models.utils import WeightMeta
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.worker.experts_manager import RedundantExpertManger
|
||||
|
||||
|
||||
@@ -464,6 +465,9 @@ class Ernie4_5_Model(nn.Layer):
|
||||
):
|
||||
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
|
||||
|
||||
if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
|
||||
hidden_states = forward_meta.attn_backend.transpose(hidden_states)
|
||||
|
||||
residual = None
|
||||
for i in range(self.num_layers):
|
||||
hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual)
|
||||
@@ -472,6 +476,9 @@ class Ernie4_5_Model(nn.Layer):
|
||||
|
||||
out = self.norm(hidden_states)
|
||||
|
||||
if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
|
||||
out = forward_meta.attn_backend.reverse_transpose(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user