[Iluvatar GPU] Optimize attention performance and fix moe load ckpt error (#3651)

This commit is contained in:
yzwu
2025-09-22 21:13:59 +08:00
committed by GitHub
parent 5532e8a323
commit 504461b6b5
17 changed files with 1344 additions and 363 deletions

View File

@@ -53,6 +53,7 @@ from fastdeploy.model_executor.models.model_base import (
from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid
from fastdeploy.model_executor.models.utils import WeightMeta
from fastdeploy.platforms import current_platform
from fastdeploy.worker.experts_manager import RedundantExpertManger
@@ -464,6 +465,9 @@ class Ernie4_5_Model(nn.Layer):
):
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
hidden_states = forward_meta.attn_backend.transpose(hidden_states)
residual = None
for i in range(self.num_layers):
hidden_states, residual = self.layers[i](forward_meta, hidden_states, residual)
@@ -472,6 +476,9 @@ class Ernie4_5_Model(nn.Layer):
out = self.norm(hidden_states)
if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
out = forward_meta.attn_backend.reverse_transpose(out)
return out