mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-09-27 12:52:29 +08:00
【Inference Optimize】DeepSeek-v3 model inference performance optimization (#3455)
* DSK_OPT_01 * update FA3
This commit is contained in:
@@ -316,30 +316,23 @@ class DeepseekV3MLAAttention(nn.Layer):
|
||||
mask_encoder_batch: paddle.Tensor,
|
||||
):
|
||||
""" """
|
||||
layernorm_out = hidden_states
|
||||
fmha_out = paddle.zeros(
|
||||
shape=[
|
||||
layernorm_out.shape[0],
|
||||
self.num_attention_heads_tp * self.v_head_dim,
|
||||
],
|
||||
dtype=layernorm_out.dtype,
|
||||
)
|
||||
|
||||
# NOTE: (changwenbin) Bring out the public calculation in PD MIX to avoid repeated calculation.
|
||||
fmha_out = None
|
||||
query = self.q_a_proj(hidden_states)
|
||||
query = self.q_a_layernorm(query)
|
||||
query = self.q_b_proj(query)
|
||||
query = query.reshape([-1, self.num_attention_heads_tp, self.qk_head_dim])
|
||||
query_nope, query_pe = query.split([self.qk_nope_head_dim, self.qk_rope_head_dim], axis=-1)
|
||||
|
||||
compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
|
||||
compressed_kv, key_pe = compressed_kv.split([self.kv_lora_rank, self.qk_rope_head_dim], axis=-1)
|
||||
key_pe = key_pe.reshape([-1, 1, self.qk_rope_head_dim])
|
||||
compressed_kv = self.kv_a_layernorm(compressed_kv)
|
||||
|
||||
query_pe, key_pe = self.rotary_emb(position_ids, query_pe, key_pe)
|
||||
|
||||
if forward_meta.max_len_tensor_cpu[1]: # max_enc_len_this_time
|
||||
query = self.q_a_proj(layernorm_out)
|
||||
query = self.q_a_layernorm(query)
|
||||
query = self.q_b_proj(query)
|
||||
|
||||
query = query.reshape([-1, self.num_attention_heads_tp, self.qk_head_dim])
|
||||
query_nope, query_pe = query.split([self.qk_nope_head_dim, self.qk_rope_head_dim], axis=-1)
|
||||
|
||||
compressed_kv = self.kv_a_proj_with_mqa(layernorm_out)
|
||||
compressed_kv, key_pe = compressed_kv.split([self.kv_lora_rank, self.qk_rope_head_dim], axis=-1)
|
||||
key_pe = key_pe.reshape([-1, 1, self.qk_rope_head_dim])
|
||||
compressed_kv = self.kv_a_layernorm(compressed_kv)
|
||||
|
||||
query_pe, key_pe = self.rotary_emb(position_ids, query_pe, key_pe)
|
||||
|
||||
key_value = self.kv_b_proj(compressed_kv)
|
||||
key_value = key_value.reshape(
|
||||
[
|
||||
@@ -371,23 +364,9 @@ class DeepseekV3MLAAttention(nn.Layer):
|
||||
fmha_out_prefill = fmha_out_prefill.reshape([-1, self.num_attention_heads_tp * self.v_head_dim])
|
||||
fmha_out_prefill = fmha_out_prefill * mask_encoder_batch.cast(fmha_out_prefill.dtype)
|
||||
|
||||
fmha_out = fmha_out + fmha_out_prefill
|
||||
fmha_out = fmha_out_prefill
|
||||
|
||||
if forward_meta.max_len_tensor_cpu[2]: # max_dec_len_this_time
|
||||
query = self.q_a_proj(layernorm_out)
|
||||
query = self.q_a_layernorm(query)
|
||||
ln_out_or_q_c = query
|
||||
|
||||
compressed_kv = self.kv_a_proj_with_mqa(layernorm_out)
|
||||
compressed_kv, key_pe = compressed_kv.split([self.kv_lora_rank, self.qk_rope_head_dim], axis=-1)
|
||||
key_pe = key_pe.reshape([-1, 1, self.qk_rope_head_dim])
|
||||
compressed_kv = self.kv_a_layernorm(compressed_kv)
|
||||
|
||||
query = self.q_b_proj(ln_out_or_q_c)
|
||||
query = query.reshape([-1, self.num_attention_heads_tp, self.qk_head_dim])
|
||||
|
||||
query_nope, query_pe = query.split([self.qk_nope_head_dim, self.qk_rope_head_dim], axis=-1)
|
||||
query_pe, key_pe = self.rotary_emb(position_ids, query_pe, key_pe)
|
||||
|
||||
q_nope_out = self.kv_b_proj_bmm(query_nope.transpose([1, 0, 2]), proj_type="k").transpose([1, 0, 2])
|
||||
|
||||
q_input = paddle.concat([q_nope_out, query_pe], axis=-1)
|
||||
@@ -416,7 +395,10 @@ class DeepseekV3MLAAttention(nn.Layer):
|
||||
.transpose([1, 0, 2])
|
||||
.reshape([-1, self.num_attention_heads_tp * self.v_head_dim])
|
||||
)
|
||||
fmha_out = fmha_out + fmha_out_decode
|
||||
if fmha_out is None:
|
||||
fmha_out = fmha_out_decode
|
||||
else:
|
||||
fmha_out = fmha_out + fmha_out_decode
|
||||
|
||||
output = self.o_proj(fmha_out)
|
||||
return output
|
||||
|
Reference in New Issue
Block a user