[Feature][MTP]Support MTP for rl-model (#4009)

* qk norm for speculate decode C16

* support mtp in v1_scheduler mode

* support mtp rope_3d

* support mtp features

* add unit test && del some log

---------

Co-authored-by: yuanxiaolan <yuanxiaolan01@baidu.com>
Co-authored-by: xiaoxiaohehe001 <hiteezsf@163.com>
This commit is contained in:
freeliuzc
2025-09-10 13:34:37 +08:00
committed by GitHub
parent cce2410fad
commit 2f473ba966
21 changed files with 1465 additions and 531 deletions

View File

@@ -100,6 +100,8 @@ class AppendAttentionBackend(AttentionBackend):
self.rope_3d: bool = getattr(fd_config.model_config, "rope_3d", False) or getattr(
fd_config.model_config, "use_3d_rope", False
)
if fd_config.speculative_config.model_type != "main":
self.rope_3d = False
self.causal: bool = getattr(fd_config.model_config, "causal", True)
self.speculative_method: str = fd_config.speculative_config.method
self.use_speculate: bool = self.speculative_method is not None
@@ -356,7 +358,7 @@ class AppendAttentionBackend(AttentionBackend):
getattr(layer, "cache_v_zp", None),
layer.linear_shift,
layer.linear_smooth,
forward_meta.attn_mask_offsets,
None if self.use_speculate else forward_meta.attn_mask_offsets,
metadata.kv_signal_data_list[layer.layer_id],
getattr(layer, "q_norm_weight", None),
getattr(layer, "k_norm_weight", None),
@@ -374,7 +376,7 @@ class AppendAttentionBackend(AttentionBackend):
metadata.max_partition_size,
metadata.encoder_max_partition_size,
self.speculate_max_draft_token_num + 1,
self.causal,
self.causal or self.use_speculate,
self.speculative_method is not None,
)
return res