[V1 Loader] Support DeepSeekV3(bf16) (#3294)

* Support new loader for DeepSeekV3(bf16)

* update paddle version

* remove useless attr
This commit is contained in:
Zero Rains
2025-08-11 13:39:28 +08:00
committed by GitHub
parent e0aeac58e1
commit 42af0b4b64
5 changed files with 141 additions and 5 deletions

View File

@@ -720,6 +720,7 @@ class KVBatchLinear(LinearBase):
self.v_head_dim = v_head_dim
# Split num_attention_heads when using TP inference.
self.num_heads_per_partition = divide(num_attention_heads, self.nranks)
self.local_rank = fd_config.parallel_config.tensor_parallel_rank
# Initialize parent with combined dimensions
super().__init__(
@@ -738,6 +739,63 @@ class KVBatchLinear(LinearBase):
self.k_weight_key = f"{prefix.replace('kv_b_proj', 'k_b_proj')}.weight"
self.v_weight_key = f"{prefix.replace('kv_b_proj', 'v_b_proj')}.weight"
self.k_b_proj_weight = self.create_parameter(
shape=[self.num_heads_per_partition, self.qk_nope_head_dim, self.kv_lora_rank],
dtype=self.weight_dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
self.v_b_proj_weight = self.create_parameter(
shape=[self.num_heads_per_partition, self.kv_lora_rank, self.v_head_dim],
dtype=self.weight_dtype,
is_bias=False,
default_initializer=paddle.nn.initializer.Constant(0),
)
set_weight_attrs(
self.k_b_proj_weight,
{"weight_loader": self.weight_loader},
)
if self.nranks > 0:
_set_var_distributed(self.k_b_proj_weight, split_axis=1)
set_weight_attrs(self.k_b_proj_weight, {"output_dim": True})
def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = None):
output_dim = getattr(param, "output_dim", None)
# Tensor parallelism splits the weight along the output_dim
if output_dim is not None:
dim = -1
size = loaded_weight.get_shape()[dim]
block_size = size // self.nranks
shard_offset = self.local_rank * block_size
shard_size = (self.local_rank + 1) * block_size
loaded_weight = loaded_weight[..., shard_offset:shard_size]
w = (
get_tensor(loaded_weight)
.reshape(
[
self.kv_lora_rank,
self.num_heads_per_partition,
-1,
]
)
.transpose(perm=[1, 2, 0])
)
if param.dtype != w.dtype:
w = w.cast(param.dtype)
# Split into K and V weights
# wk_b: [num_heads, qk_nope_head_dim, kv_lora_rank]
wk_b = w[:, : self.qk_nope_head_dim, :]
if self.v_head_dim is None:
raise ValueError("self.v_head_dim should not be None")
# wv_b: [num_heads, kv_lora_rank, v_head_dim]
wv_b = w[:, -self.v_head_dim :, :].transpose(perm=[0, 2, 1])
self.k_b_proj_weight.set_value(wk_b)
self.v_b_proj_weight.set_value(wv_b)
def load_state_dict(self, state_dict: dict):
"""
Load the combined KV weight and split it into K and V projections

View File

@@ -52,7 +52,7 @@ def get_moe_scores(
compute moe scores using e_score_correction_bias.
"""
scores = paddle.nn.functional.sigmoid(gating_output)
scores_with_bias = scores + e_score_correction_bias.unsqueeze(0)
scores_with_bias = scores + e_score_correction_bias
scores, topk_values, topk_idx = noaux_tc(
scores,
scores_with_bias,

View File

@@ -508,10 +508,11 @@ class FusedMoE(nn.Layer):
gate_correction_bias_tensor = self.extract_gate_correction_bias(
self.gate_correction_bias_key, state_dict
)
if self.gate_correction_bias.shape != gate_correction_bias_tensor.shape:
gate_correction_bias_tensor = gate_correction_bias_tensor.reshape(self.gate_correction_bias.shape)
self.gate_correction_bias.set_value(gate_correction_bias_tensor)
else:
self.gate_correction_bias = None
else:
self.gate_correction_bias = None