mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[fix] w4a8 model loading and hadamard config (#3013)
This commit is contained in:
@@ -1000,7 +1000,10 @@ class LLMEngine:
|
|||||||
"FLAGS_use_append_attn": 1,
|
"FLAGS_use_append_attn": 1,
|
||||||
"NCCL_ALGO": "Ring",
|
"NCCL_ALGO": "Ring",
|
||||||
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 32768)),
|
"FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 32768)),
|
||||||
"FLAGS_hardamard_moe_block_size": 128,
|
"FLAGS_hardamard_moe_block_size": int(os.getenv("FLAGS_hardamard_moe_block_size", 128)),
|
||||||
|
"FLAGS_hardamard_use_diagonal_block_matrix": int(
|
||||||
|
os.getenv("FLAGS_hardamard_use_diagonal_block_matrix", 0)
|
||||||
|
),
|
||||||
}
|
}
|
||||||
# environment variables needed by Dy2St
|
# environment variables needed by Dy2St
|
||||||
variables.update(
|
variables.update(
|
||||||
|
@@ -213,8 +213,11 @@ class CutlassMoEMethod(MoEMethodBase):
|
|||||||
"""
|
"""
|
||||||
# 1. Select topk experts and weights
|
# 1. Select topk experts and weights
|
||||||
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out)
|
topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out)
|
||||||
|
expertwise_scale = getattr(layer, "up_gate_proj_in_scale_all_experts")
|
||||||
# 2. EP Dispatch
|
# 2. EP Dispatch
|
||||||
permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(x, topk_idx, topk_weights)
|
permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(
|
||||||
|
x, topk_idx, topk_weights, expertwise_scale=expertwise_scale
|
||||||
|
)
|
||||||
# 3. Compute ffn
|
# 3. Compute ffn
|
||||||
if self.moe_quant_type == "w4a8":
|
if self.moe_quant_type == "w4a8":
|
||||||
num_local_experts, max_num, _ = permute_input.shape
|
num_local_experts, max_num, _ = permute_input.shape
|
||||||
@@ -376,6 +379,7 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
|
|||||||
# 1. Init scale containers and maps
|
# 1. Init scale containers and maps
|
||||||
up_gate_proj_weight_scales = []
|
up_gate_proj_weight_scales = []
|
||||||
down_proj_weight_scales = []
|
down_proj_weight_scales = []
|
||||||
|
up_gate_proj_in_scales_all_experts = []
|
||||||
up_gate_proj_in_scales = []
|
up_gate_proj_in_scales = []
|
||||||
down_proj_in_scales = []
|
down_proj_in_scales = []
|
||||||
|
|
||||||
@@ -396,9 +400,16 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod):
|
|||||||
raise ValueError(f"scale {name} should not be none in w4a8 mode.")
|
raise ValueError(f"scale {name} should not be none in w4a8 mode.")
|
||||||
|
|
||||||
# 2. Extract scale tensor from state dict
|
# 2. Extract scale tensor from state dict
|
||||||
|
if layer.ep_size > 1:
|
||||||
|
for expert_idx in range(layer.num_experts):
|
||||||
|
scale_tensor = get_tensor(state_dict[scale_key_map["up_gate_proj_in_scale"].format(expert_idx)])
|
||||||
|
up_gate_proj_in_scales_all_experts.append(1 / scale_tensor)
|
||||||
|
create_and_set_parameter(
|
||||||
|
layer, "up_gate_proj_in_scale_all_experts", paddle.concat(up_gate_proj_in_scales_all_experts)
|
||||||
|
)
|
||||||
|
|
||||||
for local_expert_idx in range(layer.num_local_experts):
|
for local_expert_idx in range(layer.num_local_experts):
|
||||||
expert_idx = local_expert_idx + layer.expert_id_offset * layer.num_local_experts
|
expert_idx = local_expert_idx + layer.expert_id_offset
|
||||||
for name, scale_key_template in scale_key_map.items():
|
for name, scale_key_template in scale_key_map.items():
|
||||||
scale_tensor = _extract_scale_tensor(state_dict, scale_key_template, expert_idx)
|
scale_tensor = _extract_scale_tensor(state_dict, scale_key_template, expert_idx)
|
||||||
scale_weight_map[name].append(scale_tensor)
|
scale_weight_map[name].append(scale_tensor)
|
||||||
|
@@ -19,6 +19,9 @@ from paddle import nn
|
|||||||
from paddleformers.utils.log import logger
|
from paddleformers.utils.log import logger
|
||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
|
from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
|
||||||
|
CutlassW4A8MoEMethod,
|
||||||
|
)
|
||||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||||
from fastdeploy.worker.experts_manager import RedundantExpertManger
|
from fastdeploy.worker.experts_manager import RedundantExpertManger
|
||||||
|
|
||||||
@@ -385,6 +388,9 @@ class FusedMoE(nn.Layer):
|
|||||||
self.gate_weight.set_value(gate_weight_tensor.astype("float32"))
|
self.gate_weight.set_value(gate_weight_tensor.astype("float32"))
|
||||||
|
|
||||||
if self.fd_config.model_config.is_quantized:
|
if self.fd_config.model_config.is_quantized:
|
||||||
|
if isinstance(self.quant_method, CutlassW4A8MoEMethod):
|
||||||
|
self.quant_method.create_weights(self, state_dict)
|
||||||
|
else:
|
||||||
self.quant_method.process_prequanted_weights(self, state_dict)
|
self.quant_method.process_prequanted_weights(self, state_dict)
|
||||||
else:
|
else:
|
||||||
self.quant_method.create_weights(self, state_dict)
|
self.quant_method.create_weights(self, state_dict)
|
||||||
|
@@ -101,12 +101,20 @@ def load_ep_checkpoint(model_path: str, fd_config: FDConfig, return_numpy: bool
|
|||||||
|
|
||||||
up_gate_proj_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.up_gate_proj.weight_scale"
|
up_gate_proj_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.up_gate_proj.weight_scale"
|
||||||
down_proj_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.down_proj.weight_scale"
|
down_proj_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.down_proj.weight_scale"
|
||||||
|
|
||||||
|
down_proj_in_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.down_proj.activation_scale"
|
||||||
num_local_ffn_keys.append(up_gate_proj_key)
|
num_local_ffn_keys.append(up_gate_proj_key)
|
||||||
num_local_ffn_keys.append(down_proj_key)
|
num_local_ffn_keys.append(down_proj_key)
|
||||||
num_local_ffn_keys.append(up_gate_proj_quant_key)
|
num_local_ffn_keys.append(up_gate_proj_quant_key)
|
||||||
num_local_ffn_keys.append(down_proj_quant_key)
|
num_local_ffn_keys.append(down_proj_quant_key)
|
||||||
num_local_ffn_keys.append(up_gate_proj_scale_key)
|
num_local_ffn_keys.append(up_gate_proj_scale_key)
|
||||||
num_local_ffn_keys.append(down_proj_scale_key)
|
num_local_ffn_keys.append(down_proj_scale_key)
|
||||||
|
num_local_ffn_keys.append(down_proj_in_scale_key)
|
||||||
|
|
||||||
|
# for EP w4a8, we need all expert's activation_scale for up_gate_proj
|
||||||
|
for j in range(fd_config.model_config.moe_num_experts):
|
||||||
|
up_gate_proj_in_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.up_gate_proj.activation_scale"
|
||||||
|
num_local_ffn_keys.append(up_gate_proj_in_scale_key)
|
||||||
|
|
||||||
for k in num_local_ffn_keys:
|
for k in num_local_ffn_keys:
|
||||||
if k in weight_list:
|
if k in weight_list:
|
||||||
|
Reference in New Issue
Block a user