diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py index be40b3d04..bc851cce6 100644 --- a/fastdeploy/model_executor/layers/moe/ep.py +++ b/fastdeploy/model_executor/layers/moe/ep.py @@ -307,6 +307,7 @@ class DeepEPEngine: topk_weights: paddle.Tensor, expertwise_scale, use_fp8: bool = False, + quant_group_size: int = 128, ): if self.deepep_engine is None: raise RuntimeError("DeepEP buffer not initialized!") @@ -327,6 +328,7 @@ class DeepEPEngine: use_fp8=use_fp8, async_finish=False, return_recv_hook=True, + num_per_channel=quant_group_size, ) return packed_recv_x, packed_recv_count, handle, dispatch_hook @@ -363,6 +365,7 @@ class DeepEPEngine: topk_idx: paddle.Tensor, topk_weights: paddle.Tensor, dispatch_use_fp8: bool, + quant_group_size: int, handle, ): if self.deepep_engine is None: @@ -376,6 +379,7 @@ class DeepEPEngine: async_finish=False, dispatch_use_fp8=dispatch_use_fp8, return_recv_hook=True, + num_per_channel=quant_group_size, ) return combined_hidden_states, combine_hook @@ -644,21 +648,29 @@ class EPDecoderRunner(EPRunner): # just supports dispatch_use_fp8 = True now! assert use_fp8 is True recv_hidden_states, recv_expert_count, handle, dispatch_hook = ( - self.ep_engine.low_latency_dispatch_two_stage(x, topk_idx, topk_weights, expertwise_scale, use_fp8) + self.ep_engine.low_latency_dispatch_two_stage( + x, topk_idx, topk_weights, expertwise_scale, use_fp8, quant_group_size + ) ) if dispatch_hook is not None: dispatch_hook() return recv_hidden_states, recv_expert_count, handle - def combine(self, ffn_out, topk_idx, topk_weights, handle): + def combine(self, ffn_out, topk_idx, topk_weights, handle, **kwargs): + quant_group_size = kwargs.get("quant_group_size", 128) if not self.use_internode_ll_two_stage: combined_hidden_states, combine_hook = self.ep_engine.low_latency_combine( ffn_out, topk_idx, topk_weights, handle ) else: combined_hidden_states, combine_hook = self.ep_engine.low_latency_combine_two_stage( - ffn_out, topk_idx, topk_weights, True, handle # just supports dispatch_use_fp8 = True now! + ffn_out, + topk_idx, + topk_weights, + True, + quant_group_size, + handle, # just supports dispatch_use_fp8 = True now! ) if combine_hook is not None: combine_hook() diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index 65a069658..a2a335ed7 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -263,7 +263,9 @@ class CutlassMoEMethod(UnquantizedFusedMoEMethod): ) # 4. EP combine - return self.ep_decoder_runner.combine(ffn_out, topk_idx, topk_weights, handle) + return self.ep_decoder_runner.combine( + ffn_out, topk_idx, topk_weights, handle, quant_group_size=quant_group_size + ) def apply_tp( self, @@ -759,8 +761,9 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod): down_proj_expert_weight_key = layer.weight_key_map.get("down_proj_expert_weight_key", None) up_gate_proj_expert_weight_scale_key = layer.weight_key_map.get("up_gate_proj_expert_weight_scale_key", None) down_proj_expert_weight_scale_key = layer.weight_key_map.get("down_proj_expert_weight_scale_key", None) - up_gate_proj_expert_in_scale_key = layer.weight_key_map.get("up_gate_proj_expert_in_scale_key", None) - down_proj_expert_in_scale_key = layer.weight_key_map.get("down_proj_expert_in_scale_key", None) + if not layer.moe_quant_config.moe_dynamic_quant: + up_gate_proj_expert_in_scale_key = layer.weight_key_map.get("up_gate_proj_expert_in_scale_key", None) + down_proj_expert_in_scale_key = layer.weight_key_map.get("down_proj_expert_in_scale_key", None) up_gate_proj_weights, down_proj_weights, logical_expert_ids, ep_rank_to_expert_id_list = ( layer.load_experts_weight( @@ -780,7 +783,7 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod): if isinstance(state_dict, list): state_dict = dict(state_dict) - if layer.ep_size > 1: + if layer.ep_size > 1 and not layer.moe_quant_config.moe_dynamic_quant: for expert_idx in ep_rank_to_expert_id_list: scale_tensor = get_tensor( ( @@ -813,44 +816,54 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod): layer.fd_config.model_config.model, ) ) - up_gate_proj_in_scale.append( - get_tensor( - ( - state_dict.pop(up_gate_proj_expert_in_scale_key.format(expert_idx)) - if up_gate_proj_expert_in_scale_key.format(expert_idx) in state_dict - else up_gate_proj_expert_in_scale_key.format(expert_idx) - ), - layer.fd_config.model_config.model, + if not layer.moe_quant_config.moe_dynamic_quant: + up_gate_proj_in_scale.append( + get_tensor( + ( + state_dict.pop(up_gate_proj_expert_in_scale_key.format(expert_idx)) + if up_gate_proj_expert_in_scale_key.format(expert_idx) in state_dict + else up_gate_proj_expert_in_scale_key.format(expert_idx) + ), + layer.fd_config.model_config.model, + ) ) - ) - down_proj_in_scale.append( - get_tensor( - ( - state_dict.pop(down_proj_expert_in_scale_key.format(expert_idx)) - if down_proj_expert_in_scale_key.format(expert_idx) in state_dict - else down_proj_expert_in_scale_key.format(expert_idx) - ), - layer.fd_config.model_config.model, + down_proj_in_scale.append( + get_tensor( + ( + state_dict.pop(down_proj_expert_in_scale_key.format(expert_idx)) + if down_proj_expert_in_scale_key.format(expert_idx) in state_dict + else down_proj_expert_in_scale_key.format(expert_idx) + ), + layer.fd_config.model_config.model, + ) ) - ) up_gate_proj_weight = paddle.stack(up_gate_proj_weights, axis=0) down_proj_weight = paddle.stack(down_proj_weights, axis=0) up_gate_proj_weight_scale = paddle.stack(up_gate_proj_weight_scale, axis=0) down_proj_weight_scale = paddle.stack(down_proj_weight_scale, axis=0) - up_gate_proj_in_scale_all_experts = paddle.stack(up_gate_proj_in_scale_all_experts, axis=0).squeeze() - up_gate_proj_in_scale = paddle.stack(up_gate_proj_in_scale, axis=0).squeeze() - down_proj_in_scale = paddle.stack(down_proj_in_scale, axis=0).squeeze() + if not layer.moe_quant_config.moe_dynamic_quant: + up_gate_proj_in_scale_all_experts = paddle.stack(up_gate_proj_in_scale_all_experts, axis=0).squeeze() + up_gate_proj_in_scale = paddle.stack(up_gate_proj_in_scale, axis=0).squeeze() + down_proj_in_scale = paddle.stack(down_proj_in_scale, axis=0).squeeze() - name_tensor_map = { - "up_gate_proj_weight": up_gate_proj_weight, - "down_proj_weight": down_proj_weight, - "up_gate_proj_weight_scale": up_gate_proj_weight_scale, - "down_proj_weight_scale": down_proj_weight_scale, - "up_gate_proj_in_scale_all_experts": up_gate_proj_in_scale_all_experts, - "up_gate_proj_in_scale": up_gate_proj_in_scale, - "down_proj_in_scale": down_proj_in_scale, - } + if not layer.moe_quant_config.moe_dynamic_quant: + name_tensor_map = { + "up_gate_proj_weight": up_gate_proj_weight, + "down_proj_weight": down_proj_weight, + "up_gate_proj_weight_scale": up_gate_proj_weight_scale, + "down_proj_weight_scale": down_proj_weight_scale, + "up_gate_proj_in_scale_all_experts": up_gate_proj_in_scale_all_experts, + "up_gate_proj_in_scale": up_gate_proj_in_scale, + "down_proj_in_scale": down_proj_in_scale, + } + else: + name_tensor_map = { + "up_gate_proj_weight": up_gate_proj_weight, + "down_proj_weight": down_proj_weight, + "up_gate_proj_weight_scale": up_gate_proj_weight_scale, + "down_proj_weight_scale": down_proj_weight_scale, + } for name, tensor in name_tensor_map.items(): getattr(layer, name).set_value(tensor) @@ -1007,11 +1020,27 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod): # weight_scales if layer.is_quantized: + if not layer.moe_quant_config.moe_dynamic_quant: + up_gate_proj_weight_scale_shape = [layer.num_local_experts, layer.moe_intermediate_size * 2] + down_proj_weight_scale_shape = [layer.num_local_experts, layer.hidden_size] + else: + up_gate_proj_weight_scale_shape = [ + layer.num_local_experts, + layer.moe_intermediate_size * 2 // 128, + layer.hidden_size // 128, + 128, + ] + down_proj_weight_scale_shape = [ + layer.num_local_experts, + layer.hidden_size // 128, + layer.moe_intermediate_size // 128, + 128, + ] setattr( layer, "up_gate_proj_weight_scale", layer.create_parameter( - shape=[layer.num_local_experts, layer.moe_intermediate_size * 2], + shape=up_gate_proj_weight_scale_shape, dtype="float32", default_initializer=paddle.nn.initializer.Constant(0), ), @@ -1020,7 +1049,7 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod): layer, "down_proj_weight_scale", layer.create_parameter( - shape=[layer.num_local_experts, layer.hidden_size], + shape=down_proj_weight_scale_shape, dtype="float32", default_initializer=paddle.nn.initializer.Constant(0), ), diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index 754f0f974..4ad070aab 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -204,8 +204,10 @@ class FusedMoE(nn.Layer): self._dtype = self._helper.get_default_dtype() self.weight_dtype = self._dtype - self.is_quantized = fd_config.model_config.is_quantized and not ( - fd_config.quant_config.name() == "mix_quant" and fd_config.quant_config.moe_quant_type is None + self.is_moe_quantized = getattr(self.fd_config.model_config, "is_moe_quantized", False) + self.is_quantized = self.is_moe_quantized or ( + fd_config.model_config.is_quantized + and not (fd_config.quant_config.name() == "mix_quant" and fd_config.quant_config.moe_quant_type is None) ) moe_quant_config = fd_config.quant_config self.moe_quant_config = moe_quant_config diff --git a/fastdeploy/model_executor/layers/quantization/mix_quant.py b/fastdeploy/model_executor/layers/quantization/mix_quant.py index 95baecb78..2956d5063 100644 --- a/fastdeploy/model_executor/layers/quantization/mix_quant.py +++ b/fastdeploy/model_executor/layers/quantization/mix_quant.py @@ -40,6 +40,7 @@ class MixQuantConfig(QuantConfigBase): is_quantized: bool = False, hadamard_block_size: int = 128, moe_dynamic_quant: bool = False, + is_moe_quantized: bool = False, ) -> None: super().__init__() self.dense_quant_type = dense_quant_type @@ -59,6 +60,7 @@ class MixQuantConfig(QuantConfigBase): self.is_quantized = is_quantized self.hadamard_block_size = hadamard_block_size self.moe_dynamic_quant = moe_dynamic_quant + self.is_moe_quantized = is_moe_quantized def name(self) -> str: return "mix_quant" @@ -76,6 +78,7 @@ class MixQuantConfig(QuantConfigBase): config.get("is_quantized", False), config.get("hadamard_block_size", 128), config.get("moe_dynamic_quant", False), + config.get("is_moe_quantized", False), ) def get_quant_method(self, layer) -> Optional[QuantMethodBase]: @@ -102,7 +105,7 @@ class MixQuantConfig(QuantConfigBase): .from_config( { "is_permuted": self.is_permuted, - "is_quantized": not self.is_checkpoint_bf16, + "is_quantized": not self.is_checkpoint_bf16 or self.is_moe_quantized, "hadamard_block_size": self.hadamard_block_size, } ) diff --git a/tests/model_executor/test_ep.py b/tests/model_executor/test_ep.py index ed3e81647..38d1b8e11 100644 --- a/tests/model_executor/test_ep.py +++ b/tests/model_executor/test_ep.py @@ -151,6 +151,7 @@ class _RecordingBuffer: use_fp8, async_finish, return_recv_hook, + num_per_channel, ): call = { "hidden_states": hidden_states, @@ -161,6 +162,7 @@ class _RecordingBuffer: "use_fp8": use_fp8, "async_finish": async_finish, "return_recv_hook": return_recv_hook, + "num_per_channel": num_per_channel, "hook_called": False, } self.low_latency_dispatch_two_stage_calls.append(call) @@ -204,6 +206,7 @@ class _RecordingBuffer: async_finish, dispatch_use_fp8, return_recv_hook, + num_per_channel, ): call = { "hidden_states": hidden_states, @@ -213,6 +216,7 @@ class _RecordingBuffer: "async_finish": async_finish, "dispatch_use_fp8": dispatch_use_fp8, "return_recv_hook": return_recv_hook, + "num_per_channel": num_per_channel, "hook_called": False, } self.low_latency_combine_two_stage_calls.append(call)