mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-07 09:31:35 +08:00
【BugFix】add moe noaux_tc tatics in trition backend (#3821)
* add moe noaux_tc tatics in trition backend * fix * add dp config
This commit is contained in:
@@ -24,6 +24,7 @@ from fastdeploy.model_executor.utils import TensorTracker, set_weight_attrs
|
|||||||
from fastdeploy.utils import ceil_div
|
from fastdeploy.utils import ceil_div
|
||||||
|
|
||||||
from ..quantization.quant_base import QuantMethodBase
|
from ..quantization.quant_base import QuantMethodBase
|
||||||
|
from .ep import get_moe_scores
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess_func
|
from fastdeploy.model_executor.ops.gpu import tritonmoe_preprocess_func
|
||||||
@@ -167,13 +168,24 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
moe_intermediate_size = layer.moe_intermediate_size
|
moe_intermediate_size = layer.moe_intermediate_size
|
||||||
hidden_size = layer.hidden_size
|
hidden_size = layer.hidden_size
|
||||||
|
|
||||||
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
if layer.topk_method == "noaux_tc":
|
||||||
gate_out,
|
_, topk_weights, topk_ids = get_moe_scores(
|
||||||
layer.gate_correction_bias,
|
gate_out,
|
||||||
top_k,
|
layer.n_group,
|
||||||
True, # apply_norm_weight,
|
layer.topk_group,
|
||||||
False,
|
layer.top_k,
|
||||||
)
|
layer.routed_scaling_factor,
|
||||||
|
layer.gate_correction_bias,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
||||||
|
gate_out,
|
||||||
|
layer.gate_correction_bias,
|
||||||
|
layer.top_k,
|
||||||
|
True, # apply_norm_weight
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
|
||||||
up_gate_proj_out = paddle.empty(
|
up_gate_proj_out = paddle.empty(
|
||||||
[token_num * top_k, moe_intermediate_size * 2],
|
[token_num * top_k, moe_intermediate_size * 2],
|
||||||
dtype=x.dtype,
|
dtype=x.dtype,
|
||||||
@@ -419,13 +431,25 @@ class TensorWiseFP8MoEMethod(QuantMethodBase):
|
|||||||
moe_intermediate_size = layer.moe_intermediate_size
|
moe_intermediate_size = layer.moe_intermediate_size
|
||||||
hidden_size = layer.hidden_size
|
hidden_size = layer.hidden_size
|
||||||
|
|
||||||
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
if layer.topk_method == "noaux_tc":
|
||||||
gate_out,
|
|
||||||
layer.gate_correction_bias,
|
_, topk_weights, topk_ids = get_moe_scores(
|
||||||
top_k,
|
gate_out,
|
||||||
True, # apply_norm_weight,
|
layer.n_group,
|
||||||
False,
|
layer.topk_group,
|
||||||
)
|
layer.top_k,
|
||||||
|
layer.routed_scaling_factor,
|
||||||
|
layer.gate_correction_bias,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
|
||||||
|
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
||||||
|
gate_out,
|
||||||
|
layer.gate_correction_bias,
|
||||||
|
top_k,
|
||||||
|
True, # apply_norm_weight,
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
|
||||||
up_gate_proj_out = paddle.empty(
|
up_gate_proj_out = paddle.empty(
|
||||||
[token_num * top_k, moe_intermediate_size * 2],
|
[token_num * top_k, moe_intermediate_size * 2],
|
||||||
@@ -829,13 +853,23 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
|
|||||||
E, N1, _ = getattr(layer, self.added_weight_attrs[0]).shape
|
E, N1, _ = getattr(layer, self.added_weight_attrs[0]).shape
|
||||||
N2 = getattr(layer, self.added_weight_attrs[1]).shape[1]
|
N2 = getattr(layer, self.added_weight_attrs[1]).shape[1]
|
||||||
|
|
||||||
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
if layer.topk_method == "noaux_tc":
|
||||||
gate_out,
|
_, topk_weights, topk_ids = get_moe_scores(
|
||||||
layer.gate_correction_bias,
|
gate_out,
|
||||||
layer.top_k,
|
layer.n_group,
|
||||||
True, # apply_norm_weight
|
layer.topk_group,
|
||||||
False,
|
layer.top_k,
|
||||||
)
|
layer.routed_scaling_factor,
|
||||||
|
layer.gate_correction_bias,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
|
||||||
|
gate_out,
|
||||||
|
layer.gate_correction_bias,
|
||||||
|
layer.top_k,
|
||||||
|
True, # apply_norm_weight
|
||||||
|
False,
|
||||||
|
)
|
||||||
|
|
||||||
config = {
|
config = {
|
||||||
"BLOCK_SIZE_M": 64,
|
"BLOCK_SIZE_M": 64,
|
||||||
|
@@ -60,6 +60,7 @@ class RolloutModelConfig:
|
|||||||
early_stop_config: str = None,
|
early_stop_config: str = None,
|
||||||
local_rank: int = 0,
|
local_rank: int = 0,
|
||||||
moba_attention_config: str = None,
|
moba_attention_config: str = None,
|
||||||
|
data_parallel_size: int = 1,
|
||||||
):
|
):
|
||||||
# Required parameters
|
# Required parameters
|
||||||
self.model = model_name_or_path
|
self.model = model_name_or_path
|
||||||
@@ -95,6 +96,7 @@ class RolloutModelConfig:
|
|||||||
self.splitwise_role = splitwise_role
|
self.splitwise_role = splitwise_role
|
||||||
self.expert_parallel_size = expert_parallel_size
|
self.expert_parallel_size = expert_parallel_size
|
||||||
self.enable_expert_parallel = enable_expert_parallel
|
self.enable_expert_parallel = enable_expert_parallel
|
||||||
|
self.data_parallel_size = data_parallel_size
|
||||||
self.ori_vocab_size = ori_vocab_size
|
self.ori_vocab_size = ori_vocab_size
|
||||||
self.quantization = quantization
|
self.quantization = quantization
|
||||||
self.guided_decoding_backend = guided_decoding_backend
|
self.guided_decoding_backend = guided_decoding_backend
|
||||||
|
Reference in New Issue
Block a user