【Inference Optimize】Support wint2 triton kernel about triton_utils_v2 (#2842)

* update supported_models doc
This commit is contained in:
AIbin
2025-07-15 14:35:40 +08:00
committed by GitHub
parent 15c8c240b5
commit fd91da7b41
4 changed files with 398 additions and 6 deletions

View File

@@ -126,7 +126,7 @@ class WINT2Config(QuantConfigBase):
layer (Layer): The layer for which the quantization method should be retrieved.
Returns:
QuantMethodBase: The quantization method associated with the given layer.
QuantMethodBase: The quantization method associated with the given layer.
"""
if isinstance(layer, FusedMoE):
if layer.layer_idx <= self.moe_w4_quant_end_layer:
@@ -135,8 +135,8 @@ class WINT2Config(QuantConfigBase):
{}).get_quant_method(layer)
else:
from fastdeploy.model_executor.layers.moe.fused_moe_wint2_backend import \
TritonWint2FusedMoeMethod
return TritonWint2FusedMoeMethod(self)
CutlassWint2FusedMoeMethod
return CutlassWint2FusedMoeMethod(self)
else:
return get_quantization_config(self.dense_quant_type).from_config(
{}).get_quant_method(layer)