[OPs] MoE support wfp8afp8(channelwise) and improve per_token_quant_fp8 (#4238)

This commit is contained in:
chen
2025-09-24 16:39:51 +08:00
committed by GitHub
parent 8b0ce8e3ab
commit 7c1fd19f0f
7 changed files with 683 additions and 33 deletions

View File

@@ -23,6 +23,7 @@ from fastdeploy.model_executor.layers.linear import (
MergedColumnParallelLinear,
QKVParallelLinear,
)
from fastdeploy.model_executor.layers.moe import FusedMoE
from fastdeploy.model_executor.layers.quantization.ops import (
cutlass_scaled_mm,
scaled_fp8_quant,
@@ -65,7 +66,14 @@ class WFP8AFP8Config(QuantConfigBase):
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
""" """
return WFP8AFP8LinearMethod(self)
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
Wfp8Afp8MoEMethod,
)
return Wfp8Afp8MoEMethod(self)
else:
return WFP8AFP8LinearMethod(self)
class WFP8AFP8LinearMethod(QuantMethodBase):