mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 17:17:14 +08:00
[OPs] MoE support wfp8afp8(channelwise) and improve per_token_quant_fp8 (#4238)
This commit is contained in:
@@ -23,6 +23,7 @@ from fastdeploy.model_executor.layers.linear import (
|
||||
MergedColumnParallelLinear,
|
||||
QKVParallelLinear,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.moe import FusedMoE
|
||||
from fastdeploy.model_executor.layers.quantization.ops import (
|
||||
cutlass_scaled_mm,
|
||||
scaled_fp8_quant,
|
||||
@@ -65,7 +66,14 @@ class WFP8AFP8Config(QuantConfigBase):
|
||||
|
||||
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
|
||||
""" """
|
||||
return WFP8AFP8LinearMethod(self)
|
||||
if isinstance(layer, FusedMoE):
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
|
||||
Wfp8Afp8MoEMethod,
|
||||
)
|
||||
|
||||
return Wfp8Afp8MoEMethod(self)
|
||||
else:
|
||||
return WFP8AFP8LinearMethod(self)
|
||||
|
||||
|
||||
class WFP8AFP8LinearMethod(QuantMethodBase):
|
||||
|
Reference in New Issue
Block a user