mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Intel HPU] enable tensor_wise_fp8 (#5324)
* [Intel HPU] enable tensor_wise_fp8 * update code based on comments * fix code style issue * fix bug about RP 5138 * mv kv_cache modifications to HPU backend * fix FP8 Precision Issues * fix FP8 Precision Issues * Add quantization UT --------- Co-authored-by: yanfeich <yanfei.cheng@intel.com> Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
@@ -34,6 +34,7 @@ class KvCacheQuantzationTypes(str, Enum):
|
||||
|
||||
INT8 = "int8"
|
||||
FP8 = "float8_e4m3fn"
|
||||
FP8_E4M3 = "float8_e4m3"
|
||||
BLOCK_WISE_FP8 = "block_wise_fp8"
|
||||
INT8_ZP = "int8_zp"
|
||||
INT4_ZP = "int4_zp"
|
||||
@@ -65,6 +66,8 @@ class KvCacheQuantConfig(QuantConfigBase):
|
||||
if self.quant_type == KvCacheQuantzationTypes.INT8 or self.quant_type == KvCacheQuantzationTypes.INT8_ZP:
|
||||
self.max_bound = 127.0
|
||||
self.is_channel_wise = True
|
||||
elif self.quant_type == KvCacheQuantzationTypes.FP8_E4M3:
|
||||
self.max_bound = 240.0
|
||||
elif (
|
||||
self.quant_type == KvCacheQuantzationTypes.FP8
|
||||
or self.quant_type == KvCacheQuantzationTypes.FP8_ZP
|
||||
@@ -101,6 +104,12 @@ class KvCacheQuantConfig(QuantConfigBase):
|
||||
)
|
||||
|
||||
return XPUKVCacheMethodBase(self)
|
||||
elif current_platform.is_intel_hpu():
|
||||
from fastdeploy.model_executor.layers.backends.intel_hpu.quantization.kv_cache import (
|
||||
HPUKVCacheMethodBase,
|
||||
)
|
||||
|
||||
return HPUKVCacheMethodBase(self)
|
||||
else:
|
||||
return KVCacheMethodBase(self)
|
||||
|
||||
|
||||
@@ -19,6 +19,7 @@ from typing import Optional
|
||||
import paddle
|
||||
|
||||
from fastdeploy.model_executor.layers.moe import FusedMoE
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
from ..utils import get_tensor
|
||||
from .quant_base import QuantConfigBase, QuantMethodBase
|
||||
@@ -52,14 +53,28 @@ class TensorWiseFP8Config(QuantConfigBase):
|
||||
"""
|
||||
return method according to this config!
|
||||
"""
|
||||
if isinstance(layer, FusedMoE):
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
|
||||
TensorWiseFP8MoEMethod,
|
||||
)
|
||||
if current_platform.is_intel_hpu():
|
||||
if isinstance(layer, FusedMoE):
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
HpuTensorWiseFP8MoEMethod,
|
||||
)
|
||||
|
||||
return TensorWiseFP8MoEMethod(self)
|
||||
return HpuTensorWiseFP8MoEMethod(self)
|
||||
else:
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
HpuTensorWiseFP8LinearMethod,
|
||||
)
|
||||
|
||||
return HpuTensorWiseFP8LinearMethod(self)
|
||||
else:
|
||||
return TensorWiseFP8LinearMethod(self)
|
||||
if isinstance(layer, FusedMoE):
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
|
||||
TensorWiseFP8MoEMethod,
|
||||
)
|
||||
|
||||
return TensorWiseFP8MoEMethod(self)
|
||||
else:
|
||||
return TensorWiseFP8LinearMethod(self)
|
||||
|
||||
|
||||
class TensorWiseFP8LinearMethod(QuantMethodBase):
|
||||
|
||||
Reference in New Issue
Block a user