[Intel HPU] enable tensor_wise_fp8 (#5324)

* [Intel HPU] enable tensor_wise_fp8

* update code based on comments

* fix code style issue

* fix bug about RP 5138

* mv kv_cache modifications to HPU backend

* fix FP8 Precision Issues

* fix FP8 Precision Issues

* Add quantization UT

---------

Co-authored-by: yanfeich <yanfei.cheng@intel.com>
Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
This commit is contained in:
fmiao2372
2025-12-17 16:45:03 +08:00
committed by GitHub
parent 15f5112ecb
commit 404cf0ece4
17 changed files with 824 additions and 116 deletions

View File

@@ -34,6 +34,7 @@ class KvCacheQuantzationTypes(str, Enum):
INT8 = "int8"
FP8 = "float8_e4m3fn"
FP8_E4M3 = "float8_e4m3"
BLOCK_WISE_FP8 = "block_wise_fp8"
INT8_ZP = "int8_zp"
INT4_ZP = "int4_zp"
@@ -65,6 +66,8 @@ class KvCacheQuantConfig(QuantConfigBase):
if self.quant_type == KvCacheQuantzationTypes.INT8 or self.quant_type == KvCacheQuantzationTypes.INT8_ZP:
self.max_bound = 127.0
self.is_channel_wise = True
elif self.quant_type == KvCacheQuantzationTypes.FP8_E4M3:
self.max_bound = 240.0
elif (
self.quant_type == KvCacheQuantzationTypes.FP8
or self.quant_type == KvCacheQuantzationTypes.FP8_ZP
@@ -101,6 +104,12 @@ class KvCacheQuantConfig(QuantConfigBase):
)
return XPUKVCacheMethodBase(self)
elif current_platform.is_intel_hpu():
from fastdeploy.model_executor.layers.backends.intel_hpu.quantization.kv_cache import (
HPUKVCacheMethodBase,
)
return HPUKVCacheMethodBase(self)
else:
return KVCacheMethodBase(self)

View File

@@ -19,6 +19,7 @@ from typing import Optional
import paddle
from fastdeploy.model_executor.layers.moe import FusedMoE
from fastdeploy.platforms import current_platform
from ..utils import get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase
@@ -52,14 +53,28 @@ class TensorWiseFP8Config(QuantConfigBase):
"""
return method according to this config!
"""
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
TensorWiseFP8MoEMethod,
)
if current_platform.is_intel_hpu():
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.backends import (
HpuTensorWiseFP8MoEMethod,
)
return TensorWiseFP8MoEMethod(self)
return HpuTensorWiseFP8MoEMethod(self)
else:
from fastdeploy.model_executor.layers.backends import (
HpuTensorWiseFP8LinearMethod,
)
return HpuTensorWiseFP8LinearMethod(self)
else:
return TensorWiseFP8LinearMethod(self)
if isinstance(layer, FusedMoE):
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
TensorWiseFP8MoEMethod,
)
return TensorWiseFP8MoEMethod(self)
else:
return TensorWiseFP8LinearMethod(self)
class TensorWiseFP8LinearMethod(QuantMethodBase):