[Intel HPU] enable tensor_wise_fp8 (#5324)

* [Intel HPU] enable tensor_wise_fp8 * update code based on comments * fix code style issue * fix bug about RP 5138 * mv kv_cache modifications to HPU backend * fix FP8 Precision Issues * fix FP8 Precision Issues * Add quantization UT --------- Co-authored-by: yanfeich <yanfei.cheng@intel.com> Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-12-17 16:45:03 +08:00
parent 15f5112ecb
commit 404cf0ece4
17 changed files with 824 additions and 116 deletions
--- a/fastdeploy/model_executor/layers/quantization/kv_cache.py
+++ b/fastdeploy/model_executor/layers/quantization/kv_cache.py
@@ -34,6 +34,7 @@ class KvCacheQuantzationTypes(str, Enum):

    INT8 = "int8"
    FP8 = "float8_e4m3fn"
+    FP8_E4M3 = "float8_e4m3"
    BLOCK_WISE_FP8 = "block_wise_fp8"
    INT8_ZP = "int8_zp"
    INT4_ZP = "int4_zp"
@@ -65,6 +66,8 @@ class KvCacheQuantConfig(QuantConfigBase):
        if self.quant_type == KvCacheQuantzationTypes.INT8 or self.quant_type == KvCacheQuantzationTypes.INT8_ZP:
            self.max_bound = 127.0
            self.is_channel_wise = True
+        elif self.quant_type == KvCacheQuantzationTypes.FP8_E4M3:
+            self.max_bound = 240.0
        elif (
            self.quant_type == KvCacheQuantzationTypes.FP8
            or self.quant_type == KvCacheQuantzationTypes.FP8_ZP
@@ -101,6 +104,12 @@ class KvCacheQuantConfig(QuantConfigBase):
            )

            return XPUKVCacheMethodBase(self)
+        elif current_platform.is_intel_hpu():
+            from fastdeploy.model_executor.layers.backends.intel_hpu.quantization.kv_cache import (
+                HPUKVCacheMethodBase,
+            )
+
+            return HPUKVCacheMethodBase(self)
        else:
            return KVCacheMethodBase(self)

--- a/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/tensor_wise_fp8.py
@@ -19,6 +19,7 @@ from typing import Optional
 import paddle

 from fastdeploy.model_executor.layers.moe import FusedMoE
+from fastdeploy.platforms import current_platform

 from ..utils import get_tensor
 from .quant_base import QuantConfigBase, QuantMethodBase
@@ -52,14 +53,28 @@ class TensorWiseFP8Config(QuantConfigBase):
        """
        return method according to this config!
        """
-        if isinstance(layer, FusedMoE):
-            from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
-                TensorWiseFP8MoEMethod,
-            )
+        if current_platform.is_intel_hpu():
+            if isinstance(layer, FusedMoE):
+                from fastdeploy.model_executor.layers.backends import (
+                    HpuTensorWiseFP8MoEMethod,
+                )

-            return TensorWiseFP8MoEMethod(self)
+                return HpuTensorWiseFP8MoEMethod(self)
+            else:
+                from fastdeploy.model_executor.layers.backends import (
+                    HpuTensorWiseFP8LinearMethod,
+                )
+
+                return HpuTensorWiseFP8LinearMethod(self)
        else:
-            return TensorWiseFP8LinearMethod(self)
+            if isinstance(layer, FusedMoE):
+                from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
+                    TensorWiseFP8MoEMethod,
+                )
+
+                return TensorWiseFP8MoEMethod(self)
+            else:
+                return TensorWiseFP8LinearMethod(self)


 class TensorWiseFP8LinearMethod(QuantMethodBase):