[LLM] First commit the llm deployment code

2025-10-05 00:33:03 +08:00 · 2025-06-09 19:20:15 +08:00
parent 980c0a1d2c
commit 684703fd72
11814 changed files with 127294 additions and 1293102 deletions
--- a/fastdeploy/model_executor/layers/activation.py
+++ b/fastdeploy/model_executor/layers/activation.py
@@ -0,0 +1,121 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# cipher_token=WjI1fQOvhN  # do not edit this line
+from paddle import nn
+from paddle.incubate.nn.functional import fused_bias_act
+
+from fastdeploy.config import LLMConfig
+from fastdeploy.platforms import current_platform
+
+
+class SiluAndMul(nn.Layer):
+    """
+    SiluAndMul Layer
+    """
+
+    def __init__(
+        self,
+        llm_config: LLMConfig,
+        bias=None,
+        act_method="gelu",
+        dequant_scales=None,
+        shift=None,
+        smooth=None,
+        quant_scale=-1,
+    ):
+        """
+        Initialize the activation layer with optional parameters for quantization, bias,
+        activation method, and more.
+
+        Args:
+            llm_config (Any): Arguments related to inference, including quantization
+                settings.
+            bias (Optional[Tensor]): Optional bias term to be added to the output.
+            act_method (str, optional): Activation method to be applied.
+                Defaults to "gelu".
+            dequant_scales (Optional[List[float]]): Dequantization scales, used in
+                quantization scenarios.
+            shift (Optional[float]): Shift factor, used in quantization scenarios.
+            smooth (Optional[float]): Smoothing factor, used for specific activation
+                functions.
+            quant_scale (float, optional): Quantization scale, used in quantization
+                scenarios. Defaults to -1, indicating no quantization.
+
+        Raises:
+            ValueError: If the default data type is not supported (only float32, float16,
+                and bfloat16 are supported).
+        """
+        super().__init__()
+
+        if current_platform.is_cuda():
+            self.forward = self.forward_cuda
+        else:
+            raise NotImplementedError
+
+        self.bias = bias
+        if act_method == "silu":
+            act_method = "swiglu"
+
+        self.act_method = act_method
+        self.dequant_scales = dequant_scales
+        self.shift = shift
+        self.smooth = smooth
+        self.quant_scale = quant_scale
+        self.quant_round_type = llm_config.quant_config.quant_round_type
+        self.quant_max_bound = llm_config.quant_config.quant_max_bound
+        self.quant_min_bound = llm_config.quant_config.quant_min_bound
+
+        self._dtype = self._helper.get_default_dtype()
+        if self._dtype == "bfloat16":
+            self._fuse_kernel_compute_dtype = "bf16"
+        elif self._dtype == "float16":
+            self._fuse_kernel_compute_dtype = "fp16"
+        elif self._dtype == "float32":
+            self._fuse_kernel_compute_dtype = "fp32"
+        else:
+            raise ValueError(f"Just support float32, float16 and \
+                    bfloat16 as default dtype, but received {self._dtype}")
+
+        # fp8 is not support smooth quantization
+        if "float8" in llm_config.model_config.act_dtype:
+            self.dequant_scales = None
+            self.shift = None
+            self.smooth = None
+
+    def forward_cuda(self, x):
+        """
+        Forward propagation of the custom activation layer.
+
+        Args:
+            x (Tensor): Input tensor to the activation layer.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+        return fused_bias_act(
+            x,
+            bias=self.bias,
+            act_method=self.act_method,
+            compute_dtype=self._fuse_kernel_compute_dtype,
+            dequant_scales=self.dequant_scales,
+            shift=self.shift,
+            smooth=self.smooth,
+            quant_scale=self.quant_scale,
+            quant_round_type=self.quant_round_type,
+            quant_max_bound=self.quant_max_bound,
+            quant_min_bound=self.quant_min_bound,
+        )