mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 00:33:03 +08:00
[LLM] First commit the llm deployment code
This commit is contained in:
121
fastdeploy/model_executor/layers/activation.py
Normal file
121
fastdeploy/model_executor/layers/activation.py
Normal file
@@ -0,0 +1,121 @@
|
||||
"""
|
||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
# cipher_token=WjI1fQOvhN # do not edit this line
|
||||
from paddle import nn
|
||||
from paddle.incubate.nn.functional import fused_bias_act
|
||||
|
||||
from fastdeploy.config import LLMConfig
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
|
||||
class SiluAndMul(nn.Layer):
|
||||
"""
|
||||
SiluAndMul Layer
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
llm_config: LLMConfig,
|
||||
bias=None,
|
||||
act_method="gelu",
|
||||
dequant_scales=None,
|
||||
shift=None,
|
||||
smooth=None,
|
||||
quant_scale=-1,
|
||||
):
|
||||
"""
|
||||
Initialize the activation layer with optional parameters for quantization, bias,
|
||||
activation method, and more.
|
||||
|
||||
Args:
|
||||
llm_config (Any): Arguments related to inference, including quantization
|
||||
settings.
|
||||
bias (Optional[Tensor]): Optional bias term to be added to the output.
|
||||
act_method (str, optional): Activation method to be applied.
|
||||
Defaults to "gelu".
|
||||
dequant_scales (Optional[List[float]]): Dequantization scales, used in
|
||||
quantization scenarios.
|
||||
shift (Optional[float]): Shift factor, used in quantization scenarios.
|
||||
smooth (Optional[float]): Smoothing factor, used for specific activation
|
||||
functions.
|
||||
quant_scale (float, optional): Quantization scale, used in quantization
|
||||
scenarios. Defaults to -1, indicating no quantization.
|
||||
|
||||
Raises:
|
||||
ValueError: If the default data type is not supported (only float32, float16,
|
||||
and bfloat16 are supported).
|
||||
"""
|
||||
super().__init__()
|
||||
|
||||
if current_platform.is_cuda():
|
||||
self.forward = self.forward_cuda
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
self.bias = bias
|
||||
if act_method == "silu":
|
||||
act_method = "swiglu"
|
||||
|
||||
self.act_method = act_method
|
||||
self.dequant_scales = dequant_scales
|
||||
self.shift = shift
|
||||
self.smooth = smooth
|
||||
self.quant_scale = quant_scale
|
||||
self.quant_round_type = llm_config.quant_config.quant_round_type
|
||||
self.quant_max_bound = llm_config.quant_config.quant_max_bound
|
||||
self.quant_min_bound = llm_config.quant_config.quant_min_bound
|
||||
|
||||
self._dtype = self._helper.get_default_dtype()
|
||||
if self._dtype == "bfloat16":
|
||||
self._fuse_kernel_compute_dtype = "bf16"
|
||||
elif self._dtype == "float16":
|
||||
self._fuse_kernel_compute_dtype = "fp16"
|
||||
elif self._dtype == "float32":
|
||||
self._fuse_kernel_compute_dtype = "fp32"
|
||||
else:
|
||||
raise ValueError(f"Just support float32, float16 and \
|
||||
bfloat16 as default dtype, but received {self._dtype}")
|
||||
|
||||
# fp8 is not support smooth quantization
|
||||
if "float8" in llm_config.model_config.act_dtype:
|
||||
self.dequant_scales = None
|
||||
self.shift = None
|
||||
self.smooth = None
|
||||
|
||||
def forward_cuda(self, x):
|
||||
"""
|
||||
Forward propagation of the custom activation layer.
|
||||
|
||||
Args:
|
||||
x (Tensor): Input tensor to the activation layer.
|
||||
|
||||
Returns:
|
||||
Tensor: Output tensor.
|
||||
"""
|
||||
return fused_bias_act(
|
||||
x,
|
||||
bias=self.bias,
|
||||
act_method=self.act_method,
|
||||
compute_dtype=self._fuse_kernel_compute_dtype,
|
||||
dequant_scales=self.dequant_scales,
|
||||
shift=self.shift,
|
||||
smooth=self.smooth,
|
||||
quant_scale=self.quant_scale,
|
||||
quant_round_type=self.quant_round_type,
|
||||
quant_max_bound=self.quant_max_bound,
|
||||
quant_min_bound=self.quant_min_bound,
|
||||
)
|
Reference in New Issue
Block a user