mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-08 18:11:00 +08:00
polish code with new pre-commit rule (#2923)
This commit is contained in:
@@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from .cutlass_scaled_mm import cutlass_scaled_mm
|
||||
from .scaled_fp8_quant import scaled_fp8_quant
|
||||
|
||||
|
@@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import paddle
|
||||
@@ -20,12 +21,14 @@ import paddle
|
||||
import fastdeploy
|
||||
|
||||
|
||||
def cutlass_scaled_mm(a: paddle.Tensor,
|
||||
b: paddle.Tensor,
|
||||
scale_a: paddle.Tensor,
|
||||
scale_b: paddle.Tensor,
|
||||
out_dtype: paddle.dtype,
|
||||
bias: Optional[paddle.Tensor] = None) -> paddle.Tensor:
|
||||
def cutlass_scaled_mm(
|
||||
a: paddle.Tensor,
|
||||
b: paddle.Tensor,
|
||||
scale_a: paddle.Tensor,
|
||||
scale_b: paddle.Tensor,
|
||||
out_dtype: paddle.dtype,
|
||||
bias: Optional[paddle.Tensor] = None,
|
||||
) -> paddle.Tensor:
|
||||
"""
|
||||
`cutlass_scaled_mm` implements a fused version of
|
||||
`output = paddle.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
|
||||
@@ -48,9 +51,8 @@ def cutlass_scaled_mm(a: paddle.Tensor,
|
||||
scale_a.shape * [1, 128] == a.shape
|
||||
scale_b.shape * [128, 128] == b.shape
|
||||
"""
|
||||
assert (out_dtype == paddle.bfloat16 or out_dtype == paddle.float16)
|
||||
assert bias is None or bias.shape[0] == b.shape[
|
||||
0] and bias.dtype == out_dtype
|
||||
assert out_dtype == paddle.bfloat16 or out_dtype == paddle.float16
|
||||
assert bias is None or bias.shape[0] == b.shape[0] and bias.dtype == out_dtype
|
||||
# Ensure input tensors have valid shapes
|
||||
# assert a.numel() > 0, "Input tensor 'a' must not be empty"
|
||||
# assert b.numel() > 0, "Input tensor 'b' must not be empty"
|
||||
@@ -59,12 +61,11 @@ def cutlass_scaled_mm(a: paddle.Tensor,
|
||||
|
||||
m = a.shape[0]
|
||||
n = b.shape[0]
|
||||
cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
|
||||
cutlass_compatible_b = b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
|
||||
assert cutlass_compatible_b
|
||||
|
||||
out = paddle.empty([m, n], dtype=out_dtype)
|
||||
fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm(
|
||||
out, a, b, scale_a, scale_b, bias)
|
||||
fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
|
||||
|
||||
return out
|
||||
|
||||
@@ -100,7 +101,7 @@ def scaled_fp8_quant(
|
||||
scaling factor.
|
||||
"""
|
||||
# This code assumes batch_dim and num_tokens are flattened
|
||||
assert (input.ndim == 2)
|
||||
assert input.ndim == 2
|
||||
shape = input.shape
|
||||
if num_token_padding:
|
||||
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
||||
@@ -109,18 +110,21 @@ def scaled_fp8_quant(
|
||||
if scale is None:
|
||||
if use_per_token_if_dynamic:
|
||||
scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
|
||||
from fastdeploy.model_executor.ops.gpu import \
|
||||
dynamic_per_token_scaled_fp8_quant
|
||||
from fastdeploy.model_executor.ops.gpu import (
|
||||
dynamic_per_token_scaled_fp8_quant,
|
||||
)
|
||||
|
||||
dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
|
||||
else:
|
||||
scale = paddle.zeros([1], dtype=paddle.float32)
|
||||
from fastdeploy.model_executor.ops.gpu import \
|
||||
dynamic_scaled_fp8_quant
|
||||
from fastdeploy.model_executor.ops.gpu import dynamic_scaled_fp8_quant
|
||||
|
||||
dynamic_scaled_fp8_quant(output, input, scale)
|
||||
else:
|
||||
# num_token_padding not implemented for this case
|
||||
# assert (scale.numel() == 1 or num_token_padding is None)
|
||||
from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
|
||||
|
||||
static_scaled_fp8_quant(output, input, scale)
|
||||
|
||||
return output, scale
|
||||
|
@@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
import paddle
|
||||
@@ -49,7 +50,7 @@ def scaled_fp8_quant(
|
||||
scaling factor.
|
||||
"""
|
||||
# This code assumes batch_dim and num_tokens are flattened
|
||||
assert (input.ndim == 2)
|
||||
assert input.ndim == 2
|
||||
shape = input.shape
|
||||
if num_token_padding:
|
||||
shape = (max(num_token_padding, input.shape[0]), shape[1])
|
||||
@@ -58,18 +59,21 @@ def scaled_fp8_quant(
|
||||
if scale is None:
|
||||
if use_per_token_if_dynamic:
|
||||
scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
|
||||
from fastdeploy.model_executor.ops.gpu import \
|
||||
dynamic_per_token_scaled_fp8_quant
|
||||
from fastdeploy.model_executor.ops.gpu import (
|
||||
dynamic_per_token_scaled_fp8_quant,
|
||||
)
|
||||
|
||||
dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
|
||||
else:
|
||||
scale = paddle.zeros([1], dtype=paddle.float32)
|
||||
from fastdeploy.model_executor.ops.gpu import \
|
||||
dynamic_scaled_fp8_quant
|
||||
from fastdeploy.model_executor.ops.gpu import dynamic_scaled_fp8_quant
|
||||
|
||||
dynamic_scaled_fp8_quant(output, input, scale)
|
||||
else:
|
||||
# num_token_padding not implemented for this case
|
||||
# assert (scale.numel() == 1 or num_token_padding is None)
|
||||
from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
|
||||
|
||||
static_scaled_fp8_quant(output, input, scale)
|
||||
|
||||
return output, scale
|
||||
|
Reference in New Issue
Block a user