polish code with new pre-commit rule (#2923)

This commit is contained in:
Zero Rains
2025-07-19 23:19:27 +08:00
committed by GitHub
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions

View File

@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from .cutlass_scaled_mm import cutlass_scaled_mm
from .scaled_fp8_quant import scaled_fp8_quant

View File

@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
import paddle
@@ -20,12 +21,14 @@ import paddle
import fastdeploy
def cutlass_scaled_mm(a: paddle.Tensor,
b: paddle.Tensor,
scale_a: paddle.Tensor,
scale_b: paddle.Tensor,
out_dtype: paddle.dtype,
bias: Optional[paddle.Tensor] = None) -> paddle.Tensor:
def cutlass_scaled_mm(
a: paddle.Tensor,
b: paddle.Tensor,
scale_a: paddle.Tensor,
scale_b: paddle.Tensor,
out_dtype: paddle.dtype,
bias: Optional[paddle.Tensor] = None,
) -> paddle.Tensor:
"""
`cutlass_scaled_mm` implements a fused version of
`output = paddle.mm((scale_a * a), (scale_b * b)).to(out_dtype)`
@@ -48,9 +51,8 @@ def cutlass_scaled_mm(a: paddle.Tensor,
scale_a.shape * [1, 128] == a.shape
scale_b.shape * [128, 128] == b.shape
"""
assert (out_dtype == paddle.bfloat16 or out_dtype == paddle.float16)
assert bias is None or bias.shape[0] == b.shape[
0] and bias.dtype == out_dtype
assert out_dtype == paddle.bfloat16 or out_dtype == paddle.float16
assert bias is None or bias.shape[0] == b.shape[0] and bias.dtype == out_dtype
# Ensure input tensors have valid shapes
# assert a.numel() > 0, "Input tensor 'a' must not be empty"
# assert b.numel() > 0, "Input tensor 'b' must not be empty"
@@ -59,12 +61,11 @@ def cutlass_scaled_mm(a: paddle.Tensor,
m = a.shape[0]
n = b.shape[0]
cutlass_compatible_b = (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
cutlass_compatible_b = b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0
assert cutlass_compatible_b
out = paddle.empty([m, n], dtype=out_dtype)
fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm(
out, a, b, scale_a, scale_b, bias)
fastdeploy.model_executor.ops.gpu.cutlass_scaled_mm(out, a, b, scale_a, scale_b, bias)
return out
@@ -100,7 +101,7 @@ def scaled_fp8_quant(
scaling factor.
"""
# This code assumes batch_dim and num_tokens are flattened
assert (input.ndim == 2)
assert input.ndim == 2
shape = input.shape
if num_token_padding:
shape = (max(num_token_padding, input.shape[0]), shape[1])
@@ -109,18 +110,21 @@ def scaled_fp8_quant(
if scale is None:
if use_per_token_if_dynamic:
scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
from fastdeploy.model_executor.ops.gpu import \
dynamic_per_token_scaled_fp8_quant
from fastdeploy.model_executor.ops.gpu import (
dynamic_per_token_scaled_fp8_quant,
)
dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
else:
scale = paddle.zeros([1], dtype=paddle.float32)
from fastdeploy.model_executor.ops.gpu import \
dynamic_scaled_fp8_quant
from fastdeploy.model_executor.ops.gpu import dynamic_scaled_fp8_quant
dynamic_scaled_fp8_quant(output, input, scale)
else:
# num_token_padding not implemented for this case
# assert (scale.numel() == 1 or num_token_padding is None)
from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
static_scaled_fp8_quant(output, input, scale)
return output, scale

View File

@@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from typing import Optional
import paddle
@@ -49,7 +50,7 @@ def scaled_fp8_quant(
scaling factor.
"""
# This code assumes batch_dim and num_tokens are flattened
assert (input.ndim == 2)
assert input.ndim == 2
shape = input.shape
if num_token_padding:
shape = (max(num_token_padding, input.shape[0]), shape[1])
@@ -58,18 +59,21 @@ def scaled_fp8_quant(
if scale is None:
if use_per_token_if_dynamic:
scale = paddle.empty([shape[0], 1], dtype=paddle.float32)
from fastdeploy.model_executor.ops.gpu import \
dynamic_per_token_scaled_fp8_quant
from fastdeploy.model_executor.ops.gpu import (
dynamic_per_token_scaled_fp8_quant,
)
dynamic_per_token_scaled_fp8_quant(output, input, scale, scale_ub)
else:
scale = paddle.zeros([1], dtype=paddle.float32)
from fastdeploy.model_executor.ops.gpu import \
dynamic_scaled_fp8_quant
from fastdeploy.model_executor.ops.gpu import dynamic_scaled_fp8_quant
dynamic_scaled_fp8_quant(output, input, scale)
else:
# num_token_padding not implemented for this case
# assert (scale.numel() == 1 or num_token_padding is None)
from fastdeploy.model_executor.ops.gpu import static_scaled_fp8_quant
static_scaled_fp8_quant(output, input, scale)
return output, scale