polish code with new pre-commit rule (#2923)

This commit is contained in:
Zero Rains
2025-07-19 23:19:27 +08:00
committed by GitHub
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions

View File

@@ -19,13 +19,14 @@ import unittest
import paddle
from fastdeploy.model_executor.layers.attention import (
Attention, PaddleNativeAttnBackend)
from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode
from fastdeploy.model_executor.layers.attention import (
Attention,
PaddleNativeAttnBackend,
)
class MockModelRunner:
def __init__(
self,
page_size=1,
@@ -53,19 +54,15 @@ class MockModelRunner:
(),
{
# A typical max_bs * max_context_len for cuda graph decode
"size":
max_batch_size,
"size": max_batch_size,
# Add req_to_token attribute
"req_to_token":
paddle.zeros([max_batch_size, max_context_len],
dtype=paddle.int32),
"req_to_token": paddle.zeros([max_batch_size, max_context_len], dtype=paddle.int32),
},
)
self.page_size = page_size
class TestNativePaddleAttentionBackend(unittest.TestCase):
def setUp(self):
# Test parameters
self.batch_size = 2
@@ -90,11 +87,10 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
# so we need to multiply the index by page_size.
self.req_to_token = (
paddle.arange(0, batch_size, dtype=paddle.int32)[:, None] * seq_len
+ paddle.arange(0, seq_len, dtype=paddle.int32)[None, :] +
page_size)
self.model_runner.req_to_token_pool.req_to_token[:batch_size, :
seq_len] = (
self.req_to_token)
+ paddle.arange(0, seq_len, dtype=paddle.int32)[None, :]
+ page_size
)
self.model_runner.req_to_token_pool.req_to_token[:batch_size, :seq_len] = self.req_to_token
def _create_attention_layer(self):
"""Create attention layer for testing."""
@@ -114,15 +110,12 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
paddle.randn(shape, dtype=self.dtype),
)
def _run_reference_forward(self, mode, q, k, v, layer, forward_batch,
expected_shape):
def _run_reference_forward(self, mode, q, k, v, layer, forward_batch, expected_shape):
"""Run reference forward pass using native backend."""
if mode == ForwardMode.EXTEND:
output = self.ref_backend.forward_extend(q, k, v, layer,
forward_batch)
output = self.ref_backend.forward_extend(q, k, v, layer, forward_batch)
else: # ForwardMode.DECODE
output = self.ref_backend.forward_decode(q, k, v, layer,
forward_batch)
output = self.ref_backend.forward_decode(q, k, v, layer, forward_batch)
return output.view(expected_shape)
def _verify_output(self, output, expected_shape, output_ref=None):
@@ -133,33 +126,28 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
f"Expected shape {expected_shape}, got {output.shape}",
)
self.assertEqual(output.dtype, self.dtype)
self.assertEqual(
paddle.isnan(output).sum().item(), 0, "Output contains NaN values")
self.assertEqual(paddle.isnan(output).sum().item(), 0, "Output contains NaN values")
if output_ref is not None:
if not paddle.allclose(output, output_ref, atol=1e-1, rtol=0.0):
# Check where the values differ beyond the given tolerances
diff_mask = ~paddle.isclose(
output, output_ref, atol=1e-1, rtol=0.0)
diff_mask = ~paddle.isclose(output, output_ref, atol=1e-1, rtol=0.0)
# Find the first index where the difference occurs
if diff_mask.any():
first_mismatch_idx = diff_mask.nonzero()[0]
print("First mismatch at index:",
tuple(first_mismatch_idx.tolist()))
print("output:",
output[tuple(first_mismatch_idx.tolist())])
print("output_ref:",
output_ref[tuple(first_mismatch_idx.tolist())])
raise AssertionError(
"Attention output is not close to the torch native backend output"
)
print(
"First mismatch at index:",
tuple(first_mismatch_idx.tolist()),
)
print("output:", output[tuple(first_mismatch_idx.tolist())])
print(
"output_ref:",
output_ref[tuple(first_mismatch_idx.tolist())],
)
raise AssertionError("Attention output is not close to the torch native backend output")
def _create_forward_batch(self,
mode,
q_len=None,
prefix_len=0,
page_size=1):
def _create_forward_batch(self, mode, q_len=None, prefix_len=0, page_size=1):
"""Create a forward batch for testing based on mode and lengths."""
self._init_model_runner(page_size=page_size)
@@ -179,16 +167,11 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
forward_mode=mode,
req_pool_indices=paddle.arange(self.batch_size),
seq_lens=paddle.to_tensor([total_len] * self.batch_size),
extend_prefix_lens=paddle.to_tensor([prefix_len] *
self.batch_size),
extend_prefix_lens=paddle.to_tensor([prefix_len] * self.batch_size),
extend_seq_lens=paddle.to_tensor([q_len] * self.batch_size),
seq_lens_cpu=paddle.to_tensor([total_len] * self.batch_size,
place="cpu"),
extend_prefix_lens_cpu=paddle.to_tensor([prefix_len] *
self.batch_size,
place="cpu"),
extend_seq_lens_cpu=paddle.to_tensor([q_len] * self.batch_size,
place="cpu"),
seq_lens_cpu=paddle.to_tensor([total_len] * self.batch_size, place="cpu"),
extend_prefix_lens_cpu=paddle.to_tensor([prefix_len] * self.batch_size, place="cpu"),
extend_seq_lens_cpu=paddle.to_tensor([q_len] * self.batch_size, place="cpu"),
attn_backend=self.backend,
)
else: # ForwardMode.DECODE
@@ -196,8 +179,7 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
total_len = self.seq_len + decode_len
if mode == ForwardMode.DECODE and page_size > 1:
# Get next page_size multiple of self.seq_len
out_cache_start = (self.batch_size * self.seq_len // page_size
+ 1) * page_size
out_cache_start = (self.batch_size * self.seq_len // page_size + 1) * page_size
# out_cache_end is the start of the next block
out_cache_end = out_cache_start + decode_len * page_size
else:
@@ -206,16 +188,13 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
forward_batch = ForwardMeta(
batch_size=self.batch_size,
input_ids=paddle.randint(0, 100,
(self.batch_size, decode_len)),
out_cache_loc=paddle.to_tensor(
[out_cache_start, out_cache_end]),
input_ids=paddle.randint(0, 100, (self.batch_size, decode_len)),
out_cache_loc=paddle.to_tensor([out_cache_start, out_cache_end]),
seq_lens_sum=self.batch_size * total_len,
forward_mode=mode,
req_pool_indices=paddle.arange(self.batch_size),
seq_lens=paddle.to_tensor([total_len] * self.batch_size),
seq_lens_cpu=paddle.to_tensor([total_len] * self.batch_size,
place="cpu"),
seq_lens_cpu=paddle.to_tensor([total_len] * self.batch_size, place="cpu"),
attn_backend=self.backend,
)
@@ -223,8 +202,7 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
forward_batch.req_to_token_pool = self.model_runner.req_to_token_pool
# Write current batch's req_to_token to req_to_token_pool
self._mock_write_to_req_to_token_pool(self.batch_size, total_len,
page_size)
self._mock_write_to_req_to_token_pool(self.batch_size, total_len, page_size)
# Add kv pool for this forward batch
forward_batch.token_to_kv_pool = self.model_runner.token_to_kv_pool
@@ -236,10 +214,13 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
[self.batch_size * cache_len, self.num_heads, self.head_dim],
dtype=self.dtype,
)
cache_v = (paddle.ones(
[self.batch_size * cache_len, self.num_heads, self.head_dim],
dtype=self.dtype,
) * 2)
cache_v = (
paddle.ones(
[self.batch_size * cache_len, self.num_heads, self.head_dim],
dtype=self.dtype,
)
* 2
)
# Set the prefix KV cache
forward_batch.token_to_kv_pool.set_kv_buffer(
@@ -263,8 +244,7 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
layer = self._create_attention_layer()
# Create forward batch and set up
forward_batch = self._create_forward_batch(mode, q_len, prefix_len,
page_size)
forward_batch = self._create_forward_batch(mode, q_len, prefix_len, page_size)
# Create QKV tensors for the input
q, k, v = self._create_qkv_tensors(self.batch_size * q_len)
@@ -291,8 +271,7 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
expected_shape = [self.batch_size, self.num_heads * self.head_dim]
output = self.backend.forward_decode(q, k, v, layer, forward_batch)
output_ref = self._run_reference_forward(mode, q, k, v, layer,
forward_batch, expected_shape)
output_ref = self._run_reference_forward(mode, q, k, v, layer, forward_batch, expected_shape)
self._verify_output(output, expected_shape, output_ref)
@@ -310,15 +289,11 @@ class TestNativePaddleAttentionBackend(unittest.TestCase):
"""Test extending from cached prefix tokens."""
prefix_len = self.seq_len // 2
extend_len = self.seq_len - prefix_len
self._run_attention_test(ForwardMode.EXTEND,
q_len=extend_len,
prefix_len=prefix_len)
self._run_attention_test(ForwardMode.EXTEND, q_len=extend_len, prefix_len=prefix_len)
def test_forward_extend_with_page_size_greater_than_1(self):
"""Test extending from cached prefix tokens with page size greater than 1."""
self._run_attention_test(ForwardMode.EXTEND,
q_len=self.seq_len,
page_size=64)
self._run_attention_test(ForwardMode.EXTEND, q_len=self.seq_len, page_size=64)
def test_forward_decode_with_page_size_greater_than_1(self):
"""Test decode operation with page size greater than 1."""