mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Others] clean code (#5133)
This commit is contained in:
@@ -311,16 +311,17 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
if (mla_backend && group_size <= 64) {
|
||||
const int set_chunk_size = get_mla_dec_chunk_size(bsz);
|
||||
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
|
||||
CUDA_CHECK(cudaMemsetAsync(
|
||||
decoder_chunk_size_device.data<int>(), 64, sizeof(int32_t), stream));
|
||||
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
|
||||
CUDA_CHECK(cudaMemsetAsync(
|
||||
decoder_num_blocks_device.data<int>(), 0, sizeof(int32_t), stream));
|
||||
|
||||
int device;
|
||||
cudaGetDevice(&device);
|
||||
CUDA_CHECK(cudaGetDevice(&device));
|
||||
int sm_cout;
|
||||
cudaDeviceGetAttribute(&sm_cout, cudaDevAttrMultiProcessorCount, device);
|
||||
CUDA_CHECK(cudaDeviceGetAttribute(
|
||||
&sm_cout, cudaDevAttrMultiProcessorCount, device));
|
||||
constexpr int config_size =
|
||||
12; // search space for chunk size:[64, 128, 256, ... 131072]
|
||||
|
||||
@@ -341,16 +342,14 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
decoder_chunk_size_device.copy_to(paddle::CPUPlace(), false);
|
||||
const int chunk_size = decoder_chunk_size_cpu.data<int>()[0];
|
||||
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(
|
||||
cudaMemsetAsync(decoder_batch_ids.data<int>(),
|
||||
0,
|
||||
decoder_batch_ele_num * sizeof(int32_t),
|
||||
stream));
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(
|
||||
cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(),
|
||||
0,
|
||||
decoder_batch_ele_num * sizeof(int32_t),
|
||||
stream));
|
||||
CUDA_CHECK(cudaMemsetAsync(decoder_batch_ids.data<int>(),
|
||||
0,
|
||||
decoder_batch_ele_num * sizeof(int32_t),
|
||||
stream));
|
||||
CUDA_CHECK(cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(),
|
||||
0,
|
||||
decoder_batch_ele_num * sizeof(int32_t),
|
||||
stream));
|
||||
|
||||
split_block_for_mla<<<1, 32, 0, stream>>>(
|
||||
seq_lens_this_time.data<int>(),
|
||||
@@ -362,17 +361,15 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
chunk_size);
|
||||
|
||||
} else {
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(
|
||||
cudaMemsetAsync(decoder_batch_ids.data<int>(),
|
||||
0,
|
||||
decoder_batch_ele_num * sizeof(int32_t),
|
||||
stream));
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(
|
||||
cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(),
|
||||
0,
|
||||
decoder_batch_ele_num * sizeof(int32_t),
|
||||
stream));
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
|
||||
CUDA_CHECK(cudaMemsetAsync(decoder_batch_ids.data<int>(),
|
||||
0,
|
||||
decoder_batch_ele_num * sizeof(int32_t),
|
||||
stream));
|
||||
CUDA_CHECK(cudaMemsetAsync(decoder_tile_ids_per_batch.data<int>(),
|
||||
0,
|
||||
decoder_batch_ele_num * sizeof(int32_t),
|
||||
stream));
|
||||
CUDA_CHECK(cudaMemsetAsync(
|
||||
decoder_num_blocks_device.data<int>(), 0, sizeof(int32_t), stream));
|
||||
|
||||
split_q_block<<<1, 32, 0, stream>>>(
|
||||
@@ -391,8 +388,6 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
#endif
|
||||
decoder_num_blocks_cpu.copy_(
|
||||
decoder_num_blocks_device, decoder_num_blocks_cpu.place(), false);
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
|
||||
decoder_chunk_size_device.data<int>(), 64, sizeof(int32_t), stream));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -401,19 +396,17 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
const uint32_t max_tile_size_per_bs_kv =
|
||||
div_up(max_enc_dec_len_this_time, block_size);
|
||||
const uint32_t kv_batch_shape = bsz * max_tile_size_per_bs_kv;
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(
|
||||
CUDA_CHECK(cudaMemsetAsync(
|
||||
kv_batch_ids.data<int>(), 0, kv_batch_shape * sizeof(int32_t), stream));
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(
|
||||
cudaMemsetAsync(kv_tile_ids_per_batch.data<int>(),
|
||||
0,
|
||||
kv_batch_shape * sizeof(int32_t),
|
||||
stream));
|
||||
CUDA_CHECK(cudaMemsetAsync(kv_tile_ids_per_batch.data<int>(),
|
||||
0,
|
||||
kv_batch_shape * sizeof(int32_t),
|
||||
stream));
|
||||
auto kv_num_blocks_x =
|
||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
|
||||
split_kv_block<<<1, 32, 0, seq_lens_encoder.stream()>>>(
|
||||
seq_lens_decoder.data<int>(),
|
||||
// sequence_lengths->data<int>(),
|
||||
seq_lens_encoder.data<int>(),
|
||||
kv_batch_ids.data<int>(),
|
||||
kv_tile_ids_per_batch.data<int>(),
|
||||
@@ -428,16 +421,14 @@ void GetBlockShapeAndSplitKVBlock(
|
||||
const uint32_t encoder_max_tile_size_per_bs_q =
|
||||
div_up((max_enc_dec_len_this_time * group_size), encoder_block_shape_q);
|
||||
const uint32_t encoder_batch_shape = bsz * encoder_max_tile_size_per_bs_q;
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(
|
||||
cudaMemsetAsync(encoder_batch_ids.data<int>(),
|
||||
0,
|
||||
encoder_batch_shape * sizeof(int32_t),
|
||||
stream));
|
||||
PADDLE_ENFORCE_GPU_SUCCESS(
|
||||
cudaMemsetAsync(encoder_tile_ids_per_batch.data<int>(),
|
||||
0,
|
||||
encoder_batch_shape * sizeof(int32_t),
|
||||
stream));
|
||||
CUDA_CHECK(cudaMemsetAsync(encoder_batch_ids.data<int>(),
|
||||
0,
|
||||
encoder_batch_shape * sizeof(int32_t),
|
||||
stream));
|
||||
CUDA_CHECK(cudaMemsetAsync(encoder_tile_ids_per_batch.data<int>(),
|
||||
0,
|
||||
encoder_batch_shape * sizeof(int32_t),
|
||||
stream));
|
||||
auto encoder_num_blocks_x =
|
||||
GetEmptyTensor({1}, paddle::DataType::INT32, seq_lens_encoder.place());
|
||||
split_q_block<<<1, 32, 0, stream>>>(seq_lens_encoder.data<int>(),
|
||||
|
||||
@@ -72,18 +72,21 @@ def allocate_launch_related_buffer(
|
||||
block_size,
|
||||
):
|
||||
# Initialize AttentionBackend buffers
|
||||
group_size = np.ceil(num_heads / kv_num_heads)
|
||||
assert num_heads % kv_num_heads == 0
|
||||
assert max_model_len % block_size == 0
|
||||
assert max_model_len % encoder_block_shape_q == 0
|
||||
group_size = num_heads // kv_num_heads
|
||||
|
||||
# NOTE: (changwenbin) When using auto_chunk,
|
||||
# decode_max_tile_size must take into account the maximum case, where *1024 can cover 128K.
|
||||
decode_max_tile_size = (
|
||||
1024 * max_batch_size * np.ceil((decoder_step_token_num * group_size) / decoder_block_shape_q)
|
||||
1024 * max_batch_size * (int)(np.ceil(decoder_step_token_num * group_size / decoder_block_shape_q))
|
||||
)
|
||||
encode_max_tile_size = max_batch_size * np.ceil((max_model_len * group_size) / encoder_block_shape_q)
|
||||
kv_max_tile_size = max_batch_size * np.ceil(max_model_len / block_size)
|
||||
encode_max_tile_size = max_batch_size * (max_model_len * group_size // encoder_block_shape_q)
|
||||
kv_max_tile_size = max_batch_size * (max_model_len // block_size)
|
||||
res = {}
|
||||
res["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
|
||||
res["decoder_tile_ids_per_batch"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
|
||||
res["decoder_batch_ids"] = paddle.full([decode_max_tile_size], 0, dtype="int32")
|
||||
res["decoder_tile_ids_per_batch"] = paddle.full([decode_max_tile_size], 0, dtype="int32")
|
||||
res["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32").pin_memory()
|
||||
# NOTE: (changwenbin) MLA kernel only needs decoder_num_blocks_device in place of GPU tensor,
|
||||
# adapted to cudagraph.
|
||||
@@ -91,12 +94,12 @@ def allocate_launch_related_buffer(
|
||||
res["decoder_chunk_size_device"] = paddle.full([1], 64, dtype="int32")
|
||||
res["max_len_tensor_cpu"] = paddle.full([9], 0, dtype="int32").cpu()
|
||||
|
||||
res["encoder_batch_ids"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32")
|
||||
res["encoder_tile_ids_per_batch"] = paddle.full([int(encode_max_tile_size)], 0, dtype="int32")
|
||||
res["encoder_batch_ids"] = paddle.full([encode_max_tile_size], 0, dtype="int32")
|
||||
res["encoder_tile_ids_per_batch"] = paddle.full([encode_max_tile_size], 0, dtype="int32")
|
||||
res["encoder_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu()
|
||||
|
||||
res["kv_batch_ids"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32")
|
||||
res["kv_tile_ids_per_batch"] = paddle.full([int(kv_max_tile_size)], 0, dtype="int32")
|
||||
res["kv_batch_ids"] = paddle.full([kv_max_tile_size], 0, dtype="int32")
|
||||
res["kv_tile_ids_per_batch"] = paddle.full([kv_max_tile_size], 0, dtype="int32")
|
||||
res["kv_num_blocks_x_cpu"] = paddle.full([1], 0, dtype="int32").cpu()
|
||||
return res
|
||||
|
||||
|
||||
@@ -172,7 +172,7 @@ class TestEngineInitialization(unittest.TestCase):
|
||||
mock_args = Mock()
|
||||
mock_args.model = "test-model"
|
||||
mock_args.tokenizer = "test-tokenizer"
|
||||
mock_args.max_model_len = 1000
|
||||
mock_args.max_model_len = 1024
|
||||
mock_args.tensor_parallel_size = 1
|
||||
mock_args.engine_worker_queue_port = [8000]
|
||||
mock_args.local_data_parallel_id = 0
|
||||
@@ -202,7 +202,7 @@ class TestEngineInitialization(unittest.TestCase):
|
||||
def test_create_serving_handlers(self, mock_chat_handler, mock_model_handler):
|
||||
"""测试创建服务处理器"""
|
||||
mock_args = Mock()
|
||||
mock_args.max_model_len = 1000
|
||||
mock_args.max_model_len = 1024
|
||||
mock_args.ips = "127.0.0.1"
|
||||
mock_args.max_waiting_time = 60
|
||||
mock_args.enable_mm_output = False
|
||||
@@ -1286,7 +1286,7 @@ class TestFastDeployBatch(unittest.TestCase):
|
||||
"--quantization",
|
||||
"wint4",
|
||||
"--max-model-len",
|
||||
"4192",
|
||||
"5120",
|
||||
"--max-num-seqs",
|
||||
"64",
|
||||
"--load-choices",
|
||||
|
||||
Reference in New Issue
Block a user