mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Others] Clean code && remove GPU sync code (#5548)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FD Image Build (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FD Image Build (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
This commit is contained in:
@@ -637,6 +637,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
batch_pooling_params = []
|
||||
for i in range(req_len):
|
||||
request = req_dicts[i]
|
||||
# assert isinstance(request, Request)
|
||||
idx = request.idx
|
||||
|
||||
if hasattr(request, "pooling_params") and request.pooling_params is not None:
|
||||
@@ -655,14 +656,14 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
logits_info, schemata_key = self._init_logits_processor(request)
|
||||
request.schemata_key = schemata_key
|
||||
|
||||
if self.scheduler_config.splitwise_role == "decode":
|
||||
if (
|
||||
hasattr(request, "prefill_end_index")
|
||||
and hasattr(request, "prompt_token_ids")
|
||||
and request.prefill_end_index > len(request.prompt_token_ids)
|
||||
):
|
||||
if hasattr(request, "output_token_ids"):
|
||||
prefill_tokens.extend(request.output_token_ids)
|
||||
if (
|
||||
self.scheduler_config.splitwise_role == "decode"
|
||||
and hasattr(request, "prefill_end_index")
|
||||
and hasattr(request, "prompt_token_ids")
|
||||
and request.prefill_end_index > len(request.prompt_token_ids)
|
||||
and hasattr(request, "output_token_ids")
|
||||
):
|
||||
prefill_tokens.extend(request.output_token_ids)
|
||||
|
||||
prefill_start_index = request.prefill_start_index
|
||||
prefill_end_index = request.prefill_end_index
|
||||
@@ -784,12 +785,12 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0:
|
||||
bad_words_len = len(request.get("bad_words_token_ids"))
|
||||
self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len
|
||||
self.share_inputs["bad_tokens_len"][idx] = bad_words_len
|
||||
self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array(
|
||||
request.get("bad_words_token_ids"), dtype="int64"
|
||||
)
|
||||
else:
|
||||
self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1
|
||||
self.share_inputs["bad_tokens_len"][idx] = 1
|
||||
self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64")
|
||||
|
||||
if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None:
|
||||
@@ -1007,12 +1008,12 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
if request.get("bad_words_token_ids") is not None and len(request.get("bad_words_token_ids")) > 0:
|
||||
bad_words_len = len(request.get("bad_words_token_ids"))
|
||||
self.share_inputs["bad_tokens_len"][idx : idx + 1] = bad_words_len
|
||||
self.share_inputs["bad_tokens_len"][idx] = bad_words_len
|
||||
self.share_inputs["bad_tokens"][idx : idx + 1, :bad_words_len] = np.array(
|
||||
request.get("bad_words_token_ids"), dtype="int64"
|
||||
)
|
||||
else:
|
||||
self.share_inputs["bad_tokens_len"][idx : idx + 1] = 1
|
||||
self.share_inputs["bad_tokens_len"][idx] = 1
|
||||
self.share_inputs["bad_tokens"][idx : idx + 1, :] = np.array([-1], dtype="int64")
|
||||
|
||||
if request.get("stop_token_ids") is not None and request.get("stop_seqs_len") is not None:
|
||||
@@ -1217,7 +1218,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["stop_nums"] = paddle.full([1], max_num_seqs, dtype="int64")
|
||||
|
||||
self.share_inputs["bad_tokens"] = paddle.full([max_num_seqs, self.model_config.vocab_size], -1, dtype="int64")
|
||||
self.share_inputs["bad_tokens_len"] = paddle.full([max_num_seqs], 1, dtype="int64")
|
||||
self.share_inputs["bad_tokens_len"] = [-1] * max_num_seqs
|
||||
self.share_inputs["next_tokens"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
|
||||
self.share_inputs["is_block_step"] = paddle.full([max_num_seqs], False, dtype="bool")
|
||||
self.share_inputs["is_chunk_step"] = paddle.full([max_num_seqs], False, dtype="bool").cpu()
|
||||
@@ -1447,7 +1448,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["output_padding_offset"].copy_(output_padding_offset, False)
|
||||
|
||||
# Update bad tokens len
|
||||
max_bad_tokens_len = np.max(self.share_inputs["bad_tokens_len"].numpy())
|
||||
max_bad_tokens_len = max(self.share_inputs["bad_tokens_len"])
|
||||
|
||||
# Initialize forward meta data
|
||||
self.initialize_forward_meta(is_dummy_or_profile_run=is_dummy_or_profile_run)
|
||||
|
||||
@@ -270,7 +270,7 @@ class TestAttentionPerformance(unittest.TestCase):
|
||||
partial_rotary_factor=fd_config.model_config.partial_rotary_factor,
|
||||
)
|
||||
|
||||
input_ids = paddle.zeros([batch_size, seq_len if mode == ForwardMode.EXTEND else 1], dtype="int64")
|
||||
input_ids = paddle.zeros([batch_size, fd_config.model_config.max_model_len], dtype="int64")
|
||||
token_num = np.sum(seq_lens_this_time)
|
||||
ids_remove_padding, batch_id_per_token, cu_seqlens_q, cu_seqlens_k = get_padding_offset(
|
||||
input_ids, seq_lens_this_time, token_num
|
||||
@@ -302,27 +302,22 @@ class TestAttentionPerformance(unittest.TestCase):
|
||||
# Test parameters
|
||||
test_steps = 100
|
||||
|
||||
# prefill_batch_size = 1
|
||||
# prefill_seq_len = 4096
|
||||
prefill_batch_size = 1
|
||||
prefill_seq_len = 2048
|
||||
|
||||
# prefill_hidden_states = paddle.randn(
|
||||
# [prefill_batch_size * prefill_seq_len, self.fd_config.model_config.hidden_size],
|
||||
# dtype=act_tensor_dtype,
|
||||
# )
|
||||
forward_meta, prefill_hidden_states = self.create_forward_meta(
|
||||
batch_size=prefill_batch_size,
|
||||
seq_len=prefill_seq_len,
|
||||
mode=ForwardMode.EXTEND,
|
||||
fd_config=self.fd_config,
|
||||
attn_backend=self.attn_backend,
|
||||
cache_quant_type_str=self.cache_quant_type_str,
|
||||
)
|
||||
|
||||
# forward_meta = self.create_forward_meta(
|
||||
# batch_size=prefill_batch_size,
|
||||
# seq_len=prefill_seq_len,
|
||||
# mode=ForwardMode.EXTEND,
|
||||
# fd_config=self.fd_config,
|
||||
# attn_backend=self.attn_backend,
|
||||
# cache_quant_type_str=self.cache_quant_type_str,
|
||||
# )
|
||||
self.attn_backend.init_attention_metadata(forward_meta)
|
||||
self.attn_forward(forward_meta, prefill_hidden_states)
|
||||
|
||||
# self.attn_backend.init_attention_metadata(forward_meta)
|
||||
# self.attn_forward(forward_meta, prefill_hidden_states)
|
||||
|
||||
# paddle.device.synchronize()
|
||||
paddle.device.synchronize()
|
||||
|
||||
# import paddle.profiler as profiler
|
||||
# p = profiler.Profiler(
|
||||
@@ -332,22 +327,22 @@ class TestAttentionPerformance(unittest.TestCase):
|
||||
# p.start()
|
||||
# p.step()
|
||||
|
||||
# start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
|
||||
# end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
|
||||
# for i in range(test_steps):
|
||||
# start_events[i].record()
|
||||
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
|
||||
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
|
||||
for i in range(test_steps):
|
||||
start_events[i].record()
|
||||
|
||||
# self.attn_forward(forward_meta, prefill_hidden_states)
|
||||
self.attn_forward(forward_meta, prefill_hidden_states)
|
||||
|
||||
# end_events[i].record()
|
||||
# paddle.device.synchronize()
|
||||
|
||||
# times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
|
||||
# print(times[-5:])
|
||||
# return
|
||||
end_events[i].record()
|
||||
paddle.device.synchronize()
|
||||
|
||||
times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
|
||||
print(times[-5:])
|
||||
# p.stop()
|
||||
|
||||
return
|
||||
|
||||
# p = profiler.Profiler(
|
||||
# targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
|
||||
# on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
|
||||
|
||||
Reference in New Issue
Block a user