mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[Code Simplification] remove cum_offsets (#3410)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
This commit is contained in:
@@ -583,7 +583,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["min_dec_len"][idx : idx + 1] = max_dec_len
|
||||
self.share_inputs["stop_flags"][idx : idx + 1] = False
|
||||
self.share_inputs["temperature"][idx : idx + 1] = 1
|
||||
|
||||
self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1]
|
||||
self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length
|
||||
|
||||
@@ -680,7 +679,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
0,
|
||||
dtype="int64",
|
||||
)
|
||||
self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||
self.share_inputs["batch_id_per_token"] = paddle.full(
|
||||
[max_num_seqs * self.parallel_config.max_model_len, 1], 0, dtype="int32"
|
||||
)
|
||||
@@ -803,7 +801,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# Remove padding
|
||||
(
|
||||
ids_remove_padding,
|
||||
cum_offsets,
|
||||
batch_id_per_token,
|
||||
cu_seqlens_q,
|
||||
cu_seqlens_k,
|
||||
@@ -819,7 +816,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
)
|
||||
|
||||
self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False)
|
||||
self.share_inputs["cum_offsets"].copy_(cum_offsets, False)
|
||||
self.share_inputs["batch_id_per_token"].copy_(batch_id_per_token, False)
|
||||
self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False)
|
||||
self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False)
|
||||
@@ -965,7 +961,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
cache_kvs_list.append(value_cache)
|
||||
|
||||
self.share_inputs["caches"] = cache_kvs_list
|
||||
|
||||
else:
|
||||
for i in range(self.model_config.num_hidden_layers):
|
||||
cache_kvs[f"key_caches_{i}"] = paddle.full(
|
||||
@@ -1071,7 +1066,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
hidden_states = rebuild_padding(
|
||||
model_output,
|
||||
self.share_inputs["cum_offsets"],
|
||||
self.share_inputs["cu_seqlens_q"],
|
||||
self.share_inputs["seq_lens_this_time"],
|
||||
self.share_inputs["seq_lens_decoder"],
|
||||
self.share_inputs["seq_lens_encoder"],
|
||||
@@ -1336,7 +1331,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
)
|
||||
hidden_states = rebuild_padding(
|
||||
model_output,
|
||||
self.share_inputs["cum_offsets"],
|
||||
self.share_inputs["cu_seqlens_q"],
|
||||
self.share_inputs["seq_lens_this_time"],
|
||||
self.share_inputs["seq_lens_decoder"],
|
||||
self.share_inputs["seq_lens_encoder"],
|
||||
@@ -1436,6 +1431,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# 7. Updata 'infer_seed' and step_cuda()
|
||||
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
|
||||
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
|
||||
|
||||
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
step_cuda(
|
||||
self.share_inputs,
|
||||
|
Reference in New Issue
Block a user