[Code Simplification] remove cum_offsets (#3410)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled

This commit is contained in:
lizexu123
2025-08-18 20:21:25 +08:00
committed by GitHub
parent 2cf96ddd68
commit 32b39620bc
9 changed files with 73 additions and 87 deletions

View File

@@ -583,7 +583,6 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["min_dec_len"][idx : idx + 1] = max_dec_len
self.share_inputs["stop_flags"][idx : idx + 1] = False
self.share_inputs["temperature"][idx : idx + 1] = 1
self.share_inputs["first_token_ids"][idx : idx + 1] = self.share_inputs["input_ids"][idx : idx + 1, :1]
self.share_inputs["ori_seq_lens_encoder"][idx : idx + 1] = input_length
@@ -680,7 +679,6 @@ class GPUModelRunner(ModelRunnerBase):
0,
dtype="int64",
)
self.share_inputs["cum_offsets"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
self.share_inputs["batch_id_per_token"] = paddle.full(
[max_num_seqs * self.parallel_config.max_model_len, 1], 0, dtype="int32"
)
@@ -803,7 +801,6 @@ class GPUModelRunner(ModelRunnerBase):
# Remove padding
(
ids_remove_padding,
cum_offsets,
batch_id_per_token,
cu_seqlens_q,
cu_seqlens_k,
@@ -819,7 +816,6 @@ class GPUModelRunner(ModelRunnerBase):
)
self.share_inputs["ids_remove_padding"].copy_(ids_remove_padding, False)
self.share_inputs["cum_offsets"].copy_(cum_offsets, False)
self.share_inputs["batch_id_per_token"].copy_(batch_id_per_token, False)
self.share_inputs["cu_seqlens_q"].copy_(cu_seqlens_q, False)
self.share_inputs["cu_seqlens_k"].copy_(cu_seqlens_k, False)
@@ -965,7 +961,6 @@ class GPUModelRunner(ModelRunnerBase):
cache_kvs_list.append(value_cache)
self.share_inputs["caches"] = cache_kvs_list
else:
for i in range(self.model_config.num_hidden_layers):
cache_kvs[f"key_caches_{i}"] = paddle.full(
@@ -1071,7 +1066,7 @@ class GPUModelRunner(ModelRunnerBase):
hidden_states = rebuild_padding(
model_output,
self.share_inputs["cum_offsets"],
self.share_inputs["cu_seqlens_q"],
self.share_inputs["seq_lens_this_time"],
self.share_inputs["seq_lens_decoder"],
self.share_inputs["seq_lens_encoder"],
@@ -1336,7 +1331,7 @@ class GPUModelRunner(ModelRunnerBase):
)
hidden_states = rebuild_padding(
model_output,
self.share_inputs["cum_offsets"],
self.share_inputs["cu_seqlens_q"],
self.share_inputs["seq_lens_this_time"],
self.share_inputs["seq_lens_decoder"],
self.share_inputs["seq_lens_encoder"],
@@ -1436,6 +1431,7 @@ class GPUModelRunner(ModelRunnerBase):
# 7. Updata 'infer_seed' and step_cuda()
self.share_inputs["infer_seed"].add_(self.infer_seed_increment)
self.share_inputs["infer_seed"][:] %= self.MAX_INFER_SEED
if not envs.ENABLE_V1_KVCACHE_SCHEDULER:
step_cuda(
self.share_inputs,