[OPs] Universal optimization and Fix early_stop cuda 700 (#3375)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* delete nonzero

* delete setup_ops_base.py

* check if

* check gcp infer_seed.cpu()

* fix repetition_early_stopper_kernel cuda 700
This commit is contained in:
chen
2025-08-14 22:40:44 +08:00
committed by GitHub
parent 09c979f3dd
commit f0f00a6025
15 changed files with 102 additions and 71 deletions

View File

@@ -361,7 +361,7 @@ class XPUModelRunner(ModelRunnerBase):
shape=[self.parallel_config.max_num_seqs, 1],
fill_value=4,
dtype="int64",
)
).cpu()
# Initialize attention Backend
# Note(gonshaotian): Currently, all attention layers share one attention backend instance.
@@ -435,6 +435,10 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
@@ -476,7 +480,9 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["pre_ids"][idx : idx + 1] = -1
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
@@ -547,7 +553,9 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
self.share_inputs["top_k_list"] = [0] * max_num_seqs
self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
self.share_inputs["min_p_list"] = [0.0] * max_num_seqs
self.share_inputs["temperature"] = paddle.full(
[max_num_seqs, 1], self.model_config.temperature, dtype="float32"
)
@@ -674,7 +682,9 @@ class XPUModelRunner(ModelRunnerBase):
temperature=self.share_inputs["temperature"],
top_p=self.share_inputs["top_p"],
top_k=self.share_inputs["top_k"],
top_k_list=self.share_inputs["top_k_list"],
min_p=self.share_inputs["min_p"],
min_p_list=self.share_inputs["min_p_list"],
seed=self.share_inputs["infer_seed"],
step_idx=self.share_inputs["step_idx"],
pre_token_ids=self.share_inputs["pre_ids"],