[Fix]fix top_k_top_p sampling (#2801)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* fix topk-topp

* update

* add base_non_truncated
This commit is contained in:
Sunny-bot1
2025-07-10 22:35:10 +08:00
committed by GitHub
parent 59071268b6
commit 240d6236bc
8 changed files with 23 additions and 123 deletions

View File

@@ -52,7 +52,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_ATTENTION_BACKEND":
lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
# Sampling class ("base", "air", or "rejection")
# Sampling class ("base", "base_non_truncated", "air", or "rejection")
"FD_SAMPLING_CLASS":
lambda: os.getenv("FD_SAMPLING_CLASS", "base"),

View File

@@ -51,7 +51,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_ATTENTION_BACKEND":
lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
# 设置采样类别,当前可设置为 "base"、"air" 或 "rejection"
# 设置采样类别,当前可设置为 "base"、"base_non_truncated"、"air" 或 "rejection"
"FD_SAMPLING_CLASS":
lambda: os.getenv("FD_SAMPLING_CLASS", "base"),

View File

@@ -74,7 +74,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_ATTENTION_BACKEND":
lambda: os.getenv("FD_ATTENTION_BACKEND", "APPEND_ATTN"),
# Set sampling class. "base", "air" and "rejection" can be set currently.
# Set sampling class. "base", "base_non_truncated", "air" and "rejection" can be set currently.
"FD_SAMPLING_CLASS":
lambda: os.getenv("FD_SAMPLING_CLASS", "base"),

View File

@@ -71,6 +71,14 @@ def top_k_top_p_sampling(
elif top_p_class == "rejection":
ids = rejection_top_p_sampling(x, top_p, top_k, seed, order)
_ = None
elif top_p_class == "base_non_truncated":
_, ids = paddle.tensor.top_p_sampling(x,
top_p,
threshold=threshold,
topp_seed=topp_seed,
seed=seed,
k=k,
mode="non-truncated")
else:
if current_platform.is_gcu():
_, ids = gcu_top_p_sampling(x, top_p)
@@ -81,7 +89,7 @@ def top_k_top_p_sampling(
topp_seed=topp_seed,
seed=seed,
k=k,
mode=mode)
mode="truncated")
return _, ids
@@ -109,26 +117,25 @@ def air_top_p_sampling(
def rejection_top_p_sampling(
x: paddle.Tensor,
top_p: paddle.Tensor,
top_k: Optional[paddle.Tensor] = None,
top_k: paddle.Tensor,
seed: int = -1,
order: Literal['top_k_first', 'joint'] = "top_k_first",
) -> paddle.Tensor:
"""
rejection_top_p_sampling
"""
assert top_p is not None, "Top_p should not be none when FD_SAMPLING_CLASS is rejection"
try:
from fastdeploy.model_executor.ops.gpu import (
rejection_top_p_sampling, top_k_renorm_probs)
if top_k is None:
if paddle.count_nonzero(top_k) == 0:
ids = rejection_top_p_sampling(
x,
top_p,
None,
seed,
)
elif top_k is not None and top_p is not None:
else:
if order == "top_k_first":
renorm_probs = top_k_renorm_probs(x, top_k)
ids = rejection_top_p_sampling(
@@ -144,10 +151,6 @@ def rejection_top_p_sampling(
top_k,
seed,
)
else:
raise ValueError(
"Top_p cannot be none."
)
except ImportError:
raise RuntimeError("Cannot import rejection_top_p_sampling op.")
return ids

View File

@@ -155,29 +155,12 @@ class GCUModelRunner(ModelRunnerBase):
-1].disaggregate_info["role"] == "prefill":
os.environ['PREFILL_NODE_ONE_STEP_STOP'] = "1"
top_k_reqs = []
top_p_reqs = []
max_num_seqs = self.parallel_config.max_num_seqs
top_p_buffer = paddle.full([max_num_seqs, 1],
self.model_config.top_p,
dtype='float32')
top_k_buffer = paddle.full([max_num_seqs, 1],
0,
dtype='int64')
req_len = len(req_dicts)
for i in range(req_len):
request = req_dicts[i]
idx = request.idx
length = len(request.prompt_token_ids)
if sampling_params := request.sampling_params:
if sampling_params.top_p < 1:
top_p_reqs.append(idx)
top_k = sampling_params.top_k
if top_k > 0:
top_k_reqs.append(idx)
prefill_tokens = []
if (request.guided_json is not None
or request.guided_regex is not None
@@ -252,8 +235,8 @@ class GCUModelRunner(ModelRunnerBase):
request.eos_token_ids.append(request.eos_token_ids[0])
self.share_inputs["eos_token_id"][:] = np.array(
request.eos_token_ids, dtype="int64").reshape(-1, 1)
top_p_buffer[idx:idx + 1] = request.get("top_p", 1.0)
top_k_buffer[idx:idx + 1] = request.get("top_k", 0)
self.share_inputs["top_p"][idx:idx + 1] = request.get("top_p", 1.0)
self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0)
self.share_inputs["temperature"][idx:idx + 1] = request.get(
"temperature", 0.95)
self.share_inputs["penalty_score"][idx:idx + 1] = request.get(
@@ -304,16 +287,6 @@ class GCUModelRunner(ModelRunnerBase):
if self.speculative_method in ["mtp"]:
self.proposer.insert_prefill_inputs(req_dicts)
if len(top_k_reqs) == 0:
self.share_inputs["top_k"] = None
else:
self.share_inputs["top_k"] = top_k_buffer
if len(top_p_reqs) == 0:
self.share_inputs["top_p"] = None
else:
self.share_inputs["top_p"] = top_p_buffer
def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int,
expected_decode_len: int):
""" Set dummy prefill inputs to share_inputs """

View File

@@ -164,15 +164,6 @@ class GPUModelRunner(ModelRunnerBase):
-1].disaggregate_info["role"] == "prefill":
os.environ['PREFILL_NODE_ONE_STEP_STOP'] = "1"
top_k_reqs = []
top_p_reqs = []
max_num_seqs = self.parallel_config.max_num_seqs
top_p_buffer = paddle.full([max_num_seqs, 1],
self.model_config.top_p,
dtype='float32')
top_k_buffer = paddle.full([max_num_seqs, 1],
0,
dtype='int64')
req_len = len(req_dicts)
for i in range(req_len):
request = req_dicts[i]
@@ -180,13 +171,6 @@ class GPUModelRunner(ModelRunnerBase):
length = len(request.prompt_token_ids)
assert length > 0, "The prompt requested must not be empty."
if sampling_params := request.sampling_params:
if sampling_params.top_p < 1:
top_p_reqs.append(idx)
top_k = sampling_params.top_k
if top_k > 0:
top_k_reqs.append(idx)
prefill_tokens = []
if (request.guided_json is not None
or request.guided_regex is not None
@@ -261,8 +245,8 @@ class GPUModelRunner(ModelRunnerBase):
request.eos_token_ids.append(request.eos_token_ids[0])
self.share_inputs["eos_token_id"][:] = np.array(
request.eos_token_ids, dtype="int64").reshape(-1, 1)
top_p_buffer[idx:idx + 1] = request.get("top_p", 1.0)
top_k_buffer[idx:idx + 1] = request.get("top_k", 0)
self.share_inputs["top_p"][idx:idx + 1] = request.get("top_p", 1.0)
self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0)
self.share_inputs["temperature"][idx:idx + 1] = request.get(
"temperature", 0.95)
self.share_inputs["penalty_score"][idx:idx + 1] = request.get(
@@ -313,16 +297,6 @@ class GPUModelRunner(ModelRunnerBase):
if self.speculative_method in ["mtp"]:
self.proposer.insert_prefill_inputs(req_dicts)
if len(top_k_reqs) == 0:
self.share_inputs["top_k"] = None
else:
self.share_inputs["top_k"] = top_k_buffer
if len(top_p_reqs) == 0:
self.share_inputs["top_p"] = None
else:
self.share_inputs["top_p"] = top_p_buffer
def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int,
expected_decode_len: int):
""" Set dummy prefill inputs to share_inputs """

View File

@@ -144,29 +144,12 @@ class IluvatarModelRunner(ModelRunnerBase):
-1].disaggregate_info["role"] == "prefill":
os.environ['PREFILL_NODE_ONE_STEP_STOP'] = "1"
top_k_reqs = []
top_p_reqs = []
max_num_seqs = self.parallel_config.max_num_seqs
top_p_buffer = paddle.full([max_num_seqs, 1],
self.model_config.top_p,
dtype='float32')
top_k_buffer = paddle.full([max_num_seqs, 1],
0,
dtype='int64')
req_len = len(req_dicts)
for i in range(req_len):
request = req_dicts[i]
idx = request.idx
length = len(request.prompt_token_ids)
if sampling_params := request.sampling_params:
if sampling_params.top_p < 1:
top_p_reqs.append(idx)
top_k = sampling_params.top_k
if top_k > 0:
top_k_reqs.append(idx)
prefill_tokens = []
if (request.guided_json is not None
or request.guided_regex is not None
@@ -241,8 +224,8 @@ class IluvatarModelRunner(ModelRunnerBase):
request.eos_token_ids.append(request.eos_token_ids[0])
self.share_inputs["eos_token_id"][:] = np.array(
request.eos_token_ids, dtype="int64").reshape(-1, 1)
top_p_buffer[idx:idx + 1] = request.get("top_p", 1.0)
top_k_buffer[idx:idx + 1] = request.get("top_k", 0)
self.share_inputs["top_p"][idx:idx + 1] = request.get("top_p", 1.0)
self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0)
self.share_inputs["temperature"][idx:idx + 1] = request.get(
"temperature", 0.95)
self.share_inputs["penalty_score"][idx:idx + 1] = request.get(
@@ -289,15 +272,6 @@ class IluvatarModelRunner(ModelRunnerBase):
idx, request.get("logits_processor"), prefill_tokens)
self.share_inputs["not_need_stop"][0] = True
if len(top_k_reqs) == 0:
self.share_inputs["top_k"] = None
else:
self.share_inputs["top_k"] = top_k_buffer
if len(top_p_reqs) == 0:
self.share_inputs["top_p"] = None
else:
self.share_inputs["top_p"] = top_p_buffer
def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int,
expected_decode_len: int):

View File

@@ -282,26 +282,11 @@ class XPUModelRunner(ModelRunnerBase):
def process_prefill_inputs(self, req_dicts: List[Request]):
""" Process inputs for prefill tasks and update share_inputs buffer """
top_k_reqs = []
top_p_reqs = []
max_num_seqs = self.parallel_config.max_num_seqs
top_p_buffer = paddle.full([max_num_seqs, 1],
self.model_config.top_p,
dtype='float32')
top_k_buffer = paddle.full([max_num_seqs, 1],
0,
dtype='int64')
req_len = len(req_dicts)
for i in range(req_len):
request = req_dicts[i]
idx = request.idx
length = request.prompt_token_ids_len
if sampling_params := request.sampling_params:
if sampling_params.top_p < 1:
top_p_reqs.append(idx)
top_k = sampling_params.top_k
if top_k > 0:
top_k_reqs.append(idx)
self.share_inputs["input_ids"][idx:idx + 1, :length] = np.array(
request.prompt_token_ids)
if len(request.eos_token_ids
@@ -310,8 +295,8 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["eos_token_id"][:] = np.array(
request.eos_token_ids, dtype="int64").reshape(-1, 1)
self.share_inputs["pre_ids"][idx:idx + 1] = -1
top_p_buffer[idx:idx + 1] = request.get("top_p", 1.0)
top_k_buffer[idx:idx + 1] = request.get("top_k", 0)
self.share_inputs["top_p"][idx:idx + 1] = request.get("top_p", 1.0)
self.share_inputs["top_k"][idx:idx + 1] = request.get("top_k", 0)
self.share_inputs["temperature"][idx:idx + 1] = request.get(
"temperature", 0.95)
self.share_inputs["penalty_score"][idx:idx + 1] = request.get(
@@ -360,15 +345,6 @@ class XPUModelRunner(ModelRunnerBase):
request.get("stop_token_ids"), dtype="int64")
self.share_inputs["not_need_stop"][0] = True
if len(top_k_reqs) == 0:
self.share_inputs["top_k"] = None
else:
self.share_inputs["top_k"] = top_k_buffer
if len(top_p_reqs) == 0:
self.share_inputs["top_p"] = None
else:
self.share_inputs["top_p"] = top_p_buffer
def _init_share_inputs(self, max_num_seqs: int):
"""Initialize all share buffers for model inputs.