mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[OPs] Universal optimization and Fix early_stop cuda 700 (#3375)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* delete nonzero * delete setup_ops_base.py * check if * check gcp infer_seed.cpu() * fix repetition_early_stopper_kernel cuda 700
This commit is contained in:
@@ -90,10 +90,10 @@ class RepetitionEarlyStopper(EarlyStopper):
|
||||
)
|
||||
|
||||
B, W = self.trunc_scores.shape
|
||||
V = probs.shape[1]
|
||||
real_bsz, V = probs.shape
|
||||
BLOCK_W = triton.next_power_of_2(W)
|
||||
|
||||
grid = (B,)
|
||||
grid = (real_bsz,)
|
||||
repetition_early_stopper_kernel[grid](
|
||||
self.trunc_scores,
|
||||
probs,
|
||||
|
@@ -42,7 +42,9 @@ class SamplingMetadata:
|
||||
|
||||
top_p: paddle.Tensor
|
||||
top_k: Optional[paddle.Tensor] = None
|
||||
top_k_list: Optional[list] = None
|
||||
min_p: Optional[paddle.Tensor] = None
|
||||
min_p_list: Optional[list] = None
|
||||
seed: Optional[paddle.Tensor] = None
|
||||
max_num_logprobs: Optional[int] = None
|
||||
enable_early_stop: Optional[int] = False
|
||||
|
@@ -29,6 +29,7 @@ def top_k_top_p_sampling(
|
||||
x: paddle.Tensor,
|
||||
top_p: paddle.Tensor,
|
||||
top_k: Optional[paddle.Tensor] = None,
|
||||
top_k_list: Optional[list] = None,
|
||||
threshold: Optional[paddle.Tensor] = None,
|
||||
topp_seed: Optional[paddle.Tensor] = None,
|
||||
seed: int = -1,
|
||||
@@ -64,7 +65,7 @@ def top_k_top_p_sampling(
|
||||
if top_p_class == "air":
|
||||
_, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)
|
||||
elif top_p_class == "rejection":
|
||||
ids = rejection_top_p_sampling(x, top_p, top_k, seed, order)
|
||||
ids = rejection_top_p_sampling(x, top_p, top_k, top_k_list, seed, order)
|
||||
_ = None
|
||||
elif top_p_class == "base_non_truncated":
|
||||
_, ids = paddle.tensor.top_p_sampling(
|
||||
@@ -121,6 +122,7 @@ def rejection_top_p_sampling(
|
||||
x: paddle.Tensor,
|
||||
top_p: paddle.Tensor,
|
||||
top_k: paddle.Tensor,
|
||||
top_k_list: list,
|
||||
seed: int = -1,
|
||||
order: Literal["top_k_first", "joint"] = "top_k_first",
|
||||
) -> paddle.Tensor:
|
||||
@@ -139,7 +141,7 @@ def rejection_top_p_sampling(
|
||||
top_k_renorm_probs,
|
||||
)
|
||||
|
||||
if paddle.count_nonzero(top_k) == 0:
|
||||
if not any(x > 0 for x in top_k_list):
|
||||
ids = rejection_top_p_sampling(
|
||||
x,
|
||||
top_p,
|
||||
@@ -170,11 +172,12 @@ def rejection_top_p_sampling(
|
||||
def min_p_sampling(
|
||||
probs: paddle.tensor,
|
||||
min_p_arr: Optional[paddle.Tensor],
|
||||
min_p_arr_cpu: Optional[list],
|
||||
) -> tuple[paddle.Tensor, paddle.Tensor]:
|
||||
"""
|
||||
min_p_sampling
|
||||
"""
|
||||
if paddle.count_nonzero(min_p_arr) == 0:
|
||||
if not any(x > 0 for x in min_p_arr_cpu):
|
||||
return probs
|
||||
else:
|
||||
if current_platform.is_cuda():
|
||||
|
@@ -281,10 +281,13 @@ class Sampler(nn.Layer):
|
||||
|
||||
probs = F.softmax(logits)
|
||||
|
||||
probs = min_p_sampling(probs, sampling_metadata.min_p)
|
||||
|
||||
probs = min_p_sampling(probs, sampling_metadata.min_p, sampling_metadata.min_p_list)
|
||||
_, next_tokens = top_k_top_p_sampling(
|
||||
probs, sampling_metadata.top_p, sampling_metadata.top_k, seed=sampling_metadata.seed[0, 0]
|
||||
probs,
|
||||
sampling_metadata.top_p,
|
||||
sampling_metadata.top_k,
|
||||
sampling_metadata.top_k_list,
|
||||
seed=sampling_metadata.seed[0, 0],
|
||||
)
|
||||
|
||||
logprobs_tensors = (
|
||||
|
@@ -19,7 +19,6 @@ from fastdeploy.import_ops import import_custom_ops
|
||||
|
||||
PACKAGE = "fastdeploy.model_executor.ops.gpu"
|
||||
|
||||
import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals())
|
||||
import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
|
||||
|
||||
|
||||
|
@@ -17,7 +17,6 @@ from fastdeploy.import_ops import import_custom_ops
|
||||
|
||||
PACKAGE = "fastdeploy.model_executor.ops.iluvatar"
|
||||
|
||||
import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals())
|
||||
import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
|
||||
|
||||
from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: F401
|
||||
|
Reference in New Issue
Block a user