From f0f00a60254282a107e48dacc236b72f9ea4bd1c Mon Sep 17 00:00:00 2001
From: chen <103103266+ckl117@users.noreply.github.com>
Date: Thu, 14 Aug 2025 22:40:44 +0800
Subject: [PATCH] [OPs] Universal optimization and Fix early_stop cuda 700
 (#3375)

* delete nonzero

* delete setup_ops_base.py

* check if

* check gcp infer_seed.cpu()

* fix repetition_early_stopper_kernel cuda 700
---
 build.sh                                      | 20 +++--------
 custom_ops/gpu_ops/cpp_extensions.cc          | 24 +++++++++++++
 custom_ops/gpu_ops/get_output_ep.cc           |  8 ++---
 .../rejection_top_p_sampling.cu               |  4 +--
 custom_ops/setup_ops.py                       | 21 ++++++++++++
 custom_ops/setup_ops_base.py                  | 34 -------------------
 .../layers/sample/early_stopper.py            |  4 +--
 .../model_executor/layers/sample/meta_data.py |  2 ++
 .../layers/sample/ops/top_k_top_p_sampling.py |  9 +++--
 .../model_executor/layers/sample/sampler.py   |  9 +++--
 fastdeploy/model_executor/ops/gpu/__init__.py |  1 -
 .../model_executor/ops/iluvatar/__init__.py   |  1 -
 fastdeploy/worker/gcu_model_runner.py         | 10 ++++--
 fastdeploy/worker/gpu_model_runner.py         | 14 ++++++--
 fastdeploy/worker/xpu_model_runner.py         | 12 ++++++-
 15 files changed, 102 insertions(+), 71 deletions(-)
 delete mode 100644 custom_ops/setup_ops_base.py

diff --git a/build.sh b/build.sh
index 86ec3cedb..e37fa2bdc 100644
--- a/build.sh
+++ b/build.sh
@@ -34,7 +34,6 @@ EGG_DIR="fastdeploy.egg-info"
 
 # custom_ops directory config
 OPS_SRC_DIR="custom_ops"
-OPS_TMP_DIR_BASE="tmp_base"
 OPS_TMP_DIR="tmp"
 
 # command line log config
@@ -71,25 +70,20 @@ function copy_ops(){
     PY_VERSION="py${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
     SYSTEM_VERSION=`${python} -c "import platform; print(platform.system().lower())"`
     PROCESSOR_VERSION=`${python} -c "import platform; print(platform.processor())"`
-    WHEEL_BASE_NAME="fastdeploy_base_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
     WHEEL_NAME="fastdeploy_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
     WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
     is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
     if [ "$is_rocm" = "True" ]; then
       DEVICE_TYPE="rocm"
-      mkdir -p ../fastdeploy/model_executor/ops/base
-      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
       cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
-      echo -e "BASE and ROCM ops have been copy to fastdeploy"
+      echo -e "ROCM ops have been copy to fastdeploy"
       return
     fi
-    mkdir -p ../fastdeploy/model_executor/ops/base
     is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
     if [ "$is_cuda" = "True" ]; then
       DEVICE_TYPE="gpu"
-      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
       cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
-      echo -e "BASE and CUDA ops have been copy to fastdeploy"
+      echo -e "CUDA ops have been copy to fastdeploy"
       return
     fi
 
@@ -112,9 +106,8 @@ function copy_ops(){
     if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
     if [ "$if_corex" = "True" ]; then
       DEVICE_TYPE="iluvatar-gpu"
-      cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
       cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
-      echo -e "BASE and Iluvatar ops have been copy to fastdeploy"
+      echo -e "Iluvatar ops have been copy to fastdeploy"
       return
     fi
 
@@ -137,19 +130,15 @@ function copy_ops(){
     fi
 
     DEVICE_TYPE="cpu"
-    cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
     cd ../../../../
     cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
-    echo -e "BASE and CPU ops have been copy to fastdeploy"
+    echo -e "CPU ops have been copy to fastdeploy"
     return
 }
 
 function build_and_install_ops() {
   cd $OPS_SRC_DIR
   export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
-  echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..."
-  ${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE}
-  find ${OPS_TMP_DIR_BASE} -type f -name "*.o" -exec rm -f {} \;
   echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
   TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
   is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
@@ -223,7 +212,6 @@ function cleanup() {
   fi
 
   rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
-  rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR_BASE
   rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR
 }
 
diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
index d43a4af5c..17911252a 100644
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -776,6 +776,22 @@ void MergePrefillDecodeOutput(
         const int head_dim,
         const int max_token);
 
+std::vector<paddle::Tensor> TopPSamplingReject(const paddle::Tensor &probs,
+                                               const paddle::Tensor &top_p,
+                                               const paddle::optional<paddle::Tensor> &top_k,
+                                               int64_t seed);
+
+std::vector<paddle::Tensor> TopKRenorm(const paddle::Tensor &probs,
+                                       const paddle::Tensor &top_k);
+
+std::vector<paddle::Tensor> MinPSamplingFromProbs(const paddle::Tensor &probs,
+                                               const paddle::Tensor &min_p);
+
+void SaveOutMmsgStatic(const paddle::Tensor& x,
+                       const paddle::Tensor& not_need_stop,
+                       int64_t rank_id,
+                       bool save_each_rank);
+
 PYBIND11_MODULE(fastdeploy_ops, m) {
 
   m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -1128,4 +1144,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("speculate_step_paddle",&SpeculateStepPaddle, "speculate_step_paddle function");
 
   m.def("merge_prefill_decode_output", &MergePrefillDecodeOutput, "merge_prefill_decode_output function");
+
+  m.def("rejection_top_p_sampling", &TopPSamplingReject, "rejection_top_p_sampling function");
+
+  m.def("top_k_renorm_probs", &TopKRenorm, "top_k_renorm_probs function");
+
+  m.def("min_p_sampling", &MinPSamplingFromProbs, "min_p_sampling function");
+
+  m.def("save_output", &SaveOutMmsgStatic, "save_output function");
 }
diff --git a/custom_ops/gpu_ops/get_output_ep.cc b/custom_ops/gpu_ops/get_output_ep.cc
index f5f742022..68730615f 100644
--- a/custom_ops/gpu_ops/get_output_ep.cc
+++ b/custom_ops/gpu_ops/get_output_ep.cc
@@ -109,11 +109,11 @@ void GetOutputEp(const paddle::Tensor& x,
     return;
 }
 
-void GetOutputStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) {
+void GetOutputEPStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) {
     GetOutputEp(x, rank_id, wait_flag, 1);
 }
 
-void GetOutputDynamic(const paddle::Tensor& x,
+void GetOutputEPDynamic(const paddle::Tensor& x,
                       int64_t rank_id,
                       bool wait_flag,
                       int msg_queue_id) {
@@ -125,11 +125,11 @@ PD_BUILD_STATIC_OP(get_output_ep)
     .Attrs({"rank_id: int64_t", "wait_flag: bool"})
     .Outputs({"x_out"})
     .SetInplaceMap({{"x", "x_out"}})
-    .SetKernelFn(PD_KERNEL(GetOutputStatic));
+    .SetKernelFn(PD_KERNEL(GetOutputEPStatic));
 
 PD_BUILD_STATIC_OP(get_output_ep_dynamic)
     .Inputs({"x"})
     .Attrs({"rank_id: int64_t", "wait_flag: bool", "msg_queue_id: int"})
     .Outputs({"x_out"})
     .SetInplaceMap({{"x", "x_out"}})
-    .SetKernelFn(PD_KERNEL(GetOutputDynamic));
+    .SetKernelFn(PD_KERNEL(GetOutputEPDynamic));
diff --git a/custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu b/custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu
index dbc5b52e4..99c87d36f 100644
--- a/custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu
+++ b/custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu
@@ -19,7 +19,7 @@
 std::vector<paddle::Tensor> TopPSamplingReject(const paddle::Tensor &probs,
                                                const paddle::Tensor &top_p,
                                                const paddle::optional<paddle::Tensor> &top_k,
-                                               int seed) {
+                                               int64_t seed) {
   std::vector<int64_t> probs_shape = probs.shape();
   unsigned int batch_size = probs_shape[0];
   unsigned int vocab_size = probs_shape[1];
@@ -82,7 +82,7 @@ TopPSamplingRejectInferDtype(const paddle::DataType &probs_dtype,
 PD_BUILD_STATIC_OP(rejection_top_p_sampling)
     .Inputs({"probs", "top_p", paddle::Optional("top_k")})
     .Outputs({"samples"})
-    .Attrs({"seed: int"})
+    .Attrs({"seed: int64_t"})
     .SetKernelFn(PD_KERNEL(TopPSamplingReject))
     .SetInferShapeFn(PD_INFER_SHAPE(TopPSamplingRejectInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(TopPSamplingRejectInferDtype));
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index de4202bc2..a94c22f48 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -199,6 +199,11 @@ if paddle.is_compiled_with_rocm():
         if not os.listdir(json_dir):
             raise ValueError("Git clone nlohmann_json failed!")
     sources = [
+        "gpu_ops/save_with_output_msg.cc",
+        "gpu_ops/get_output.cc",
+        "gpu_ops/get_output_msg_with_topk.cc",
+        "gpu_ops/save_output_msg_with_topk.cc",
+        "gpu_ops/transfer_output.cc",
         "gpu_ops/set_value_by_flags.cu",
         "gpu_ops/token_penalty_multi_scores.cu",
         "gpu_ops/stop_generation.cu",
@@ -250,6 +255,11 @@ if paddle.is_compiled_with_rocm():
     )
 elif paddle.is_compiled_with_cuda():
     sources = [
+        "gpu_ops/save_with_output_msg.cc",
+        "gpu_ops/get_output.cc",
+        "gpu_ops/get_output_msg_with_topk.cc",
+        "gpu_ops/save_output_msg_with_topk.cc",
+        "gpu_ops/transfer_output.cc",
         "gpu_ops/set_mask_value.cu",
         "gpu_ops/set_value_by_flags.cu",
         "gpu_ops/ngram_mask.cu",
@@ -532,6 +542,11 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
                 ]
             },
             sources=[
+                "gpu_ops/save_with_output_msg.cc",
+                "gpu_ops/get_output.cc",
+                "gpu_ops/get_output_msg_with_topk.cc",
+                "gpu_ops/save_output_msg_with_topk.cc",
+                "gpu_ops/transfer_output.cc",
                 "gpu_ops/get_padding_offset.cu",
                 "gpu_ops/set_value_by_flags.cu",
                 "gpu_ops/rebuild_padding.cu",
@@ -653,6 +668,12 @@ else:
         name="fastdeploy_cpu_ops",
         ext_modules=CppExtension(
             sources=[
+                "gpu_ops/save_with_output_msg.cc",
+                "gpu_ops/get_output.cc",
+                "gpu_ops/get_output_msg_with_topk.cc",
+                "gpu_ops/save_output_msg_with_topk.cc",
+                "gpu_ops/transfer_output.cc",
+                "cpu_ops/rebuild_padding.cc",
                 "cpu_ops/simd_sort.cc",
                 "cpu_ops/set_value_by_flags.cc",
                 "cpu_ops/token_penalty_multi_scores.cc",
diff --git a/custom_ops/setup_ops_base.py b/custom_ops/setup_ops_base.py
deleted file mode 100644
index 2386fee19..000000000
--- a/custom_ops/setup_ops_base.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""setup for FASTDEPLOY base ops"""
-
-from paddle.utils.cpp_extension import CppExtension, setup
-
-setup(
-    name="fastdeploy_base_ops",
-    ext_modules=CppExtension(
-        sources=[
-            "gpu_ops/save_with_output_msg.cc",
-            "gpu_ops/get_output.cc",
-            "gpu_ops/get_output_msg_with_topk.cc",
-            "gpu_ops/save_output_msg_with_topk.cc",
-            "gpu_ops/transfer_output.cc",
-            "cpu_ops/rebuild_padding.cc",
-        ],
-        extra_compile_args=[
-            "-DPy_LIMITED_API=0x03090000",
-            "-DPADDLE_ON_INFERENCE",
-        ],
-    ),
-)
diff --git a/fastdeploy/model_executor/layers/sample/early_stopper.py b/fastdeploy/model_executor/layers/sample/early_stopper.py
index 9ca4707d3..3ac0daf2f 100644
--- a/fastdeploy/model_executor/layers/sample/early_stopper.py
+++ b/fastdeploy/model_executor/layers/sample/early_stopper.py
@@ -90,10 +90,10 @@ class RepetitionEarlyStopper(EarlyStopper):
         )
 
         B, W = self.trunc_scores.shape
-        V = probs.shape[1]
+        real_bsz, V = probs.shape
         BLOCK_W = triton.next_power_of_2(W)
 
-        grid = (B,)
+        grid = (real_bsz,)
         repetition_early_stopper_kernel[grid](
             self.trunc_scores,
             probs,
diff --git a/fastdeploy/model_executor/layers/sample/meta_data.py b/fastdeploy/model_executor/layers/sample/meta_data.py
index 06281a5a5..2f79dc48b 100644
--- a/fastdeploy/model_executor/layers/sample/meta_data.py
+++ b/fastdeploy/model_executor/layers/sample/meta_data.py
@@ -42,7 +42,9 @@ class SamplingMetadata:
 
     top_p: paddle.Tensor
     top_k: Optional[paddle.Tensor] = None
+    top_k_list: Optional[list] = None
     min_p: Optional[paddle.Tensor] = None
+    min_p_list: Optional[list] = None
     seed: Optional[paddle.Tensor] = None
     max_num_logprobs: Optional[int] = None
     enable_early_stop: Optional[int] = False
diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
index 9e9e4cf9e..2b0e522cc 100644
--- a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
+++ b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py
@@ -29,6 +29,7 @@ def top_k_top_p_sampling(
     x: paddle.Tensor,
     top_p: paddle.Tensor,
     top_k: Optional[paddle.Tensor] = None,
+    top_k_list: Optional[list] = None,
     threshold: Optional[paddle.Tensor] = None,
     topp_seed: Optional[paddle.Tensor] = None,
     seed: int = -1,
@@ -64,7 +65,7 @@ def top_k_top_p_sampling(
     if top_p_class == "air":
         _, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)
     elif top_p_class == "rejection":
-        ids = rejection_top_p_sampling(x, top_p, top_k, seed, order)
+        ids = rejection_top_p_sampling(x, top_p, top_k, top_k_list, seed, order)
         _ = None
     elif top_p_class == "base_non_truncated":
         _, ids = paddle.tensor.top_p_sampling(
@@ -121,6 +122,7 @@ def rejection_top_p_sampling(
     x: paddle.Tensor,
     top_p: paddle.Tensor,
     top_k: paddle.Tensor,
+    top_k_list: list,
     seed: int = -1,
     order: Literal["top_k_first", "joint"] = "top_k_first",
 ) -> paddle.Tensor:
@@ -139,7 +141,7 @@ def rejection_top_p_sampling(
                 top_k_renorm_probs,
             )
 
-        if paddle.count_nonzero(top_k) == 0:
+        if not any(x > 0 for x in top_k_list):
             ids = rejection_top_p_sampling(
                 x,
                 top_p,
@@ -170,11 +172,12 @@ def rejection_top_p_sampling(
 def min_p_sampling(
     probs: paddle.tensor,
     min_p_arr: Optional[paddle.Tensor],
+    min_p_arr_cpu: Optional[list],
 ) -> tuple[paddle.Tensor, paddle.Tensor]:
     """
     min_p_sampling
     """
-    if paddle.count_nonzero(min_p_arr) == 0:
+    if not any(x > 0 for x in min_p_arr_cpu):
         return probs
     else:
         if current_platform.is_cuda():
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
index cece8f870..1cc26e4fb 100644
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -281,10 +281,13 @@ class Sampler(nn.Layer):
 
         probs = F.softmax(logits)
 
-        probs = min_p_sampling(probs, sampling_metadata.min_p)
-
+        probs = min_p_sampling(probs, sampling_metadata.min_p, sampling_metadata.min_p_list)
         _, next_tokens = top_k_top_p_sampling(
-            probs, sampling_metadata.top_p, sampling_metadata.top_k, seed=sampling_metadata.seed[0, 0]
+            probs,
+            sampling_metadata.top_p,
+            sampling_metadata.top_k,
+            sampling_metadata.top_k_list,
+            seed=sampling_metadata.seed[0, 0],
         )
 
         logprobs_tensors = (
diff --git a/fastdeploy/model_executor/ops/gpu/__init__.py b/fastdeploy/model_executor/ops/gpu/__init__.py
index 49ed5e0ea..1e9ae2949 100644
--- a/fastdeploy/model_executor/ops/gpu/__init__.py
+++ b/fastdeploy/model_executor/ops/gpu/__init__.py
@@ -19,7 +19,6 @@ from fastdeploy.import_ops import import_custom_ops
 
 PACKAGE = "fastdeploy.model_executor.ops.gpu"
 
-import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals())
 import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
 
 
diff --git a/fastdeploy/model_executor/ops/iluvatar/__init__.py b/fastdeploy/model_executor/ops/iluvatar/__init__.py
index 83b42f661..8d07acf0c 100644
--- a/fastdeploy/model_executor/ops/iluvatar/__init__.py
+++ b/fastdeploy/model_executor/ops/iluvatar/__init__.py
@@ -17,7 +17,6 @@ from fastdeploy.import_ops import import_custom_ops
 
 PACKAGE = "fastdeploy.model_executor.ops.iluvatar"
 
-import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals())
 import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
 
 from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn  # noqa: F401
diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
index c1d7fe60c..6065b496a 100644
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -94,7 +94,7 @@ class GCUModelRunner(ModelRunnerBase):
             shape=[self.parallel_config.max_num_seqs, 1],
             fill_value=4,
             dtype="int64",
-        )
+        ).cpu()
         self.restore_chunked_prefill_request = dict()
 
         # Initialize attention Backend
@@ -239,7 +239,9 @@ class GCUModelRunner(ModelRunnerBase):
             self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
             self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
             self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
+            self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
             self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
+            self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
 
             self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95)
             self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request(
@@ -361,7 +363,9 @@ class GCUModelRunner(ModelRunnerBase):
         self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
         self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
         self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
+        self.share_inputs["top_k_list"] = [0] * max_num_seqs
         self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
+        self.share_inputs["min_p_list"] = [0.0] * max_num_seqs
         self.share_inputs["temperature"] = paddle.full(
             [max_num_seqs, 1], self.model_config.temperature, dtype="float32"
         )
@@ -408,7 +412,7 @@ class GCUModelRunner(ModelRunnerBase):
         self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
         self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32")
         self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32")
-        self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
+        self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64").cpu()
         self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
         self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
@@ -539,7 +543,9 @@ class GCUModelRunner(ModelRunnerBase):
             temperature=self.share_inputs["temperature"],
             top_p=self.share_inputs["top_p"],
             top_k=self.share_inputs["top_k"],
+            top_k_list=self.share_inputs["top_k_list"],
             min_p=self.share_inputs["min_p"],
+            min_p_list=self.share_inputs["min_p_list"],
             seed=self.share_inputs["infer_seed"],
             step_idx=self.share_inputs["step_idx"],
             pre_token_ids=self.share_inputs["pre_ids"],
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index b039a545a..c8e9f5d87 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -138,7 +138,7 @@ class GPUModelRunner(ModelRunnerBase):
             shape=[self.parallel_config.max_num_seqs, 1],
             fill_value=4,
             dtype="int64",
-        )
+        ).cpu()
 
         self.restore_chunked_prefill_request = dict()
 
@@ -315,6 +315,10 @@ class GPUModelRunner(ModelRunnerBase):
             self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
 
             self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
+            self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
+            self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
+            self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
+            self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
             self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
             self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
             self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
@@ -478,7 +482,9 @@ class GPUModelRunner(ModelRunnerBase):
             self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
             self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
             self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
+            self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
             self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
+            self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
 
             self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95)
             self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request(
@@ -612,7 +618,9 @@ class GPUModelRunner(ModelRunnerBase):
         self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
         self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
         self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
+        self.share_inputs["top_k_list"] = [0] * max_num_seqs
         self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
+        self.share_inputs["min_p_list"] = [0.0] * max_num_seqs
         self.share_inputs["temperature"] = paddle.full(
             [max_num_seqs, 1], self.model_config.temperature, dtype="float32"
         )
@@ -661,7 +669,7 @@ class GPUModelRunner(ModelRunnerBase):
         self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
         self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32")
         self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32")
-        self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
+        self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64").cpu()
         self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
         self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
@@ -830,7 +838,9 @@ class GPUModelRunner(ModelRunnerBase):
             temperature=self.share_inputs["temperature"],
             top_p=self.share_inputs["top_p"],
             top_k=self.share_inputs["top_k"],
+            top_k_list=self.share_inputs["top_k_list"],
             min_p=self.share_inputs["min_p"],
+            min_p_list=self.share_inputs["min_p_list"],
             seed=self.share_inputs["infer_seed"],
             step_idx=self.share_inputs["step_idx"],
             pre_token_ids=self.share_inputs["pre_ids"],
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
index 8c06481de..570eb5018 100644
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -361,7 +361,7 @@ class XPUModelRunner(ModelRunnerBase):
             shape=[self.parallel_config.max_num_seqs, 1],
             fill_value=4,
             dtype="int64",
-        )
+        ).cpu()
 
         # Initialize attention Backend
         # Note(gonshaotian): Currently, all attention layers share one attention backend instance.
@@ -435,6 +435,10 @@ class XPUModelRunner(ModelRunnerBase):
             self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
 
             self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
+            self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
+            self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
+            self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
+            self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
             self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
             self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
             self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
@@ -476,7 +480,9 @@ class XPUModelRunner(ModelRunnerBase):
             self.share_inputs["pre_ids"][idx : idx + 1] = -1
             self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
             self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
+            self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
             self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
+            self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
             self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
             self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
             self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
@@ -547,7 +553,9 @@ class XPUModelRunner(ModelRunnerBase):
         self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
         self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
         self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
+        self.share_inputs["top_k_list"] = [0] * max_num_seqs
         self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
+        self.share_inputs["min_p_list"] = [0.0] * max_num_seqs
         self.share_inputs["temperature"] = paddle.full(
             [max_num_seqs, 1], self.model_config.temperature, dtype="float32"
         )
@@ -674,7 +682,9 @@ class XPUModelRunner(ModelRunnerBase):
             temperature=self.share_inputs["temperature"],
             top_p=self.share_inputs["top_p"],
             top_k=self.share_inputs["top_k"],
+            top_k_list=self.share_inputs["top_k_list"],
             min_p=self.share_inputs["min_p"],
+            min_p_list=self.share_inputs["min_p_list"],
             seed=self.share_inputs["infer_seed"],
             step_idx=self.share_inputs["step_idx"],
             pre_token_ids=self.share_inputs["pre_ids"],