From f0f00a60254282a107e48dacc236b72f9ea4bd1c Mon Sep 17 00:00:00 2001 From: chen <103103266+ckl117@users.noreply.github.com> Date: Thu, 14 Aug 2025 22:40:44 +0800 Subject: [PATCH] [OPs] Universal optimization and Fix early_stop cuda 700 (#3375) * delete nonzero * delete setup_ops_base.py * check if * check gcp infer_seed.cpu() * fix repetition_early_stopper_kernel cuda 700 --- build.sh | 20 +++-------- custom_ops/gpu_ops/cpp_extensions.cc | 24 +++++++++++++ custom_ops/gpu_ops/get_output_ep.cc | 8 ++--- .../rejection_top_p_sampling.cu | 4 +-- custom_ops/setup_ops.py | 21 ++++++++++++ custom_ops/setup_ops_base.py | 34 ------------------- .../layers/sample/early_stopper.py | 4 +-- .../model_executor/layers/sample/meta_data.py | 2 ++ .../layers/sample/ops/top_k_top_p_sampling.py | 9 +++-- .../model_executor/layers/sample/sampler.py | 9 +++-- fastdeploy/model_executor/ops/gpu/__init__.py | 1 - .../model_executor/ops/iluvatar/__init__.py | 1 - fastdeploy/worker/gcu_model_runner.py | 10 ++++-- fastdeploy/worker/gpu_model_runner.py | 14 ++++++-- fastdeploy/worker/xpu_model_runner.py | 12 ++++++- 15 files changed, 102 insertions(+), 71 deletions(-) delete mode 100644 custom_ops/setup_ops_base.py diff --git a/build.sh b/build.sh index 86ec3cedb..e37fa2bdc 100644 --- a/build.sh +++ b/build.sh @@ -34,7 +34,6 @@ EGG_DIR="fastdeploy.egg-info" # custom_ops directory config OPS_SRC_DIR="custom_ops" -OPS_TMP_DIR_BASE="tmp_base" OPS_TMP_DIR="tmp" # command line log config @@ -71,25 +70,20 @@ function copy_ops(){ PY_VERSION="py${PY_MAIN_VERSION}.${PY_SUB_VERSION}" SYSTEM_VERSION=`${python} -c "import platform; print(platform.system().lower())"` PROCESSOR_VERSION=`${python} -c "import platform; print(platform.processor())"` - WHEEL_BASE_NAME="fastdeploy_base_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg" WHEEL_NAME="fastdeploy_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg" WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg" is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"` if [ "$is_rocm" = "True" ]; then DEVICE_TYPE="rocm" - mkdir -p ../fastdeploy/model_executor/ops/base - cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu - echo -e "BASE and ROCM ops have been copy to fastdeploy" + echo -e "ROCM ops have been copy to fastdeploy" return fi - mkdir -p ../fastdeploy/model_executor/ops/base is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"` if [ "$is_cuda" = "True" ]; then DEVICE_TYPE="gpu" - cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu - echo -e "BASE and CUDA ops have been copy to fastdeploy" + echo -e "CUDA ops have been copy to fastdeploy" return fi @@ -112,9 +106,8 @@ function copy_ops(){ if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"` if [ "$if_corex" = "True" ]; then DEVICE_TYPE="iluvatar-gpu" - cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar - echo -e "BASE and Iluvatar ops have been copy to fastdeploy" + echo -e "Iluvatar ops have been copy to fastdeploy" return fi @@ -137,19 +130,15 @@ function copy_ops(){ fi DEVICE_TYPE="cpu" - cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base cd ../../../../ cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu - echo -e "BASE and CPU ops have been copy to fastdeploy" + echo -e "CPU ops have been copy to fastdeploy" return } function build_and_install_ops() { cd $OPS_SRC_DIR export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy} - echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..." - ${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE} - find ${OPS_TMP_DIR_BASE} -type f -name "*.o" -exec rm -f {} \; echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..." TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}` is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"` @@ -223,7 +212,6 @@ function cleanup() { fi rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR - rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR_BASE rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR } diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index d43a4af5c..17911252a 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -776,6 +776,22 @@ void MergePrefillDecodeOutput( const int head_dim, const int max_token); +std::vector TopPSamplingReject(const paddle::Tensor &probs, + const paddle::Tensor &top_p, + const paddle::optional &top_k, + int64_t seed); + +std::vector TopKRenorm(const paddle::Tensor &probs, + const paddle::Tensor &top_k); + +std::vector MinPSamplingFromProbs(const paddle::Tensor &probs, + const paddle::Tensor &min_p); + +void SaveOutMmsgStatic(const paddle::Tensor& x, + const paddle::Tensor& not_need_stop, + int64_t rank_id, + bool save_each_rank); + PYBIND11_MODULE(fastdeploy_ops, m) { m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"), @@ -1128,4 +1144,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("speculate_step_paddle",&SpeculateStepPaddle, "speculate_step_paddle function"); m.def("merge_prefill_decode_output", &MergePrefillDecodeOutput, "merge_prefill_decode_output function"); + + m.def("rejection_top_p_sampling", &TopPSamplingReject, "rejection_top_p_sampling function"); + + m.def("top_k_renorm_probs", &TopKRenorm, "top_k_renorm_probs function"); + + m.def("min_p_sampling", &MinPSamplingFromProbs, "min_p_sampling function"); + + m.def("save_output", &SaveOutMmsgStatic, "save_output function"); } diff --git a/custom_ops/gpu_ops/get_output_ep.cc b/custom_ops/gpu_ops/get_output_ep.cc index f5f742022..68730615f 100644 --- a/custom_ops/gpu_ops/get_output_ep.cc +++ b/custom_ops/gpu_ops/get_output_ep.cc @@ -109,11 +109,11 @@ void GetOutputEp(const paddle::Tensor& x, return; } -void GetOutputStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) { +void GetOutputEPStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) { GetOutputEp(x, rank_id, wait_flag, 1); } -void GetOutputDynamic(const paddle::Tensor& x, +void GetOutputEPDynamic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag, int msg_queue_id) { @@ -125,11 +125,11 @@ PD_BUILD_STATIC_OP(get_output_ep) .Attrs({"rank_id: int64_t", "wait_flag: bool"}) .Outputs({"x_out"}) .SetInplaceMap({{"x", "x_out"}}) - .SetKernelFn(PD_KERNEL(GetOutputStatic)); + .SetKernelFn(PD_KERNEL(GetOutputEPStatic)); PD_BUILD_STATIC_OP(get_output_ep_dynamic) .Inputs({"x"}) .Attrs({"rank_id: int64_t", "wait_flag: bool", "msg_queue_id: int"}) .Outputs({"x_out"}) .SetInplaceMap({{"x", "x_out"}}) - .SetKernelFn(PD_KERNEL(GetOutputDynamic)); + .SetKernelFn(PD_KERNEL(GetOutputEPDynamic)); diff --git a/custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu b/custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu index dbc5b52e4..99c87d36f 100644 --- a/custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu +++ b/custom_ops/gpu_ops/sample_kernels/rejection_top_p_sampling.cu @@ -19,7 +19,7 @@ std::vector TopPSamplingReject(const paddle::Tensor &probs, const paddle::Tensor &top_p, const paddle::optional &top_k, - int seed) { + int64_t seed) { std::vector probs_shape = probs.shape(); unsigned int batch_size = probs_shape[0]; unsigned int vocab_size = probs_shape[1]; @@ -82,7 +82,7 @@ TopPSamplingRejectInferDtype(const paddle::DataType &probs_dtype, PD_BUILD_STATIC_OP(rejection_top_p_sampling) .Inputs({"probs", "top_p", paddle::Optional("top_k")}) .Outputs({"samples"}) - .Attrs({"seed: int"}) + .Attrs({"seed: int64_t"}) .SetKernelFn(PD_KERNEL(TopPSamplingReject)) .SetInferShapeFn(PD_INFER_SHAPE(TopPSamplingRejectInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(TopPSamplingRejectInferDtype)); diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index de4202bc2..a94c22f48 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -199,6 +199,11 @@ if paddle.is_compiled_with_rocm(): if not os.listdir(json_dir): raise ValueError("Git clone nlohmann_json failed!") sources = [ + "gpu_ops/save_with_output_msg.cc", + "gpu_ops/get_output.cc", + "gpu_ops/get_output_msg_with_topk.cc", + "gpu_ops/save_output_msg_with_topk.cc", + "gpu_ops/transfer_output.cc", "gpu_ops/set_value_by_flags.cu", "gpu_ops/token_penalty_multi_scores.cu", "gpu_ops/stop_generation.cu", @@ -250,6 +255,11 @@ if paddle.is_compiled_with_rocm(): ) elif paddle.is_compiled_with_cuda(): sources = [ + "gpu_ops/save_with_output_msg.cc", + "gpu_ops/get_output.cc", + "gpu_ops/get_output_msg_with_topk.cc", + "gpu_ops/save_output_msg_with_topk.cc", + "gpu_ops/transfer_output.cc", "gpu_ops/set_mask_value.cu", "gpu_ops/set_value_by_flags.cu", "gpu_ops/ngram_mask.cu", @@ -532,6 +542,11 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"): ] }, sources=[ + "gpu_ops/save_with_output_msg.cc", + "gpu_ops/get_output.cc", + "gpu_ops/get_output_msg_with_topk.cc", + "gpu_ops/save_output_msg_with_topk.cc", + "gpu_ops/transfer_output.cc", "gpu_ops/get_padding_offset.cu", "gpu_ops/set_value_by_flags.cu", "gpu_ops/rebuild_padding.cu", @@ -653,6 +668,12 @@ else: name="fastdeploy_cpu_ops", ext_modules=CppExtension( sources=[ + "gpu_ops/save_with_output_msg.cc", + "gpu_ops/get_output.cc", + "gpu_ops/get_output_msg_with_topk.cc", + "gpu_ops/save_output_msg_with_topk.cc", + "gpu_ops/transfer_output.cc", + "cpu_ops/rebuild_padding.cc", "cpu_ops/simd_sort.cc", "cpu_ops/set_value_by_flags.cc", "cpu_ops/token_penalty_multi_scores.cc", diff --git a/custom_ops/setup_ops_base.py b/custom_ops/setup_ops_base.py deleted file mode 100644 index 2386fee19..000000000 --- a/custom_ops/setup_ops_base.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""setup for FASTDEPLOY base ops""" - -from paddle.utils.cpp_extension import CppExtension, setup - -setup( - name="fastdeploy_base_ops", - ext_modules=CppExtension( - sources=[ - "gpu_ops/save_with_output_msg.cc", - "gpu_ops/get_output.cc", - "gpu_ops/get_output_msg_with_topk.cc", - "gpu_ops/save_output_msg_with_topk.cc", - "gpu_ops/transfer_output.cc", - "cpu_ops/rebuild_padding.cc", - ], - extra_compile_args=[ - "-DPy_LIMITED_API=0x03090000", - "-DPADDLE_ON_INFERENCE", - ], - ), -) diff --git a/fastdeploy/model_executor/layers/sample/early_stopper.py b/fastdeploy/model_executor/layers/sample/early_stopper.py index 9ca4707d3..3ac0daf2f 100644 --- a/fastdeploy/model_executor/layers/sample/early_stopper.py +++ b/fastdeploy/model_executor/layers/sample/early_stopper.py @@ -90,10 +90,10 @@ class RepetitionEarlyStopper(EarlyStopper): ) B, W = self.trunc_scores.shape - V = probs.shape[1] + real_bsz, V = probs.shape BLOCK_W = triton.next_power_of_2(W) - grid = (B,) + grid = (real_bsz,) repetition_early_stopper_kernel[grid]( self.trunc_scores, probs, diff --git a/fastdeploy/model_executor/layers/sample/meta_data.py b/fastdeploy/model_executor/layers/sample/meta_data.py index 06281a5a5..2f79dc48b 100644 --- a/fastdeploy/model_executor/layers/sample/meta_data.py +++ b/fastdeploy/model_executor/layers/sample/meta_data.py @@ -42,7 +42,9 @@ class SamplingMetadata: top_p: paddle.Tensor top_k: Optional[paddle.Tensor] = None + top_k_list: Optional[list] = None min_p: Optional[paddle.Tensor] = None + min_p_list: Optional[list] = None seed: Optional[paddle.Tensor] = None max_num_logprobs: Optional[int] = None enable_early_stop: Optional[int] = False diff --git a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py index 9e9e4cf9e..2b0e522cc 100644 --- a/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py +++ b/fastdeploy/model_executor/layers/sample/ops/top_k_top_p_sampling.py @@ -29,6 +29,7 @@ def top_k_top_p_sampling( x: paddle.Tensor, top_p: paddle.Tensor, top_k: Optional[paddle.Tensor] = None, + top_k_list: Optional[list] = None, threshold: Optional[paddle.Tensor] = None, topp_seed: Optional[paddle.Tensor] = None, seed: int = -1, @@ -64,7 +65,7 @@ def top_k_top_p_sampling( if top_p_class == "air": _, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode) elif top_p_class == "rejection": - ids = rejection_top_p_sampling(x, top_p, top_k, seed, order) + ids = rejection_top_p_sampling(x, top_p, top_k, top_k_list, seed, order) _ = None elif top_p_class == "base_non_truncated": _, ids = paddle.tensor.top_p_sampling( @@ -121,6 +122,7 @@ def rejection_top_p_sampling( x: paddle.Tensor, top_p: paddle.Tensor, top_k: paddle.Tensor, + top_k_list: list, seed: int = -1, order: Literal["top_k_first", "joint"] = "top_k_first", ) -> paddle.Tensor: @@ -139,7 +141,7 @@ def rejection_top_p_sampling( top_k_renorm_probs, ) - if paddle.count_nonzero(top_k) == 0: + if not any(x > 0 for x in top_k_list): ids = rejection_top_p_sampling( x, top_p, @@ -170,11 +172,12 @@ def rejection_top_p_sampling( def min_p_sampling( probs: paddle.tensor, min_p_arr: Optional[paddle.Tensor], + min_p_arr_cpu: Optional[list], ) -> tuple[paddle.Tensor, paddle.Tensor]: """ min_p_sampling """ - if paddle.count_nonzero(min_p_arr) == 0: + if not any(x > 0 for x in min_p_arr_cpu): return probs else: if current_platform.is_cuda(): diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py index cece8f870..1cc26e4fb 100644 --- a/fastdeploy/model_executor/layers/sample/sampler.py +++ b/fastdeploy/model_executor/layers/sample/sampler.py @@ -281,10 +281,13 @@ class Sampler(nn.Layer): probs = F.softmax(logits) - probs = min_p_sampling(probs, sampling_metadata.min_p) - + probs = min_p_sampling(probs, sampling_metadata.min_p, sampling_metadata.min_p_list) _, next_tokens = top_k_top_p_sampling( - probs, sampling_metadata.top_p, sampling_metadata.top_k, seed=sampling_metadata.seed[0, 0] + probs, + sampling_metadata.top_p, + sampling_metadata.top_k, + sampling_metadata.top_k_list, + seed=sampling_metadata.seed[0, 0], ) logprobs_tensors = ( diff --git a/fastdeploy/model_executor/ops/gpu/__init__.py b/fastdeploy/model_executor/ops/gpu/__init__.py index 49ed5e0ea..1e9ae2949 100644 --- a/fastdeploy/model_executor/ops/gpu/__init__.py +++ b/fastdeploy/model_executor/ops/gpu/__init__.py @@ -19,7 +19,6 @@ from fastdeploy.import_ops import import_custom_ops PACKAGE = "fastdeploy.model_executor.ops.gpu" -import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals()) import_custom_ops(PACKAGE, ".fastdeploy_ops", globals()) diff --git a/fastdeploy/model_executor/ops/iluvatar/__init__.py b/fastdeploy/model_executor/ops/iluvatar/__init__.py index 83b42f661..8d07acf0c 100644 --- a/fastdeploy/model_executor/ops/iluvatar/__init__.py +++ b/fastdeploy/model_executor/ops/iluvatar/__init__.py @@ -17,7 +17,6 @@ from fastdeploy.import_ops import import_custom_ops PACKAGE = "fastdeploy.model_executor.ops.iluvatar" -import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals()) import_custom_ops(PACKAGE, ".fastdeploy_ops", globals()) from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: F401 diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index c1d7fe60c..6065b496a 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -94,7 +94,7 @@ class GCUModelRunner(ModelRunnerBase): shape=[self.parallel_config.max_num_seqs, 1], fill_value=4, dtype="int64", - ) + ).cpu() self.restore_chunked_prefill_request = dict() # Initialize attention Backend @@ -239,7 +239,9 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7) self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) + self.share_inputs["top_k_list"][idx] = request.get("top_k", 0) self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0) + self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0) self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95) self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request( @@ -361,7 +363,9 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64") self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["top_k_list"] = [0] * max_num_seqs self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32") + self.share_inputs["min_p_list"] = [0.0] * max_num_seqs self.share_inputs["temperature"] = paddle.full( [max_num_seqs, 1], self.model_config.temperature, dtype="float32" ) @@ -408,7 +412,7 @@ class GCUModelRunner(ModelRunnerBase): self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32") self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32") - self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64").cpu() self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") @@ -539,7 +543,9 @@ class GCUModelRunner(ModelRunnerBase): temperature=self.share_inputs["temperature"], top_p=self.share_inputs["top_p"], top_k=self.share_inputs["top_k"], + top_k_list=self.share_inputs["top_k_list"], min_p=self.share_inputs["min_p"], + min_p_list=self.share_inputs["min_p_list"], seed=self.share_inputs["infer_seed"], step_idx=self.share_inputs["step_idx"], pre_token_ids=self.share_inputs["pre_ids"], diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index b039a545a..c8e9f5d87 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -138,7 +138,7 @@ class GPUModelRunner(ModelRunnerBase): shape=[self.parallel_config.max_num_seqs, 1], fill_value=4, dtype="int64", - ) + ).cpu() self.restore_chunked_prefill_request = dict() @@ -315,6 +315,10 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7) + self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) + self.share_inputs["top_k_list"][idx] = request.get("top_k", 0) + self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0) + self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0) self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95) self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0) self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0) @@ -478,7 +482,9 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7) self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) + self.share_inputs["top_k_list"][idx] = request.get("top_k", 0) self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0) + self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0) self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95) self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request( @@ -612,7 +618,9 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64") self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["top_k_list"] = [0] * max_num_seqs self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32") + self.share_inputs["min_p_list"] = [0.0] * max_num_seqs self.share_inputs["temperature"] = paddle.full( [max_num_seqs, 1], self.model_config.temperature, dtype="float32" ) @@ -661,7 +669,7 @@ class GPUModelRunner(ModelRunnerBase): self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32") self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32") self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32") - self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64").cpu() self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64") self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") @@ -830,7 +838,9 @@ class GPUModelRunner(ModelRunnerBase): temperature=self.share_inputs["temperature"], top_p=self.share_inputs["top_p"], top_k=self.share_inputs["top_k"], + top_k_list=self.share_inputs["top_k_list"], min_p=self.share_inputs["min_p"], + min_p_list=self.share_inputs["min_p_list"], seed=self.share_inputs["infer_seed"], step_idx=self.share_inputs["step_idx"], pre_token_ids=self.share_inputs["pre_ids"], diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 8c06481de..570eb5018 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -361,7 +361,7 @@ class XPUModelRunner(ModelRunnerBase): shape=[self.parallel_config.max_num_seqs, 1], fill_value=4, dtype="int64", - ) + ).cpu() # Initialize attention Backend # Note(gonshaotian): Currently, all attention layers share one attention backend instance. @@ -435,6 +435,10 @@ class XPUModelRunner(ModelRunnerBase): self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1) self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7) + self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) + self.share_inputs["top_k_list"][idx] = request.get("top_k", 0) + self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0) + self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0) self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95) self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0) self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0) @@ -476,7 +480,9 @@ class XPUModelRunner(ModelRunnerBase): self.share_inputs["pre_ids"][idx : idx + 1] = -1 self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7) self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0) + self.share_inputs["top_k_list"][idx] = request.get("top_k", 0) self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0) + self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0) self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95) self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0) self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0) @@ -547,7 +553,9 @@ class XPUModelRunner(ModelRunnerBase): self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64") self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32") self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64") + self.share_inputs["top_k_list"] = [0] * max_num_seqs self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32") + self.share_inputs["min_p_list"] = [0.0] * max_num_seqs self.share_inputs["temperature"] = paddle.full( [max_num_seqs, 1], self.model_config.temperature, dtype="float32" ) @@ -674,7 +682,9 @@ class XPUModelRunner(ModelRunnerBase): temperature=self.share_inputs["temperature"], top_p=self.share_inputs["top_p"], top_k=self.share_inputs["top_k"], + top_k_list=self.share_inputs["top_k_list"], min_p=self.share_inputs["min_p"], + min_p_list=self.share_inputs["min_p_list"], seed=self.share_inputs["infer_seed"], step_idx=self.share_inputs["step_idx"], pre_token_ids=self.share_inputs["pre_ids"],