[OPs] Universal optimization and Fix early_stop cuda 700 (#3375)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* delete nonzero

* delete setup_ops_base.py

* check if

* check gcp infer_seed.cpu()

* fix repetition_early_stopper_kernel cuda 700
This commit is contained in:
chen
2025-08-14 22:40:44 +08:00
committed by GitHub
parent 09c979f3dd
commit f0f00a6025
15 changed files with 102 additions and 71 deletions

View File

@@ -776,6 +776,22 @@ void MergePrefillDecodeOutput(
const int head_dim,
const int max_token);
std::vector<paddle::Tensor> TopPSamplingReject(const paddle::Tensor &probs,
const paddle::Tensor &top_p,
const paddle::optional<paddle::Tensor> &top_k,
int64_t seed);
std::vector<paddle::Tensor> TopKRenorm(const paddle::Tensor &probs,
const paddle::Tensor &top_k);
std::vector<paddle::Tensor> MinPSamplingFromProbs(const paddle::Tensor &probs,
const paddle::Tensor &min_p);
void SaveOutMmsgStatic(const paddle::Tensor& x,
const paddle::Tensor& not_need_stop,
int64_t rank_id,
bool save_each_rank);
PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -1128,4 +1144,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
m.def("speculate_step_paddle",&SpeculateStepPaddle, "speculate_step_paddle function");
m.def("merge_prefill_decode_output", &MergePrefillDecodeOutput, "merge_prefill_decode_output function");
m.def("rejection_top_p_sampling", &TopPSamplingReject, "rejection_top_p_sampling function");
m.def("top_k_renorm_probs", &TopKRenorm, "top_k_renorm_probs function");
m.def("min_p_sampling", &MinPSamplingFromProbs, "min_p_sampling function");
m.def("save_output", &SaveOutMmsgStatic, "save_output function");
}

View File

@@ -109,11 +109,11 @@ void GetOutputEp(const paddle::Tensor& x,
return;
}
void GetOutputStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) {
void GetOutputEPStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) {
GetOutputEp(x, rank_id, wait_flag, 1);
}
void GetOutputDynamic(const paddle::Tensor& x,
void GetOutputEPDynamic(const paddle::Tensor& x,
int64_t rank_id,
bool wait_flag,
int msg_queue_id) {
@@ -125,11 +125,11 @@ PD_BUILD_STATIC_OP(get_output_ep)
.Attrs({"rank_id: int64_t", "wait_flag: bool"})
.Outputs({"x_out"})
.SetInplaceMap({{"x", "x_out"}})
.SetKernelFn(PD_KERNEL(GetOutputStatic));
.SetKernelFn(PD_KERNEL(GetOutputEPStatic));
PD_BUILD_STATIC_OP(get_output_ep_dynamic)
.Inputs({"x"})
.Attrs({"rank_id: int64_t", "wait_flag: bool", "msg_queue_id: int"})
.Outputs({"x_out"})
.SetInplaceMap({{"x", "x_out"}})
.SetKernelFn(PD_KERNEL(GetOutputDynamic));
.SetKernelFn(PD_KERNEL(GetOutputEPDynamic));

View File

@@ -19,7 +19,7 @@
std::vector<paddle::Tensor> TopPSamplingReject(const paddle::Tensor &probs,
const paddle::Tensor &top_p,
const paddle::optional<paddle::Tensor> &top_k,
int seed) {
int64_t seed) {
std::vector<int64_t> probs_shape = probs.shape();
unsigned int batch_size = probs_shape[0];
unsigned int vocab_size = probs_shape[1];
@@ -82,7 +82,7 @@ TopPSamplingRejectInferDtype(const paddle::DataType &probs_dtype,
PD_BUILD_STATIC_OP(rejection_top_p_sampling)
.Inputs({"probs", "top_p", paddle::Optional("top_k")})
.Outputs({"samples"})
.Attrs({"seed: int"})
.Attrs({"seed: int64_t"})
.SetKernelFn(PD_KERNEL(TopPSamplingReject))
.SetInferShapeFn(PD_INFER_SHAPE(TopPSamplingRejectInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(TopPSamplingRejectInferDtype));

View File

@@ -199,6 +199,11 @@ if paddle.is_compiled_with_rocm():
if not os.listdir(json_dir):
raise ValueError("Git clone nlohmann_json failed!")
sources = [
"gpu_ops/save_with_output_msg.cc",
"gpu_ops/get_output.cc",
"gpu_ops/get_output_msg_with_topk.cc",
"gpu_ops/save_output_msg_with_topk.cc",
"gpu_ops/transfer_output.cc",
"gpu_ops/set_value_by_flags.cu",
"gpu_ops/token_penalty_multi_scores.cu",
"gpu_ops/stop_generation.cu",
@@ -250,6 +255,11 @@ if paddle.is_compiled_with_rocm():
)
elif paddle.is_compiled_with_cuda():
sources = [
"gpu_ops/save_with_output_msg.cc",
"gpu_ops/get_output.cc",
"gpu_ops/get_output_msg_with_topk.cc",
"gpu_ops/save_output_msg_with_topk.cc",
"gpu_ops/transfer_output.cc",
"gpu_ops/set_mask_value.cu",
"gpu_ops/set_value_by_flags.cu",
"gpu_ops/ngram_mask.cu",
@@ -532,6 +542,11 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
]
},
sources=[
"gpu_ops/save_with_output_msg.cc",
"gpu_ops/get_output.cc",
"gpu_ops/get_output_msg_with_topk.cc",
"gpu_ops/save_output_msg_with_topk.cc",
"gpu_ops/transfer_output.cc",
"gpu_ops/get_padding_offset.cu",
"gpu_ops/set_value_by_flags.cu",
"gpu_ops/rebuild_padding.cu",
@@ -653,6 +668,12 @@ else:
name="fastdeploy_cpu_ops",
ext_modules=CppExtension(
sources=[
"gpu_ops/save_with_output_msg.cc",
"gpu_ops/get_output.cc",
"gpu_ops/get_output_msg_with_topk.cc",
"gpu_ops/save_output_msg_with_topk.cc",
"gpu_ops/transfer_output.cc",
"cpu_ops/rebuild_padding.cc",
"cpu_ops/simd_sort.cc",
"cpu_ops/set_value_by_flags.cc",
"cpu_ops/token_penalty_multi_scores.cc",

View File

@@ -1,34 +0,0 @@
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""setup for FASTDEPLOY base ops"""
from paddle.utils.cpp_extension import CppExtension, setup
setup(
name="fastdeploy_base_ops",
ext_modules=CppExtension(
sources=[
"gpu_ops/save_with_output_msg.cc",
"gpu_ops/get_output.cc",
"gpu_ops/get_output_msg_with_topk.cc",
"gpu_ops/save_output_msg_with_topk.cc",
"gpu_ops/transfer_output.cc",
"cpu_ops/rebuild_padding.cc",
],
extra_compile_args=[
"-DPy_LIMITED_API=0x03090000",
"-DPADDLE_ON_INFERENCE",
],
),
)