mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[OPs] Universal optimization and Fix early_stop cuda 700 (#3375)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* delete nonzero * delete setup_ops_base.py * check if * check gcp infer_seed.cpu() * fix repetition_early_stopper_kernel cuda 700
This commit is contained in:
20
build.sh
20
build.sh
@@ -34,7 +34,6 @@ EGG_DIR="fastdeploy.egg-info"
|
|||||||
|
|
||||||
# custom_ops directory config
|
# custom_ops directory config
|
||||||
OPS_SRC_DIR="custom_ops"
|
OPS_SRC_DIR="custom_ops"
|
||||||
OPS_TMP_DIR_BASE="tmp_base"
|
|
||||||
OPS_TMP_DIR="tmp"
|
OPS_TMP_DIR="tmp"
|
||||||
|
|
||||||
# command line log config
|
# command line log config
|
||||||
@@ -71,25 +70,20 @@ function copy_ops(){
|
|||||||
PY_VERSION="py${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
|
PY_VERSION="py${PY_MAIN_VERSION}.${PY_SUB_VERSION}"
|
||||||
SYSTEM_VERSION=`${python} -c "import platform; print(platform.system().lower())"`
|
SYSTEM_VERSION=`${python} -c "import platform; print(platform.system().lower())"`
|
||||||
PROCESSOR_VERSION=`${python} -c "import platform; print(platform.processor())"`
|
PROCESSOR_VERSION=`${python} -c "import platform; print(platform.processor())"`
|
||||||
WHEEL_BASE_NAME="fastdeploy_base_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
|
|
||||||
WHEEL_NAME="fastdeploy_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
|
WHEEL_NAME="fastdeploy_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
|
||||||
WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
|
WHEEL_CPU_NAME="fastdeploy_cpu_ops-${OPS_VERSION}-${PY_VERSION}-${SYSTEM_VERSION}-${PROCESSOR_VERSION}.egg"
|
||||||
is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
|
is_rocm=`$python -c "import paddle; print(paddle.is_compiled_with_rocm())"`
|
||||||
if [ "$is_rocm" = "True" ]; then
|
if [ "$is_rocm" = "True" ]; then
|
||||||
DEVICE_TYPE="rocm"
|
DEVICE_TYPE="rocm"
|
||||||
mkdir -p ../fastdeploy/model_executor/ops/base
|
|
||||||
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
|
||||||
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
||||||
echo -e "BASE and ROCM ops have been copy to fastdeploy"
|
echo -e "ROCM ops have been copy to fastdeploy"
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
mkdir -p ../fastdeploy/model_executor/ops/base
|
|
||||||
is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
|
is_cuda=`$python -c "import paddle; print(paddle.is_compiled_with_cuda())"`
|
||||||
if [ "$is_cuda" = "True" ]; then
|
if [ "$is_cuda" = "True" ]; then
|
||||||
DEVICE_TYPE="gpu"
|
DEVICE_TYPE="gpu"
|
||||||
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
|
||||||
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/gpu
|
||||||
echo -e "BASE and CUDA ops have been copy to fastdeploy"
|
echo -e "CUDA ops have been copy to fastdeploy"
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -112,9 +106,8 @@ function copy_ops(){
|
|||||||
if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
|
if_corex=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device(\"iluvatar_gpu\"))"`
|
||||||
if [ "$if_corex" = "True" ]; then
|
if [ "$if_corex" = "True" ]; then
|
||||||
DEVICE_TYPE="iluvatar-gpu"
|
DEVICE_TYPE="iluvatar-gpu"
|
||||||
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
|
||||||
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
|
cp -r ./${OPS_TMP_DIR}/${WHEEL_NAME}/* ../fastdeploy/model_executor/ops/iluvatar
|
||||||
echo -e "BASE and Iluvatar ops have been copy to fastdeploy"
|
echo -e "Iluvatar ops have been copy to fastdeploy"
|
||||||
return
|
return
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -137,19 +130,15 @@ function copy_ops(){
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
DEVICE_TYPE="cpu"
|
DEVICE_TYPE="cpu"
|
||||||
cp -r ./${OPS_TMP_DIR_BASE}/${WHEEL_BASE_NAME}/* ../fastdeploy/model_executor/ops/base
|
|
||||||
cd ../../../../
|
cd ../../../../
|
||||||
cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
|
cp -r ${OPS_TMP_DIR}/${WHEEL_CPU_NAME}/* ../fastdeploy/model_executor/ops/cpu
|
||||||
echo -e "BASE and CPU ops have been copy to fastdeploy"
|
echo -e "CPU ops have been copy to fastdeploy"
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
function build_and_install_ops() {
|
function build_and_install_ops() {
|
||||||
cd $OPS_SRC_DIR
|
cd $OPS_SRC_DIR
|
||||||
export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
|
export no_proxy=bcebos.com,paddlepaddle.org.cn,${no_proxy}
|
||||||
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_base_ops..."
|
|
||||||
${python} setup_ops_base.py install --install-lib ${OPS_TMP_DIR_BASE}
|
|
||||||
find ${OPS_TMP_DIR_BASE} -type f -name "*.o" -exec rm -f {} \;
|
|
||||||
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
|
echo -e "${BLUE}[build]${NONE} build and install fastdeploy_ops..."
|
||||||
TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
|
TMP_DIR_REAL_PATH=`readlink -f ${OPS_TMP_DIR}`
|
||||||
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
|
is_xpu=`$python -c "import paddle; print(paddle.is_compiled_with_xpu())"`
|
||||||
@@ -223,7 +212,6 @@ function cleanup() {
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
|
rm -rf $OPS_SRC_DIR/$BUILD_DIR $OPS_SRC_DIR/$EGG_DIR
|
||||||
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR_BASE
|
|
||||||
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR
|
rm -rf $OPS_SRC_DIR/$OPS_TMP_DIR
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -776,6 +776,22 @@ void MergePrefillDecodeOutput(
|
|||||||
const int head_dim,
|
const int head_dim,
|
||||||
const int max_token);
|
const int max_token);
|
||||||
|
|
||||||
|
std::vector<paddle::Tensor> TopPSamplingReject(const paddle::Tensor &probs,
|
||||||
|
const paddle::Tensor &top_p,
|
||||||
|
const paddle::optional<paddle::Tensor> &top_k,
|
||||||
|
int64_t seed);
|
||||||
|
|
||||||
|
std::vector<paddle::Tensor> TopKRenorm(const paddle::Tensor &probs,
|
||||||
|
const paddle::Tensor &top_k);
|
||||||
|
|
||||||
|
std::vector<paddle::Tensor> MinPSamplingFromProbs(const paddle::Tensor &probs,
|
||||||
|
const paddle::Tensor &min_p);
|
||||||
|
|
||||||
|
void SaveOutMmsgStatic(const paddle::Tensor& x,
|
||||||
|
const paddle::Tensor& not_need_stop,
|
||||||
|
int64_t rank_id,
|
||||||
|
bool save_each_rank);
|
||||||
|
|
||||||
PYBIND11_MODULE(fastdeploy_ops, m) {
|
PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||||
|
|
||||||
m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
|
m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
|
||||||
@@ -1128,4 +1144,12 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
|||||||
m.def("speculate_step_paddle",&SpeculateStepPaddle, "speculate_step_paddle function");
|
m.def("speculate_step_paddle",&SpeculateStepPaddle, "speculate_step_paddle function");
|
||||||
|
|
||||||
m.def("merge_prefill_decode_output", &MergePrefillDecodeOutput, "merge_prefill_decode_output function");
|
m.def("merge_prefill_decode_output", &MergePrefillDecodeOutput, "merge_prefill_decode_output function");
|
||||||
|
|
||||||
|
m.def("rejection_top_p_sampling", &TopPSamplingReject, "rejection_top_p_sampling function");
|
||||||
|
|
||||||
|
m.def("top_k_renorm_probs", &TopKRenorm, "top_k_renorm_probs function");
|
||||||
|
|
||||||
|
m.def("min_p_sampling", &MinPSamplingFromProbs, "min_p_sampling function");
|
||||||
|
|
||||||
|
m.def("save_output", &SaveOutMmsgStatic, "save_output function");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -109,11 +109,11 @@ void GetOutputEp(const paddle::Tensor& x,
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
void GetOutputStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) {
|
void GetOutputEPStatic(const paddle::Tensor& x, int64_t rank_id, bool wait_flag) {
|
||||||
GetOutputEp(x, rank_id, wait_flag, 1);
|
GetOutputEp(x, rank_id, wait_flag, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void GetOutputDynamic(const paddle::Tensor& x,
|
void GetOutputEPDynamic(const paddle::Tensor& x,
|
||||||
int64_t rank_id,
|
int64_t rank_id,
|
||||||
bool wait_flag,
|
bool wait_flag,
|
||||||
int msg_queue_id) {
|
int msg_queue_id) {
|
||||||
@@ -125,11 +125,11 @@ PD_BUILD_STATIC_OP(get_output_ep)
|
|||||||
.Attrs({"rank_id: int64_t", "wait_flag: bool"})
|
.Attrs({"rank_id: int64_t", "wait_flag: bool"})
|
||||||
.Outputs({"x_out"})
|
.Outputs({"x_out"})
|
||||||
.SetInplaceMap({{"x", "x_out"}})
|
.SetInplaceMap({{"x", "x_out"}})
|
||||||
.SetKernelFn(PD_KERNEL(GetOutputStatic));
|
.SetKernelFn(PD_KERNEL(GetOutputEPStatic));
|
||||||
|
|
||||||
PD_BUILD_STATIC_OP(get_output_ep_dynamic)
|
PD_BUILD_STATIC_OP(get_output_ep_dynamic)
|
||||||
.Inputs({"x"})
|
.Inputs({"x"})
|
||||||
.Attrs({"rank_id: int64_t", "wait_flag: bool", "msg_queue_id: int"})
|
.Attrs({"rank_id: int64_t", "wait_flag: bool", "msg_queue_id: int"})
|
||||||
.Outputs({"x_out"})
|
.Outputs({"x_out"})
|
||||||
.SetInplaceMap({{"x", "x_out"}})
|
.SetInplaceMap({{"x", "x_out"}})
|
||||||
.SetKernelFn(PD_KERNEL(GetOutputDynamic));
|
.SetKernelFn(PD_KERNEL(GetOutputEPDynamic));
|
||||||
|
|||||||
@@ -19,7 +19,7 @@
|
|||||||
std::vector<paddle::Tensor> TopPSamplingReject(const paddle::Tensor &probs,
|
std::vector<paddle::Tensor> TopPSamplingReject(const paddle::Tensor &probs,
|
||||||
const paddle::Tensor &top_p,
|
const paddle::Tensor &top_p,
|
||||||
const paddle::optional<paddle::Tensor> &top_k,
|
const paddle::optional<paddle::Tensor> &top_k,
|
||||||
int seed) {
|
int64_t seed) {
|
||||||
std::vector<int64_t> probs_shape = probs.shape();
|
std::vector<int64_t> probs_shape = probs.shape();
|
||||||
unsigned int batch_size = probs_shape[0];
|
unsigned int batch_size = probs_shape[0];
|
||||||
unsigned int vocab_size = probs_shape[1];
|
unsigned int vocab_size = probs_shape[1];
|
||||||
@@ -82,7 +82,7 @@ TopPSamplingRejectInferDtype(const paddle::DataType &probs_dtype,
|
|||||||
PD_BUILD_STATIC_OP(rejection_top_p_sampling)
|
PD_BUILD_STATIC_OP(rejection_top_p_sampling)
|
||||||
.Inputs({"probs", "top_p", paddle::Optional("top_k")})
|
.Inputs({"probs", "top_p", paddle::Optional("top_k")})
|
||||||
.Outputs({"samples"})
|
.Outputs({"samples"})
|
||||||
.Attrs({"seed: int"})
|
.Attrs({"seed: int64_t"})
|
||||||
.SetKernelFn(PD_KERNEL(TopPSamplingReject))
|
.SetKernelFn(PD_KERNEL(TopPSamplingReject))
|
||||||
.SetInferShapeFn(PD_INFER_SHAPE(TopPSamplingRejectInferShape))
|
.SetInferShapeFn(PD_INFER_SHAPE(TopPSamplingRejectInferShape))
|
||||||
.SetInferDtypeFn(PD_INFER_DTYPE(TopPSamplingRejectInferDtype));
|
.SetInferDtypeFn(PD_INFER_DTYPE(TopPSamplingRejectInferDtype));
|
||||||
|
|||||||
@@ -199,6 +199,11 @@ if paddle.is_compiled_with_rocm():
|
|||||||
if not os.listdir(json_dir):
|
if not os.listdir(json_dir):
|
||||||
raise ValueError("Git clone nlohmann_json failed!")
|
raise ValueError("Git clone nlohmann_json failed!")
|
||||||
sources = [
|
sources = [
|
||||||
|
"gpu_ops/save_with_output_msg.cc",
|
||||||
|
"gpu_ops/get_output.cc",
|
||||||
|
"gpu_ops/get_output_msg_with_topk.cc",
|
||||||
|
"gpu_ops/save_output_msg_with_topk.cc",
|
||||||
|
"gpu_ops/transfer_output.cc",
|
||||||
"gpu_ops/set_value_by_flags.cu",
|
"gpu_ops/set_value_by_flags.cu",
|
||||||
"gpu_ops/token_penalty_multi_scores.cu",
|
"gpu_ops/token_penalty_multi_scores.cu",
|
||||||
"gpu_ops/stop_generation.cu",
|
"gpu_ops/stop_generation.cu",
|
||||||
@@ -250,6 +255,11 @@ if paddle.is_compiled_with_rocm():
|
|||||||
)
|
)
|
||||||
elif paddle.is_compiled_with_cuda():
|
elif paddle.is_compiled_with_cuda():
|
||||||
sources = [
|
sources = [
|
||||||
|
"gpu_ops/save_with_output_msg.cc",
|
||||||
|
"gpu_ops/get_output.cc",
|
||||||
|
"gpu_ops/get_output_msg_with_topk.cc",
|
||||||
|
"gpu_ops/save_output_msg_with_topk.cc",
|
||||||
|
"gpu_ops/transfer_output.cc",
|
||||||
"gpu_ops/set_mask_value.cu",
|
"gpu_ops/set_mask_value.cu",
|
||||||
"gpu_ops/set_value_by_flags.cu",
|
"gpu_ops/set_value_by_flags.cu",
|
||||||
"gpu_ops/ngram_mask.cu",
|
"gpu_ops/ngram_mask.cu",
|
||||||
@@ -532,6 +542,11 @@ elif paddle.is_compiled_with_custom_device("iluvatar_gpu"):
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
sources=[
|
sources=[
|
||||||
|
"gpu_ops/save_with_output_msg.cc",
|
||||||
|
"gpu_ops/get_output.cc",
|
||||||
|
"gpu_ops/get_output_msg_with_topk.cc",
|
||||||
|
"gpu_ops/save_output_msg_with_topk.cc",
|
||||||
|
"gpu_ops/transfer_output.cc",
|
||||||
"gpu_ops/get_padding_offset.cu",
|
"gpu_ops/get_padding_offset.cu",
|
||||||
"gpu_ops/set_value_by_flags.cu",
|
"gpu_ops/set_value_by_flags.cu",
|
||||||
"gpu_ops/rebuild_padding.cu",
|
"gpu_ops/rebuild_padding.cu",
|
||||||
@@ -653,6 +668,12 @@ else:
|
|||||||
name="fastdeploy_cpu_ops",
|
name="fastdeploy_cpu_ops",
|
||||||
ext_modules=CppExtension(
|
ext_modules=CppExtension(
|
||||||
sources=[
|
sources=[
|
||||||
|
"gpu_ops/save_with_output_msg.cc",
|
||||||
|
"gpu_ops/get_output.cc",
|
||||||
|
"gpu_ops/get_output_msg_with_topk.cc",
|
||||||
|
"gpu_ops/save_output_msg_with_topk.cc",
|
||||||
|
"gpu_ops/transfer_output.cc",
|
||||||
|
"cpu_ops/rebuild_padding.cc",
|
||||||
"cpu_ops/simd_sort.cc",
|
"cpu_ops/simd_sort.cc",
|
||||||
"cpu_ops/set_value_by_flags.cc",
|
"cpu_ops/set_value_by_flags.cc",
|
||||||
"cpu_ops/token_penalty_multi_scores.cc",
|
"cpu_ops/token_penalty_multi_scores.cc",
|
||||||
|
|||||||
@@ -1,34 +0,0 @@
|
|||||||
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
"""setup for FASTDEPLOY base ops"""
|
|
||||||
|
|
||||||
from paddle.utils.cpp_extension import CppExtension, setup
|
|
||||||
|
|
||||||
setup(
|
|
||||||
name="fastdeploy_base_ops",
|
|
||||||
ext_modules=CppExtension(
|
|
||||||
sources=[
|
|
||||||
"gpu_ops/save_with_output_msg.cc",
|
|
||||||
"gpu_ops/get_output.cc",
|
|
||||||
"gpu_ops/get_output_msg_with_topk.cc",
|
|
||||||
"gpu_ops/save_output_msg_with_topk.cc",
|
|
||||||
"gpu_ops/transfer_output.cc",
|
|
||||||
"cpu_ops/rebuild_padding.cc",
|
|
||||||
],
|
|
||||||
extra_compile_args=[
|
|
||||||
"-DPy_LIMITED_API=0x03090000",
|
|
||||||
"-DPADDLE_ON_INFERENCE",
|
|
||||||
],
|
|
||||||
),
|
|
||||||
)
|
|
||||||
@@ -90,10 +90,10 @@ class RepetitionEarlyStopper(EarlyStopper):
|
|||||||
)
|
)
|
||||||
|
|
||||||
B, W = self.trunc_scores.shape
|
B, W = self.trunc_scores.shape
|
||||||
V = probs.shape[1]
|
real_bsz, V = probs.shape
|
||||||
BLOCK_W = triton.next_power_of_2(W)
|
BLOCK_W = triton.next_power_of_2(W)
|
||||||
|
|
||||||
grid = (B,)
|
grid = (real_bsz,)
|
||||||
repetition_early_stopper_kernel[grid](
|
repetition_early_stopper_kernel[grid](
|
||||||
self.trunc_scores,
|
self.trunc_scores,
|
||||||
probs,
|
probs,
|
||||||
|
|||||||
@@ -42,7 +42,9 @@ class SamplingMetadata:
|
|||||||
|
|
||||||
top_p: paddle.Tensor
|
top_p: paddle.Tensor
|
||||||
top_k: Optional[paddle.Tensor] = None
|
top_k: Optional[paddle.Tensor] = None
|
||||||
|
top_k_list: Optional[list] = None
|
||||||
min_p: Optional[paddle.Tensor] = None
|
min_p: Optional[paddle.Tensor] = None
|
||||||
|
min_p_list: Optional[list] = None
|
||||||
seed: Optional[paddle.Tensor] = None
|
seed: Optional[paddle.Tensor] = None
|
||||||
max_num_logprobs: Optional[int] = None
|
max_num_logprobs: Optional[int] = None
|
||||||
enable_early_stop: Optional[int] = False
|
enable_early_stop: Optional[int] = False
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ def top_k_top_p_sampling(
|
|||||||
x: paddle.Tensor,
|
x: paddle.Tensor,
|
||||||
top_p: paddle.Tensor,
|
top_p: paddle.Tensor,
|
||||||
top_k: Optional[paddle.Tensor] = None,
|
top_k: Optional[paddle.Tensor] = None,
|
||||||
|
top_k_list: Optional[list] = None,
|
||||||
threshold: Optional[paddle.Tensor] = None,
|
threshold: Optional[paddle.Tensor] = None,
|
||||||
topp_seed: Optional[paddle.Tensor] = None,
|
topp_seed: Optional[paddle.Tensor] = None,
|
||||||
seed: int = -1,
|
seed: int = -1,
|
||||||
@@ -64,7 +65,7 @@ def top_k_top_p_sampling(
|
|||||||
if top_p_class == "air":
|
if top_p_class == "air":
|
||||||
_, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)
|
_, ids = air_top_p_sampling(x, top_p, threshold, topp_seed, seed=seed, k=k, mode=mode)
|
||||||
elif top_p_class == "rejection":
|
elif top_p_class == "rejection":
|
||||||
ids = rejection_top_p_sampling(x, top_p, top_k, seed, order)
|
ids = rejection_top_p_sampling(x, top_p, top_k, top_k_list, seed, order)
|
||||||
_ = None
|
_ = None
|
||||||
elif top_p_class == "base_non_truncated":
|
elif top_p_class == "base_non_truncated":
|
||||||
_, ids = paddle.tensor.top_p_sampling(
|
_, ids = paddle.tensor.top_p_sampling(
|
||||||
@@ -121,6 +122,7 @@ def rejection_top_p_sampling(
|
|||||||
x: paddle.Tensor,
|
x: paddle.Tensor,
|
||||||
top_p: paddle.Tensor,
|
top_p: paddle.Tensor,
|
||||||
top_k: paddle.Tensor,
|
top_k: paddle.Tensor,
|
||||||
|
top_k_list: list,
|
||||||
seed: int = -1,
|
seed: int = -1,
|
||||||
order: Literal["top_k_first", "joint"] = "top_k_first",
|
order: Literal["top_k_first", "joint"] = "top_k_first",
|
||||||
) -> paddle.Tensor:
|
) -> paddle.Tensor:
|
||||||
@@ -139,7 +141,7 @@ def rejection_top_p_sampling(
|
|||||||
top_k_renorm_probs,
|
top_k_renorm_probs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if paddle.count_nonzero(top_k) == 0:
|
if not any(x > 0 for x in top_k_list):
|
||||||
ids = rejection_top_p_sampling(
|
ids = rejection_top_p_sampling(
|
||||||
x,
|
x,
|
||||||
top_p,
|
top_p,
|
||||||
@@ -170,11 +172,12 @@ def rejection_top_p_sampling(
|
|||||||
def min_p_sampling(
|
def min_p_sampling(
|
||||||
probs: paddle.tensor,
|
probs: paddle.tensor,
|
||||||
min_p_arr: Optional[paddle.Tensor],
|
min_p_arr: Optional[paddle.Tensor],
|
||||||
|
min_p_arr_cpu: Optional[list],
|
||||||
) -> tuple[paddle.Tensor, paddle.Tensor]:
|
) -> tuple[paddle.Tensor, paddle.Tensor]:
|
||||||
"""
|
"""
|
||||||
min_p_sampling
|
min_p_sampling
|
||||||
"""
|
"""
|
||||||
if paddle.count_nonzero(min_p_arr) == 0:
|
if not any(x > 0 for x in min_p_arr_cpu):
|
||||||
return probs
|
return probs
|
||||||
else:
|
else:
|
||||||
if current_platform.is_cuda():
|
if current_platform.is_cuda():
|
||||||
|
|||||||
@@ -281,10 +281,13 @@ class Sampler(nn.Layer):
|
|||||||
|
|
||||||
probs = F.softmax(logits)
|
probs = F.softmax(logits)
|
||||||
|
|
||||||
probs = min_p_sampling(probs, sampling_metadata.min_p)
|
probs = min_p_sampling(probs, sampling_metadata.min_p, sampling_metadata.min_p_list)
|
||||||
|
|
||||||
_, next_tokens = top_k_top_p_sampling(
|
_, next_tokens = top_k_top_p_sampling(
|
||||||
probs, sampling_metadata.top_p, sampling_metadata.top_k, seed=sampling_metadata.seed[0, 0]
|
probs,
|
||||||
|
sampling_metadata.top_p,
|
||||||
|
sampling_metadata.top_k,
|
||||||
|
sampling_metadata.top_k_list,
|
||||||
|
seed=sampling_metadata.seed[0, 0],
|
||||||
)
|
)
|
||||||
|
|
||||||
logprobs_tensors = (
|
logprobs_tensors = (
|
||||||
|
|||||||
@@ -19,7 +19,6 @@ from fastdeploy.import_ops import import_custom_ops
|
|||||||
|
|
||||||
PACKAGE = "fastdeploy.model_executor.ops.gpu"
|
PACKAGE = "fastdeploy.model_executor.ops.gpu"
|
||||||
|
|
||||||
import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals())
|
|
||||||
import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
|
import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -17,7 +17,6 @@ from fastdeploy.import_ops import import_custom_ops
|
|||||||
|
|
||||||
PACKAGE = "fastdeploy.model_executor.ops.iluvatar"
|
PACKAGE = "fastdeploy.model_executor.ops.iluvatar"
|
||||||
|
|
||||||
import_custom_ops(PACKAGE, "..base.fastdeploy_base_ops", globals())
|
|
||||||
import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
|
import_custom_ops(PACKAGE, ".fastdeploy_ops", globals())
|
||||||
|
|
||||||
from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: F401
|
from .moe_ops import iluvatar_moe_expert_ffn as moe_expert_ffn # noqa: F401
|
||||||
|
|||||||
@@ -94,7 +94,7 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
shape=[self.parallel_config.max_num_seqs, 1],
|
shape=[self.parallel_config.max_num_seqs, 1],
|
||||||
fill_value=4,
|
fill_value=4,
|
||||||
dtype="int64",
|
dtype="int64",
|
||||||
)
|
).cpu()
|
||||||
self.restore_chunked_prefill_request = dict()
|
self.restore_chunked_prefill_request = dict()
|
||||||
|
|
||||||
# Initialize attention Backend
|
# Initialize attention Backend
|
||||||
@@ -239,7 +239,9 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
|
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
|
||||||
self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
|
self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
|
||||||
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
|
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
|
||||||
|
self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
|
||||||
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
|
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
|
||||||
|
self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
|
||||||
|
|
||||||
self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95)
|
self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95)
|
||||||
self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request(
|
self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request(
|
||||||
@@ -361,7 +363,9 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
|
self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
|
||||||
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
|
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
|
||||||
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
|
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
|
||||||
|
self.share_inputs["top_k_list"] = [0] * max_num_seqs
|
||||||
self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
|
self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
|
||||||
|
self.share_inputs["min_p_list"] = [0.0] * max_num_seqs
|
||||||
self.share_inputs["temperature"] = paddle.full(
|
self.share_inputs["temperature"] = paddle.full(
|
||||||
[max_num_seqs, 1], self.model_config.temperature, dtype="float32"
|
[max_num_seqs, 1], self.model_config.temperature, dtype="float32"
|
||||||
)
|
)
|
||||||
@@ -408,7 +412,7 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
|
self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
|
||||||
self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32")
|
self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32")
|
||||||
self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32")
|
self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32")
|
||||||
self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
|
self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64").cpu()
|
||||||
self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
|
self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
|
||||||
self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||||
self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||||
@@ -539,7 +543,9 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
temperature=self.share_inputs["temperature"],
|
temperature=self.share_inputs["temperature"],
|
||||||
top_p=self.share_inputs["top_p"],
|
top_p=self.share_inputs["top_p"],
|
||||||
top_k=self.share_inputs["top_k"],
|
top_k=self.share_inputs["top_k"],
|
||||||
|
top_k_list=self.share_inputs["top_k_list"],
|
||||||
min_p=self.share_inputs["min_p"],
|
min_p=self.share_inputs["min_p"],
|
||||||
|
min_p_list=self.share_inputs["min_p_list"],
|
||||||
seed=self.share_inputs["infer_seed"],
|
seed=self.share_inputs["infer_seed"],
|
||||||
step_idx=self.share_inputs["step_idx"],
|
step_idx=self.share_inputs["step_idx"],
|
||||||
pre_token_ids=self.share_inputs["pre_ids"],
|
pre_token_ids=self.share_inputs["pre_ids"],
|
||||||
|
|||||||
@@ -138,7 +138,7 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
shape=[self.parallel_config.max_num_seqs, 1],
|
shape=[self.parallel_config.max_num_seqs, 1],
|
||||||
fill_value=4,
|
fill_value=4,
|
||||||
dtype="int64",
|
dtype="int64",
|
||||||
)
|
).cpu()
|
||||||
|
|
||||||
self.restore_chunked_prefill_request = dict()
|
self.restore_chunked_prefill_request = dict()
|
||||||
|
|
||||||
@@ -315,6 +315,10 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
|
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
|
||||||
|
|
||||||
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
|
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
|
||||||
|
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
|
||||||
|
self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
|
||||||
|
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
|
||||||
|
self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
|
||||||
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
|
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
|
||||||
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
|
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
|
||||||
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
|
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
|
||||||
@@ -478,7 +482,9 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
|
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
|
||||||
self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
|
self.share_inputs["top_p"][idx : idx + 1] = get_attr_from_request(request, "top_p", 0.7)
|
||||||
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
|
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
|
||||||
|
self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
|
||||||
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
|
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
|
||||||
|
self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
|
||||||
|
|
||||||
self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95)
|
self.share_inputs["temperature"][idx : idx + 1] = get_attr_from_request(request, "temperature", 0.95)
|
||||||
self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request(
|
self.share_inputs["penalty_score"][idx : idx + 1] = get_attr_from_request(
|
||||||
@@ -612,7 +618,9 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
|
self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
|
||||||
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
|
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
|
||||||
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
|
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
|
||||||
|
self.share_inputs["top_k_list"] = [0] * max_num_seqs
|
||||||
self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
|
self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
|
||||||
|
self.share_inputs["min_p_list"] = [0.0] * max_num_seqs
|
||||||
self.share_inputs["temperature"] = paddle.full(
|
self.share_inputs["temperature"] = paddle.full(
|
||||||
[max_num_seqs, 1], self.model_config.temperature, dtype="float32"
|
[max_num_seqs, 1], self.model_config.temperature, dtype="float32"
|
||||||
)
|
)
|
||||||
@@ -661,7 +669,7 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
|
self.share_inputs["need_block_list"] = paddle.full([max_num_seqs], -1, dtype="int32")
|
||||||
self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32")
|
self.share_inputs["need_block_len"] = paddle.full([1], 0, dtype="int32")
|
||||||
self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32")
|
self.share_inputs["used_list_len"] = paddle.full([max_num_seqs], 0, dtype="int32")
|
||||||
self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
|
self.share_inputs["infer_seed"] = paddle.full([max_num_seqs, 1], 0, dtype="int64").cpu()
|
||||||
self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
|
self.share_inputs["first_token_ids"] = paddle.full([max_num_seqs, 1], -1, dtype="int64")
|
||||||
self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
self.share_inputs["ori_seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||||
self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
self.share_inputs["system_lens"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
|
||||||
@@ -830,7 +838,9 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
temperature=self.share_inputs["temperature"],
|
temperature=self.share_inputs["temperature"],
|
||||||
top_p=self.share_inputs["top_p"],
|
top_p=self.share_inputs["top_p"],
|
||||||
top_k=self.share_inputs["top_k"],
|
top_k=self.share_inputs["top_k"],
|
||||||
|
top_k_list=self.share_inputs["top_k_list"],
|
||||||
min_p=self.share_inputs["min_p"],
|
min_p=self.share_inputs["min_p"],
|
||||||
|
min_p_list=self.share_inputs["min_p_list"],
|
||||||
seed=self.share_inputs["infer_seed"],
|
seed=self.share_inputs["infer_seed"],
|
||||||
step_idx=self.share_inputs["step_idx"],
|
step_idx=self.share_inputs["step_idx"],
|
||||||
pre_token_ids=self.share_inputs["pre_ids"],
|
pre_token_ids=self.share_inputs["pre_ids"],
|
||||||
|
|||||||
@@ -361,7 +361,7 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
shape=[self.parallel_config.max_num_seqs, 1],
|
shape=[self.parallel_config.max_num_seqs, 1],
|
||||||
fill_value=4,
|
fill_value=4,
|
||||||
dtype="int64",
|
dtype="int64",
|
||||||
)
|
).cpu()
|
||||||
|
|
||||||
# Initialize attention Backend
|
# Initialize attention Backend
|
||||||
# Note(gonshaotian): Currently, all attention layers share one attention backend instance.
|
# Note(gonshaotian): Currently, all attention layers share one attention backend instance.
|
||||||
@@ -435,6 +435,10 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
|
self.share_inputs["eos_token_id"][:] = np.array(request.eos_token_ids, dtype="int64").reshape(-1, 1)
|
||||||
|
|
||||||
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
|
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
|
||||||
|
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
|
||||||
|
self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
|
||||||
|
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
|
||||||
|
self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
|
||||||
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
|
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
|
||||||
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
|
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
|
||||||
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
|
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
|
||||||
@@ -476,7 +480,9 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["pre_ids"][idx : idx + 1] = -1
|
self.share_inputs["pre_ids"][idx : idx + 1] = -1
|
||||||
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
|
self.share_inputs["top_p"][idx : idx + 1] = request.get("top_p", 0.7)
|
||||||
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
|
self.share_inputs["top_k"][idx : idx + 1] = request.get("top_k", 0)
|
||||||
|
self.share_inputs["top_k_list"][idx] = request.get("top_k", 0)
|
||||||
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
|
self.share_inputs["min_p"][idx : idx + 1] = request.get("min_p", 0.0)
|
||||||
|
self.share_inputs["min_p_list"][idx] = request.get("min_p", 0.0)
|
||||||
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
|
self.share_inputs["temperature"][idx : idx + 1] = request.get("temperature", 0.95)
|
||||||
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
|
self.share_inputs["penalty_score"][idx : idx + 1] = request.get("repetition_penalty", 1.0)
|
||||||
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
|
self.share_inputs["frequency_score"][idx : idx + 1] = request.get("frequency_penalty", 0.0)
|
||||||
@@ -547,7 +553,9 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
|
self.share_inputs["eos_token_id"] = paddle.full([self.model_config.eos_tokens_lens, 1], 0, dtype="int64")
|
||||||
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
|
self.share_inputs["top_p"] = paddle.full([max_num_seqs, 1], self.model_config.top_p, dtype="float32")
|
||||||
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
|
self.share_inputs["top_k"] = paddle.full([max_num_seqs, 1], 0, dtype="int64")
|
||||||
|
self.share_inputs["top_k_list"] = [0] * max_num_seqs
|
||||||
self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
|
self.share_inputs["min_p"] = paddle.full([max_num_seqs, 1], 0.0, dtype="float32")
|
||||||
|
self.share_inputs["min_p_list"] = [0.0] * max_num_seqs
|
||||||
self.share_inputs["temperature"] = paddle.full(
|
self.share_inputs["temperature"] = paddle.full(
|
||||||
[max_num_seqs, 1], self.model_config.temperature, dtype="float32"
|
[max_num_seqs, 1], self.model_config.temperature, dtype="float32"
|
||||||
)
|
)
|
||||||
@@ -674,7 +682,9 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
temperature=self.share_inputs["temperature"],
|
temperature=self.share_inputs["temperature"],
|
||||||
top_p=self.share_inputs["top_p"],
|
top_p=self.share_inputs["top_p"],
|
||||||
top_k=self.share_inputs["top_k"],
|
top_k=self.share_inputs["top_k"],
|
||||||
|
top_k_list=self.share_inputs["top_k_list"],
|
||||||
min_p=self.share_inputs["min_p"],
|
min_p=self.share_inputs["min_p"],
|
||||||
|
min_p_list=self.share_inputs["min_p_list"],
|
||||||
seed=self.share_inputs["infer_seed"],
|
seed=self.share_inputs["infer_seed"],
|
||||||
step_idx=self.share_inputs["step_idx"],
|
step_idx=self.share_inputs["step_idx"],
|
||||||
pre_token_ids=self.share_inputs["pre_ids"],
|
pre_token_ids=self.share_inputs["pre_ids"],
|
||||||
|
|||||||
Reference in New Issue
Block a user