mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Feat] ernie4_5_vl_moe support CudaGraph (#3226)
* delete dynamic control flow for decode * coda-style * fix scatter/gather typos and use input stream instead default stream * support 0-Size Tensor * update runner and model * using static mem address as input * fix mem leak * refine code * update mm_buffer * fix typo * fix buffersize * fix unk token * refine code * refine * support other arch * open cudagraph in vlci * fix * update * update * update * fix cmd * update --------- Co-authored-by: aquagull <hongyuh@qq.com> Co-authored-by: Yuanle Liu <yuanlehome@163.com>
This commit is contained in:
@@ -36,6 +36,9 @@ void MoeDispatchKernel(
|
||||
paddle::Tensor *topk_idx, paddle::Tensor *expert_idx_per_token) {
|
||||
using namespace phi;
|
||||
|
||||
if (num_rows == 0){
|
||||
return;
|
||||
}
|
||||
typedef PDTraits<T> traits_;
|
||||
typedef typename traits_::DataType DataType_;
|
||||
typedef typename traits_::data_t data_t;
|
||||
@@ -185,6 +188,15 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
|
||||
auto expert_idx_per_token =
|
||||
GetEmptyTensor({num_rows * moe_topk}, paddle::DataType::INT32, place);
|
||||
|
||||
if (token_rows == 0){
|
||||
return {permute_input,
|
||||
tokens_expert_prefix_sum,
|
||||
permute_indices_per_token,
|
||||
topk_weight,
|
||||
topk_idx,
|
||||
expert_idx_per_token};
|
||||
}
|
||||
|
||||
switch (input_type) {
|
||||
case paddle::DataType::BFLOAT16:
|
||||
MoeDispatchKernel<paddle::DataType::BFLOAT16>(
|
||||
|
||||
Reference in New Issue
Block a user