Files
FastDeploy/custom_ops/gpu_ops/moe/ep_moe_prefill_func.cu
gaoziyuan 82e64b13e1 [NewFeature]Support dp multi api server && Fix some bug in mixed ep && merge develop (#3598)
* [Feature] update ep

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix queue ports idx

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* fix ci

* Update engine.py

* fix ci

* fix some bug in mixed ep

* add server fix and op fix

* rm some log

* fix code style

* ltd fix

* fix

* fix

* fix some bug

* fix bug

* fix bug

* fix style

* Update config.py

* Update splitwise_connector.py

* Update cache_messager.py

* Update __init__.py

* merge and fix

* Update engine.py

* Update common_engine.py

* Update run_ci_xpu.sh

* Update ernie_processor.py

* Update ernie_processor.py

---------

Co-authored-by: ltd0924 <ltd0924@sina.com>
Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
2025-08-26 19:59:02 +08:00

976 lines
44 KiB
Plaintext

// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Ignore CUTLASS warnings about type punning
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wunused-function"
#pragma once
#include "moe/fused_moe_helper.h"
#include "moe/fused_moe_op.h"
#pragma GCC diagnostic pop
#include "helper.h"
#include <cooperative_groups.h>
#define DISPATCH_NUM_EXPERTS_PER_RANK(num_experts_per_rank, NUM_EXPERTS_PER_RANK, ...) \
switch (num_experts_per_rank) { \
case 2: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 2; \
__VA_ARGS__ \
break; \
} \
case 6: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 6; \
__VA_ARGS__ \
break; \
} \
case 8: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 8; \
__VA_ARGS__ \
break; \
} \
case 9: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 9; \
__VA_ARGS__ \
break; \
} \
case 16: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 16; \
__VA_ARGS__ \
break; \
} \
case 32: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 32; \
__VA_ARGS__ \
break; \
} \
case 48: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 48; \
__VA_ARGS__ \
break; \
} \
case 64: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 64; \
__VA_ARGS__ \
break; \
} \
case 128: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 128; \
__VA_ARGS__ \
break; \
} \
case 160: { \
constexpr size_t NUM_EXPERTS_PER_RANK = 160; \
__VA_ARGS__ \
break; \
} \
default: { \
std::ostringstream err_msg; \
err_msg << "Unsupported num_experts_per_rank: " << num_experts_per_rank; \
throw std::invalid_argument(err_msg.str()); \
} \
}
namespace cg = cooperative_groups;
template<typename T>
__device__ T warpReduceSum(T val){
for(int lane_mask = 16; lane_mask > 0; lane_mask /=2){
val += __shfl_down_sync(0xffffffff, val, lane_mask);
}
return val;
}
__global__ void get_expert_token_num(
int64_t* topk_ids,
int *out_workspace, // num_experts * 2 + 2
const int token_num,
const int moe_topk,
const int num_experts
) {
cg::grid_group grid = cg::this_grid();
constexpr int KNWARPS = 512 / 32;
__shared__ int warp_sum[KNWARPS * 2];
int *expert_token_num = out_workspace;
int *expert_token_num_padded = out_workspace + num_experts;
int *token_num_all = out_workspace + num_experts * 2;
int *token_num_all_padded = out_workspace + num_experts * 2 + 1;
const int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = global_idx; i < num_experts; i += blockDim.x * gridDim.x) {
expert_token_num[i] = 0;
expert_token_num_padded[i] = 0;
}
grid.sync();
for (int i = global_idx; i < token_num * moe_topk; i += blockDim.x * gridDim.x) {
const int topk_idx = topk_ids[i];
atomicAdd(&expert_token_num[topk_idx], 1);
}
grid.sync();
for (int i = global_idx; i < num_experts; i += blockDim.x * gridDim.x) {
const int token_num_per_expert = expert_token_num[i];
if (token_num_per_expert > 0) {
expert_token_num_padded[i] = 128 - token_num_per_expert % 128 + token_num_per_expert;
}
}
grid.sync();
if (blockIdx.x == 0) {
int token_num_now = 0;
int token_num_padded = 0;
if (threadIdx.x < num_experts) {
token_num_now = expert_token_num[threadIdx.x];
token_num_padded = expert_token_num_padded[threadIdx.x];
}
const int laneId = threadIdx.x % 32;
const int warpId = threadIdx.x / 32;
int sum = warpReduceSum<int>(token_num_now);
int sum_padded = warpReduceSum<int>(token_num_padded);
__syncthreads();
if (laneId == 0) {
warp_sum[warpId] = sum;
warp_sum[warpId + KNWARPS] = sum_padded;
}
__syncthreads();
sum = (threadIdx.x < KNWARPS) ? warp_sum[laneId] : 0;
sum_padded = (threadIdx.x < KNWARPS) ? warp_sum[laneId + KNWARPS] : 0;
if (warpId == 0) {
sum = warpReduceSum<int>(sum);
sum_padded = warpReduceSum<int>(sum_padded);
}
if (threadIdx.x == 0) {
*token_num_all = sum;
*token_num_all_padded = sum_padded;
}
}
}
std::vector<std::vector<int>> GetExpertTokenNum(
const paddle::Tensor& topk_ids,
const int num_experts) {
const int token_num = topk_ids.dims()[0];
const int moe_topk = topk_ids.dims()[1];
auto out_workspace = GetEmptyTensor({num_experts * 2 + 2}, paddle::DataType::INT32, topk_ids.place());
const int block_size = 512;
const int grid_size = min(132 * 4, div_up(token_num * moe_topk, block_size));
int64_t *topk_ids_ptr = const_cast<int64_t*>(topk_ids.data<int64_t>());
int *out_workspace_ptr = out_workspace.data<int>();
void* kernel_args[] = {
(void*)(&topk_ids_ptr),
(void*)(&out_workspace_ptr),
(void*)&token_num,
(void*)&moe_topk,
(void*)&num_experts
};
cudaLaunchCooperativeKernel((void*)get_expert_token_num, dim3(grid_size), dim3(block_size), kernel_args, 0, topk_ids.stream());
auto out_workspace_host = out_workspace.copy_to(paddle::CPUPlace(), true);
int *out_workspace_host_ptr = out_workspace_host.data<int>();
std::vector<int> expert_token_num(out_workspace_host_ptr, out_workspace_host_ptr + num_experts);
std::vector<int> expert_token_num_padded(out_workspace_host_ptr + num_experts, out_workspace_host_ptr + num_experts * 2);
std::vector<int> token_num_all(out_workspace_host_ptr + num_experts * 2, out_workspace_host_ptr + num_experts * 2 + 2);
return {expert_token_num, expert_token_num_padded, token_num_all};
}
template <typename T>
__global__ void combine_prmt_back_kernel(
const T* expanded_permuted_rows,
T* reduced_unpermuted_output,
const T* bias,
const float* dst_weights,
const int* expanded_source_row_to_expanded_dest_row, // permute_indices_per_token
const int* expert_for_source_row, // dst_idx
const int64_t cols,
const int64_t k,
const int64_t compute_bias,
const bool norm_topk_prob,
const float routed_scaling_factor,
const int64_t num_rows) {
static constexpr int VEC_SIZE = sizeof(int4) / sizeof(T);
AlignedVector<T, VEC_SIZE> load_vec;
AlignedVector<T, VEC_SIZE> bias_vec;
AlignedVector<T, VEC_SIZE> res_vec;
const int cols_int4 = cols / VEC_SIZE;
for (int original_row = blockIdx.x; original_row < num_rows; original_row += gridDim.x) {
T* reduced_row_ptr = reduced_unpermuted_output + original_row * cols;
for (int tid = threadIdx.x; tid < cols_int4; tid += blockDim.x) {
#pragma unroll
for (int vid = 0; vid < VEC_SIZE; vid++) {
res_vec[vid] = 0;
}
for (int k_idx = 0; k_idx < k; ++k_idx) { // k is num_experts_per_rank
const int expanded_original_row = original_row + k_idx * num_rows;
const int expanded_permuted_row =
expanded_source_row_to_expanded_dest_row[expanded_original_row];
if (expanded_permuted_row < 0) continue;
const int64_t k_offset = original_row * k + k_idx;
const float row_scale = dst_weights[expanded_permuted_row];
const T* expanded_permuted_rows_row_ptr =
expanded_permuted_rows + expanded_permuted_row * cols; // prmt后的位置对应的值
Load<T, VEC_SIZE>(expanded_permuted_rows_row_ptr + tid * VEC_SIZE, &load_vec);
const int expert_idx = expert_for_source_row[k_offset]; // 当前位置对应的专家
const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr; // 当前专家对应的down_proj的bias
if (bias_ptr) {
Load<T, VEC_SIZE>(bias_ptr + tid * VEC_SIZE, &bias_vec);
#pragma unroll
for (int vid = 0; vid < VEC_SIZE; vid++) {
res_vec[vid] += static_cast<T>(
row_scale * static_cast<float>(load_vec[vid]) +
static_cast<float>(bias_vec[vid]));
}
} else {
#pragma unroll
for (int vid = 0; vid < VEC_SIZE; vid++) {
res_vec[vid] += static_cast<T>(
row_scale * static_cast<float>(load_vec[vid]));
}
}
}
Store<T, VEC_SIZE>(res_vec, reduced_row_ptr + tid * VEC_SIZE);
}
}
}
template <paddle::DataType T>
void MoeCombineKernel(const paddle::Tensor& ffn_out,
const paddle::Tensor& expert_scales_float,
const paddle::Tensor& permute_indices_per_token,
const paddle::Tensor& top_k_indices,
const paddle::optional<paddle::Tensor>& down_proj_bias,
const bool norm_topk_prob,
const float routed_scaling_factor,
const int num_rows,
const int hidden_size,
paddle::Tensor* output) {
using namespace phi;
typedef PDTraits<T> traits_;
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;
auto stream = ffn_out.stream();
const int threads = 1024;
const int gridx = min(132 * 8, num_rows);
const int num_experts_per_rank = top_k_indices.dims()[1];
combine_prmt_back_kernel<<<gridx, threads, 0, stream>>>(
ffn_out.data<data_t>(),
output->data<data_t>(),
down_proj_bias ? down_proj_bias->data<data_t>() : nullptr,
expert_scales_float.data<float>(),
permute_indices_per_token.data<int32_t>(),
top_k_indices.data<int>(),
hidden_size,
num_experts_per_rank,
static_cast<int>(1), // compute bias
norm_topk_prob,
routed_scaling_factor,
num_rows);
}
std::vector<paddle::Tensor> EPMoeExpertCombine(
const paddle::Tensor& ffn_out,
const paddle::Tensor& expert_scales_float, // dst_weights
const paddle::Tensor& permute_indices_per_token, // permute_indices_per_token
const paddle::Tensor& top_k_indices, // dst_indices
const paddle::optional<paddle::Tensor>& down_proj_bias,
const bool norm_topk_prob,
const float routed_scaling_factor) {
const auto input_type = ffn_out.dtype();
auto place = ffn_out.place();
const int num_rows = top_k_indices.dims()[0];
const int hidden_size = ffn_out.dims()[1];
auto output = GetEmptyTensor({num_rows, hidden_size}, input_type, place);
switch (input_type) {
case paddle::DataType::BFLOAT16:
MoeCombineKernel<paddle::DataType::BFLOAT16>(
ffn_out,
expert_scales_float,
permute_indices_per_token,
top_k_indices,
down_proj_bias,
norm_topk_prob,
routed_scaling_factor,
num_rows,
hidden_size,
&output);
break;
case paddle::DataType::FLOAT16:
MoeCombineKernel<paddle::DataType::BFLOAT16>(
ffn_out,
expert_scales_float,
permute_indices_per_token,
top_k_indices,
down_proj_bias,
norm_topk_prob,
routed_scaling_factor,
num_rows,
hidden_size,
&output);
break;
default:
PD_THROW("Unsupported data type for MoeDispatchKernel");
}
return {output};
}
template <typename T, typename OutT, int NUM_EXPERTS_PER_RANK = 8, int Kthread = 512, int RoundType = 1>
__global__ void permute_x_kernel(const T *src_x,
const int64_t *topk_idx,
const float *topk_weights,
const int *token_nums_per_expert,
const float *up_gate_proj_in_scale,
const int moe_topk,
const int num_rows,
const int token_nums_this_rank,
const int hidden_size,
OutT *permute_x, // [token_nums_this_rank, hidden_size]
int *permute_indices_per_token, // [moe_topk, num_rows]
float *dst_weights, // [token_nums_this_rank]
int *dst_indices,
int *cumsum_idx_gpu,
int64_t *token_nums_per_expert_cumsum,
int64_t *expert_idx_per_token, // [num_rows, moe_topk]
float max_bound = 127.0,
float min_bound = -127.0) {
const int src_token_idx = blockIdx.x;
const int tid = threadIdx.x;
constexpr int vec_size = sizeof(int4) / sizeof(T);
__shared__ int write_idx; // cumsum start idx
__shared__ int token_nums_per_expert_cum[NUM_EXPERTS_PER_RANK];
AlignedVector<T, vec_size> src_vec;
AlignedVector<OutT, vec_size> res_vec;
if (tid == 0) {
int sum_now = 0;
for (int i = 0; i < NUM_EXPERTS_PER_RANK; i++) {
sum_now += token_nums_per_expert[i];
token_nums_per_expert_cum[i] = sum_now;
if (blockIdx.x == 0) {
token_nums_per_expert_cumsum[i] = sum_now;
}
}
}
__syncthreads();
const int hidden_size_int4 = hidden_size / vec_size;
for (int s_token_idx = src_token_idx; s_token_idx < num_rows; s_token_idx += gridDim.x) {
const int64_t *topk_idx_now = topk_idx + s_token_idx * moe_topk;
#pragma unroll
for (int expert_idx = 0; expert_idx < moe_topk; expert_idx++) {
int expert_now = static_cast<int>(topk_idx_now[expert_idx]);
if (expert_now == -1) continue;
const int dst_chunk_start_idx = expert_now == 0 ? 0 : token_nums_per_expert_cum[expert_now - 1];
if (tid == 0) {
const int offset_now = atomicAdd(cumsum_idx_gpu + expert_now, 1);
write_idx = offset_now;
}
__syncthreads();
const int token_offset_now = write_idx;
const int dst_token_idx = dst_chunk_start_idx + token_offset_now;
permute_indices_per_token[expert_now * num_rows + s_token_idx] = dst_token_idx;
dst_weights[dst_token_idx] = topk_weights[s_token_idx * moe_topk + expert_idx];
dst_indices[s_token_idx * NUM_EXPERTS_PER_RANK + expert_now] = expert_now;
// cp x
for (int v_id = tid; v_id < hidden_size_int4; v_id += blockDim.x) {
Load<T, vec_size>(&src_x[s_token_idx * hidden_size + v_id * vec_size], &src_vec);
if (up_gate_proj_in_scale) {
for (int i = 0; i < vec_size; i++) {
float quant_value = max_bound * up_gate_proj_in_scale[expert_now] * static_cast<float>(src_vec[i]);
if constexpr (std::is_same<OutT, int8_t>::value) {
// w4aint8
if (RoundType == 0) {
res_vec[i] = static_cast<OutT>(ClipFunc<float>(rint(quant_value), min_bound, max_bound));
} else {
res_vec[i] = static_cast<OutT>(ClipFunc<float>(round(quant_value), min_bound, max_bound));
}
} else {
// w4afp8
float value = ClipFunc<float>(quant_value, min_bound, max_bound);
res_vec[i] = static_cast<OutT>(value);
}
}
} else {
for (int i = 0; i < vec_size; i++) {
res_vec[i] = static_cast<OutT>(src_vec[i]);
}
}
Store<OutT, vec_size>(res_vec, &permute_x[dst_token_idx * hidden_size + v_id * vec_size]);
}
expert_idx_per_token[dst_token_idx] = expert_now;
}
}
}
template <paddle::DataType T>
void EPMoeDispatchKernel(const paddle::Tensor& input,
const paddle::Tensor& topk_ids,
const paddle::Tensor& topk_weights,
const paddle::Tensor& token_nums_per_expert,
const paddle::optional<paddle::Tensor>& up_gate_proj_in_scale,
const std::string& moe_quant_type,
const int moe_topk,
const int num_rows,
const int token_nums_this_rank,
const int hidden_size,
const int num_experts_per_rank,
paddle::Tensor* permute_input,
paddle::Tensor* permute_indices_per_token,
paddle::Tensor* dst_weights,
paddle::Tensor* dst_indices,
paddle::Tensor* cumsum_idx_gpu,
paddle::Tensor* token_nums_per_expert_cumsum,
paddle::Tensor* expert_idx_per_token) {
using namespace phi;
typedef PDTraits<T> traits_;
typedef typename traits_::DataType DataType_;
typedef typename traits_::data_t data_t;
typedef PDTraits<paddle::DataType::FLOAT8_E4M3FN> traits_fp8;
typedef typename traits_fp8::DataType DataType_fp8;
typedef typename traits_fp8::data_t data_t_fp8;
auto stream = input.stream();
auto place = input.place();
const int gridx = min(132 * 8, num_rows);
if (moe_quant_type == "w4a8") {
if (num_experts_per_rank == 8) {
permute_x_kernel<data_t, int8_t, 8><<<gridx, 512, 0, stream>>>(
input.data<data_t>(),
topk_ids.data<int64_t>(),
topk_weights.data<float>(),
token_nums_per_expert.data<int>(),
up_gate_proj_in_scale ? up_gate_proj_in_scale.get().data<float>() : nullptr,
moe_topk,
num_rows,
token_nums_this_rank,
hidden_size,
permute_input->data<int8_t>(),
permute_indices_per_token->data<int>(),
dst_weights->data<float>(),
dst_indices->data<int>(),
cumsum_idx_gpu->data<int>(),
token_nums_per_expert_cumsum->data<int64_t>(),
expert_idx_per_token->data<int64_t>(),
127.0,
-127.0
);
} else if (num_experts_per_rank == 16) {
permute_x_kernel<data_t, int8_t, 16><<<gridx, 512, 0, stream>>>(
input.data<data_t>(),
topk_ids.data<int64_t>(),
topk_weights.data<float>(),
token_nums_per_expert.data<int>(),
up_gate_proj_in_scale ? up_gate_proj_in_scale.get().data<float>() : nullptr,
moe_topk,
num_rows,
token_nums_this_rank,
hidden_size,
permute_input->data<int8_t>(),
permute_indices_per_token->data<int>(),
dst_weights->data<float>(),
dst_indices->data<int>(),
cumsum_idx_gpu->data<int>(),
token_nums_per_expert_cumsum->data<int64_t>(),
expert_idx_per_token->data<int64_t>(),
127.0,
-127.0
);
}
} else if (moe_quant_type == "w4afp8") {
if (num_experts_per_rank == 8) {
permute_x_kernel<data_t, data_t_fp8, 8, 512><<<gridx, 512, 0, stream>>>(
input.data<data_t>(),
topk_ids.data<int64_t>(),
topk_weights.data<float>(),
token_nums_per_expert.data<int>(),
up_gate_proj_in_scale ? up_gate_proj_in_scale.get().data<float>() : nullptr,
moe_topk,
num_rows,
token_nums_this_rank,
hidden_size,
permute_input->data<data_t_fp8>(),
permute_indices_per_token->data<int>(),
dst_weights->data<float>(),
dst_indices->data<int>(),
cumsum_idx_gpu->data<int>(),
token_nums_per_expert_cumsum->data<int64_t>(),
expert_idx_per_token->data<int64_t>(),
448.0f,
-448.0f
);
} else if (num_experts_per_rank == 16) {
permute_x_kernel<data_t, data_t_fp8, 16, 512><<<gridx, 512, 0, stream>>>(
input.data<data_t>(),
topk_ids.data<int64_t>(),
topk_weights.data<float>(),
token_nums_per_expert.data<int>(),
up_gate_proj_in_scale ? up_gate_proj_in_scale.get().data<float>() : nullptr,
moe_topk,
num_rows,
token_nums_this_rank,
hidden_size,
permute_input->data<data_t_fp8>(),
permute_indices_per_token->data<int>(),
dst_weights->data<float>(),
dst_indices->data<int>(),
cumsum_idx_gpu->data<int>(),
token_nums_per_expert_cumsum->data<int64_t>(),
expert_idx_per_token->data<int64_t>(),
448.0f,
-448.0f
);
}
} else {
if (num_experts_per_rank == 8) {
permute_x_kernel<data_t, data_t, 8><<<gridx, 512, 0, stream>>>(
input.data<data_t>(),
topk_ids.data<int64_t>(),
topk_weights.data<float>(),
token_nums_per_expert.data<int>(),
up_gate_proj_in_scale ? up_gate_proj_in_scale.get().data<float>() : nullptr,
moe_topk,
num_rows,
token_nums_this_rank,
hidden_size,
permute_input->data<data_t>(),
permute_indices_per_token->data<int>(),
dst_weights->data<float>(),
dst_indices->data<int>(),
cumsum_idx_gpu->data<int>(),
token_nums_per_expert_cumsum->data<int64_t>(),
expert_idx_per_token->data<int64_t>(),
127.0,
-127.0
);
} else if (num_experts_per_rank == 16) {
permute_x_kernel<data_t, data_t, 16><<<gridx, 512, 0, stream>>>(
input.data<data_t>(),
topk_ids.data<int64_t>(),
topk_weights.data<float>(),
token_nums_per_expert.data<int>(),
up_gate_proj_in_scale ? up_gate_proj_in_scale.get().data<float>() : nullptr,
moe_topk,
num_rows,
token_nums_this_rank,
hidden_size,
permute_input->data<data_t>(),
permute_indices_per_token->data<int>(),
dst_weights->data<float>(),
dst_indices->data<int>(),
cumsum_idx_gpu->data<int>(),
token_nums_per_expert_cumsum->data<int64_t>(),
expert_idx_per_token->data<int64_t>(),
127.0,
-127.0
);
}
}
}
std::vector<paddle::Tensor> EPMoeExpertDispatch(
const paddle::Tensor& input,
const paddle::Tensor& topk_ids,
const paddle::Tensor& topk_weights,
const paddle::optional<paddle::Tensor>& up_gate_proj_in_scale,
const std::vector<int>& token_nums_per_expert,
const int token_nums_this_rank,
const std::string& moe_quant_type) {
const auto input_type = input.dtype();
const int moe_topk = topk_ids.dims()[1];
auto place = input.place();
int token_rows = 0;
auto input_dims = input.dims();
if (input_dims.size() == 3) {
token_rows = input_dims[0] * input_dims[1];
} else {
token_rows = input_dims[0];
}
const int num_rows = token_rows;
const int hidden_size = input.dims()[input_dims.size() - 1];
const int num_experts_per_rank = token_nums_per_expert.size();
auto permute_input = GetEmptyTensor(
{token_nums_this_rank, hidden_size},
moe_quant_type == "w4a8" ? paddle::DataType::INT8 : moe_quant_type == "w4afp8" ? paddle::DataType::FLOAT8_E4M3FN : input_type,
place);
auto num_experts_per_rank_tensor = GetEmptyTensor(
{num_experts_per_rank},
paddle::DataType::INT32,
place);
auto expert_idx_per_token = GetEmptyTensor(
{token_nums_this_rank}, paddle::DataType::INT64, place);
cudaMemcpyAsync(num_experts_per_rank_tensor.data<int>(), token_nums_per_expert.data(), num_experts_per_rank * sizeof(int), cudaMemcpyHostToDevice, input.stream());
// cudaMemcpy(num_experts_per_rank_tensor.data<int>(), token_nums_per_expert.data(), num_experts_per_rank * sizeof(int), cudaMemcpyHostToDevice);
auto token_nums_per_expert_cumsum = GetEmptyTensor({num_experts_per_rank}, paddle::DataType::INT64, place);
auto dst_weights = GetEmptyTensor({token_nums_this_rank}, paddle::DataType::FLOAT32, place);
auto dst_indices = GetEmptyTensor({num_rows, num_experts_per_rank}, paddle::DataType::INT32, place);
auto permute_indices_per_token = paddle::full({num_experts_per_rank, num_rows}, -1, paddle::DataType::INT32, place);
auto cumsum_idx_gpu = paddle::full({num_experts_per_rank}, 0, paddle::DataType::INT32, place);
switch (input_type) {
case paddle::DataType::BFLOAT16:
EPMoeDispatchKernel<paddle::DataType::BFLOAT16>(input,
topk_ids,
topk_weights,
num_experts_per_rank_tensor,
up_gate_proj_in_scale,
moe_quant_type,
moe_topk,
num_rows,
token_nums_this_rank,
hidden_size,
num_experts_per_rank,
&permute_input,
&permute_indices_per_token,
&dst_weights,
&dst_indices,
&cumsum_idx_gpu,
&token_nums_per_expert_cumsum,
&expert_idx_per_token);
break;
case paddle::DataType::FLOAT16:
EPMoeDispatchKernel<paddle::DataType::FLOAT16>(input,
topk_ids,
topk_weights,
num_experts_per_rank_tensor,
up_gate_proj_in_scale,
moe_quant_type,
moe_topk,
num_rows,
token_nums_this_rank,
hidden_size,
num_experts_per_rank,
&permute_input,
&permute_indices_per_token,
&dst_weights,
&dst_indices,
&cumsum_idx_gpu,
&token_nums_per_expert_cumsum,
&expert_idx_per_token);
break;
default:
PD_THROW("Unsupported data type for EPMoeDispatchKernel");
}
return {permute_input,
permute_indices_per_token,
token_nums_per_expert_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
expert_idx_per_token};
}
std::vector<std::vector<int64_t>> EPMoeExpertDispatchInferShape(
const std::vector<int64_t>& input_shape,
const std::vector<int64_t>& topk_ids_shape,
const std::vector<int64_t>& topk_weights_shape,
const paddle::optional<std::vector<int64_t>>& up_gate_proj_in_scale_dtype,
const std::vector<int>& token_nums_per_expert,
const int token_nums_this_rank) {
int token_rows = -1;
int moe_topk = topk_ids_shape[1];
if (input_shape.size() == 3) {
token_rows = input_shape[0] * input_shape[1];
} else {
token_rows = input_shape[0];
}
const int expert_num = token_nums_per_expert.size(); // 本地专家个数
const int num_rows = token_rows;
const int hidden_size = input_shape[input_shape.size() - 1];
return {{token_nums_this_rank, hidden_size},
{expert_num, num_rows},
{expert_num},
{token_nums_this_rank},
{num_rows, expert_num},
{expert_num},
{token_nums_this_rank}}; // dst_idx per expert
}
std::vector<paddle::DataType> EPMoeExpertDispatchInferDtype(
const paddle::DataType& input_dtype,
const paddle::DataType& topk_ids_dtype,
const paddle::DataType& topk_weights_dtype,
const std::vector<int>& token_nums_per_expert,
const int token_nums_this_rank,
const std::string& moe_quant_type) {
return {moe_quant_type == "w4a8" ? paddle::DataType::INT8 : input_dtype,
paddle::DataType::INT32,
paddle::DataType::INT64,
paddle::DataType::FLOAT32,
paddle::DataType::INT32,
paddle::DataType::INT32,
paddle::DataType::INT64};
}
PD_BUILD_STATIC_OP(ep_moe_expert_dispatch)
.Inputs({"input", "topk_ids", "topk_weights",
paddle::Optional("up_gate_proj_in_scale")})
.Outputs({"permute_input",
"permute_indices_per_token",
"token_nums_per_expert_cumsum",
"dst_weights",
"dst_indices",
"cumsum_idx_gpu",
"expert_idx_per_token"})
.Attrs({
"token_nums_per_expert: std::vector<int>",
"token_nums_this_rank: int",
"moe_quant_type: std::string"
})
.SetKernelFn(PD_KERNEL(EPMoeExpertDispatch))
.SetInferShapeFn(PD_INFER_SHAPE(EPMoeExpertDispatchInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(EPMoeExpertDispatchInferDtype));
template <typename T, int NUM_EXPERTS_PER_RANK = 8>
__global__ void permute_x_fp8_kernel(const T *src_x,
const float *scale,
const int64_t *topk_idx,
const float *topk_weights,
const int *token_nums_per_expert,
const int *token_nums_per_expert_padded,
const int moe_topk,
const int num_rows,
const int token_nums_this_rank,
const int token_nums_this_rank_padded,
const int64_t hidden_size,
T *permute_x, // [token_nums_this_rank, hidden_size]
float *permute_scale,
int *permute_indices_per_token, // [moe_topk, num_rows]
float *dst_weights, // [token_nums_this_rank]
int *dst_indices,
int *cumsum_idx_gpu,
int64_t *token_nums_per_expert_cumsum,
int64_t *token_nums_per_expert_padded_cumsum,
int *m_indices) { // [num_rows, moe_topk]
const int64_t src_token_idx = blockIdx.x;
const int tid = threadIdx.x;
constexpr int vec_size = sizeof(int4) / sizeof(T);
constexpr int scale_vec_size = sizeof(int4) / sizeof(float);
__shared__ int write_idx; // cumsum start idx
__shared__ int token_nums_per_expert_cum[NUM_EXPERTS_PER_RANK];
if (tid == 0) {
int sum_now = 0;
int sum_now_padded = 0;
for (int i = 0; i < NUM_EXPERTS_PER_RANK; i++) {
sum_now += token_nums_per_expert[i];
sum_now_padded += token_nums_per_expert_padded[i];
token_nums_per_expert_cum[i] = sum_now_padded;
if (blockIdx.x == 0) {
token_nums_per_expert_cumsum[i] = sum_now;
token_nums_per_expert_padded_cumsum[i] = sum_now_padded;
}
}
}
__syncthreads();
const int hidden_size_int4 = hidden_size / vec_size;
const int hidden_size_scale = hidden_size / 128;
const int hidden_size_scale_int4 = hidden_size_scale / scale_vec_size;
const int token_nums_feed_to_ffn = token_nums_per_expert_cum[NUM_EXPERTS_PER_RANK-1];
// prmt
for (int64_t s_token_idx = src_token_idx; s_token_idx < token_nums_feed_to_ffn; s_token_idx += gridDim.x) {
// the m_indices[s_token_idx] must be a value `i` in [0, NUM_EXPERTS_PER_RANK)
// here we parallel wo find the `i` we want.
for (int i = threadIdx.x; i < NUM_EXPERTS_PER_RANK; i+= blockDim.x) {
const int start_idx = i == 0 ? 0 : token_nums_per_expert_cum[i - 1];
const int end_idx = token_nums_per_expert_cum[i];
if (s_token_idx >= start_idx && s_token_idx < end_idx) {
if ((s_token_idx - start_idx) < token_nums_per_expert[i]) m_indices[s_token_idx] = i;
break;
}
}
if (s_token_idx < num_rows) {
const int64_t *topk_idx_now = topk_idx + s_token_idx * moe_topk;
#pragma unroll
for (int expert_idx = 0; expert_idx < moe_topk; expert_idx++) {
int expert_now = static_cast<int>(topk_idx_now[expert_idx]);
if (expert_now == -1) continue;
const int dst_chunk_start_idx = expert_now == 0 ? 0 : token_nums_per_expert_cum[expert_now - 1];
if (tid == 0) {
const int offset_now = atomicAdd(cumsum_idx_gpu + expert_now, 1);
write_idx = offset_now;
}
__syncthreads();
const int token_offset_now = write_idx;
const int64_t dst_token_idx = dst_chunk_start_idx + token_offset_now;
permute_indices_per_token[expert_now * num_rows + s_token_idx] = dst_token_idx;
dst_weights[dst_token_idx] = topk_weights[s_token_idx * moe_topk + expert_idx];
// m_indices[dst_token_idx] = expert_now; // not need?
dst_indices[s_token_idx * NUM_EXPERTS_PER_RANK + expert_now] = expert_now;
// cp x
for (int64_t v_id = tid; v_id < hidden_size_int4; v_id += blockDim.x) {
*(reinterpret_cast<int4*>(permute_x + dst_token_idx * hidden_size) + v_id) = *(reinterpret_cast<const int4*>(src_x + s_token_idx * hidden_size) + v_id);
}
// cp scale
for (int v_id = tid; v_id < hidden_size_scale_int4; v_id += blockDim.x) {
*(reinterpret_cast<int4*>(permute_scale + dst_token_idx * hidden_size_scale) + v_id) = *(reinterpret_cast<const int4*>(scale + s_token_idx * hidden_size_scale) + v_id);
}
}
}
}
}
void EPMoeDispatchFP8Kernel(const paddle::Tensor& input,
const paddle::Tensor& scale,
const paddle::Tensor& topk_ids,
const paddle::Tensor& topk_weights,
const paddle::Tensor& token_nums_per_expert,
const paddle::Tensor& token_nums_per_expert_padded,
const int moe_topk,
const int num_rows,
const int token_nums_this_rank,
const int token_nums_this_rank_padded,
const int hidden_size,
const int num_experts_per_rank,
paddle::Tensor* permute_input,
paddle::Tensor* permute_scale,
paddle::Tensor* permute_indices_per_token,
paddle::Tensor* dst_weights,
paddle::Tensor* dst_indices,
paddle::Tensor* cumsum_idx_gpu,
paddle::Tensor* token_nums_per_expert_cumsum,
paddle::Tensor* token_nums_per_expert_padded_cumsum,
paddle::Tensor* m_indices) {
auto stream = input.stream();
auto place = input.place();
// const int gridx = min(132 * 8, num_rows);
const int gridx = 132 * 8;
DISPATCH_NUM_EXPERTS_PER_RANK(num_experts_per_rank, NUM_EXPERTS_PER_RANK,
permute_x_fp8_kernel<phi::dtype::float8_e4m3fn, NUM_EXPERTS_PER_RANK><<<gridx, 512, 0, stream>>>(
input.data<phi::dtype::float8_e4m3fn>(),
scale.data<float>(),
topk_ids.data<int64_t>(),
topk_weights.data<float>(),
token_nums_per_expert.data<int>(),
token_nums_per_expert_padded.data<int>(),
moe_topk,
num_rows,
token_nums_this_rank,
token_nums_this_rank_padded,
hidden_size,
permute_input->data<phi::dtype::float8_e4m3fn>(),
permute_scale->data<float>(),
permute_indices_per_token->data<int>(),
dst_weights->data<float>(),
dst_indices->data<int>(),
cumsum_idx_gpu->data<int>(),
token_nums_per_expert_cumsum->data<int64_t>(),
token_nums_per_expert_padded_cumsum->data<int64_t>(),
m_indices->data<int>()
);)
}
std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
const paddle::Tensor& input,
const paddle::Tensor& scale,
const paddle::Tensor& topk_ids,
const paddle::Tensor& topk_weights,
const paddle::Tensor& num_experts_per_rank_tensor,
const paddle::Tensor& num_experts_per_rank_padded_tensor,
const bool use_in_ep,
const int token_nums_this_rank_padded) {
const auto input_type = input.dtype();
const int moe_topk = topk_ids.dims()[1];
auto place = input.place();
int token_rows = 0;
auto input_dims = input.dims();
if (input_dims.size() == 3) {
token_rows = input_dims[0] * input_dims[1];
} else {
token_rows = input_dims[0];
}
const int num_rows = token_rows;
const int hidden_size = input.dims()[input_dims.size() - 1];
const int num_experts_per_rank = num_experts_per_rank_tensor.dims()[0];
int32_t token_nums_feed_to_ffn = use_in_ep ? token_nums_this_rank_padded : token_rows * moe_topk + num_experts_per_rank * (128-1);
auto permute_input = GetEmptyTensor(
{token_nums_feed_to_ffn, hidden_size},
input_type,
place);
auto permute_scale = GetEmptyTensor(
{token_nums_feed_to_ffn, hidden_size / 128},
paddle::DataType::FLOAT32,
place);
auto m_indices = paddle::full({token_nums_feed_to_ffn}, -1, paddle::DataType::INT32, place);
auto token_nums_per_expert_cumsum = GetEmptyTensor({num_experts_per_rank}, paddle::DataType::INT64, place);
auto token_nums_per_expert_padded_cumsum = GetEmptyTensor({num_experts_per_rank}, paddle::DataType::INT64, place);
auto dst_weights = GetEmptyTensor({token_nums_feed_to_ffn}, paddle::DataType::FLOAT32, place);
auto dst_indices = GetEmptyTensor({num_rows, num_experts_per_rank}, paddle::DataType::INT32, place);
auto permute_indices_per_token = paddle::full({num_experts_per_rank, num_rows}, -1, paddle::DataType::INT32, place);
auto cumsum_idx_gpu = paddle::full({num_experts_per_rank}, 0, paddle::DataType::INT32, place);
EPMoeDispatchFP8Kernel(input,
scale,
topk_ids,
topk_weights,
num_experts_per_rank_tensor,
num_experts_per_rank_padded_tensor,
moe_topk,
num_rows,
-1,
-1,
hidden_size,
num_experts_per_rank,
&permute_input,
&permute_scale,
&permute_indices_per_token,
&dst_weights,
&dst_indices,
&cumsum_idx_gpu,
&token_nums_per_expert_cumsum,
&token_nums_per_expert_padded_cumsum,
&m_indices);
return {permute_input,
permute_scale,
permute_indices_per_token,
token_nums_per_expert_cumsum,
token_nums_per_expert_padded_cumsum,
dst_weights,
dst_indices,
cumsum_idx_gpu,
m_indices};
}
PD_BUILD_STATIC_OP(ep_moe_expert_dispatch_fp8)
.Inputs({"input", "scale", "topk_ids", "topk_weights", "num_experts_per_rank_tensor", "num_experts_per_rank_padded_tensor"})
.Outputs({"permute_input",
"permute_scale",
"permute_indices_per_token",
"token_nums_per_expert_cumsum",
"token_nums_per_expert_padded_cumsum",
"dst_weights",
"dst_indices",
"cumsum_idx_gpu",
"m_indices"})
.Attrs({"use_in_ep:bool", "token_nums_this_rank_padded:int"})
.SetKernelFn(PD_KERNEL(EPMoeExpertDispatchFP8));