diff --git a/custom_ops/gpu_ops/moe/moe_fast_hardamard_kernel.cu b/custom_ops/gpu_ops/moe/moe_fast_hardamard_kernel.cu index 02302b0a0..763eb5d10 100644 --- a/custom_ops/gpu_ops/moe/moe_fast_hardamard_kernel.cu +++ b/custom_ops/gpu_ops/moe/moe_fast_hardamard_kernel.cu @@ -228,3 +228,22 @@ template void MoeFastHardamardWrapper( const int hadamard_block_size, int8_t *out, cudaStream_t &stream); + +template void +MoeFastHardamardWrapper( + const phi::dtype::bfloat16 *x_data, + const int64_t *expert_idx_per_token, + const int64_t *recv_expert_count, + const phi::dtype::bfloat16 *shift, + const phi::dtype::bfloat16 *smooth, + const float *quant_scales, + const int quant_round_type, + const float quant_max_bound, + const float quant_min_bound, + const int64_t token_num, + const int64_t dim, + const int num_max_tokens_per_expert, + bool used_in_ep_low_latency, + const int hadamard_block_size, + phi::dtype::float8_e4m3fn *out, + cudaStream_t &stream); diff --git a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py index 8ad706d81..86268380e 100644 --- a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py +++ b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py @@ -174,7 +174,7 @@ for type in dtype: template_head_file.write( """ } else { \\ - PADDLE_THROW(phi::errors::Unimplemented("W4aFp8 not supported m=%d k=%d experts=%d token_padding_size=%d kBlockN=%d groupsize=%d, please add [%d, %d, %d, %d, %d, %d] to the gemm_case array in the custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py file and recompile it\\n", _M, _K, _EXPERTS, _TokenPaddingSize, _kBlockN, _GROUPSIZE, _M, _K, _EXPERTS, _TokenPaddingSize, _kBlockN, _GROUPSIZE)); \\ + PADDLE_THROW(phi::errors::Unimplemented("W4aFp8 not supported m=%d k=%d experts=%d token_padding_size=%d kBlockN=%d groupsize=%d, please add [%d, %d, %d, %d, %d] to the gemm_case array in the custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py file and recompile it\\n", _M, _K, _EXPERTS, _TokenPaddingSize, _kBlockN, _GROUPSIZE, _M, _K, _EXPERTS, _TokenPaddingSize, _GROUPSIZE)); \\ } \\ }""" ) diff --git a/fastdeploy/model_executor/layers/moe/ep.py b/fastdeploy/model_executor/layers/moe/ep.py index e8facc2b6..54bf6ef10 100644 --- a/fastdeploy/model_executor/layers/moe/ep.py +++ b/fastdeploy/model_executor/layers/moe/ep.py @@ -295,7 +295,7 @@ class DeepEPEngine: use_fp8=use_fp8, async_finish=False, return_recv_hook=True, - num_per_channel=quant_group_size, + # num_per_channel=quant_group_size, ) return packed_recv_x, recv_expert_count, handle, dispatch_hook