diff --git a/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu b/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu index 8ce591f39..ad501752a 100644 --- a/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu +++ b/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include "helper.h" #include "mla_cache_kernel.cuh" template @@ -259,7 +260,7 @@ std::vector DecodeMLAWriteCacheKernel( } -PD_BUILD_OP(prefill_mla_write_cache) +PD_BUILD_STATIC_OP(prefill_mla_write_cache) .Inputs({"kv_nope", "kv_pe", "kv_cache", @@ -274,7 +275,7 @@ PD_BUILD_OP(prefill_mla_write_cache) "max_seq_len: int"}) .SetKernelFn(PD_KERNEL(PrefillMLAWriteCacheKernel)); -PD_BUILD_OP(decode_mla_write_cache) +PD_BUILD_STATIC_OP(decode_mla_write_cache) .Inputs({"kv_nope", "kv_pe", "kv_cache", diff --git a/custom_ops/gpu_ops/fused_rotary_position_encoding.cu b/custom_ops/gpu_ops/fused_rotary_position_encoding.cu index 41670fec8..a0462ba34 100644 --- a/custom_ops/gpu_ops/fused_rotary_position_encoding.cu +++ b/custom_ops/gpu_ops/fused_rotary_position_encoding.cu @@ -15,6 +15,7 @@ #include "helper.h" #include "paddle/extension.h" + template inline __device__ void apply_token_rotary_embedding_kernel( T* __restrict__ arr, @@ -138,7 +139,7 @@ void FusedRotaryPositionEncoding( }); } -PD_BUILD_OP(fused_rotary_position_encoding) +PD_BUILD_STATIC_OP(fused_rotary_position_encoding) .Inputs({"query", "key", "position_ids", "cos_sin_cache"}) .Outputs({"query_out", "key_out"}) .Attrs({"head_size: int", "is_neox: bool"}) diff --git a/custom_ops/gpu_ops/get_position_ids_and_mask_encoder_batch.cu b/custom_ops/gpu_ops/get_position_ids_and_mask_encoder_batch.cu index f58705d9f..9ddc1732e 100644 --- a/custom_ops/gpu_ops/get_position_ids_and_mask_encoder_batch.cu +++ b/custom_ops/gpu_ops/get_position_ids_and_mask_encoder_batch.cu @@ -15,6 +15,7 @@ #include "helper.h" #include "paddle/extension.h" + __global__ void GetPositionIdsAndMaskEncoderBatchKernel( const int* seq_lens_encoder, // [bsz] 每个批次的 encoder 长度 const int* seq_lens_decoder, // [bsz] 每个批次的 decoder 长度 @@ -74,7 +75,7 @@ void GetPositionIdsAndMaskEncoderBatch( bsz); } -PD_BUILD_OP(get_position_ids_and_mask_encoder_batch) +PD_BUILD_STATIC_OP(get_position_ids_and_mask_encoder_batch) .Inputs({"seq_lens_encoder", "seq_lens_decoder", "seq_lens_this_time", diff --git a/custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu b/custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu index 3b031a29d..ee27f566c 100644 --- a/custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu +++ b/custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu @@ -12,9 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "helper.h" #include "paddle/extension.h" - #define CEILDIV(a,b) (((a+b-1)/b)) template @@ -189,7 +189,7 @@ std::vector tritonmoe_preprocess_kernel(const paddle::Tensor& to return {sorted_ids, expert_ids, num_tokens_post_pad}; } -PD_BUILD_OP(tritonmoe_preprocess) +PD_BUILD_STATIC_OP(tritonmoe_preprocess) .Inputs({"topk_ids"}) .Attrs({"num_experts: int64_t", "GEMM_BLOCK_SIZE_M: int64_t"}) .Outputs({"sorted_ids", "expert_ids", "num_tokens_post_pad"}) diff --git a/custom_ops/gpu_ops/multi_head_latent_attention.cu b/custom_ops/gpu_ops/multi_head_latent_attention.cu index d8e8ae718..98a61e838 100644 --- a/custom_ops/gpu_ops/multi_head_latent_attention.cu +++ b/custom_ops/gpu_ops/multi_head_latent_attention.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "append_attn/multi_head_latent_attention_kernel.h" +#include "helper.h" #include "mla_attn/batch_mla_with_paged_kv_cache.h" template @@ -410,7 +411,7 @@ std::vector MultiHeadLatentAttentionInferDtype( } } -PD_BUILD_OP(multi_head_latent_attention) +PD_BUILD_STATIC_OP(multi_head_latent_attention) .Inputs({"query", "key_cache", "value_cache", diff --git a/custom_ops/gpu_ops/noaux_tc.cu b/custom_ops/gpu_ops/noaux_tc.cu index d98b5b4b4..c92822eb9 100644 --- a/custom_ops/gpu_ops/noaux_tc.cu +++ b/custom_ops/gpu_ops/noaux_tc.cu @@ -18,6 +18,7 @@ #include #include +#include "helper.h" #include "noauxtc_kernel.h" std::vector NoauxTc(paddle::Tensor& scores, @@ -60,7 +61,7 @@ std::vector> NoauxTcInferShape( return {scores_shape}; } -PD_BUILD_OP(noaux_tc) +PD_BUILD_STATIC_OP(noaux_tc) .Inputs({"scores", "scores_with_bias"}) .Outputs({"output_tensor"}) .Attrs({"n_group: int",