diff --git a/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu b/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu
index 8ce591f39..ad501752a 100644
--- a/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu
+++ b/custom_ops/gpu_ops/append_attn/mla_cache_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 #pragma once
 
+#include "helper.h"
 #include "mla_cache_kernel.cuh"
 
 template <paddle::DataType T>
@@ -259,7 +260,7 @@ std::vector<paddle::Tensor> DecodeMLAWriteCacheKernel(
 }
 
 
-PD_BUILD_OP(prefill_mla_write_cache)
+PD_BUILD_STATIC_OP(prefill_mla_write_cache)
     .Inputs({"kv_nope",
              "kv_pe",
              "kv_cache",
@@ -274,7 +275,7 @@ PD_BUILD_OP(prefill_mla_write_cache)
             "max_seq_len: int"})
     .SetKernelFn(PD_KERNEL(PrefillMLAWriteCacheKernel));
 
-PD_BUILD_OP(decode_mla_write_cache)
+PD_BUILD_STATIC_OP(decode_mla_write_cache)
     .Inputs({"kv_nope",
              "kv_pe",
              "kv_cache",
diff --git a/custom_ops/gpu_ops/fused_rotary_position_encoding.cu b/custom_ops/gpu_ops/fused_rotary_position_encoding.cu
index 41670fec8..a0462ba34 100644
--- a/custom_ops/gpu_ops/fused_rotary_position_encoding.cu
+++ b/custom_ops/gpu_ops/fused_rotary_position_encoding.cu
@@ -15,6 +15,7 @@
 #include "helper.h"
 #include "paddle/extension.h"
 
+
 template <typename T, bool IS_NEOX>
 inline __device__ void apply_token_rotary_embedding_kernel(
     T* __restrict__ arr,
@@ -138,7 +139,7 @@ void FusedRotaryPositionEncoding(
       });
 }
 
-PD_BUILD_OP(fused_rotary_position_encoding)
+PD_BUILD_STATIC_OP(fused_rotary_position_encoding)
     .Inputs({"query", "key", "position_ids", "cos_sin_cache"})
     .Outputs({"query_out", "key_out"})
     .Attrs({"head_size: int", "is_neox: bool"})
diff --git a/custom_ops/gpu_ops/get_position_ids_and_mask_encoder_batch.cu b/custom_ops/gpu_ops/get_position_ids_and_mask_encoder_batch.cu
index f58705d9f..9ddc1732e 100644
--- a/custom_ops/gpu_ops/get_position_ids_and_mask_encoder_batch.cu
+++ b/custom_ops/gpu_ops/get_position_ids_and_mask_encoder_batch.cu
@@ -15,6 +15,7 @@
 #include "helper.h"
 #include "paddle/extension.h"
 
+
 __global__ void GetPositionIdsAndMaskEncoderBatchKernel(
     const int* seq_lens_encoder,  // [bsz] 每个批次的 encoder 长度
     const int* seq_lens_decoder,  // [bsz] 每个批次的 decoder 长度
@@ -74,7 +75,7 @@ void GetPositionIdsAndMaskEncoderBatch(
       bsz);
 }
 
-PD_BUILD_OP(get_position_ids_and_mask_encoder_batch)
+PD_BUILD_STATIC_OP(get_position_ids_and_mask_encoder_batch)
     .Inputs({"seq_lens_encoder",
              "seq_lens_decoder",
              "seq_lens_this_time",
diff --git a/custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu b/custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu
index 3b031a29d..ee27f566c 100644
--- a/custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu
+++ b/custom_ops/gpu_ops/moe/tritonmoe_preprocess.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "helper.h"
 #include "paddle/extension.h"
 
-
 #define CEILDIV(a,b) (((a+b-1)/b))
 
 template <typename scalar_t>
@@ -189,7 +189,7 @@ std::vector<paddle::Tensor> tritonmoe_preprocess_kernel(const paddle::Tensor& to
     return {sorted_ids, expert_ids, num_tokens_post_pad};
 }
 
-PD_BUILD_OP(tritonmoe_preprocess)
+PD_BUILD_STATIC_OP(tritonmoe_preprocess)
     .Inputs({"topk_ids"})
     .Attrs({"num_experts: int64_t", "GEMM_BLOCK_SIZE_M: int64_t"})
     .Outputs({"sorted_ids", "expert_ids", "num_tokens_post_pad"})
diff --git a/custom_ops/gpu_ops/multi_head_latent_attention.cu b/custom_ops/gpu_ops/multi_head_latent_attention.cu
index d8e8ae718..98a61e838 100644
--- a/custom_ops/gpu_ops/multi_head_latent_attention.cu
+++ b/custom_ops/gpu_ops/multi_head_latent_attention.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "append_attn/multi_head_latent_attention_kernel.h"
+#include "helper.h"
 #include "mla_attn/batch_mla_with_paged_kv_cache.h"
 
 template <paddle::DataType D>
@@ -410,7 +411,7 @@ std::vector<paddle::DataType> MultiHeadLatentAttentionInferDtype(
   }
 }
 
-PD_BUILD_OP(multi_head_latent_attention)
+PD_BUILD_STATIC_OP(multi_head_latent_attention)
     .Inputs({"query",
              "key_cache",
              "value_cache",
diff --git a/custom_ops/gpu_ops/noaux_tc.cu b/custom_ops/gpu_ops/noaux_tc.cu
index d98b5b4b4..c92822eb9 100644
--- a/custom_ops/gpu_ops/noaux_tc.cu
+++ b/custom_ops/gpu_ops/noaux_tc.cu
@@ -18,6 +18,7 @@
 #include <algorithm>
 #include <optional>
 
+#include "helper.h"
 #include "noauxtc_kernel.h"
 
 std::vector<paddle::Tensor> NoauxTc(paddle::Tensor& scores,
@@ -60,7 +61,7 @@ std::vector<std::vector<int64_t>> NoauxTcInferShape(
   return {scores_shape};
 }
 
-PD_BUILD_OP(noaux_tc)
+PD_BUILD_STATIC_OP(noaux_tc)
     .Inputs({"scores", "scores_with_bias"})
     .Outputs({"output_tensor"})
     .Attrs({"n_group: int",