diff --git a/custom_ops/gpu_ops/sample_kernels/sampling.cuh b/custom_ops/gpu_ops/sample_kernels/sampling.cuh
index 99ccc42bb..f5845bea9 100644
--- a/custom_ops/gpu_ops/sample_kernels/sampling.cuh
+++ b/custom_ops/gpu_ops/sample_kernels/sampling.cuh
@@ -48,14 +48,15 @@ constexpr BlockReduceAlgorithm REDUCE_ALGO = BLOCK_REDUCE_WARP_REDUCTIONS;
 #define SAMPLING_CUB_SUBTRACTLEFT_DEFINED
 #endif
 
-template <typename T> struct Pair {
+template <typename T>
+struct Pair {
   T value;
   int count;
 
-  __device__ Pair operator+(const Pair &other) const {
+  __device__ Pair operator+(const Pair& other) const {
     return {value + other.value, count + other.count};
   }
-  __device__ Pair &operator+=(const Pair &other) {
+  __device__ Pair& operator+=(const Pair& other) {
     value += other.value;
     count += other.count;
     return *this;
@@ -78,22 +79,25 @@ struct ValueCount {
 };
 
 struct BoolDiffOp {
-  __device__ __forceinline__ bool operator()(const bool &lhs,
-                                             const bool &rhs) const {
+  __device__ __forceinline__ bool operator()(const bool& lhs,
+                                             const bool& rhs) const {
     return lhs != rhs;
   }
 };
 
-template <uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
+template <uint32_t BLOCK_THREADS,
+          BlockScanAlgorithm SCAN_ALGORITHM,
           BlockReduceAlgorithm REDUCE_ALGORITHM>
 struct SamplingTempStorage {
   union {
     float deterministic_scan[BLOCK_THREADS / 32];
     typename BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>::TempStorage scan;
-    typename BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage reduce;
-    typename BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage reduce_int;
-    typename BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
-        reduce_value_count;
+    typename BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
+        reduce;
+    typename BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
+        reduce_int;
+    typename BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>::
+        TempStorage reduce_value_count;
     typename BlockAdjacentDifference<bool, BLOCK_THREADS>::TempStorage adj_diff;
   } block_prim;
   struct {
@@ -112,14 +116,17 @@ struct SamplingTempStorage {
  * algorithm. \note This implementation is slower than the cub::BlockScan, but
  * it is deterministic.
  */
-template <uint32_t VEC_SIZE, uint32_t BLOCK_THREADS,
+template <uint32_t VEC_SIZE,
+          uint32_t BLOCK_THREADS,
           BlockScanAlgorithm SCAN_ALGORITHM,
-          BlockReduceAlgorithm REDUCE_ALGORITHM, typename T>
-__device__ __forceinline__ void
-DeterministicInclusiveSum(const T *in_data, T *out_data,
-                          SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM,
-                                              REDUCE_ALGORITHM> *temp_storage) {
-  T *smem_prefix_sum = temp_storage->block_prim.deterministic_scan;
+          BlockReduceAlgorithm REDUCE_ALGORITHM,
+          typename T>
+__device__ __forceinline__ void DeterministicInclusiveSum(
+    const T* in_data,
+    T* out_data,
+    SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>*
+        temp_storage) {
+  T* smem_prefix_sum = temp_storage->block_prim.deterministic_scan;
   T thread_data[VEC_SIZE];
   T thread_sum = 0;
 #pragma unroll
@@ -138,8 +145,8 @@ DeterministicInclusiveSum(const T *in_data, T *out_data,
     }
   }
 
-  T warp_sum = __shfl_sync(0xffffffff, thread_exclusive_prefix_sum,
-                           threadIdx.x | 0xffffffff);
+  T warp_sum = __shfl_sync(
+      0xffffffff, thread_exclusive_prefix_sum, threadIdx.x | 0xffffffff);
   if (threadIdx.x % 32 == 31) {
     thread_exclusive_prefix_sum = 0;
   }
@@ -197,12 +204,21 @@ DeterministicInclusiveSum(const T *in_data, T *out_data,
   }
 }
 
-template <uint32_t VEC_SIZE, uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
-          BlockReduceAlgorithm REDUCE_ALGORITHM, bool DETERMINISTIC, typename Predicate>
+template <uint32_t VEC_SIZE,
+          uint32_t BLOCK_THREADS,
+          BlockScanAlgorithm SCAN_ALGORITHM,
+          BlockReduceAlgorithm REDUCE_ALGORITHM,
+          bool DETERMINISTIC,
+          typename Predicate>
 __device__ __forceinline__ void DeviceSamplingFromProb(
-    uint32_t i, uint32_t d, Predicate pred, float u, vec_t<float, VEC_SIZE> prob_vec,
+    uint32_t i,
+    uint32_t d,
+    Predicate pred,
+    float u,
+    vec_t<float, VEC_SIZE> prob_vec,
     float& aggregate,
-    SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>* temp_storage) {
+    SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>*
+        temp_storage) {
   const uint32_t tx = threadIdx.x;
   float prob_greater_than_threshold[VEC_SIZE];
   float inclusive_cdf[VEC_SIZE];
@@ -212,14 +228,14 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
     prob_greater_than_threshold[j] = pred(prob_vec[j]) ? prob_vec[j] : 0;
     valid[j] = pred(prob_vec[j]) && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d;
   }
-#ifdef PADDLE_WITH_COREX
-  float aggregate_local =
-      BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage->block_prim.reduce)
-          .Sum(prob_greater_than_threshold);
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+  float aggregate_local = BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                              temp_storage->block_prim.reduce)
+                              .Sum(prob_greater_than_threshold);
 #else
-  float aggregate_local =
-      BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage->block_prim.reduce)
-          .Sum<VEC_SIZE>(prob_greater_than_threshold);
+  float aggregate_local = BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                              temp_storage->block_prim.reduce)
+                              .Sum<VEC_SIZE>(prob_greater_than_threshold);
 #endif
   if (tx == 0) {
     temp_storage->block_aggregate.value = aggregate_local;
@@ -229,14 +245,19 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
 
   if (aggregate + aggregate_local > u) {
     if constexpr (DETERMINISTIC) {
-      DeterministicInclusiveSum<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>(
+      DeterministicInclusiveSum<VEC_SIZE,
+                                BLOCK_THREADS,
+                                SCAN_ALGORITHM,
+                                REDUCE_ALGORITHM>(
           prob_greater_than_threshold, inclusive_cdf, temp_storage);
     } else {
-#ifdef PADDLE_WITH_COREX
-      BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>(temp_storage->block_prim.scan)
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+      BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>(
+          temp_storage->block_prim.scan)
           .InclusiveSum(prob_greater_than_threshold, inclusive_cdf);
 #else
-      BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>(temp_storage->block_prim.scan)
+      BlockScan<float, BLOCK_THREADS, SCAN_ALGORITHM>(
+          temp_storage->block_prim.scan)
           .InclusiveSum<VEC_SIZE>(prob_greater_than_threshold, inclusive_cdf);
 #endif
 
@@ -250,28 +271,35 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
 
     bool greater_than_u_diff[VEC_SIZE];
 #ifdef SAMPLING_CUB_SUBTRACTLEFT_DEFINED
-    #ifdef PADDLE_WITH_COREX
-      BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
-          .SubtractLeft(greater_than_u, greater_than_u_diff, BoolDiffOp());
-    #else
-      BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
-          .SubtractLeft<VEC_SIZE>(greater_than_u, greater_than_u_diff, BoolDiffOp());
-    #endif
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+    BlockAdjacentDifference<bool, BLOCK_THREADS>(
+        temp_storage->block_prim.adj_diff)
+        .SubtractLeft(greater_than_u, greater_than_u_diff, BoolDiffOp());
 #else
-    #ifdef PADDLE_WITH_COREX
-      BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
-          .FlagHeads(greater_than_u_diff, greater_than_u, BoolDiffOp(), 0);
-    #else
-      BlockAdjacentDifference<bool, BLOCK_THREADS>(temp_storage->block_prim.adj_diff)
-          .FlagHeads<VEC_SIZE>(greater_than_u_diff, greater_than_u, BoolDiffOp(), 0);
-    #endif
+    BlockAdjacentDifference<bool, BLOCK_THREADS>(
+        temp_storage->block_prim.adj_diff)
+        .SubtractLeft<VEC_SIZE>(
+            greater_than_u, greater_than_u_diff, BoolDiffOp());
+#endif
+#else
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+    BlockAdjacentDifference<bool, BLOCK_THREADS>(
+        temp_storage->block_prim.adj_diff)
+        .FlagHeads(greater_than_u_diff, greater_than_u, BoolDiffOp(), 0);
+#else
+    BlockAdjacentDifference<bool, BLOCK_THREADS>(
+        temp_storage->block_prim.adj_diff)
+        .FlagHeads<VEC_SIZE>(
+            greater_than_u_diff, greater_than_u, BoolDiffOp(), 0);
+#endif
 #endif
     __syncthreads();
 
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; ++j) {
       if (greater_than_u_diff[j]) {
-        atomicMin(&(temp_storage->sampled_id), (i * BLOCK_THREADS + tx) * VEC_SIZE + j);
+        atomicMin(&(temp_storage->sampled_id),
+                  (i * BLOCK_THREADS + tx) * VEC_SIZE + j);
       }
     }
     __syncthreads();
@@ -287,9 +315,9 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
       valid_index[j] = -1;
     }
   }
-  int max_valid_index =
-      BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage->block_prim.reduce_int)
-          .Reduce(valid_index, cub::Max());
+  int max_valid_index = BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                            temp_storage->block_prim.reduce_int)
+                            .Reduce(valid_index, cub::Max());
   if (tx == 0 && max_valid_index != -1) {
     temp_storage->last_valid_id = max_valid_index;
   }
@@ -297,15 +325,19 @@ __device__ __forceinline__ void DeviceSamplingFromProb(
   aggregate += aggregate_local;
 }
 
-
-
-
-template <uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
-          BlockReduceAlgorithm REDUCE_ALGORITHM, uint32_t VEC_SIZE, bool DETERMINISTIC,
-          typename DType, typename IdType>
-__global__ void TopKTopPSamplingFromProbKernel(DType* probs, IdType* output,
-                                               float* top_p_arr, IdType* top_k_arr,
-                                               uint32_t d, uint64_t philox_seed,
+template <uint32_t BLOCK_THREADS,
+          BlockScanAlgorithm SCAN_ALGORITHM,
+          BlockReduceAlgorithm REDUCE_ALGORITHM,
+          uint32_t VEC_SIZE,
+          bool DETERMINISTIC,
+          typename DType,
+          typename IdType>
+__global__ void TopKTopPSamplingFromProbKernel(DType* probs,
+                                               IdType* output,
+                                               float* top_p_arr,
+                                               IdType* top_k_arr,
+                                               uint32_t d,
+                                               uint64_t philox_seed,
                                                uint64_t philox_offset) {
   const uint32_t batch_size = gridDim.x;
   const uint32_t bx = blockIdx.x, tx = threadIdx.x;
@@ -315,12 +347,12 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs, IdType* output,
   const uint32_t k = top_k_arr[row_idx] == 0 ? d : top_k_arr[row_idx];
   const float p = top_p_arr[row_idx];
 
-  extern __shared__ __align__(
-      alignof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>))
+  extern __shared__ __align__(alignof(
+      SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>))
       uint8_t smem_sampling[];
-  auto& temp_storage =
-      reinterpret_cast<SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>&>(
-          smem_sampling);
+  auto& temp_storage = reinterpret_cast<
+      SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>&>(
+      smem_sampling);
 
   vec_t<float, VEC_SIZE> probs_vec;
   float aggregate;
@@ -336,12 +368,22 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs, IdType* output,
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
       probs_vec.fill(0);
       if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-        probs_vec.cast_load(probs + row_idx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
+        probs_vec.cast_load(probs + row_idx * d +
+                            (i * BLOCK_THREADS + tx) * VEC_SIZE);
       }
 
-      DeviceSamplingFromProb<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM,
+      DeviceSamplingFromProb<VEC_SIZE,
+                             BLOCK_THREADS,
+                             SCAN_ALGORITHM,
+                             REDUCE_ALGORITHM,
                              DETERMINISTIC>(
-          i, d, [&](float x) { return x > low; }, u, probs_vec, aggregate, &temp_storage);
+          i,
+          d,
+          [&](float x) { return x > low; },
+          u,
+          probs_vec,
+          aggregate,
+          &temp_storage);
       if (aggregate > u) {
         break;
       }
@@ -362,28 +404,29 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs, IdType* output,
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
       probs_vec.fill(0);
       if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-        probs_vec.cast_load(probs + row_idx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
+        probs_vec.cast_load(probs + row_idx * d +
+                            (i * BLOCK_THREADS + tx) * VEC_SIZE);
       }
 
       ValueCount<float> probs_gt_pivot_0[VEC_SIZE], probs_gt_pivot_1[VEC_SIZE];
 #pragma unroll
       for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-        probs_gt_pivot_0[j] = {
-            (probs_vec[j] > pivot_0) ? probs_vec[j] : 0,
-            (probs_vec[j] > pivot_0 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
-        probs_gt_pivot_1[j] = {
-            (probs_vec[j] > pivot_1) ? probs_vec[j] : 0,
-            (probs_vec[j] > pivot_1 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
+        probs_gt_pivot_0[j] = {(probs_vec[j] > pivot_0) ? probs_vec[j] : 0,
+                               (probs_vec[j] > pivot_0 &&
+                                (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
+        probs_gt_pivot_1[j] = {(probs_vec[j] > pivot_1) ? probs_vec[j] : 0,
+                               (probs_vec[j] > pivot_1 &&
+                                (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
       }
 
-#ifdef PADDLE_WITH_COREX
-      aggregate_gt_pivot_0 +=
-          BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
-              .Sum(probs_gt_pivot_0);
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+      aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS>(
+                                  temp_storage.block_prim.reduce_value_count)
+                                  .Sum(probs_gt_pivot_0);
 #else
-      aggregate_gt_pivot_0 +=
-          BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
-              .Sum<VEC_SIZE>(probs_gt_pivot_0);
+      aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS>(
+                                  temp_storage.block_prim.reduce_value_count)
+                                  .Sum<VEC_SIZE>(probs_gt_pivot_0);
 #endif
       if (tx == 0) {
         temp_storage.block_aggregate.pair = aggregate_gt_pivot_0;
@@ -391,14 +434,14 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs, IdType* output,
       __syncthreads();
       aggregate_gt_pivot_0 = temp_storage.block_aggregate.pair;
 
-#ifdef PADDLE_WITH_COREX
-      aggregate_gt_pivot_1 +=
-          BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
-              .Sum(probs_gt_pivot_1);
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+      aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS>(
+                                  temp_storage.block_prim.reduce_value_count)
+                                  .Sum(probs_gt_pivot_1);
 #else
-      aggregate_gt_pivot_1 +=
-          BlockReduce<ValueCount<float>, BLOCK_THREADS>(temp_storage.block_prim.reduce_value_count)
-              .Sum<VEC_SIZE>(probs_gt_pivot_1);
+      aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS>(
+                                  temp_storage.block_prim.reduce_value_count)
+                                  .Sum<VEC_SIZE>(probs_gt_pivot_1);
 #endif
       if (tx == 0) {
         temp_storage.block_aggregate.pair = aggregate_gt_pivot_1;
@@ -427,14 +470,19 @@ __global__ void TopKTopPSamplingFromProbKernel(DType* probs, IdType* output,
   }
 }
 
-
-
-template <uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
-          BlockReduceAlgorithm REDUCE_ALGORITHM, uint32_t VEC_SIZE,
-          bool DETERMINISTIC, typename DType, typename IdType>
-__global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output,
-                                           float* top_p_arr, uint32_t d,
-                                           uint64_t philox_seed, uint64_t philox_offset) {
+template <uint32_t BLOCK_THREADS,
+          BlockScanAlgorithm SCAN_ALGORITHM,
+          BlockReduceAlgorithm REDUCE_ALGORITHM,
+          uint32_t VEC_SIZE,
+          bool DETERMINISTIC,
+          typename DType,
+          typename IdType>
+__global__ void TopPSamplingFromProbKernel(DType* probs,
+                                           IdType* output,
+                                           float* top_p_arr,
+                                           uint32_t d,
+                                           uint64_t philox_seed,
+                                           uint64_t philox_offset) {
   const uint32_t batch_size = gridDim.x;
   const uint32_t bx = blockIdx.x, tx = threadIdx.x;
   curandStatePhilox4_32_10_t state;
@@ -442,12 +490,12 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output,
   const uint32_t row_idx = bx;
   float top_p = top_p_arr[row_idx];
 
-  extern __shared__ __align__(
-      alignof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>))
+  extern __shared__ __align__(alignof(
+      SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>))
       uint8_t smem_sampling[];
-  auto& temp_storage =
-      reinterpret_cast<SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>&>(
-          smem_sampling);
+  auto& temp_storage = reinterpret_cast<
+      SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>&>(
+      smem_sampling);
 
   vec_t<float, VEC_SIZE> probs_vec;
   float aggregate;
@@ -463,12 +511,22 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output,
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
       probs_vec.fill(0);
       if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-        probs_vec.cast_load(probs + row_idx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
+        probs_vec.cast_load(probs + row_idx * d +
+                            (i * BLOCK_THREADS + tx) * VEC_SIZE);
       }
 
-      DeviceSamplingFromProb<VEC_SIZE, BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM,
+      DeviceSamplingFromProb<VEC_SIZE,
+                             BLOCK_THREADS,
+                             SCAN_ALGORITHM,
+                             REDUCE_ALGORITHM,
                              DETERMINISTIC>(
-          i, d, [&](float x) { return x > low; }, u, probs_vec, aggregate, &temp_storage);
+          i,
+          d,
+          [&](float x) { return x > low; },
+          u,
+          probs_vec,
+          aggregate,
+          &temp_storage);
       if (aggregate > u) {
         break;
       }
@@ -489,7 +547,8 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output,
     for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
       probs_vec.fill(0);
       if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-        probs_vec.cast_load(probs + row_idx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
+        probs_vec.cast_load(probs + row_idx * d +
+                            (i * BLOCK_THREADS + tx) * VEC_SIZE);
       }
 
       float probs_gt_pivot_0[VEC_SIZE], probs_gt_pivot_1[VEC_SIZE];
@@ -499,12 +558,14 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output,
         probs_gt_pivot_1[j] = (probs_vec[j] > pivot_1) ? probs_vec[j] : 0;
       }
 
-#ifdef PADDLE_WITH_COREX
-      aggregate_gt_pivot_0 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
-                                  .Sum(probs_gt_pivot_0);
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+      aggregate_gt_pivot_0 +=
+          BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+              .Sum(probs_gt_pivot_0);
 #else
-      aggregate_gt_pivot_0 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
-                                  .Sum<VEC_SIZE>(probs_gt_pivot_0);
+      aggregate_gt_pivot_0 +=
+          BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+              .Sum<VEC_SIZE>(probs_gt_pivot_0);
 #endif
       if (tx == 0) {
         temp_storage.block_aggregate.value = aggregate_gt_pivot_0;
@@ -512,12 +573,14 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output,
       __syncthreads();
       aggregate_gt_pivot_0 = temp_storage.block_aggregate.value;
 
-#ifdef PADDLE_WITH_COREX
-      aggregate_gt_pivot_1 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
-                                  .Sum(probs_gt_pivot_1);
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+      aggregate_gt_pivot_1 +=
+          BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+              .Sum(probs_gt_pivot_1);
 #else
-      aggregate_gt_pivot_1 += BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
-                                  .Sum<VEC_SIZE>(probs_gt_pivot_1);
+      aggregate_gt_pivot_1 +=
+          BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce)
+              .Sum<VEC_SIZE>(probs_gt_pivot_1);
 #endif
       if (tx == 0) {
         temp_storage.block_aggregate.value = aggregate_gt_pivot_1;
@@ -546,9 +609,13 @@ __global__ void TopPSamplingFromProbKernel(DType* probs, IdType* output,
   }
 }
 
-template <uint32_t VEC_SIZE, uint32_t BLOCK_THREADS, BlockReduceAlgorithm REDUCE_ALGORITHM,
+template <uint32_t VEC_SIZE,
+          uint32_t BLOCK_THREADS,
+          BlockReduceAlgorithm REDUCE_ALGORITHM,
           typename TempStorage>
-__device__ __forceinline__ float GetMaxValue(float* in_data, uint32_t row_idx, uint32_t d,
+__device__ __forceinline__ float GetMaxValue(float* in_data,
+                                             uint32_t row_idx,
+                                             uint32_t d,
                                              TempStorage& temp_storage) {
   const uint32_t tx = threadIdx.x;
   vec_t<float, VEC_SIZE> in_data_vec;
@@ -557,21 +624,24 @@ __device__ __forceinline__ float GetMaxValue(float* in_data, uint32_t row_idx, u
   for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
     in_data_vec.fill(0);
     if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-      in_data_vec.cast_load(in_data + row_idx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
+      in_data_vec.cast_load(in_data + row_idx * d +
+                            (i * BLOCK_THREADS + tx) * VEC_SIZE);
     }
     float in_data_[VEC_SIZE];
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; ++j) {
       in_data_[j] = in_data_vec[j];
     }
-#ifdef PADDLE_WITH_COREX
-    max_val = max(
-        max_val, BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-                     .Reduce(in_data_, cub::Max()));
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+    max_val = max(max_val,
+                  BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                      temp_storage.block_prim.reduce)
+                      .Reduce(in_data_, cub::Max()));
 #else
-    max_val = max(
-        max_val, BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-                     .Reduce<VEC_SIZE>(in_data_, cub::Max()));
+    max_val = max(max_val,
+                  BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                      temp_storage.block_prim.reduce)
+                      .Reduce<VEC_SIZE>(in_data_, cub::Max()));
 #endif
     __syncthreads();
   }
@@ -585,10 +655,12 @@ __device__ __forceinline__ float GetMaxValue(float* in_data, uint32_t row_idx, u
 template <uint32_t BLOCK_THREADS, BlockReduceAlgorithm REDUCE_ALGORITHM>
 struct RenormTempStorage {
   union {
-    typename BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage reduce;
-    typename BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage reduce_int;
-    typename BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
-        reduce_value_count;
+    typename BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
+        reduce;
+    typename BlockReduce<int, BLOCK_THREADS, REDUCE_ALGORITHM>::TempStorage
+        reduce_int;
+    typename BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>::
+        TempStorage reduce_value_count;
   } block_prim;
   struct {
     float max_val;
@@ -607,24 +679,33 @@ struct RenormTempStorage {
   };
 };
 
-template <uint32_t BLOCK_THREADS, BlockScanAlgorithm SCAN_ALGORITHM,
-          BlockReduceAlgorithm REDUCE_ALGORITHM, uint32_t VEC_SIZE, bool DETERMINISTIC,
-          typename DType,typename IdType>
-__global__ void MinPSamplingFromProbKernel(DType* probs, const float* min_p_arr,
-                                            DType* renormed_prob,uint32_t d) {
+template <uint32_t BLOCK_THREADS,
+          BlockScanAlgorithm SCAN_ALGORITHM,
+          BlockReduceAlgorithm REDUCE_ALGORITHM,
+          uint32_t VEC_SIZE,
+          bool DETERMINISTIC,
+          typename DType,
+          typename IdType>
+__global__ void MinPSamplingFromProbKernel(DType* probs,
+                                           const float* min_p_arr,
+                                           DType* renormed_prob,
+                                           uint32_t d) {
   const uint32_t bx = blockIdx.x, tx = threadIdx.x;
   float p = (min_p_arr == nullptr) ? 0 : min_p_arr[bx];
   const uint32_t row_idx = bx;
 
-  extern __shared__ __align__(
-      alignof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>))
+  extern __shared__ __align__(alignof(
+      SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>))
       uint8_t smem_sampling[];
-  auto& temp_storage =
-      reinterpret_cast<SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>&>(
-          smem_sampling);
+  auto& temp_storage = reinterpret_cast<
+      SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>&>(
+      smem_sampling);
 
-  float max_val = GetMaxValue<VEC_SIZE, BLOCK_THREADS, REDUCE_ALGORITHM,
-                              SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>>(
+  float max_val = GetMaxValue<
+      VEC_SIZE,
+      BLOCK_THREADS,
+      REDUCE_ALGORITHM,
+      SamplingTempStorage<BLOCK_THREADS, SCAN_ALGORITHM, REDUCE_ALGORITHM>>(
       probs, row_idx, d, temp_storage);
   float pivot = max_val * p;
 
@@ -633,7 +714,8 @@ __global__ void MinPSamplingFromProbKernel(DType* probs, const float* min_p_arr,
   for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
     probs_vec.fill(0);
     if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-      probs_vec.cast_load(probs + row_idx * d + (i * BLOCK_THREADS + tx) * VEC_SIZE);
+      probs_vec.cast_load(probs + row_idx * d +
+                          (i * BLOCK_THREADS + tx) * VEC_SIZE);
     }
 
 #pragma unroll
@@ -641,42 +723,51 @@ __global__ void MinPSamplingFromProbKernel(DType* probs, const float* min_p_arr,
       probs_vec[j] = (probs_vec[j] >= pivot) ? probs_vec[j] : 0;
     }
     if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-      probs_vec.store(renormed_prob + row_idx * d + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
+      probs_vec.store(renormed_prob + row_idx * d +
+                      i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
     }
-
   }
 }
 
-
-template <uint32_t BLOCK_THREADS, BlockReduceAlgorithm REDUCE_ALGORITHM, uint32_t VEC_SIZE,
-          typename DType, typename IdType>
-__global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType* top_k_arr, uint32_t d) {
+template <uint32_t BLOCK_THREADS,
+          BlockReduceAlgorithm REDUCE_ALGORITHM,
+          uint32_t VEC_SIZE,
+          typename DType,
+          typename IdType>
+__global__ void TopKRenormProbKernel(DType* probs,
+                                     DType* renormed_prob,
+                                     IdType* top_k_arr,
+                                     uint32_t d) {
   const uint32_t bx = blockIdx.x, tx = threadIdx.x;
   const uint32_t row_idx = bx;
   const uint32_t k = top_k_arr[row_idx] == 0 ? d : top_k_arr[row_idx];
-#ifdef PADDLE_WITH_COREX
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
   double pivot = std::numeric_limits<float>::infinity(), normalizer = 1;
 #else
   double pivot = -cuda::std::numeric_limits<float>::infinity(), normalizer = 1;
 #endif
   vec_t<float, VEC_SIZE> probs_vec;
   if (k < d) {
-    extern __shared__ __align__(alignof(RenormTempStorage<BLOCK_THREADS, REDUCE_ALGO>))
-        uint8_t smem_renorm[];
+    extern __shared__ __align__(alignof(
+        RenormTempStorage<BLOCK_THREADS, REDUCE_ALGO>)) uint8_t smem_renorm[];
     auto& temp_storage =
-        reinterpret_cast<RenormTempStorage<BLOCK_THREADS, REDUCE_ALGO>&>(smem_renorm);
+        reinterpret_cast<RenormTempStorage<BLOCK_THREADS, REDUCE_ALGO>&>(
+            smem_renorm);
     temp_storage.max_val = 0;
 
-    float max_val = GetMaxValue<VEC_SIZE, BLOCK_THREADS, REDUCE_ALGORITHM,
-                                RenormTempStorage<BLOCK_THREADS, REDUCE_ALGORITHM>>(
-        probs, row_idx, d, temp_storage);
+    float max_val =
+        GetMaxValue<VEC_SIZE,
+                    BLOCK_THREADS,
+                    REDUCE_ALGORITHM,
+                    RenormTempStorage<BLOCK_THREADS, REDUCE_ALGORITHM>>(
+            probs, row_idx, d, temp_storage);
 
     double low = 0, high = max_val;
     float min_gt_low, max_le_high;
     float sum_low = 1;
     // f(x) = len(nonzero(probs > x)), f(x) is non-increasing
-    // min_gt_low = min{p \in probs | p > low}, max_le_high = max{p \in probs | p <= high}
-    // loop invariant:
+    // min_gt_low = min{p \in probs | p > low}, max_le_high = max{p \in probs |
+    // p <= high} loop invariant:
     // - f(low) >= k, f(high) < k
     // - f(low) > f(min_gt_low) >= f(max_le_high) == f(high)
     // stopping condition: min_gt_low == max_le_high
@@ -692,55 +783,65 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
       for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
         probs_vec.fill(0);
         if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-          probs_vec.cast_load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
+          probs_vec.cast_load(probs + row_idx * d +
+                              i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
         }
-        ValueCount<float> probs_gt_pivot_0_pair[VEC_SIZE], probs_gt_pivot_1_pair[VEC_SIZE];
+        ValueCount<float> probs_gt_pivot_0_pair[VEC_SIZE],
+            probs_gt_pivot_1_pair[VEC_SIZE];
 #pragma unroll
         for (uint32_t j = 0; j < VEC_SIZE; ++j) {
           probs_gt_pivot_0_pair[j] = {
               (probs_vec[j] > pivot_0) ? probs_vec[j] : 0,
-              (probs_vec[j] > pivot_0 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
+              (probs_vec[j] > pivot_0 &&
+               (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
           probs_gt_pivot_1_pair[j] = {
               (probs_vec[j] > pivot_1) ? probs_vec[j] : 0,
-              (probs_vec[j] > pivot_1 && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
+              (probs_vec[j] > pivot_1 &&
+               (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d)};
 
-          if (probs_vec[j] > low && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
+          if (probs_vec[j] > low &&
+              (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
             min_gt_low = min(min_gt_low, probs_vec[j]);
           }
-          if (probs_vec[j] <= high && (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
+          if (probs_vec[j] <= high &&
+              (i * BLOCK_THREADS + tx) * VEC_SIZE + j < d) {
             max_le_high = max(max_le_high, probs_vec[j]);
           }
         }
 
-#ifdef PADDLE_WITH_COREX
-        aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
-                                    temp_storage.block_prim.reduce_value_count)
-                                    .Sum(probs_gt_pivot_0_pair);
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+        aggregate_gt_pivot_0 +=
+            BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                temp_storage.block_prim.reduce_value_count)
+                .Sum(probs_gt_pivot_0_pair);
 #else
-        aggregate_gt_pivot_0 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
-                                    temp_storage.block_prim.reduce_value_count)
-                                    .Sum<VEC_SIZE>(probs_gt_pivot_0_pair);
+        aggregate_gt_pivot_0 +=
+            BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                temp_storage.block_prim.reduce_value_count)
+                .Sum<VEC_SIZE>(probs_gt_pivot_0_pair);
 #endif
         __syncthreads();
 
-#ifdef PADDLE_WITH_COREX
-        aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
-                                    temp_storage.block_prim.reduce_value_count)
-                                    .Sum(probs_gt_pivot_1_pair);
+#if defined(PADDLE_WITH_COREX) || defined(PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU)
+        aggregate_gt_pivot_1 +=
+            BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                temp_storage.block_prim.reduce_value_count)
+                .Sum(probs_gt_pivot_1_pair);
 #else
-        aggregate_gt_pivot_1 += BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
-                                    temp_storage.block_prim.reduce_value_count)
-                                    .Sum<VEC_SIZE>(probs_gt_pivot_1_pair);
+        aggregate_gt_pivot_1 +=
+            BlockReduce<ValueCount<float>, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                temp_storage.block_prim.reduce_value_count)
+                .Sum<VEC_SIZE>(probs_gt_pivot_1_pair);
 #endif
         __syncthreads();
       }
-      min_gt_low =
-          BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-              .Reduce(min_gt_low, cub::Min());
+      min_gt_low = BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                       temp_storage.block_prim.reduce)
+                       .Reduce(min_gt_low, cub::Min());
       __syncthreads();
-      max_le_high =
-          BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(temp_storage.block_prim.reduce)
-              .Reduce(max_le_high, cub::Max());
+      max_le_high = BlockReduce<float, BLOCK_THREADS, REDUCE_ALGORITHM>(
+                        temp_storage.block_prim.reduce)
+                        .Reduce(max_le_high, cub::Max());
       if (tx == 0) {
         temp_storage.block_aggregate.pairs[0] = aggregate_gt_pivot_0;
         temp_storage.block_aggregate.pairs[1] = aggregate_gt_pivot_1;
@@ -774,23 +875,29 @@ __global__ void TopKRenormProbKernel(DType* probs, DType* renormed_prob, IdType*
   for (uint32_t i = 0; i < ceil_div(d, BLOCK_THREADS * VEC_SIZE); ++i) {
     probs_vec.fill(0);
     if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-      probs_vec.cast_load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
+      probs_vec.cast_load(probs + row_idx * d + i * BLOCK_THREADS * VEC_SIZE +
+                          tx * VEC_SIZE);
     }
 #pragma unroll
     for (uint32_t j = 0; j < VEC_SIZE; ++j) {
       probs_vec[j] = (probs_vec[j] > pivot) ? probs_vec[j] * normalizer : 0;
     }
     if ((i * BLOCK_THREADS + tx) * VEC_SIZE < d) {
-      probs_vec.store(renormed_prob + row_idx * d + i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
+      probs_vec.store(renormed_prob + row_idx * d +
+                      i * BLOCK_THREADS * VEC_SIZE + tx * VEC_SIZE);
     }
   }
 }
 
 template <typename T, typename IdType>
-cudaError_t TopPSamplingFromProb(T *probs, IdType *output,
-                                 uint32_t batch_size, const T *top_p_val,
-                                 uint32_t d, bool deterministic,
-                                 uint64_t philox_seed, uint64_t philox_offset,
+cudaError_t TopPSamplingFromProb(T* probs,
+                                 IdType* output,
+                                 uint32_t batch_size,
+                                 const T* top_p_val,
+                                 uint32_t d,
+                                 bool deterministic,
+                                 uint64_t philox_seed,
+                                 uint64_t philox_offset,
                                  cudaStream_t stream = 0) {
   constexpr uint32_t BLOCK_THREADS = 1024;
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
@@ -799,99 +906,139 @@ cudaError_t TopPSamplingFromProb(T *probs, IdType *output,
       sizeof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
   dim3 nblks(batch_size);
   dim3 nthrs(BLOCK_THREADS);
-  void* args[] = {&probs,     &output,       &top_p_val,
-                  &d,         &philox_seed,  &philox_offset};
+  void* args[] = {
+      &probs, &output, &top_p_val, &d, &philox_seed, &philox_offset};
 
   DISPATCH_ALIGNED_VEC_SIZE(
-      vec_size, VEC_SIZE,
+      vec_size,
+      VEC_SIZE,
       {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
-        auto kernel =
-            TopPSamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO,
-                                       VEC_SIZE, DETERMINISTIC, T, IdType>;
+        auto kernel = TopPSamplingFromProbKernel<BLOCK_THREADS,
+                                                 SCAN_ALGO,
+                                                 REDUCE_ALGO,
+                                                 VEC_SIZE,
+                                                 DETERMINISTIC,
+                                                 T,
+                                                 IdType>;
         CUDA_CALL(cudaFuncSetAttribute(
             kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-        CUDA_CALL(cudaLaunchKernel((void *)kernel, nblks, nthrs, args,
-                                   smem_size, stream));
+        CUDA_CALL(cudaLaunchKernel(
+            (void*)kernel, nblks, nthrs, args, smem_size, stream));
       })});
   return cudaSuccess;
 }
 
-template <typename T,typename IdType>
-cudaError_t MinPSamplingFromProb(T *probs, const T* min_p_arr,T *renormed_prob,
+template <typename T, typename IdType>
+cudaError_t MinPSamplingFromProb(T* probs,
+                                 const T* min_p_arr,
+                                 T* renormed_prob,
                                  uint32_t batch_size,
-                                 uint32_t d, bool deterministic,
-                                 cudaStream_t stream = 0){
+                                 uint32_t d,
+                                 bool deterministic,
+                                 cudaStream_t stream = 0) {
   constexpr uint32_t BLOCK_THREADS = 1024;
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
-  const uint32_t smem_size = sizeof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
+  const uint32_t smem_size =
+      sizeof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
   dim3 nblks(batch_size);
   dim3 nthrs(BLOCK_THREADS);
-  void* args[] = {&probs, &min_p_arr,&renormed_prob,&d};
+  void* args[] = {&probs, &min_p_arr, &renormed_prob, &d};
   DISPATCH_ALIGNED_VEC_SIZE(
-      vec_size, VEC_SIZE,
+      vec_size,
+      VEC_SIZE,
       {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
-        auto kernel =
-            MinPSamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO,
-                                       VEC_SIZE, DETERMINISTIC, T,IdType>;
+        auto kernel = MinPSamplingFromProbKernel<BLOCK_THREADS,
+                                                 SCAN_ALGO,
+                                                 REDUCE_ALGO,
+                                                 VEC_SIZE,
+                                                 DETERMINISTIC,
+                                                 T,
+                                                 IdType>;
         CUDA_CALL(cudaFuncSetAttribute(
             kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-        CUDA_CALL(cudaLaunchKernel((void *)kernel, nblks, nthrs, args,
-                                   smem_size, stream));
+        CUDA_CALL(cudaLaunchKernel(
+            (void*)kernel, nblks, nthrs, args, smem_size, stream));
       })});
   return cudaSuccess;
 }
 
-
 template <typename T, typename IdType>
-cudaError_t TopKTopPSamplingFromProb(T *probs, IdType *output,
-                                     uint32_t batch_size, const T *top_p_val, const IdType *top_k_val,
-                                     uint32_t d, bool deterministic,
-                                     uint64_t philox_seed, uint64_t philox_offset,
+cudaError_t TopKTopPSamplingFromProb(T* probs,
+                                     IdType* output,
+                                     uint32_t batch_size,
+                                     const T* top_p_val,
+                                     const IdType* top_k_val,
+                                     uint32_t d,
+                                     bool deterministic,
+                                     uint64_t philox_seed,
+                                     uint64_t philox_offset,
                                      cudaStream_t stream = 0) {
   const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
 
   auto compute_capacity = GetCudaComputeCapability();
   DISPATCH_COMPUTE_CAP_NUM_THREADS(compute_capacity, BLOCK_THREADS, {
-    const uint32_t smem_size = sizeof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
+    const uint32_t smem_size =
+        sizeof(SamplingTempStorage<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO>);
     dim3 nblks(batch_size);
     dim3 nthrs(BLOCK_THREADS);
-    void* args[] = {&probs,     &output,       &top_p_val, &top_k_val,
-                    &d,         &philox_seed,  &philox_offset};
+    void* args[] = {&probs,
+                    &output,
+                    &top_p_val,
+                    &top_k_val,
+                    &d,
+                    &philox_seed,
+                    &philox_offset};
 
     DISPATCH_ALIGNED_VEC_SIZE(
-        vec_size, VEC_SIZE, {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
-          auto kernel = TopKTopPSamplingFromProbKernel<BLOCK_THREADS, SCAN_ALGO, REDUCE_ALGO,
-                                                       VEC_SIZE, DETERMINISTIC, T, IdType>;
-          CUDA_CALL(
-              cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-          CUDA_CALL(
-              cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
+        vec_size,
+        VEC_SIZE,
+        {DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, {
+          auto kernel = TopKTopPSamplingFromProbKernel<BLOCK_THREADS,
+                                                       SCAN_ALGO,
+                                                       REDUCE_ALGO,
+                                                       VEC_SIZE,
+                                                       DETERMINISTIC,
+                                                       T,
+                                                       IdType>;
+          CUDA_CALL(cudaFuncSetAttribute(
+              kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+          CUDA_CALL(cudaLaunchKernel(
+              (void*)kernel, nblks, nthrs, args, smem_size, stream));
         })});
     return cudaSuccess;
   });
 }
 
 template <typename DType, typename IdType>
-cudaError_t TopKRenormProb(DType* probs, DType* renormed_prob, IdType* top_k_arr,
-                           uint32_t batch_size, uint32_t d,
+cudaError_t TopKRenormProb(DType* probs,
+                           DType* renormed_prob,
+                           IdType* top_k_arr,
+                           uint32_t batch_size,
+                           uint32_t d,
                            cudaStream_t stream = 0) {
   const uint32_t vec_size = std::gcd(16 / sizeof(DType), d);
 
   auto compute_capacity = GetCudaComputeCapability();
   DISPATCH_COMPUTE_CAP_NUM_THREADS(compute_capacity, BLOCK_THREADS, {
-    const uint32_t smem_size = sizeof(RenormTempStorage<BLOCK_THREADS, REDUCE_ALGO>);
+    const uint32_t smem_size =
+        sizeof(RenormTempStorage<BLOCK_THREADS, REDUCE_ALGO>);
     dim3 nblks(batch_size);
     dim3 nthrs(BLOCK_THREADS);
     void* args[] = {&probs, &renormed_prob, &top_k_arr, &d};
     DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
-      auto kernel = TopKRenormProbKernel<BLOCK_THREADS, REDUCE_ALGO, VEC_SIZE, DType, IdType>;
-      CUDA_CALL(
-          cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
-      CUDA_CALL(cudaLaunchKernel((void*)kernel, nblks, nthrs, args, smem_size, stream));
+      auto kernel = TopKRenormProbKernel<BLOCK_THREADS,
+                                         REDUCE_ALGO,
+                                         VEC_SIZE,
+                                         DType,
+                                         IdType>;
+      CUDA_CALL(cudaFuncSetAttribute(
+          kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
+      CUDA_CALL(cudaLaunchKernel(
+          (void*)kernel, nblks, nthrs, args, smem_size, stream));
     });
     return cudaSuccess;
   });
 }
 
-} // namespace sampling
+}  // namespace sampling
diff --git a/custom_ops/gpu_ops/sample_kernels/utils.cuh b/custom_ops/gpu_ops/sample_kernels/utils.cuh
index 1de480ab8..3488eb42b 100644
--- a/custom_ops/gpu_ops/sample_kernels/utils.cuh
+++ b/custom_ops/gpu_ops/sample_kernels/utils.cuh
@@ -23,221 +23,235 @@
 #include <cuda_device_runtime_api.h>
 #include <cuda_runtime.h>
 
+#include <curand.h>
+#include <curand_kernel.h>
+#include <curand_philox4x32_x.h>
 #include <cstdint>
 #include <iostream>
 #include <sstream>
 #include <stdexcept>
 #include <vector>
-#include <curand.h>
-#include <curand_kernel.h>
-#include <curand_philox4x32_x.h>
 
 /******************* utils *******************/
 #define STR_HELPER(x) #x
 #define STR(x) STR_HELPER(x)
 
 #ifndef NDEBUG
-#define CUDA_CALL(func, ...)                                                   \
-  {                                                                            \
-    cudaError_t e = (func);                                                    \
-    if (e != cudaSuccess) {                                                    \
-      std::cerr << "CUDA Error: " << cudaGetErrorString(e) << " (" << e        \
-                << ") " << __FILE__ << ": line " << __LINE__                   \
-                << " at function " << STR(func) << std::endl;                  \
-      return e;                                                                \
-    }                                                                          \
+#define CUDA_CALL(func, ...)                                            \
+  {                                                                     \
+    cudaError_t e = (func);                                             \
+    if (e != cudaSuccess) {                                             \
+      std::cerr << "CUDA Error: " << cudaGetErrorString(e) << " (" << e \
+                << ") " << __FILE__ << ": line " << __LINE__            \
+                << " at function " << STR(func) << std::endl;           \
+      return e;                                                         \
+    }                                                                   \
   }
 #else
-#define CUDA_CALL(func, ...)                                                   \
-  {                                                                            \
-    cudaError_t e = (func);                                                    \
-    if (e != cudaSuccess) {                                                    \
-      return e;                                                                \
-    }                                                                          \
+#define CUDA_CALL(func, ...) \
+  {                          \
+    cudaError_t e = (func);  \
+    if (e != cudaSuccess) {  \
+      return e;              \
+    }                        \
   }
 #endif
 
-#define DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, ...)              \
-  if (deterministic) {                                                         \
-    constexpr bool DETERMINISTIC = true;                                       \
-    __VA_ARGS__                                                                \
-  } else {                                                                     \
-    constexpr bool DETERMINISTIC = false;                                      \
-    __VA_ARGS__                                                                \
+#define DISPATCH_DETERMINISTIC(deterministic, DETERMINISTIC, ...) \
+  if (deterministic) {                                            \
+    constexpr bool DETERMINISTIC = true;                          \
+    __VA_ARGS__                                                   \
+  } else {                                                        \
+    constexpr bool DETERMINISTIC = false;                         \
+    __VA_ARGS__                                                   \
   }
 
-#define DISPATCH_ALIGNED_VEC_SIZE(aligned_vec_size, ALIGNED_VEC_SIZE, ...)     \
-  switch (aligned_vec_size) {                                                  \
-  case 16: {                                                                   \
-    constexpr size_t ALIGNED_VEC_SIZE = 16;                                    \
-    __VA_ARGS__                                                                \
-    break;                                                                     \
-  }                                                                            \
-  case 8: {                                                                    \
-    constexpr size_t ALIGNED_VEC_SIZE = 8;                                     \
-    __VA_ARGS__                                                                \
-    break;                                                                     \
-  }                                                                            \
-  case 4: {                                                                    \
-    constexpr size_t ALIGNED_VEC_SIZE = 4;                                     \
-    __VA_ARGS__                                                                \
-    break;                                                                     \
-  }                                                                            \
-  case 2: {                                                                    \
-    constexpr size_t ALIGNED_VEC_SIZE = 2;                                     \
-    __VA_ARGS__                                                                \
-    break;                                                                     \
-  }                                                                            \
-  case 1: {                                                                    \
-    constexpr size_t ALIGNED_VEC_SIZE = 1;                                     \
-    __VA_ARGS__                                                                \
-    break;                                                                     \
-  }                                                                            \
-  default: {                                                                   \
-    std::ostringstream err_msg;                                                \
-    err_msg << "Unsupported aligned_vec_size: " << aligned_vec_size;           \
-    throw std::invalid_argument(err_msg.str());                                \
-  }                                                                            \
+#define DISPATCH_ALIGNED_VEC_SIZE(aligned_vec_size, ALIGNED_VEC_SIZE, ...) \
+  switch (aligned_vec_size) {                                              \
+    case 16: {                                                             \
+      constexpr size_t ALIGNED_VEC_SIZE = 16;                              \
+      __VA_ARGS__                                                          \
+      break;                                                               \
+    }                                                                      \
+    case 8: {                                                              \
+      constexpr size_t ALIGNED_VEC_SIZE = 8;                               \
+      __VA_ARGS__                                                          \
+      break;                                                               \
+    }                                                                      \
+    case 4: {                                                              \
+      constexpr size_t ALIGNED_VEC_SIZE = 4;                               \
+      __VA_ARGS__                                                          \
+      break;                                                               \
+    }                                                                      \
+    case 2: {                                                              \
+      constexpr size_t ALIGNED_VEC_SIZE = 2;                               \
+      __VA_ARGS__                                                          \
+      break;                                                               \
+    }                                                                      \
+    case 1: {                                                              \
+      constexpr size_t ALIGNED_VEC_SIZE = 1;                               \
+      __VA_ARGS__                                                          \
+      break;                                                               \
+    }                                                                      \
+    default: {                                                             \
+      std::ostringstream err_msg;                                          \
+      err_msg << "Unsupported aligned_vec_size: " << aligned_vec_size;     \
+      throw std::invalid_argument(err_msg.str());                          \
+    }                                                                      \
   }
 
 /******************* vec_t<float> *******************/
 #define SAMPLING_INLINE inline __attribute__((always_inline)) __device__
-template <typename float_t, size_t vec_size> struct vec_t {
-  SAMPLING_INLINE float_t &operator[](size_t i);
-  SAMPLING_INLINE const float_t &operator[](size_t i) const;
+template <typename float_t, size_t vec_size>
+struct vec_t {
+  SAMPLING_INLINE float_t& operator[](size_t i);
+  SAMPLING_INLINE const float_t& operator[](size_t i) const;
   SAMPLING_INLINE void fill(float_t val);
-  SAMPLING_INLINE void load(const float_t *ptr);
-  SAMPLING_INLINE void store(float_t *ptr) const;
+  SAMPLING_INLINE void load(const float_t* ptr);
+  SAMPLING_INLINE void store(float_t* ptr) const;
   template <typename T>
-  SAMPLING_INLINE void cast_from(const vec_t<T, vec_size> &src);
-  template <typename T> SAMPLING_INLINE void cast_load(const T *ptr);
-  template <typename T> SAMPLING_INLINE void cast_store(T *ptr) const;
-  SAMPLING_INLINE static void memcpy(float_t *dst, const float_t *src);
-  SAMPLING_INLINE float_t *ptr();
+  SAMPLING_INLINE void cast_from(const vec_t<T, vec_size>& src);
+  template <typename T>
+  SAMPLING_INLINE void cast_load(const T* ptr);
+  template <typename T>
+  SAMPLING_INLINE void cast_store(T* ptr) const;
+  SAMPLING_INLINE static void memcpy(float_t* dst, const float_t* src);
+  SAMPLING_INLINE float_t* ptr();
 };
 
 // float x 1
-template <> struct vec_t<float, 1> {
+template <>
+struct vec_t<float, 1> {
   float data;
 
-  SAMPLING_INLINE float &operator[](size_t i) { return ((float *)(&data))[i]; }
-  SAMPLING_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(&data))[i];
+  SAMPLING_INLINE float& operator[](size_t i) { return ((float*)(&data))[i]; }
+  SAMPLING_INLINE const float& operator[](size_t i) const {
+    return ((const float*)(&data))[i];
   }
-  SAMPLING_INLINE float *ptr() { return reinterpret_cast<float *>(&data); }
+  SAMPLING_INLINE float* ptr() { return reinterpret_cast<float*>(&data); }
   SAMPLING_INLINE void fill(float val);
-  SAMPLING_INLINE void load(const float *ptr);
-  SAMPLING_INLINE void store(float *ptr) const;
-  template <typename T> SAMPLING_INLINE void cast_from(const vec_t<T, 1> &src) {
+  SAMPLING_INLINE void load(const float* ptr);
+  SAMPLING_INLINE void store(float* ptr) const;
+  template <typename T>
+  SAMPLING_INLINE void cast_from(const vec_t<T, 1>& src) {
     cast_from_impl(*this, src);
   }
-  template <typename T> SAMPLING_INLINE void cast_load(const T *ptr) {
+  template <typename T>
+  SAMPLING_INLINE void cast_load(const T* ptr) {
     cast_load_impl(*this, ptr);
   }
-  template <typename T> SAMPLING_INLINE void cast_store(T *ptr) const {
+  template <typename T>
+  SAMPLING_INLINE void cast_store(T* ptr) const {
     cast_store_impl(ptr, *this);
   }
-  SAMPLING_INLINE static void memcpy(float *dst, const float *src);
+  SAMPLING_INLINE static void memcpy(float* dst, const float* src);
 };
 
 SAMPLING_INLINE void vec_t<float, 1>::fill(float val) { data = val; }
 
-SAMPLING_INLINE void vec_t<float, 1>::load(const float *ptr) { data = *ptr; }
+SAMPLING_INLINE void vec_t<float, 1>::load(const float* ptr) { data = *ptr; }
 
-SAMPLING_INLINE void vec_t<float, 1>::store(float *ptr) const { *ptr = data; }
+SAMPLING_INLINE void vec_t<float, 1>::store(float* ptr) const { *ptr = data; }
 
-SAMPLING_INLINE void vec_t<float, 1>::memcpy(float *dst, const float *src) {
+SAMPLING_INLINE void vec_t<float, 1>::memcpy(float* dst, const float* src) {
   *dst = *src;
 }
 
 // float x 2
-template <> struct vec_t<float, 2> {
+template <>
+struct vec_t<float, 2> {
   float2 data;
 
-  SAMPLING_INLINE float &operator[](size_t i) { return ((float *)(&data))[i]; }
-  SAMPLING_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(&data))[i];
+  SAMPLING_INLINE float& operator[](size_t i) { return ((float*)(&data))[i]; }
+  SAMPLING_INLINE const float& operator[](size_t i) const {
+    return ((const float*)(&data))[i];
   }
-  SAMPLING_INLINE float *ptr() { return reinterpret_cast<float *>(&data); }
+  SAMPLING_INLINE float* ptr() { return reinterpret_cast<float*>(&data); }
   SAMPLING_INLINE void fill(float val);
-  SAMPLING_INLINE void load(const float *ptr);
-  SAMPLING_INLINE void store(float *ptr) const;
-  template <typename T> SAMPLING_INLINE void cast_from(const vec_t<T, 2> &src) {
+  SAMPLING_INLINE void load(const float* ptr);
+  SAMPLING_INLINE void store(float* ptr) const;
+  template <typename T>
+  SAMPLING_INLINE void cast_from(const vec_t<T, 2>& src) {
     cast_from_impl(*this, src);
   }
-  template <typename T> SAMPLING_INLINE void cast_load(const T *ptr) {
+  template <typename T>
+  SAMPLING_INLINE void cast_load(const T* ptr) {
     cast_load_impl(*this, ptr);
   }
-  template <typename T> SAMPLING_INLINE void cast_store(T *ptr) const {
+  template <typename T>
+  SAMPLING_INLINE void cast_store(T* ptr) const {
     cast_store_impl(ptr, *this);
   }
-  SAMPLING_INLINE static void memcpy(float *dst, const float *src);
+  SAMPLING_INLINE static void memcpy(float* dst, const float* src);
 };
 
 SAMPLING_INLINE void vec_t<float, 2>::fill(float val) {
   data = make_float2(val, val);
 }
 
-SAMPLING_INLINE void vec_t<float, 2>::load(const float *ptr) {
-  data = *((float2 *)ptr);
+SAMPLING_INLINE void vec_t<float, 2>::load(const float* ptr) {
+  data = *((float2*)ptr);
 }
 
-SAMPLING_INLINE void vec_t<float, 2>::store(float *ptr) const {
-  *((float2 *)ptr) = data;
+SAMPLING_INLINE void vec_t<float, 2>::store(float* ptr) const {
+  *((float2*)ptr) = data;
 }
 
-SAMPLING_INLINE void vec_t<float, 2>::memcpy(float *dst, const float *src) {
-  *((float2 *)dst) = *((float2 *)src);
+SAMPLING_INLINE void vec_t<float, 2>::memcpy(float* dst, const float* src) {
+  *((float2*)dst) = *((float2*)src);
 }
 
 // float x 4 or more
-template <size_t vec_size> struct vec_t<float, vec_size> {
+template <size_t vec_size>
+struct vec_t<float, vec_size> {
   float4 data[vec_size / 4];
 
-  SAMPLING_INLINE float &operator[](size_t i) { return ((float *)(data))[i]; }
-  SAMPLING_INLINE const float &operator[](size_t i) const {
-    return ((const float *)(data))[i];
+  SAMPLING_INLINE float& operator[](size_t i) { return ((float*)(data))[i]; }
+  SAMPLING_INLINE const float& operator[](size_t i) const {
+    return ((const float*)(data))[i];
   }
-  SAMPLING_INLINE float *ptr() { return reinterpret_cast<float *>(&data); }
+  SAMPLING_INLINE float* ptr() { return reinterpret_cast<float*>(&data); }
   SAMPLING_INLINE void fill(float val) {
 #pragma unroll
     for (size_t i = 0; i < vec_size / 4; ++i) {
       data[i] = make_float4(val, val, val, val);
     }
   }
-  SAMPLING_INLINE void load(const float *ptr) {
+  SAMPLING_INLINE void load(const float* ptr) {
 #pragma unroll
     for (size_t i = 0; i < vec_size / 4; ++i) {
-      data[i] = ((float4 *)ptr)[i];
+      data[i] = ((float4*)ptr)[i];
     }
   }
-  SAMPLING_INLINE void store(float *ptr) const {
+  SAMPLING_INLINE void store(float* ptr) const {
 #pragma unroll
     for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)ptr)[i] = data[i];
+      ((float4*)ptr)[i] = data[i];
     }
   }
   template <typename T>
-  SAMPLING_INLINE void cast_from(const vec_t<T, vec_size> &src) {
+  SAMPLING_INLINE void cast_from(const vec_t<T, vec_size>& src) {
     cast_from_impl(*this, src);
   }
-  template <typename T> SAMPLING_INLINE void cast_load(const T *ptr) {
+  template <typename T>
+  SAMPLING_INLINE void cast_load(const T* ptr) {
     cast_load_impl(*this, ptr);
   }
-  template <typename T> SAMPLING_INLINE void cast_store(T *ptr) const {
+  template <typename T>
+  SAMPLING_INLINE void cast_store(T* ptr) const {
     cast_store_impl(ptr, *this);
   }
-  SAMPLING_INLINE static void memcpy(float *dst, const float *src) {
+  SAMPLING_INLINE static void memcpy(float* dst, const float* src) {
 #pragma unroll
     for (size_t i = 0; i < vec_size / 4; ++i) {
-      ((float4 *)dst)[i] = ((float4 *)src)[i];
+      ((float4*)dst)[i] = ((float4*)src)[i];
     }
   }
 };
 
 template <typename src_float_t, typename tgt_float_t, size_t vec_size>
 SAMPLING_INLINE void cast_load_impl(vec_t<tgt_float_t, vec_size>& dst,
-                                      const src_float_t* src_ptr) {
+                                    const src_float_t* src_ptr) {
   if constexpr (std::is_same_v<src_float_t, tgt_float_t>) {
     dst.load(src_ptr);
   } else {
@@ -260,11 +274,16 @@ inline std::pair<int, int> GetCudaComputeCapability() {
 __forceinline__ __device__ float ptx_rcp(float x) {
 #ifdef PADDLE_WITH_COREX
   return __ivcorex_rcpf(x);
+#else
+#ifdef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
+  return __frcp_rn(x);
 #else
   float y;
   asm volatile("rcp.approx.ftz.f32 %0, %1;" : "=f"(y) : "f"(x));
   return y;
 #endif
+
+#endif
 }
 
 template <typename T1, typename T2>
diff --git a/custom_ops/metax_ops/apply_rope.cu b/custom_ops/metax_ops/apply_rope.cu
deleted file mode 100644
index 4e820e425..000000000
--- a/custom_ops/metax_ops/apply_rope.cu
+++ /dev/null
@@ -1,291 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <cuda_runtime.h>
-#include <paddle/extension.h>
-#include <algorithm>
-#include "helper.h"
-
-#define THREADS_PER_BLOCK 128
-
-template <typename T>
-struct Converter;
-
-template <>
-struct Converter<__half> {
-  // __half -> float
-  __device__ static float to_float(__half val) { return __half2float(val); }
-  // float -> __half
-  __device__ static __half from_float(float val) {
-    return __float2half_rn(val);
-  }
-  // int -> __half
-  __device__ static __half from_int(float val) { return __int2half_rn(val); }
-};
-
-template <>
-struct Converter<__nv_bfloat16> {
-  // __nv_bfloat16 -> float
-  __device__ static float to_float(__nv_bfloat16 val) {
-    return __bfloat162float(val);
-  }
-  // float -> __nv_bfloat16
-  __device__ static __nv_bfloat16 from_float(float val) {
-    return __float2bfloat16_rn(val);
-  }
-  // int -> __nv_bfloat16
-  __device__ static __nv_bfloat16 from_int(int val) {
-    return __int2bfloat16_rn(val);
-  }
-};
-
-template <typename T>
-__device__ void RotateQKVec4(const T* qk_ptr,
-                             const T* rot_cos_ptr,
-                             const T* rot_sin_ptr,
-                             const int head_num,
-                             const int base_idx,
-                             const int rot_base_idx,
-                             T* out) {
-  using VecT = AlignedVector<T, 4>;
-
-  VecT qk_vec;
-  Load(qk_ptr + base_idx, &qk_vec);
-  VecT rot_half_vec = {-qk_vec[1], qk_vec[0], -qk_vec[3], qk_vec[2]};
-  VecT cos_vec, sin_vec;
-  Load(rot_cos_ptr + rot_base_idx, &cos_vec);
-  Load(rot_sin_ptr + rot_base_idx, &sin_vec);
-#pragma unroll
-  for (int i = 0; i < 4; ++i) {
-    *(out + base_idx + i) =
-        qk_vec[i] * cos_vec[i] + rot_half_vec[i] * sin_vec[i];
-  }
-}
-
-template <typename T>
-__device__ void RotateQKVec4(const T* qk_ptr,
-                             const float* rot_cos_ptr,
-                             const float* rot_sin_ptr,
-                             const int head_num,
-                             const int base_idx,
-                             const int rot_base_idx,
-                             T* out) {
-  using VecT = AlignedVector<T, 4>;
-  using VecF = AlignedVector<float, 4>;
-  auto to_float = [] __device__(T val) -> float {
-    return Converter<T>::to_float(val);
-  };
-  auto from_float = [] __device__(float val) -> T {
-    return Converter<T>::from_float(val);
-  };
-
-  VecT qk_vec;
-  Load(qk_ptr + base_idx, &qk_vec);
-  VecF rot_half_vec = {-to_float(qk_vec[1]),
-                       to_float(qk_vec[0]),
-                       -to_float(qk_vec[3]),
-                       to_float(qk_vec[2])};
-  VecF cos_vec, sin_vec;
-  Load(rot_cos_ptr + rot_base_idx, &cos_vec);
-  Load(rot_sin_ptr + rot_base_idx, &sin_vec);
-#pragma unroll
-  for (int i = 0; i < 4; ++i) {
-    *(out + base_idx + i) = from_float(to_float(qk_vec[i]) * cos_vec[i] +
-                                       rot_half_vec[i] * sin_vec[i]);
-  }
-}
-
-// qk and rope have a same type
-template <typename T>
-__global__ void DispatchApplyRopeVec4Kernel(const T* q,
-                                            const T* k,
-                                            const T* rot_cos,
-                                            const T* rot_sin,
-                                            const int q_num_elements,
-                                            const int k_num_elements,
-                                            const int q_head_num,
-                                            const int k_head_num,
-                                            const int head_dim,
-                                            T* q_out,
-                                            T* k_out) {
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * 4;
-  int head_dim_idx = idx % head_dim;
-
-  if (idx < q_num_elements) {
-    int rot_idx = idx / (q_head_num * head_dim) * head_dim + head_dim_idx;
-    RotateQKVec4(q, rot_cos, rot_sin, q_head_num, idx, rot_idx, q_out);
-  }
-
-  if (idx < k_num_elements) {
-    int rot_idx = idx / (k_head_num * head_dim) * head_dim + head_dim_idx;
-    RotateQKVec4(k, rot_cos, rot_sin, k_head_num, idx, rot_idx, k_out);
-  }
-}
-
-// rope dtype is float32
-template <typename T>
-__global__ void DispatchApplyRopeVec4Kernel(const T* q,
-                                            const T* k,
-                                            const float* rot_cos,
-                                            const float* rot_sin,
-                                            const int q_num_elements,
-                                            const int k_num_elements,
-                                            const int q_head_num,
-                                            const int k_head_num,
-                                            const int head_dim,
-                                            T* q_out,
-                                            T* k_out) {
-  int idx = (blockIdx.x * blockDim.x + threadIdx.x) * 4;
-  int head_dim_idx = idx % head_dim;
-
-  if (idx < q_num_elements) {
-    int rot_idx = idx / (q_head_num * head_dim) * head_dim + head_dim_idx;
-    RotateQKVec4(q, rot_cos, rot_sin, q_head_num, idx, rot_idx, q_out);
-  }
-
-  if (idx < k_num_elements) {
-    int rot_idx = idx / (k_head_num * head_dim) * head_dim + head_dim_idx;
-    RotateQKVec4(k, rot_cos, rot_sin, k_head_num, idx, rot_idx, k_out);
-  }
-}
-
-template <paddle::DataType D>
-void ApplyRopeKernel(const paddle::Tensor& q,
-                     const paddle::Tensor& k,
-                     const paddle::Tensor& rot_cos,
-                     const paddle::Tensor& rot_sin,
-                     paddle::Tensor& q_out,
-                     paddle::Tensor& k_out) {
-  typedef PDTraits<D> traits_;
-  typedef typename traits_::DataType DataType_;
-  typedef typename traits_::data_t data_t;
-
-  const auto q_num_elements = q.numel();
-  const auto k_num_elements = k.numel();
-  const auto q_shape = q.shape();
-  const auto k_shape = k.shape();
-  const auto dims = q_shape.size();
-  const auto q_head_num = q_shape[dims - 2];
-  const auto k_head_num = k_shape[dims - 2];
-  const auto head_dim = q_shape.back();
-  int block_num =
-      (std::max(q_num_elements, k_num_elements) + (THREADS_PER_BLOCK * 4) - 1) /
-      (THREADS_PER_BLOCK * 4);
-  auto stream = q.stream();
-
-  if (q.dtype() == rot_cos.dtype()) {
-    DispatchApplyRopeVec4Kernel<DataType_>
-        <<<block_num, THREADS_PER_BLOCK, 0, stream>>>(
-            reinterpret_cast<const DataType_*>(q.data<data_t>()),
-            reinterpret_cast<const DataType_*>(k.data<data_t>()),
-            reinterpret_cast<const DataType_*>(rot_cos.data<data_t>()),
-            reinterpret_cast<const DataType_*>(rot_sin.data<data_t>()),
-            q_num_elements,
-            k_num_elements,
-            q_head_num,
-            k_head_num,
-            head_dim,
-            reinterpret_cast<DataType_*>(q_out.data<data_t>()),
-            reinterpret_cast<DataType_*>(k_out.data<data_t>()));
-  } else if (rot_cos.dtype() == paddle::DataType::FLOAT32) {
-    DispatchApplyRopeVec4Kernel<DataType_>
-        <<<block_num, THREADS_PER_BLOCK, 0, stream>>>(
-            reinterpret_cast<const DataType_*>(q.data<data_t>()),
-            reinterpret_cast<const DataType_*>(k.data<data_t>()),
-            reinterpret_cast<const float*>(rot_cos.data<float>()),
-            reinterpret_cast<const float*>(rot_sin.data<float>()),
-            q_num_elements,
-            k_num_elements,
-            q_head_num,
-            k_head_num,
-            head_dim,
-            reinterpret_cast<DataType_*>(q_out.data<data_t>()),
-            reinterpret_cast<DataType_*>(k_out.data<data_t>()));
-  } else {
-    PD_THROW("Unsupported qk dtype and rope dtype.");
-  }
-}
-
-std::vector<paddle::Tensor> ApplyRope(const paddle::Tensor& q,
-                                      const paddle::Tensor& k,
-                                      const paddle::Tensor& rot_cos,
-                                      const paddle::Tensor& rot_sin) {
-  auto q_shape = q.shape();
-  auto cos_shape = rot_cos.shape();
-
-  auto q_out = paddle::empty_like(q);
-  auto k_out = paddle::empty_like(k);
-
-  if (q.numel() == 0 || k.numel() == 0) {
-    return {q_out, k_out};
-  }
-
-  PADDLE_ENFORCE_EQ(
-      q_shape.back() % 2,
-      0,
-      "The last dimension (head_dim) of qk must be an even number "
-      "for RoPE, but got %d",
-      q_shape.back());
-  PADDLE_ENFORCE_EQ(q_shape.size(),
-                    cos_shape.size(),
-                    "The shape size of cos mismatches the shape size of q, "
-                    "expect %d but got %d",
-                    q_shape.size(),
-                    cos_shape.size());
-  PADDLE_ENFORCE_EQ(q_shape.back(),
-                    cos_shape.back(),
-                    "The shape.back() of cos mismatches the shape.back() of q, "
-                    "expect %d but got %d",
-                    q_shape.back(),
-                    cos_shape.back());
-
-  auto input_type = q.dtype();
-  switch (input_type) {
-    case paddle::DataType::BFLOAT16:
-      ApplyRopeKernel<paddle::DataType::BFLOAT16>(
-          q, k, rot_cos, rot_sin, q_out, k_out);
-      break;
-    case paddle::DataType::FLOAT16:
-      ApplyRopeKernel<paddle::DataType::FLOAT16>(
-          q, k, rot_cos, rot_sin, q_out, k_out);
-      break;
-    default:
-      PD_THROW("Only support qk dtype of BF16 and F16");
-  }
-
-  return {q_out, k_out};
-}
-
-std::vector<std::vector<int64_t>> ApplyRopeInferShape(
-    const std::vector<int64_t>& q_shape,
-    const std::vector<int64_t>& k_shape,
-    const std::vector<int64_t>& cos_shape,
-    const std::vector<int64_t>& sin_shape) {
-  return {q_shape, k_shape, cos_shape, sin_shape};
-}
-
-std::vector<paddle::DataType> ApplyRopeInferDtype(
-    const paddle::DataType& q_dtype,
-    const paddle::DataType& k_dtype,
-    const paddle::DataType& cos_dtype,
-    const paddle::DataType& sin_dtype) {
-  return {q_dtype, k_dtype, cos_dtype, sin_dtype};
-}
-
-PD_BUILD_OP(apply_rope)
-    .Inputs({"q", "k", "rot_cos", "rot_sin"})
-    .Outputs({"q_out", "k_out"})
-    .SetKernelFn(PD_KERNEL(ApplyRope))
-    .SetInferShapeFn(PD_INFER_SHAPE(ApplyRopeInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(ApplyRopeInferDtype));
diff --git a/custom_ops/metax_ops/apply_rope_qkv.cu b/custom_ops/metax_ops/apply_rope_qkv.cu
new file mode 100644
index 000000000..3c7679e07
--- /dev/null
+++ b/custom_ops/metax_ops/apply_rope_qkv.cu
@@ -0,0 +1,329 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime.h>
+#include <paddle/extension.h>
+#include <algorithm>
+#include "helper.h"
+
+template <typename T>
+struct Converter;
+
+template <>
+struct Converter<__half> {
+  // __half -> float
+  __device__ static float to_float(__half val) { return __half2float(val); }
+  // float -> __half
+  __device__ static __half from_float(float val) {
+    return __float2half_rn(val);
+  }
+  // int -> __half
+  __device__ static __half from_int(float val) { return __int2half_rn(val); }
+};
+
+template <>
+struct Converter<__nv_bfloat16> {
+  // __nv_bfloat16 -> float
+  __device__ static float to_float(__nv_bfloat16 val) {
+    return __bfloat162float(val);
+  }
+  // float -> __nv_bfloat16
+  __device__ static __nv_bfloat16 from_float(float val) {
+    return __float2bfloat16_rn(val);
+  }
+  // int -> __nv_bfloat16
+  __device__ static __nv_bfloat16 from_int(int val) {
+    return __int2bfloat16_rn(val);
+  }
+};
+
+struct ApplyRopeQKVParams {
+  int head_dim;
+  int token_stride;
+  int head_stride;
+  int q_stride;
+  int kv_stride;
+  int q_head_offset;
+  int k_head_offset;
+  int v_head_offset;
+  int q_head_num;
+  int kv_head_num;
+};
+
+template <typename T>
+__device__ __forceinline__ void RotateQKVec4(const T* qkv_ptr,
+                                             const T* rot_cos_ptr,
+                                             const T* rot_sin_ptr,
+                                             const int load_idx,
+                                             const int store_idx,
+                                             const int rot_base_idx,
+                                             T* out) {
+  using VecT = AlignedVector<T, 4>;
+
+  VecT qk_vec;
+  Load(qkv_ptr + load_idx, &qk_vec);
+  VecT rot_half_vec = {-qk_vec[1], qk_vec[0], -qk_vec[3], qk_vec[2]};
+  VecT cos_vec, sin_vec;
+  Load(rot_cos_ptr + rot_base_idx, &cos_vec);
+  Load(rot_sin_ptr + rot_base_idx, &sin_vec);
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    *(out + store_idx + i) =
+        qk_vec[i] * cos_vec[i] + rot_half_vec[i] * sin_vec[i];
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ void RotateQKVec4(const T* qkv_ptr,
+                                             const float* rot_cos_ptr,
+                                             const float* rot_sin_ptr,
+                                             const int load_idx,
+                                             const int store_idx,
+                                             const int rot_base_idx,
+                                             T* out) {
+  using VecT = AlignedVector<T, 4>;
+  using VecF = AlignedVector<float, 4>;
+  auto to_float = [] __device__(T val) -> float {
+    return Converter<T>::to_float(val);
+  };
+  auto from_float = [] __device__(float val) -> T {
+    return Converter<T>::from_float(val);
+  };
+
+  VecT qk_vec;
+  Load(qkv_ptr + load_idx, &qk_vec);
+  VecF rot_half_vec = {-to_float(qk_vec[1]),
+                       to_float(qk_vec[0]),
+                       -to_float(qk_vec[3]),
+                       to_float(qk_vec[2])};
+  VecF cos_vec, sin_vec;
+  Load(rot_cos_ptr + rot_base_idx, &cos_vec);
+  Load(rot_sin_ptr + rot_base_idx, &sin_vec);
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    *(out + store_idx + i) = from_float(to_float(qk_vec[i]) * cos_vec[i] +
+                                        rot_half_vec[i] * sin_vec[i]);
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ void StoreValue(const T* qkv_ptr,
+                                           const int load_idx,
+                                           const int store_idx,
+                                           T* out) {
+  using VecT = AlignedVector<T, 4>;
+  VecT v_vec;
+  Load(qkv_ptr + load_idx, &v_vec);
+  Store(v_vec, out + store_idx);
+}
+
+template <typename T, typename WeightType>
+__global__ void DispatchApplyRopeQKVVec4Kernel(const T* qkv,
+                                               const WeightType* rot_cos,
+                                               const WeightType* rot_sin,
+                                               ApplyRopeQKVParams param,
+                                               T* q_out,
+                                               T* k_out,
+                                               T* v_out) {
+  const int token_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int head_idx = blockIdx.y * blockDim.y + threadIdx.y;
+  const int head_dim_idx = (blockIdx.z * blockDim.z + threadIdx.z) * 4;
+  int rot_idx = token_idx * param.head_dim + head_dim_idx;
+  int load_idx, store_idx;
+
+  if (head_idx < param.q_head_num && head_dim_idx < param.head_dim) {  // q
+    load_idx = token_idx * param.token_stride +
+               (head_idx + param.q_head_offset) * param.head_stride +
+               head_dim_idx;
+    store_idx =
+        token_idx * param.q_stride + head_idx * param.head_dim + head_dim_idx;
+    RotateQKVec4(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx, q_out);
+  }
+
+  if (head_idx < param.kv_head_num && head_dim_idx < param.head_dim) {  // kv
+    load_idx = token_idx * param.token_stride +
+               (head_idx + param.k_head_offset) * param.head_stride +
+               head_dim_idx;
+    store_idx =
+        token_idx * param.kv_stride + head_idx * param.head_dim + head_dim_idx;
+    RotateQKVec4(qkv, rot_cos, rot_sin, load_idx, store_idx, rot_idx, k_out);
+    load_idx = token_idx * param.token_stride +
+               (head_idx + param.v_head_offset) * param.head_stride +
+               head_dim_idx;
+    StoreValue(qkv, load_idx, store_idx, v_out);
+  }
+}
+
+template <paddle::DataType D>
+void ApplyRopeQKVKernel(const paddle::Tensor& qkv,
+                        const paddle::Tensor& rot_cos,
+                        const paddle::Tensor& rot_sin,
+                        const int q_head_num,
+                        const int kv_head_num,
+                        const int head_dim,
+                        paddle::Tensor& q_out,
+                        paddle::Tensor& k_out,
+                        paddle::Tensor& v_out) {
+  typedef PDTraits<D> traits_;
+  typedef typename traits_::DataType DataType_;
+  typedef typename traits_::data_t data_t;
+
+  const int all_num_elements = qkv.numel();
+  const int all_num_head = q_head_num + 2 * kv_head_num;
+  auto stream = qkv.stream();
+
+  dim3 block_dims(1, 4, 32);
+  dim3 grid_dims(all_num_elements / (all_num_head * head_dim),  // token
+                 (std::max(q_head_num, kv_head_num) + block_dims.y - 1) /
+                     block_dims.y,  // head
+                 (head_dim + (block_dims.z * 4) - 1) /
+                     (block_dims.z * 4)  // dim: load vec4 at a time
+  );
+
+  // printf("grid: (%d, %d, %d)\n", grid_dims.x, grid_dims.y, grid_dims.z);
+  // printf("block: (%d, %d, %d)\n", block_dims.x, block_dims.y, block_dims.z);
+
+  ApplyRopeQKVParams param;
+  param.head_dim = head_dim;
+  param.token_stride = all_num_head * head_dim;
+  param.head_stride = head_dim;
+  param.q_stride = q_head_num * head_dim;
+  param.kv_stride = kv_head_num * head_dim;
+  param.q_head_offset = 0;
+  param.k_head_offset = q_head_num;
+  param.v_head_offset = q_head_num + kv_head_num;
+  param.q_head_num = q_head_num;
+  param.kv_head_num = kv_head_num;
+
+  if (qkv.dtype() == rot_cos.dtype()) {
+    DispatchApplyRopeQKVVec4Kernel<DataType_, DataType_>
+        <<<grid_dims, block_dims, 0, stream>>>(
+            reinterpret_cast<const DataType_*>(qkv.data<data_t>()),
+            reinterpret_cast<const DataType_*>(rot_cos.data<data_t>()),
+            reinterpret_cast<const DataType_*>(rot_sin.data<data_t>()),
+            param,
+            reinterpret_cast<DataType_*>(q_out.data<data_t>()),
+            reinterpret_cast<DataType_*>(k_out.data<data_t>()),
+            reinterpret_cast<DataType_*>(v_out.data<data_t>()));
+  } else if (rot_cos.dtype() == paddle::DataType::FLOAT32) {
+    DispatchApplyRopeQKVVec4Kernel<DataType_, float>
+        <<<grid_dims, block_dims, 0, stream>>>(
+            reinterpret_cast<const DataType_*>(qkv.data<data_t>()),
+            reinterpret_cast<const float*>(rot_cos.data<float>()),
+            reinterpret_cast<const float*>(rot_sin.data<float>()),
+            param,
+            reinterpret_cast<DataType_*>(q_out.data<data_t>()),
+            reinterpret_cast<DataType_*>(k_out.data<data_t>()),
+            reinterpret_cast<DataType_*>(v_out.data<data_t>()));
+  } else {
+    PD_THROW("Unsupported qk dtype and rope dtype.");
+  }
+}
+
+std::vector<paddle::Tensor> ApplyRopeQKV(const paddle::Tensor& qkv,
+                                         const paddle::Tensor& rot_cos,
+                                         const paddle::Tensor& rot_sin,
+                                         const int q_head_num,
+                                         const int kv_head_num,
+                                         const int head_dim) {
+  auto qkv_shape = qkv.shape();
+  auto token_num = qkv_shape[0];
+  auto place = qkv.place();
+  auto dtype = qkv.dtype();
+  common::DDim q_out_shape, kv_out_shape;
+  if (rot_cos.shape().size() == 3) {
+    q_out_shape = {token_num, q_head_num, head_dim};
+    kv_out_shape = {token_num, kv_head_num, head_dim};
+  } else {
+    q_out_shape = {token_num, 1, q_head_num, head_dim};
+    kv_out_shape = {token_num, 1, kv_head_num, head_dim};
+  }
+  auto q_out = GetEmptyTensor(q_out_shape, dtype, place);
+  auto k_out = GetEmptyTensor(kv_out_shape, dtype, place);
+  auto v_out = GetEmptyTensor(kv_out_shape, dtype, place);
+
+  if (token_num == 0) {
+    return {q_out, k_out, v_out};
+  }
+
+  PADDLE_ENFORCE_EQ(qkv_shape.back(),
+                    ((q_head_num + 2 * kv_head_num) * head_dim),
+                    "The last dimension of qkv [%d] must equal to {(q_head_num "
+                    "+ 2 * kv_head_num) * head_dim [%d].",
+                    qkv_shape.back(),
+                    ((q_head_num + 2 * kv_head_num) * head_dim));
+  PADDLE_ENFORCE_EQ(
+      head_dim % 2,
+      0,
+      "The last dimension (head_dim) of qkv must be an even number "
+      "for RoPE, but got %d",
+      head_dim);
+  PADDLE_ENFORCE_EQ(q_out.shape().back(),
+                    rot_cos.shape().back(),
+                    "The last dimension of cos mismatches that of q, "
+                    "expect %d but got %d",
+                    q_out.shape().back(),
+                    rot_cos.shape().back());
+
+  switch (dtype) {
+    case paddle::DataType::BFLOAT16:
+      ApplyRopeQKVKernel<paddle::DataType::BFLOAT16>(qkv,
+                                                     rot_cos,
+                                                     rot_sin,
+                                                     q_head_num,
+                                                     kv_head_num,
+                                                     head_dim,
+                                                     q_out,
+                                                     k_out,
+                                                     v_out);
+      break;
+    case paddle::DataType::FLOAT16:
+      ApplyRopeQKVKernel<paddle::DataType::FLOAT16>(qkv,
+                                                    rot_cos,
+                                                    rot_sin,
+                                                    q_head_num,
+                                                    kv_head_num,
+                                                    head_dim,
+                                                    q_out,
+                                                    k_out,
+                                                    v_out);
+      break;
+    default:
+      PD_THROW("Only support qk dtype of BF16 and F16");
+  }
+
+  return {q_out, k_out, v_out};
+}
+
+std::vector<std::vector<int64_t>> ApplyRopeQKVInferShape(
+    const std::vector<int64_t>& qkv_shape,
+    const std::vector<int64_t>& cos_shape,
+    const std::vector<int64_t>& sin_shape) {
+  return {qkv_shape, cos_shape, sin_shape};
+}
+
+std::vector<paddle::DataType> ApplyRopeQKVInferDtype(
+    const paddle::DataType& qkv_dtype,
+    const paddle::DataType& cos_dtype,
+    const paddle::DataType& sin_dtype) {
+  return {qkv_dtype, cos_dtype, sin_dtype};
+}
+
+PD_BUILD_OP(apply_rope_qkv)
+    .Inputs({"qkv", "rot_cos", "rot_sin"})
+    .Outputs({"q_out", "k_out", "v_out"})
+    .Attrs({"q_head_num:int", "kv_head_num:int", "head_dim:int"})
+    .SetKernelFn(PD_KERNEL(ApplyRopeQKV))
+    .SetInferShapeFn(PD_INFER_SHAPE(ApplyRopeQKVInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(ApplyRopeQKVInferDtype));
diff --git a/custom_ops/metax_ops/cache_kv_with_rope.cu b/custom_ops/metax_ops/cache_kv_with_rope.cu
new file mode 100644
index 000000000..0f3e9a54e
--- /dev/null
+++ b/custom_ops/metax_ops/cache_kv_with_rope.cu
@@ -0,0 +1,477 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime.h>
+#include <paddle/extension.h>
+#include <algorithm>
+#include "helper.h"
+
+template <typename T>
+struct Converter;
+
+template <>
+struct Converter<__half> {
+  // __half -> float
+  __device__ static float to_float(__half val) { return __half2float(val); }
+  // float -> __half
+  __device__ static __half from_float(float val) {
+    return __float2half_rn(val);
+  }
+  // int -> __half
+  __device__ static __half from_int(float val) { return __int2half_rn(val); }
+};
+
+template <>
+struct Converter<__nv_bfloat16> {
+  // __nv_bfloat16 -> float
+  __device__ static float to_float(__nv_bfloat16 val) {
+    return __bfloat162float(val);
+  }
+  // float -> __nv_bfloat16
+  __device__ static __nv_bfloat16 from_float(float val) {
+    return __float2bfloat16_rn(val);
+  }
+  // int -> __nv_bfloat16
+  __device__ static __nv_bfloat16 from_int(int val) {
+    return __int2bfloat16_rn(val);
+  }
+};
+
+struct CacheKVWithRopeParams {
+  int head_dim;
+  int block_size;
+  int block_num;
+  int cache_stride;
+  int token_stride;
+  int head_stride;
+  int q_stride;
+  int kv_stride;
+  int q_head_offset;
+  int k_head_offset;
+  int v_head_offset;
+  int q_head_num;
+  int kv_head_num;
+};
+
+template <typename T, int VecSize = 4, bool WriteCache = true>
+__device__ __forceinline__ void RotateQKVec(const T* qkv_ptr,
+                                            const T* rotary_cos_ptr,
+                                            const T* rotary_sin_ptr,
+                                            const int load_idx,
+                                            const int store_idx,
+                                            const int cache_store_idx,
+                                            const int rot_base_idx,
+                                            T* caches,
+                                            T* out) {
+  using VecT = AlignedVector<T, VecSize>;
+
+  VecT qk_vec;
+  Load(qkv_ptr + load_idx, &qk_vec);
+  VecT rot_half_vec;
+  int flag;
+#pragma unroll
+  for (int i = 0; i < VecSize; ++i) {
+    flag = 1 - 2 * (i % 2);
+    rot_half_vec[i] = -qk_vec[i + flag] * Converter<T>::from_int(flag);
+  }
+  VecT cos_vec, sin_vec;
+  Load(rotary_cos_ptr + rot_base_idx, &cos_vec);
+  Load(rotary_sin_ptr + rot_base_idx, &sin_vec);
+#pragma unroll
+  for (int i = 0; i < VecSize; ++i) {
+    T result = qk_vec[i] * cos_vec[i] + rot_half_vec[i] * sin_vec[i];
+    *(out + store_idx + i) = result;
+
+    if (WriteCache) {
+      *(caches + cache_store_idx + i) = result;
+    }
+  }
+}
+
+template <typename T, int VecSize = 4, bool WriteCache = true>
+__device__ __forceinline__ void RotateQKVec(const T* qkv_ptr,
+                                            const float* rotary_cos_ptr,
+                                            const float* rotary_sin_ptr,
+                                            const int load_idx,
+                                            const int store_idx,
+                                            const int cache_store_idx,
+                                            const int rot_base_idx,
+                                            T* caches,
+                                            T* out) {
+  using VecT = AlignedVector<T, VecSize>;
+  using VecF = AlignedVector<float, VecSize>;
+  auto to_float = [] __device__(T val) -> float {
+    return Converter<T>::to_float(val);
+  };
+  auto from_float = [] __device__(float val) -> T {
+    return Converter<T>::from_float(val);
+  };
+
+  VecT qk_vec;
+  Load(qkv_ptr + load_idx, &qk_vec);
+  VecF rot_half_vec;
+  int flag;
+#pragma unroll
+  for (int i = 0; i < VecSize; ++i) {
+    flag = 1 - 2 * (i % 2);
+    rot_half_vec[i] = -to_float(qk_vec[i + flag]) * static_cast<float>(flag);
+  }
+  VecF cos_vec, sin_vec;
+  Load(rotary_cos_ptr + rot_base_idx, &cos_vec);
+  Load(rotary_sin_ptr + rot_base_idx, &sin_vec);
+#pragma unroll
+  for (int i = 0; i < VecSize; ++i) {
+    T result = from_float(to_float(qk_vec[i]) * cos_vec[i] +
+                          rot_half_vec[i] * sin_vec[i]);
+    *(out + store_idx + i) = result;
+    if (WriteCache) {
+      *(caches + cache_store_idx + i) = result;
+    }
+  }
+}
+
+template <typename T, int VecSize = 4>
+__device__ __forceinline__ void StoreValue(const T* qkv_ptr,
+                                           const int load_idx,
+                                           const int store_idx,
+                                           const int cache_store_idx,
+                                           T* caches,
+                                           T* out) {
+  using VecT = AlignedVector<T, VecSize>;
+  VecT v_vec;
+  Load(qkv_ptr + load_idx, &v_vec);
+  Store(v_vec, out + store_idx);
+  Store(v_vec, caches + cache_store_idx);
+}
+
+template <typename T, typename WeightType, int VecSize>
+__global__ void DispatchCacheKVWithRopeVecKernel(const T* qkv,
+                                                 T* caches_k,
+                                                 T* caches_v,
+                                                 const int* block_tables,
+                                                 const WeightType* rotary_cos,
+                                                 const WeightType* rotary_sin,
+                                                 const int* cu_seqlens_q,
+                                                 const int* batch_ids_q,
+                                                 CacheKVWithRopeParams param,
+                                                 T* q_out,
+                                                 T* k_out,
+                                                 T* v_out) {
+  const int token_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int head_idx = blockIdx.y * blockDim.y + threadIdx.y;
+  const int head_dim_idx = (blockIdx.z * blockDim.z + threadIdx.z) * VecSize;
+
+  int load_idx, store_idx, cache_store_idx;
+  int rot_idx = token_idx * param.head_dim + head_dim_idx;
+
+  const int batch_idx = *(batch_ids_q + token_idx);
+  const int inter_batch_token_offset = token_idx - *(cu_seqlens_q + batch_idx);
+  const int inter_batch_block_idx = inter_batch_token_offset / param.block_size;
+  const int inter_block_offset = inter_batch_token_offset % param.block_size;
+  const int block_idx =
+      *(block_tables + batch_idx * param.block_num + inter_batch_block_idx);
+
+  assert(block_idx != -1);
+
+  if (head_dim_idx < param.head_dim) {
+    if (head_idx < param.q_head_num) {  // q
+      load_idx = token_idx * param.token_stride +
+                 (head_idx + param.q_head_offset) * param.head_stride +
+                 head_dim_idx;
+      store_idx =
+          token_idx * param.q_stride + head_idx * param.head_dim + head_dim_idx;
+      RotateQKVec<T, VecSize, false>(qkv,
+                                     rotary_cos,
+                                     rotary_sin,
+                                     load_idx,
+                                     store_idx,
+                                     -1,
+                                     rot_idx,
+                                     static_cast<T*>(nullptr),
+                                     q_out);
+    }
+
+    if (head_idx < param.kv_head_num) {  // kv
+      load_idx = token_idx * param.token_stride +
+                 (head_idx + param.k_head_offset) * param.head_stride +
+                 head_dim_idx;
+      store_idx = token_idx * param.kv_stride + head_idx * param.head_dim +
+                  head_dim_idx;
+      cache_store_idx = block_idx * param.cache_stride +
+                        inter_block_offset * param.kv_stride +
+                        head_idx * param.head_dim + head_dim_idx;
+      // printf("block_idx: %d inter_block_offset: %d cache_store_idx: %d
+      // param.cache_stride: %d\n", block_idx, inter_block_offset,
+      // cache_store_idx, param.cache_stride);
+      RotateQKVec<T, VecSize, true>(qkv,
+                                    rotary_cos,
+                                    rotary_sin,
+                                    load_idx,
+                                    store_idx,
+                                    cache_store_idx,
+                                    rot_idx,
+                                    caches_k,
+                                    k_out);
+
+      load_idx = token_idx * param.token_stride +
+                 (head_idx + param.v_head_offset) * param.head_stride +
+                 head_dim_idx;
+      StoreValue<T, VecSize>(
+          qkv, load_idx, store_idx, cache_store_idx, caches_v, v_out);
+    }
+  }
+}
+
+template <paddle::DataType D, int VecSize = 4>
+void CacheKVWithRopeKernel(
+    const paddle::Tensor& qkv,  // token_num, head_num * head_dim
+    paddle::Tensor&
+        caches_k,  // max_block_num, block_size, kv_head_num, head_dim
+    paddle::Tensor&
+        caches_v,  // max_block_num, block_size, kv_head_num, head_dim
+    const paddle::Tensor& block_tables,  // bs, block_num
+    const paddle::Tensor& rotary_cos,
+    const paddle::Tensor& rotary_sin,
+    const paddle::Tensor& cu_seqlens_q,  // bs + 1
+    const paddle::Tensor& batch_ids_q,   // token_num
+    const int q_head_num,
+    const int kv_head_num,
+    const int head_dim,
+    const int block_size,
+    paddle::Tensor& q_out,
+    paddle::Tensor& k_out,
+    paddle::Tensor& v_out) {
+  typedef PDTraits<D> traits_;
+  typedef typename traits_::DataType DataType_;
+  typedef typename traits_::data_t data_t;
+
+  const int all_num_elements = qkv.numel();
+  const int all_num_heads = q_head_num + 2 * kv_head_num;
+  auto stream = qkv.stream();
+
+  dim3 block_dims(1, 4, (head_dim + VecSize - 1) / VecSize);
+  dim3 grid_dims(all_num_elements / (all_num_heads * head_dim),  // token
+                 (std::max(q_head_num, kv_head_num) + block_dims.y - 1) /
+                     block_dims.y,  // head
+                 (head_dim + (block_dims.z * VecSize) - 1) /
+                     (block_dims.z * VecSize)  // dim: load Vec at a time
+  );
+
+  // printf("grid: (%d, %d, %d)\n", grid_dims.x, grid_dims.y, grid_dims.z);
+  // printf("block: (%d, %d, %d)\n", block_dims.x, block_dims.y, block_dims.z);
+
+  CacheKVWithRopeParams param;
+  param.head_dim = head_dim;
+  param.block_size = block_size;
+  param.block_num = static_cast<int>(block_tables.shape().back());
+  param.cache_stride = block_size * kv_head_num * head_dim;
+  param.token_stride = all_num_heads * head_dim;
+  param.head_stride = head_dim;
+  param.q_stride = q_head_num * head_dim;
+  param.kv_stride = kv_head_num * head_dim;
+  param.q_head_offset = 0;
+  param.k_head_offset = q_head_num;
+  param.v_head_offset = q_head_num + kv_head_num;
+  param.q_head_num = q_head_num;
+  param.kv_head_num = kv_head_num;
+
+  if (qkv.dtype() == rotary_cos.dtype()) {
+    DispatchCacheKVWithRopeVecKernel<DataType_, DataType_, VecSize>
+        <<<grid_dims, block_dims, 0, stream>>>(
+            reinterpret_cast<const DataType_*>(qkv.data<data_t>()),
+            reinterpret_cast<DataType_*>(caches_k.data<data_t>()),
+            reinterpret_cast<DataType_*>(caches_v.data<data_t>()),
+            reinterpret_cast<const int*>(block_tables.data<int>()),
+            reinterpret_cast<const DataType_*>(rotary_cos.data<data_t>()),
+            reinterpret_cast<const DataType_*>(rotary_sin.data<data_t>()),
+            reinterpret_cast<const int*>(cu_seqlens_q.data<int>()),
+            reinterpret_cast<const int*>(batch_ids_q.data<int>()),
+            param,
+            reinterpret_cast<DataType_*>(q_out.data<data_t>()),
+            reinterpret_cast<DataType_*>(k_out.data<data_t>()),
+            reinterpret_cast<DataType_*>(v_out.data<data_t>()));
+  } else if (rotary_cos.dtype() == paddle::DataType::FLOAT32) {
+    DispatchCacheKVWithRopeVecKernel<DataType_, float, VecSize>
+        <<<grid_dims, block_dims, 0, stream>>>(
+            reinterpret_cast<const DataType_*>(qkv.data<data_t>()),
+            reinterpret_cast<DataType_*>(caches_k.data<data_t>()),
+            reinterpret_cast<DataType_*>(caches_v.data<data_t>()),
+            reinterpret_cast<const int*>(block_tables.data<int>()),
+            reinterpret_cast<const float*>(rotary_cos.data<float>()),
+            reinterpret_cast<const float*>(rotary_sin.data<float>()),
+            reinterpret_cast<const int*>(cu_seqlens_q.data<int>()),
+            reinterpret_cast<const int*>(batch_ids_q.data<int>()),
+            param,
+            reinterpret_cast<DataType_*>(q_out.data<data_t>()),
+            reinterpret_cast<DataType_*>(k_out.data<data_t>()),
+            reinterpret_cast<DataType_*>(v_out.data<data_t>()));
+  } else {
+    PD_THROW("Unsupported qk dtype and rope dtype.");
+  }
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("CUDA Error: %s\n", cudaGetErrorString(err));
+  }
+}
+
+std::vector<paddle::Tensor> CacheKVWithRope(
+    const paddle::Tensor& qkv,  // token_num, head_num * head_dim
+    paddle::Tensor&
+        caches_k,  // max_block_num, block_size, kv_head_num, head_dim
+    paddle::Tensor&
+        caches_v,  // max_block_num, block_size, kv_head_num, head_dim
+    const paddle::Tensor& block_tables,  // bs, block_num
+    const paddle::Tensor& rotary_cos,
+    const paddle::Tensor& rotary_sin,
+    const paddle::Tensor& cu_seqlens_q,  // bs + 1
+    const paddle::Tensor& batch_ids_q,   // token_num
+    const int q_head_num,
+    const int kv_head_num,
+    const int head_dim,
+    const int block_size) {
+  auto qkv_shape = qkv.shape();
+  auto token_num = qkv_shape[0];
+  auto place = qkv.place();
+  auto dtype = qkv.dtype();
+  common::DDim q_out_shape, kv_out_shape;
+  if (rotary_cos.shape().size() == 3) {
+    q_out_shape = {token_num, q_head_num, head_dim};
+    kv_out_shape = {token_num, kv_head_num, head_dim};
+  } else {
+    q_out_shape = {token_num, 1, q_head_num, head_dim};
+    kv_out_shape = {token_num, 1, kv_head_num, head_dim};
+  }
+  auto q_out = GetEmptyTensor(q_out_shape, dtype, place);
+  auto k_out = GetEmptyTensor(kv_out_shape, dtype, place);
+  auto v_out = GetEmptyTensor(kv_out_shape, dtype, place);
+
+  if (token_num == 0) {
+    return {q_out, k_out, v_out};
+  }
+
+  PADDLE_ENFORCE_EQ(qkv_shape.back(),
+                    ((q_head_num + 2 * kv_head_num) * head_dim),
+                    "The last dimension of qkv [%d] must equal to {(q_head_num "
+                    "+ 2 * kv_head_num) * head_dim [%d].",
+                    qkv_shape.back(),
+                    ((q_head_num + 2 * kv_head_num) * head_dim));
+  PADDLE_ENFORCE_EQ(
+      head_dim % 2,
+      0,
+      "The last dimension (head_dim) of qkv must be an even number "
+      "for RoPE, but got %d",
+      head_dim);
+  PADDLE_ENFORCE_EQ(q_out.shape().back(),
+                    rotary_cos.shape().back(),
+                    "The last dimension of cos mismatches that of q, "
+                    "expect %d but got %d",
+                    q_out.shape().back(),
+                    rotary_cos.shape().back());
+
+  switch (dtype) {
+    case paddle::DataType::BFLOAT16:
+      CacheKVWithRopeKernel<paddle::DataType::BFLOAT16>(qkv,
+                                                        caches_k,
+                                                        caches_v,
+                                                        block_tables,
+                                                        rotary_cos,
+                                                        rotary_sin,
+                                                        cu_seqlens_q,
+                                                        batch_ids_q,
+                                                        q_head_num,
+                                                        kv_head_num,
+                                                        head_dim,
+                                                        block_size,
+                                                        q_out,
+                                                        k_out,
+                                                        v_out);
+      break;
+    case paddle::DataType::FLOAT16:
+      CacheKVWithRopeKernel<paddle::DataType::FLOAT16>(qkv,
+                                                       caches_k,
+                                                       caches_v,
+                                                       block_tables,
+                                                       rotary_cos,
+                                                       rotary_sin,
+                                                       cu_seqlens_q,
+                                                       batch_ids_q,
+                                                       q_head_num,
+                                                       kv_head_num,
+                                                       head_dim,
+                                                       block_size,
+                                                       q_out,
+                                                       k_out,
+                                                       v_out);
+      break;
+    default:
+      PD_THROW("Only support qk dtype of BF16 and F16");
+  }
+
+  return {q_out, k_out, v_out};
+}
+
+std::vector<std::vector<int64_t>> CacheKVWithRopeInferShape(
+    const std::vector<int64_t>& qkv_shape,
+    const std::vector<int64_t>& caches_k_shape,
+    const std::vector<int64_t>& caches_v_shape,
+    const std::vector<int64_t>& block_tables_shape,
+    const std::vector<int64_t>& cos_shape,
+    const std::vector<int64_t>& sin_shape,
+    const std::vector<int64_t>& cu_seqlens_q_shape,
+    const std::vector<int64_t>& batch_ids_q_shape) {
+  return {qkv_shape,
+          caches_k_shape,
+          caches_v_shape,
+          block_tables_shape,
+          cos_shape,
+          sin_shape,
+          cu_seqlens_q_shape,
+          batch_ids_q_shape};
+}
+
+std::vector<paddle::DataType> CacheKVWithRopeInferDtype(
+    const paddle::DataType& qkv_dtype,
+    const paddle::DataType& caches_k_dtype,
+    const paddle::DataType& caches_v_dtype,
+    const paddle::DataType& block_tables_dtype,
+    const paddle::DataType& cos_dtype,
+    const paddle::DataType& sin_dtype,
+    const paddle::DataType& cu_seqlens_q_dtype,
+    const paddle::DataType& batch_ids_q_dtype) {
+  return {qkv_dtype,
+          caches_k_dtype,
+          caches_v_dtype,
+          block_tables_dtype,
+          cos_dtype,
+          sin_dtype,
+          cu_seqlens_q_dtype,
+          batch_ids_q_dtype};
+}
+
+PD_BUILD_OP(cache_kv_with_rope)
+    .Inputs({"qkv",
+             "caches_k",
+             "caches_v",
+             "block_tables",
+             "rotary_cos",
+             "rotary_sin",
+             "cu_seqlen_q",
+             "batch_ids_q"})
+    .Outputs({"q_out", "k_out", "v_out"})
+    .Attrs(
+        {"q_head_num:int", "kv_head_num:int", "head_dim:int", "block_size:int"})
+    .SetKernelFn(PD_KERNEL(CacheKVWithRope))
+    .SetInferShapeFn(PD_INFER_SHAPE(CacheKVWithRopeInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(CacheKVWithRopeInferDtype));
diff --git a/custom_ops/metax_ops/fused_moe.cu b/custom_ops/metax_ops/fused_moe.cu
index c1cdf14e7..30a134e0c 100644
--- a/custom_ops/metax_ops/fused_moe.cu
+++ b/custom_ops/metax_ops/fused_moe.cu
@@ -14,9 +14,10 @@
 
 #pragma once
 
-#include "fused_moe_op.h"
+#include "fused_moe_helper.h"
 #include "helper.h"
-#include "mc_fused_moe_helper.h"
+
+namespace phi {
 
 __global__ void compute_total_rows_before_expert_kernel(
     int* sorted_experts,
@@ -42,58 +43,61 @@ void compute_total_rows_before_expert(int* sorted_indices,
       sorted_indices, total_indices, num_experts, total_rows_before_expert);
 }
 
-template <paddle::DataType T,
-          typename ElementA,
-          typename ElementB,
-          typename ElementC>
+}  // namespace phi
+
+template <paddle::DataType T>
 void FusedMoeKernel(const paddle::Tensor& input,
                     const paddle::Tensor& gate_weight,
-                    const paddle::Tensor& ffn1_weight,
-                    const paddle::optional<paddle::Tensor>& ffn1_scale,
-                    const paddle::optional<paddle::Tensor>& ffn1_bias,
-                    const paddle::Tensor& ffn2_weight,
-                    const paddle::optional<paddle::Tensor>& ffn2_scale,
-                    const paddle::optional<paddle::Tensor>& ffn2_bias,
+                    const paddle::Tensor& up_gate_proj_weight,
+                    const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
+                    const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
+                    const paddle::Tensor& down_proj_weight,
+                    const paddle::optional<paddle::Tensor>& down_proj_scale,
+                    const paddle::optional<paddle::Tensor>& down_proj_bias,
                     const std::string& quant_method,
                     const int moe_topk,
                     const bool group_moe,
                     const bool norm_topk_prob,
                     paddle::Tensor* output) {
+  using namespace phi;
   typedef PDTraits<T> traits_;
   typedef typename traits_::DataType DataType_;
   typedef typename traits_::data_t data_t;
 
   auto* output_data = output->data<data_t>();
 
-  auto moe_compute =
-      McMoeHelper<data_t, ElementA, ElementB, ElementC>(quant_method);
+  auto int8_moe_gemm_runner = McMoeGemmRunner<DataType_, int8_t>();
 
-  moe_compute.computeFFN(&input,
-                         &gate_weight,
-                         &ffn1_weight,
-                         ffn1_scale ? ffn1_scale.get_ptr() : nullptr,
-                         ffn1_bias ? ffn1_bias.get_ptr() : nullptr,
-                         &ffn2_weight,
-                         ffn2_scale ? ffn2_scale.get_ptr() : nullptr,
-                         ffn2_bias ? ffn2_bias.get_ptr() : nullptr,
-                         nullptr,
-                         moe_topk,
-                         group_moe,
-                         norm_topk_prob,
-                         1.0,  // ComputeFFN
-                         "ffn",
-                         output);
+  auto moe_compute =
+      McMoeHelper<data_t, DataType_>(quant_method, &int8_moe_gemm_runner);
+
+  moe_compute.computeFFN(
+      &input,
+      &gate_weight,
+      &up_gate_proj_weight,
+      up_gate_proj_scale ? up_gate_proj_scale.get_ptr() : nullptr,
+      up_gate_proj_bias ? up_gate_proj_bias.get_ptr() : nullptr,
+      &down_proj_weight,
+      down_proj_scale ? down_proj_scale.get_ptr() : nullptr,
+      down_proj_bias ? down_proj_bias.get_ptr() : nullptr,
+      nullptr,
+      moe_topk,
+      group_moe,
+      norm_topk_prob,
+      1.0,  // ComputeFFN
+      "ffn",
+      output);
 }
 
 std::vector<paddle::Tensor> FusedExpertMoe(
     const paddle::Tensor& input,
     const paddle::Tensor& gate_weight,
-    const paddle::Tensor& ffn1_weight,
-    const paddle::Tensor& ffn2_weight,
-    const paddle::optional<paddle::Tensor>& ffn1_bias,
-    const paddle::optional<paddle::Tensor>& ffn1_scale,
-    const paddle::optional<paddle::Tensor>& ffn2_bias,
-    const paddle::optional<paddle::Tensor>& ffn2_scale,
+    const paddle::Tensor& up_gate_proj_weight,
+    const paddle::Tensor& down_proj_weight,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
+    const paddle::optional<paddle::Tensor>& down_proj_bias,
+    const paddle::optional<paddle::Tensor>& down_proj_scale,
     const std::string& quant_method,
     const int moe_topk,
     const bool norm_topk_prob,
@@ -107,40 +111,22 @@ std::vector<paddle::Tensor> FusedExpertMoe(
 
   switch (input_type) {
     case paddle::DataType::BFLOAT16:
-      FusedMoeKernel<paddle::DataType::BFLOAT16,
-                     maca_bfloat16,
-                     int8_t,
-                     maca_bfloat16>(input,
-                                    gate_weight,
-                                    ffn1_weight,
-                                    ffn1_scale,
-                                    ffn1_bias,
-                                    ffn2_weight,
-                                    ffn2_scale,
-                                    ffn2_bias,
-                                    quant_method,
-                                    moe_topk,
-                                    group_moe,
-                                    norm_topk_prob,
-                                    &output);
+      FusedMoeKernel<paddle::DataType::BFLOAT16>(input,
+                                                 gate_weight,
+                                                 up_gate_proj_weight,
+                                                 up_gate_proj_scale,
+                                                 up_gate_proj_bias,
+                                                 down_proj_weight,
+                                                 down_proj_scale,
+                                                 down_proj_bias,
+                                                 quant_method,
+                                                 moe_topk,
+                                                 group_moe,
+                                                 norm_topk_prob,
+                                                 &output);
       break;
-    // case paddle::DataType::FLOAT16:
-    //   FusedMoeKernel<paddle::DataType::FLOAT16>(input,
-    //                                             gate_weight,
-    //                                             ffn1_weight,
-    //                                             ffn1_scale,
-    //                                             ffn1_bias,
-    //                                             ffn2_weight,
-    //                                             ffn2_scale,
-    //                                             ffn2_bias,
-    //                                             quant_method,
-    //                                             moe_topk,
-    //                                             group_moe,
-    //                                             norm_topk_prob,
-    //                                             &output);
-    //   break;
     default:
-      PD_THROW("Only support bf16 for FusedMoeKernel");
+      PD_THROW("Unsupported data type for FusedMoeKernel");
   }
   return {output};
 }
@@ -148,36 +134,36 @@ std::vector<paddle::Tensor> FusedExpertMoe(
 std::vector<std::vector<int64_t>> FusedExpertMoeInferShape(
     const std::vector<int64_t>& input_shape,
     const std::vector<int64_t>& gate_weight_shape,
-    const std::vector<int64_t>& ffn1_weight_shape,
-    const std::vector<int64_t>& ffn2_weight_shape,
-    const paddle::optional<std::vector<int64_t>>& ffn1_bias_shape,
-    const paddle::optional<std::vector<int64_t>>& ffn1_scale_shape,
-    const paddle::optional<std::vector<int64_t>>& ffn2_bias_shape,
-    const paddle::optional<std::vector<int64_t>>& ffn2_scale_shape) {
+    const std::vector<int64_t>& up_gate_proj_weight_shape,
+    const std::vector<int64_t>& down_proj_weight_shape,
+    const paddle::optional<std::vector<int64_t>>& up_gate_proj_bias_shape,
+    const paddle::optional<std::vector<int64_t>>& up_gate_proj_scale_shape,
+    const paddle::optional<std::vector<int64_t>>& down_proj_bias_shape,
+    const paddle::optional<std::vector<int64_t>>& down_proj_scale_shape) {
   return {input_shape};
 }
 
 std::vector<paddle::DataType> FusedExpertMoeInferDtype(
     const paddle::DataType& input_dtype,
     const paddle::DataType& gate_weight_dtype,
-    const paddle::DataType& ffn1_weight_dtype,
-    const paddle::DataType& ffn2_weight_dtype,
-    const paddle::optional<paddle::DataType>& ffn1_bias_dtype,
-    const paddle::optional<paddle::DataType>& ffn1_scale_dtype,
-    const paddle::optional<paddle::DataType>& ffn2_bias_dtype,
-    const paddle::optional<paddle::DataType>& ffn2_scale_dtype) {
+    const paddle::DataType& up_gate_proj_weight_dtype,
+    const paddle::DataType& down_proj_weight_dtype,
+    const paddle::optional<paddle::DataType>& up_gate_proj_bias_dtype,
+    const paddle::optional<paddle::DataType>& up_gate_proj_scale_dtype,
+    const paddle::optional<paddle::DataType>& down_proj_bias_dtype,
+    const paddle::optional<paddle::DataType>& down_proj_scale_dtype) {
   return {input_dtype};
 }
 
-PD_BUILD_OP(fused_expert_moe)
+PD_BUILD_STATIC_OP(fused_expert_moe)
     .Inputs({"input",
              "gate_weight",
-             "ffn1_weight",
-             "ffn2_weight",
-             paddle::Optional("ffn1_bias"),
-             paddle::Optional("ffn1_scale"),
-             paddle::Optional("ffn2_bias"),
-             paddle::Optional("ffn2_scale")})
+             "up_gate_proj_weight",
+             "down_proj_weight",
+             paddle::Optional("up_gate_proj_bias"),
+             paddle::Optional("up_gate_proj_scale"),
+             paddle::Optional("down_proj_bias"),
+             paddle::Optional("down_proj_scale")})
     .Outputs({"output"})
     .Attrs({"quant_method:std::string",
             "moe_topk:int",
diff --git a/custom_ops/metax_ops/fused_moe_gemm_kernels.h b/custom_ops/metax_ops/fused_moe_gemm_kernels.h
new file mode 100644
index 000000000..177f81307
--- /dev/null
+++ b/custom_ops/metax_ops/fused_moe_gemm_kernels.h
@@ -0,0 +1,199 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "mctlass/numeric_conversion.h"
+#include "mctlassEx/mctlassEx.h"
+
+namespace phi {
+
+template <typename T>
+struct mctlassExDataTraits;
+
+template <>
+struct mctlassExDataTraits<maca_bfloat16> {
+  static constexpr mctlassExDataType type =
+      mctlassExDataType::MCTLASS_EX_DATATYPE_BF16;
+};
+
+template <>
+struct mctlassExDataTraits<int8_t> {
+  static constexpr mctlassExDataType type =
+      mctlassExDataType::MCTLASS_EX_DATATYPE_INT8;
+};
+
+template <typename T, typename WeightType>
+class McMoeGemmRunner {
+ public:
+  McMoeGemmRunner() {}
+
+  void mc_grouped_gemm_basic_kernel(const T* ptrA,
+                                    mctlassExOrder_t majorA,
+                                    const WeightType* ptrB,
+                                    mctlassExOrder_t majorB,
+                                    const T* ptrScale,
+                                    const T* ptrBias,
+                                    T* ptrC,
+                                    mctlassExOrder_t majorC,
+                                    const int* ptrSegInd,
+                                    int* ptrMNumTilesInd,
+                                    int numExperts,
+                                    int m,  // expanded_active_expert_rows
+                                    int n,  // inter_dim
+                                    int k,  // hidden_size
+                                    mcStream_t stream) {
+    mctlassExHandle_t handle;
+    mctlassExHandleCreate(&handle);
+
+    mctlassExDataType DataType_ = mctlassExDataTraits<T>::type;
+    mctlassExDataType WeightType_ = mctlassExDataTraits<WeightType>::type;
+
+    mctlassExMatrixLayout_t matLayoutA;
+    mctlassExMatrixLayout_t matLayoutB;
+    mctlassExMatrixLayout_t matLayoutC;
+
+    // mat A: (m, k)
+    mctlassExMatrixLayoutCreate(&matLayoutA, DataType_, m, k, k);
+    mctlassExMatrixLayoutSetAttribute(
+        matLayoutA,
+        mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
+        &majorA,
+        sizeof(mctlassExOrder_t));
+    mctlassExMatrixLayoutSetAttribute(
+        matLayoutA,
+        mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_BATCH_COUNT,
+        &numExperts,
+        sizeof(int));
+    // mat B: (num_experts, n, k)
+    mctlassExMatrixLayoutCreate(&matLayoutB, WeightType_, k, n, k);
+    mctlassExMatrixLayoutSetAttribute(
+        matLayoutB,
+        mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
+        &majorB,
+        sizeof(mctlassExOrder_t));
+    mctlassExMatrixLayoutSetAttribute(
+        matLayoutB,
+        mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_BATCH_COUNT,
+        &numExperts,
+        sizeof(int));
+    // mat C: (m, n)
+    mctlassExMatrixLayoutCreate(&matLayoutC, DataType_, m, n, n);
+    mctlassExMatrixLayoutSetAttribute(
+        matLayoutC,
+        mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
+        &majorC,
+        sizeof(mctlassExOrder_t));
+    mctlassExMatrixLayoutSetAttribute(
+        matLayoutC,
+        mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_BATCH_COUNT,
+        &numExperts,
+        sizeof(int));
+    // bias: (num_experts, n)
+    // scale: (num, n)
+
+    mctlassExDesc_t mctlass_desc;
+    mctlassExCreateDesc(&mctlass_desc);
+    mctlassExDataType input_type = DataType_;
+    mctlassExDataType scale_type = WeightType_;
+    mctlassExDataType compute_type =
+        mctlassExDataType::MCTLASS_EX_DATATYPE_FP32;
+    mctlassExEpilogueType epilogue_type =
+        mctlassExEpilogueType::MCTLASS_EX_EPILOGUE_TYPE_DEFAULT;
+    if (ptrBias) {
+      epilogue_type = mctlassExEpilogueType::MCTLASS_EX_EPILOGUE_TYPE_BIAS;
+    }
+    // set scale
+    mctlassExDescSetAttribute(
+        mctlass_desc,
+        mctlassExDescAttributes_t::MCTLASS_EX_DESC_B_SCALE_POINTER,
+        &ptrScale,
+        sizeof(ptrScale));
+    mctlassExDescSetAttribute(
+        mctlass_desc,
+        mctlassExDescAttributes_t::MCTLASS_EX_DESC_B_SCALE_TYPE,
+        &input_type,
+        sizeof(mctlassExDataType));
+    // set bias
+    if (ptrBias) {
+      mctlassExDescSetAttribute(
+          mctlass_desc,
+          mctlassExDescAttributes_t::MCTLASS_EX_DESC_BIAS_POINTER,
+          &ptrBias,
+          sizeof(ptrBias));
+    }
+    // set coumpute type
+    mctlassExDescSetAttribute(
+        mctlass_desc,
+        mctlassExDescAttributes_t::MCTLASS_EX_DESC_COMPUTE_TYPE,
+        &compute_type,
+        sizeof(mctlassExDataType));
+    // set epilogue type
+    mctlassExDescSetAttribute(
+        mctlass_desc,
+        mctlassExDescAttributes_t::MCTLASS_EX_DESC_EPILOGUE_TYPE,
+        &epilogue_type,
+        sizeof(mctlassExEpilogueType));
+
+    const mctlassExContiguousGroupedGemmAlgo_t algo =
+        mctlassExContiguousGroupedGemmAlgo_t::
+            MCTLASS_EX_CONTIGUOUS_GROUPED_ALGO_DEFAULT;
+    mctlassExContiguousGroupedDesc_t contiguous_group_desc;
+    mctlassExContiguousGroupedDescCreate(
+        &contiguous_group_desc, ptrSegInd, nullptr, ptrMNumTilesInd, 1);
+    int blocksizeM;
+    mctlassExContiguousGroupedGemmGetBlocksizeM(handle,
+                                                mctlass_desc,
+                                                matLayoutA,
+                                                matLayoutB,
+                                                matLayoutC,
+                                                &algo,
+                                                &blocksizeM);
+    mctlassExContiguousGroupedGemmComputeMNumTilesIndptr(handle,
+                                                         mctlass_desc,
+                                                         matLayoutA,
+                                                         matLayoutB,
+                                                         matLayoutC,
+                                                         &algo,
+                                                         contiguous_group_desc,
+                                                         numExperts,
+                                                         blocksizeM,
+                                                         stream);
+
+    mctlassExContiguousGroupedGemmBasic(handle,
+                                        mctlass_desc,
+                                        ptrA,
+                                        matLayoutA,
+                                        ptrB,
+                                        matLayoutB,
+                                        ptrC,
+                                        matLayoutC,
+                                        contiguous_group_desc,
+                                        &algo,
+                                        nullptr,
+                                        0,
+                                        stream);
+
+    mctlassExHandleDestroy(handle);
+    mctlassExMatrixLayoutDestroy(matLayoutA);
+    mctlassExMatrixLayoutDestroy(matLayoutB);
+    mctlassExMatrixLayoutDestroy(matLayoutC);
+    mctlassExContiguousGroupedDescDestroy(contiguous_group_desc);
+    mctlassExDestroyDesc(mctlass_desc);
+  }
+};
+
+template class McMoeGemmRunner<maca_bfloat16, int8_t>;
+
+}  // namespace phi
diff --git a/custom_ops/metax_ops/fused_moe_helper.h b/custom_ops/metax_ops/fused_moe_helper.h
index 67c616ce4..9d7842ce2 100644
--- a/custom_ops/metax_ops/fused_moe_helper.h
+++ b/custom_ops/metax_ops/fused_moe_helper.h
@@ -14,14 +14,17 @@
 
 #pragma once
 
-#include "cutlass_kernels/moe_gemm/fused_moe_gemm_kernels.h"
+#include "fused_moe_gemm_kernels.h"
+#include "fused_moe_imp_op.h"
 #include "fused_moe_op.h"
+#include "mctlass/numeric_conversion.h"
+#include "mctlassEx/mctlassEx.h"
 
-using namespace phi;
+namespace phi {
 
 template <typename T, int VecSize>
-__global__ void moe_token_type_ids_kernel(T *gating_output,
-                                          const int *moe_token_type_ids_out,
+__global__ void moe_token_type_ids_kernel(T* gating_output,
+                                          const int* moe_token_type_ids_out,
                                           const int num_rows,
                                           const int num_experts,
                                           const int k) {
@@ -40,8 +43,8 @@ __global__ void moe_token_type_ids_kernel(T *gating_output,
 }
 
 template <typename T>
-void moe_token_type_ids_kernelLauncher(T *gating_output,
-                                       const int *moe_token_type_ids_out,
+void moe_token_type_ids_kernelLauncher(T* gating_output,
+                                       const int* moe_token_type_ids_out,
                                        const int num_rows,
                                        const int num_experts,
                                        const int k,
@@ -51,3 +54,338 @@ void moe_token_type_ids_kernelLauncher(T *gating_output,
   moe_token_type_ids_kernel<T, 1><<<blocks, 512, 0, stream>>>(
       gating_output, moe_token_type_ids_out, num_rows, num_experts, k);
 }
+
+template <typename T, typename MacaType>
+class McMoeHelper {
+ public:
+  McMoeHelper(const std::string gemm_method,
+              McMoeGemmRunner<MacaType, int8_t>* int8_moe_gemm_runner)
+      : gemm_method_(gemm_method),
+        int8_moe_gemm_runner_(int8_moe_gemm_runner) {}
+
+  // --------      getWorkspaceSize      -------- //
+  template <typename KeyT>
+  size_t getWorkspaceSize(const int64_t num_rows,
+                          const int64_t hidden_size,
+                          const int64_t inter_size,
+                          const int64_t num_experts,
+                          const int64_t k) {
+    const size_t buf_size = AlignTo16(k * num_rows * hidden_size);
+    const size_t interbuf_size = AlignTo16(k * num_rows * inter_size);
+    const size_t padded_experts = AlignTo16(num_experts);
+    const size_t num_moe_inputs = AlignTo16(k * num_rows);
+    // softmax output, permuted_rows and permuted_experts have moved to outside
+    // of moe kernel, allocate them in Encoder or Decoder before invoking
+    // FfnLayer forward.
+    size_t total_ws_bytes =
+        5 * num_moe_inputs *
+        sizeof(int);  // source_rows_, permuted_rows_, permuted_experts_
+    total_ws_bytes += buf_size * sizeof(KeyT);  // permuted_data
+    total_ws_bytes +=
+        padded_experts * sizeof(int32_t);  // Hold total_rows_before_expert_
+
+    const size_t bytes_for_fc1_result = interbuf_size * sizeof(KeyT);
+    const size_t sorter_ws_size_bytes =
+        AlignTo16(sorter_.getWorkspaceSize(num_rows));
+    sorter_.update_num_experts(num_experts);
+
+    int64_t bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
+    if (sorter_ws_size_bytes > bytes_for_fc1_result) {
+      int64_t remaining_bytes =
+          AlignTo16(sorter_ws_size_bytes - bytes_for_fc1_result);
+      bytes_for_intermediate_and_sorting += remaining_bytes;
+    }
+
+    total_ws_bytes +=
+        bytes_for_intermediate_and_sorting;  // intermediate (fc1) output + cub
+                                             // sorting workspace
+
+    int64_t num_softmax_outs = 0;
+    const bool is_pow_2 =
+        (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
+    if (!is_pow_2 || num_experts > 256) {
+      num_softmax_outs = AlignTo16(num_rows * num_experts);
+    }
+
+    total_ws_bytes += num_softmax_outs * sizeof(float);
+
+    return total_ws_bytes;
+  }
+
+  void computeFFN(const paddle::Tensor* input,
+                  const paddle::Tensor* gate_weight,
+                  const paddle::Tensor* up_gate_proj_weight,
+                  const paddle::Tensor* up_gate_proj_scale,
+                  const paddle::Tensor* up_gate_proj_bias,
+                  const paddle::Tensor* down_proj_weight,
+                  const paddle::Tensor* down_proj_scale,
+                  const paddle::Tensor* down_proj_bias,
+                  const paddle::Tensor* moe_token_type_ids,
+                  const int moe_topk,
+                  const bool group_moe,
+                  const bool norm_topk_prob,
+                  const float routed_scaling_factor,
+                  const std::string moe_type,
+                  paddle::Tensor* output) {
+    auto* input_activations = input->data<T>();
+    auto* gating_weights = gate_weight->data<float>();
+    const T* fc1_expert_biases =
+        up_gate_proj_bias ? up_gate_proj_bias->data<T>() : nullptr;
+    const T* fc2_expert_biases =
+        down_proj_bias ? down_proj_bias->data<T>() : nullptr;
+
+    auto* output_ = output->data<T>();
+    auto stream = input->stream();
+    auto place = input->place();
+    auto input_type = input->dtype();
+
+    auto input_dims = input->dims();
+    auto up_gate_proj_dims = up_gate_proj_weight->dims();
+    int64_t token_num = 0;
+    if (input_dims.size() == 3) {
+      token_num = input_dims[0] * input_dims[1];
+    } else {
+      token_num = input_dims[0];
+    }
+    const int64_t num_rows = token_num;
+
+    const int64_t hidden_size = up_gate_proj_dims[2];
+    int64_t inter_dim = 0;
+    if (moe_type == "qkv") {
+      inter_dim =
+          up_gate_proj_dims[2] * up_gate_proj_dims[3] * up_gate_proj_dims[4];
+    } else {
+      inter_dim = up_gate_proj_dims[1];
+    }
+
+    // if (gemm_method_ == "weight_only_int4") {
+    //   inter_dim = inter_dim * 2;
+    // }
+
+    const int64_t inter_size = inter_dim;
+    const int64_t num_experts = up_gate_proj_dims[0];
+    const int64_t k = moe_topk;
+
+    int64_t bytes =
+        getWorkspaceSize<T>(num_rows, hidden_size, inter_size, num_experts, k);
+
+    // Pointers
+    int* expert_for_source_row;
+    int* source_rows_;
+    int* permuted_rows_;
+    int* permuted_experts_;
+    int* expanded_source_row_to_expanded_dest_row;
+
+    T* permuted_data_;
+    int32_t* total_rows_before_expert_;
+    T* fc1_result_;
+    float* softmax_out_;
+
+    paddle::Tensor ws_ptr_tensor =
+        GetEmptyTensor({bytes}, paddle::DataType::INT8, place);
+    int8_t* ws_ptr = ws_ptr_tensor.data<int8_t>();
+
+    const int64_t buf_size = AlignTo16(k * num_rows * hidden_size);
+    const int64_t interbuf_size = AlignTo16(k * num_rows * inter_size);
+    const int64_t padded_experts = AlignTo16(num_experts);
+    const int64_t num_moe_inputs = AlignTo16(k * num_rows);
+
+    expert_for_source_row = reinterpret_cast<int*>(ws_ptr);
+    source_rows_ = expert_for_source_row + num_moe_inputs;
+    permuted_rows_ = source_rows_ + num_moe_inputs;
+    permuted_experts_ = permuted_rows_ + num_moe_inputs;
+    expanded_source_row_to_expanded_dest_row =
+        permuted_experts_ + num_moe_inputs;
+    permuted_data_ = reinterpret_cast<T*>(
+        expanded_source_row_to_expanded_dest_row + num_moe_inputs);
+    total_rows_before_expert_ =
+        reinterpret_cast<int32_t*>(permuted_data_ + buf_size);
+    fc1_result_ =
+        reinterpret_cast<T*>(total_rows_before_expert_ + padded_experts);
+
+    const bool is_pow_2 =
+        (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
+    if (!is_pow_2 || num_experts > 256) {
+      softmax_out_ = reinterpret_cast<float*>(fc1_result_ + interbuf_size);
+    } else {
+      softmax_out_ = nullptr;
+    }
+
+    paddle::Tensor expert_scales_float_tensor =
+        GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
+    float* expert_scales_float = expert_scales_float_tensor.data<float>();
+
+    float* softmax_max_prob = nullptr;
+    if (group_moe) {
+      paddle::Tensor softmax_max_prob_tensor = GetEmptyTensor(
+          {num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
+      // (TODO: check fill success ?)
+      paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
+      softmax_max_prob = softmax_max_prob_tensor.data<float>();
+    }
+
+    paddle::Tensor fc1_out_tensor =
+        GetEmptyTensor({num_rows * k, inter_size}, input_type, place);
+    T* fc1_out = fc1_out_tensor.data<T>();
+
+    auto input_cast_tensor =
+        paddle::experimental::cast(*input, paddle::DataType::FLOAT32);
+    auto gate_tensor =
+        paddle::experimental::matmul(input_cast_tensor, *gate_weight);
+    float* gating_output = gate_tensor.data<float>();
+
+    if (moe_token_type_ids) {
+      auto* moe_token_type_ids_out = moe_token_type_ids->data<int>();
+      moe_token_type_ids_kernelLauncher<float>(gating_output,
+                                               moe_token_type_ids_out,
+                                               num_rows,
+                                               num_experts,
+                                               k,
+                                               stream);
+    }
+
+    topk_gating_softmax_kernelLauncher<float, int>(gating_output,
+                                                   nullptr,
+                                                   expert_scales_float,
+                                                   softmax_out_,
+                                                   expert_for_source_row,
+                                                   source_rows_,
+                                                   softmax_max_prob,
+                                                   num_rows,
+                                                   num_experts,
+                                                   k,
+                                                   group_moe,
+                                                   stream);
+
+    const int64_t sorter_ws_size_bytes =
+        AlignTo16(sorter_.getWorkspaceSize(int64_t(k * num_rows)));
+
+    sorter_.run(fc1_result_,
+                sorter_ws_size_bytes,
+                expert_for_source_row,
+                permuted_experts_,
+                source_rows_,
+                permuted_rows_,
+                k * num_rows,
+                false,
+                stream);
+
+    initialize_moe_routing_kernelLauncher(
+        input_activations,
+        permuted_data_,
+        permuted_rows_,
+        nullptr,
+        nullptr,
+        expanded_source_row_to_expanded_dest_row,
+        num_rows,
+        num_rows,
+        hidden_size,
+        k,
+        stream);
+
+    const int64_t expanded_active_expert_rows = k * num_rows;
+
+    compute_total_rows_before_expert(permuted_experts_,
+                                     expanded_active_expert_rows,
+                                     num_experts,
+                                     total_rows_before_expert_,
+                                     stream);
+
+    mctlassExOrder_t row_major = mctlassExOrder_t::MCTLASS_EX_ORDER_ROW_MAJOR;
+    mctlassExOrder_t column_major =
+        mctlassExOrder_t::MCTLASS_EX_ORDER_COLUMN_MAJOR;
+    auto m_num_tile =
+        GetEmptyTensor({num_experts}, paddle::DataType::INT32, place);
+    int* m_num_tile_ptr = reinterpret_cast<int*>(m_num_tile.data<int>());
+
+    if (gemm_method_ == "weight_only_int8") {
+      int8_moe_gemm_runner_->mc_grouped_gemm_basic_kernel(
+          reinterpret_cast<const MacaType*>(permuted_data_),
+          row_major,
+          reinterpret_cast<const int8_t*>(up_gate_proj_weight->data<int8_t>()),
+          column_major,
+          reinterpret_cast<const MacaType*>(up_gate_proj_scale->data<T>()),
+          reinterpret_cast<const MacaType*>(fc1_expert_biases),
+          reinterpret_cast<MacaType*>(fc1_out),
+          row_major,
+          total_rows_before_expert_,
+          m_num_tile_ptr,
+          num_experts,
+          expanded_active_expert_rows,
+          inter_size,
+          hidden_size,
+          stream);
+    } else {
+      throw std::runtime_error("Unsupported gemm method: " + gemm_method_);
+    }
+
+    if (moe_type == "ffn") {
+      auto act_out_tensor =
+          paddle::experimental::swiglu(fc1_out_tensor, nullptr);
+      auto act_out = act_out_tensor.data<T>();
+
+      paddle::Tensor fc2_output_tensor =
+          GetEmptyTensor({k * num_rows, hidden_size}, input_type, place);
+      T* fc2_result = fc2_output_tensor.data<T>();
+
+      if (gemm_method_ == "weight_only_int8") {
+        int8_moe_gemm_runner_->mc_grouped_gemm_basic_kernel(
+            reinterpret_cast<const MacaType*>(act_out),
+            row_major,
+            reinterpret_cast<const int8_t*>(down_proj_weight->data<int8_t>()),
+            column_major,
+            reinterpret_cast<const MacaType*>(down_proj_scale->data<T>()),
+            nullptr,
+            reinterpret_cast<MacaType*>(fc2_result),
+            row_major,
+            total_rows_before_expert_,
+            m_num_tile_ptr,
+            num_experts,
+            expanded_active_expert_rows,
+            hidden_size,
+            inter_size / 2,
+            stream);
+      } else {
+        throw std::runtime_error("Unsupported gemm method: " + gemm_method_);
+      }
+
+      finalize_moe_routing_kernelLauncher(
+          fc2_result,
+          output_,
+          fc2_expert_biases,
+          reinterpret_cast<float*>(expert_scales_float),
+          expanded_source_row_to_expanded_dest_row,
+          expert_for_source_row,
+          num_rows,
+          hidden_size,
+          k,
+          static_cast<int>(1),
+          norm_topk_prob,
+          routed_scaling_factor,
+          stream);
+    } else {
+      finalize_moe_routing_kernelLauncher(
+          // fc2_result,
+          fc1_out,
+          output_,
+          fc1_expert_biases,  // fc2_expert_biases,
+          reinterpret_cast<float*>(expert_scales_float),
+          expanded_source_row_to_expanded_dest_row,
+          expert_for_source_row,
+          num_rows,
+          inter_size,
+          k,
+          static_cast<int>(0),
+          norm_topk_prob,
+          routed_scaling_factor,
+          stream);
+    }
+  }
+
+ private:
+  McMoeGemmRunner<MacaType, int8_t>* int8_moe_gemm_runner_;
+  std::string gemm_method_;
+  CubKeyValueSorter sorter_;
+};
+
+}  // namespace phi
diff --git a/custom_ops/metax_ops/fused_moe_imp_op.h b/custom_ops/metax_ops/fused_moe_imp_op.h
index 99aabaf8a..3108df789 100644
--- a/custom_ops/metax_ops/fused_moe_imp_op.h
+++ b/custom_ops/metax_ops/fused_moe_imp_op.h
@@ -20,6 +20,8 @@
 #include <string>
 #include "cub/cub.cuh"
 
+namespace phi {
+
 static const float HALF_FLT_MAX = 65504.F;
 static const float HALF_FLT_MIN = -65504.F;
 static inline size_t AlignTo16(const size_t& input) {
@@ -121,3 +123,5 @@ class CubKeyValueSorter {
   int num_experts_;
   int num_bits_;
 };
+
+}  // namespace phi
diff --git a/custom_ops/metax_ops/fused_moe_op.h b/custom_ops/metax_ops/fused_moe_op.h
index 00ed38115..1a7d32cdb 100644
--- a/custom_ops/metax_ops/fused_moe_op.h
+++ b/custom_ops/metax_ops/fused_moe_op.h
@@ -1,28 +1,27 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
- * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+// /*
+//  * SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION &
+//  * AFFILIATES. All rights reserved. SPDX-License-Identifier: Apache-2.0
+//  *
+//  * Licensed under the Apache License, Version 2.0 (the "License");
+//  * you may not use this file except in compliance with the License.
+//  * You may obtain a copy of the License at
+//  *
+//  * http://www.apache.org/licenses/LICENSE-2.0
+//  *
+//  * Unless required by applicable law or agreed to in writing, software
+//  * distributed under the License is distributed on an "AS IS" BASIS,
+//  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//  * See the License for the specific language governing permissions and
+//  * limitations under the License.
+//  */
 
 #pragma once
 
 #include <cuda.h>
 #include <cuda_fp16.h>
-#include "fused_moe_helper.h"
-#include "fused_moe_imp_op.h"
-#include "mctlass/numeric_conversion.h"  // BUILD_MARK
-// Ignore mctlass warnings about type punning
+#include "mctlass/functional.h"
+#include "mctlass/numeric_conversion.h"
+// Ignore CUTLASS warnings about type punning
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
 #pragma GCC diagnostic ignored "-Wunused-function"
@@ -34,6 +33,8 @@
 
 #define WARP_SIZE 32
 
+namespace phi {
+
 struct GpuLaunchConfig {
   dim3 block_per_grid;
   dim3 thread_per_block;
@@ -55,6 +56,324 @@ inline GpuLaunchConfig Get1DBlocksAnd2DGridsMoe(const int64_t cols) {
   return config;
 }
 
+constexpr static int FINALIZE_THREADS_PER_BLOCK = 256;
+template <class T, class U>
+__host__ __device__ constexpr static U arrayConvert(T const& input) {
+  using Type = typename U::Element;
+  static_assert(T::kElements == U::kElements);
+  U u;
+#pragma unroll
+  for (int i = 0; i < U::kElements; i++) {
+    u[i] = static_cast<Type>(input[i]);
+  }
+  return u;
+}
+
+struct uint8 {
+  uint4 u;
+  uint4 v;
+};
+
+template <int BYTES>
+struct BytesToType {};
+
+template <>
+struct BytesToType<32> {
+  using Type = uint8;
+  static_assert(sizeof(Type) == 32);
+};
+
+template <>
+struct BytesToType<16> {
+  using Type = uint4;
+  static_assert(sizeof(Type) == 16);
+};
+
+template <>
+struct BytesToType<8> {
+  using Type = uint64_t;
+  static_assert(sizeof(Type) == 8);
+};
+
+template <>
+struct BytesToType<4> {
+  using Type = uint32_t;
+  static_assert(sizeof(Type) == 4);
+};
+
+template <>
+struct BytesToType<2> {
+  using Type = uint16_t;
+  static_assert(sizeof(Type) == 2);
+};
+
+template <>
+struct BytesToType<1> {
+  using Type = uint8_t;
+  static_assert(sizeof(Type) == 1);
+};
+
+template <template <typename> class ReductionOp, typename T, int block_size>
+__inline__ __device__ T BlockAllReduce(T val) {
+  typedef cub::BlockReduce<T, block_size> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage temp_storage;
+  __shared__ T result_broadcast;
+  T result = BlockReduce(temp_storage).Reduce(val, ReductionOp<T>());
+  if (threadIdx.x == 0) {
+    result_broadcast = result;
+  }
+  __syncthreads();
+  return result_broadcast;
+}
+
+template <typename T>
+struct SumOp {
+  __device__ __forceinline__ T operator()(T const& x, T const& y) {
+    return x + y;
+  }
+};
+
+template <typename InType, typename OutType>
+__forceinline__ __device__ OutType QuantHelperFunc(const InType input,
+                                                   const float scale,
+                                                   const float max_bound,
+                                                   const float min_bound) {
+  float quant_value = max_bound * scale * static_cast<float>(input);
+  return static_cast<OutType>(
+      ClipFunc<float>(quant_value, min_bound, max_bound));
+}
+
+template <typename T, typename OutT, int VecSize, int Kthread>
+__global__ void masked_quantize_moe_input_kernel(
+    const T* permuted_inputs,
+    const int64_t* expert_idx_per_token,
+    const float* quant_scales,
+    const float quant_max_bound,
+    const float quant_min_bound,
+    const int64_t token_num,
+    const int64_t dim,
+    float* permuted_input_row_sum,
+    const int64_t* recv_expert_count,
+    const int num_max_tokens_per_expert,
+    OutT* out) {
+  using LoadT = AlignedVector<T, VecSize>;
+  using LoadOutT = AlignedVector<OutT, VecSize>;
+  LoadT input_vec;
+  LoadOutT output_vec;
+  float scale_factor = -7.0f / 512.0f;
+  using vec_t = typename BytesToType<sizeof(OutT) * VecSize>::Type;
+  for (int token_idx = blockIdx.x; token_idx < token_num;
+       token_idx += gridDim.x) {
+    const auto token_idx_in_expert = token_idx % num_max_tokens_per_expert;
+    const auto expert_id = token_idx / num_max_tokens_per_expert;
+    if (token_idx_in_expert >= recv_expert_count[expert_id]) {
+      auto next_expert_start_idx = (expert_id + 1) * num_max_tokens_per_expert;
+      auto num_iters_to_next_expert =
+          (next_expert_start_idx - token_idx - 1) / gridDim.x;
+      token_idx += num_iters_to_next_expert * gridDim.x;
+      continue;
+    }
+    int64_t expert_idx = expert_idx_per_token[token_idx];
+    float quant_scale = quant_scales[expert_idx];
+    float thread_row_sum = 0.0f;
+    for (int idx = threadIdx.x; idx < dim / VecSize; idx += blockDim.x) {
+      int64_t offset = token_idx * dim + idx * VecSize;
+      Load<T, VecSize>(&permuted_inputs[offset], &input_vec);
+#pragma unroll
+      for (int i = 0; i < VecSize; i++) {
+        output_vec[i] = QuantHelperFunc<T, OutT>(
+            input_vec[i], quant_scale, quant_max_bound, quant_min_bound);
+        thread_row_sum += static_cast<float>(output_vec[i]);
+      }
+      *(reinterpret_cast<vec_t*>(&out[offset])) =
+          *(reinterpret_cast<const vec_t*>(&output_vec));
+    }
+    float block_row_sum = BlockAllReduce<SumOp, float, Kthread>(thread_row_sum);
+    permuted_input_row_sum[token_idx] = block_row_sum * scale_factor;
+  }
+}
+
+template <typename T, typename OutT, int VecSize, int Kthread>
+__global__ void quantize_moe_input_kernel(const T* permuted_inputs,
+                                          const int64_t* expert_idx_per_token,
+                                          const float* quant_scales,
+                                          const float quant_max_bound,
+                                          const float quant_min_bound,
+                                          const int64_t token_num,
+                                          const int64_t dim,
+                                          float* permuted_input_row_sum,
+                                          const int64_t* recv_expert_count,
+                                          const int num_max_tokens_per_expert,
+                                          OutT* out) {
+  using LoadT = AlignedVector<T, VecSize>;
+  using LoadOutT = AlignedVector<OutT, VecSize>;
+  LoadT input_vec;
+  LoadOutT output_vec;
+  using vec_t = typename BytesToType<sizeof(OutT) * VecSize>::Type;
+  float scale_factor = -7.0f / 512.0f;
+  for (int token_idx = blockIdx.x; token_idx < token_num;
+       token_idx += gridDim.x) {
+    int64_t expert_idx = expert_idx_per_token[token_idx];
+    float quant_scale = quant_scales[expert_idx];
+    float thread_row_sum = 0.0f;
+    for (int idx = threadIdx.x; idx < dim / VecSize; idx += blockDim.x) {
+      int64_t offset = token_idx * dim + idx * VecSize;
+      Load<T, VecSize>(&permuted_inputs[offset], &input_vec);
+#pragma unroll
+      for (int i = 0; i < VecSize; i++) {
+        output_vec[i] = QuantHelperFunc<T, OutT>(
+            input_vec[i], quant_scale, quant_max_bound, quant_min_bound);
+        thread_row_sum += static_cast<float>(output_vec[i]);
+      }
+      *(reinterpret_cast<vec_t*>(&out[offset])) =
+          *(reinterpret_cast<const vec_t*>(&output_vec));
+    }
+    float block_row_sum = BlockAllReduce<SumOp, float, Kthread>(thread_row_sum);
+    permuted_input_row_sum[token_idx] = block_row_sum * scale_factor;
+  }
+}
+
+template <typename T, typename OutT>
+void quantize_moe_input(const T* permuted_inputs,
+                        const int64_t* expert_idx_per_token,
+                        const float* quant_scales,
+                        const float quant_max_bound,
+                        const float quant_min_bound,
+                        const int64_t token_num,
+                        const int64_t dim,
+                        float* permuted_input_row_sum,
+                        const int64_t* recv_expert_count,
+                        const int num_max_tokens_per_expert,
+                        bool used_in_ep_low_latency,
+                        OutT* out,
+                        cudaStream_t stream) {
+  constexpr int VecSize = 16 / sizeof(T);
+  constexpr int threads_per_block = 128;
+  const int dev_id = 0;
+  int sm_count;
+  int act_blocks_per_sm;
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+  assert(dim % VecSize == 0);
+  auto kernel =
+      used_in_ep_low_latency
+          ? masked_quantize_moe_input_kernel<T,
+                                             OutT,
+                                             VecSize,
+                                             threads_per_block>
+          : quantize_moe_input_kernel<T, OutT, VecSize, threads_per_block>;
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &act_blocks_per_sm, kernel, threads_per_block, 0);
+  const int num_blocks_per_wave = sm_count * act_blocks_per_sm;
+  dim3 grid;
+  grid.x = min(static_cast<int64_t>(num_blocks_per_wave), token_num);
+  kernel<<<grid, threads_per_block, 0, stream>>>(permuted_inputs,
+                                                 expert_idx_per_token,
+                                                 quant_scales,
+                                                 quant_max_bound,
+                                                 quant_min_bound,
+                                                 token_num,
+                                                 dim,
+                                                 permuted_input_row_sum,
+                                                 recv_expert_count,
+                                                 num_max_tokens_per_expert,
+                                                 out);
+}
+
+template <typename T, int VecSize, int Kthread>
+__global__ void masked_compute_row_sum_kernel(
+    const T* permuted_inputs,
+    const int64_t token_num,
+    const int64_t dim,
+    float* permuted_input_row_sum,
+    const int64_t* recv_expert_count,
+    const int num_max_tokens_per_expert) {
+  using LoadT = AlignedVector<T, VecSize>;
+  LoadT input_vec;
+  float scale_factor = -7.0f / 512.0f;
+  for (int token_idx = blockIdx.x; token_idx < token_num;
+       token_idx += gridDim.x) {
+    const auto token_idx_in_expert = token_idx % num_max_tokens_per_expert;
+    const auto expert_id = token_idx / num_max_tokens_per_expert;
+    if (token_idx_in_expert >= recv_expert_count[expert_id]) {
+      auto next_expert_start_idx = (expert_id + 1) * num_max_tokens_per_expert;
+      auto num_iters_to_next_expert =
+          (next_expert_start_idx - token_idx - 1) / gridDim.x;
+      token_idx += num_iters_to_next_expert * gridDim.x;
+      continue;
+    }
+    float thread_row_sum = 0.0f;
+    for (int idx = threadIdx.x; idx < dim / VecSize; idx += blockDim.x) {
+      int64_t offset = token_idx * dim + idx * VecSize;
+      Load<T, VecSize>(&permuted_inputs[offset], &input_vec);
+#pragma unroll
+      for (int i = 0; i < VecSize; i++) {
+        thread_row_sum += static_cast<float>(input_vec[i]);
+      }
+    }
+    float block_row_sum = BlockAllReduce<SumOp, float, Kthread>(thread_row_sum);
+    permuted_input_row_sum[token_idx] = block_row_sum * scale_factor;
+  }
+}
+
+template <typename T, int VecSize, int Kthread>
+__global__ void compute_row_sum_kernel(const T* permuted_inputs,
+                                       const int64_t token_num,
+                                       const int64_t dim,
+                                       float* permuted_input_row_sum,
+                                       const int64_t* recv_expert_count,
+                                       const int num_max_tokens_per_expert) {
+  using LoadT = AlignedVector<T, VecSize>;
+  LoadT input_vec;
+  float scale_factor = -7.0f / 512.0f;
+  for (int token_idx = blockIdx.x; token_idx < token_num;
+       token_idx += gridDim.x) {
+    float thread_row_sum = 0.0f;
+    for (int idx = threadIdx.x; idx < dim / VecSize; idx += blockDim.x) {
+      int64_t offset = token_idx * dim + idx * VecSize;
+      Load<T, VecSize>(&permuted_inputs[offset], &input_vec);
+#pragma unroll
+      for (int i = 0; i < VecSize; i++) {
+        thread_row_sum += static_cast<float>(input_vec[i]);
+      }
+    }
+    float block_row_sum = BlockAllReduce<SumOp, float, Kthread>(thread_row_sum);
+    permuted_input_row_sum[token_idx] = block_row_sum * scale_factor;
+  }
+}
+
+template <typename T>
+void compute_row_sum(const T* permuted_inputs,
+                     const int64_t token_num,
+                     const int64_t dim,
+                     float* permuted_input_row_sum,
+                     const int64_t* recv_expert_count,
+                     const int num_max_tokens_per_expert,
+                     bool used_in_ep_low_latency,
+                     cudaStream_t stream) {
+  constexpr int VecSize = 16 / sizeof(T);
+  constexpr int threads_per_block = 128;
+  const int dev_id = 0;
+  int sm_count;
+  int act_blocks_per_sm;
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+  assert(dim % VecSize == 0);
+  auto kernel =
+      used_in_ep_low_latency
+          ? masked_compute_row_sum_kernel<T, VecSize, threads_per_block>
+          : compute_row_sum_kernel<T, VecSize, threads_per_block>;
+  cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &act_blocks_per_sm, kernel, threads_per_block, 0);
+  const int num_blocks_per_wave = sm_count * act_blocks_per_sm;
+  dim3 grid;
+  grid.x = min(static_cast<int64_t>(num_blocks_per_wave), token_num);
+  kernel<<<grid, threads_per_block, 0, stream>>>(permuted_inputs,
+                                                 token_num,
+                                                 dim,
+                                                 permuted_input_row_sum,
+                                                 recv_expert_count,
+                                                 num_max_tokens_per_expert);
+}
+
 // ====================== Softmax things ===============================
 // We have our own implementation of softmax here so we can support transposing
 // the output in the softmax kernel when we extend this module to support
@@ -132,64 +451,6 @@ __launch_bounds__(TPB) __global__
   }
 }
 
-template <typename T, int TPB>
-__launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
-                                                 T* output,
-                                                 int* indices,
-                                                 int* source_rows,
-                                                 T* softmax_max_prob,
-                                                 const int64_t num_experts,
-                                                 const int64_t k,
-                                                 const int64_t num_rows) {
-  using cub_kvp = cub::KeyValuePair<int, T>;
-  using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
-  __shared__ typename BlockReduce::TempStorage tmpStorage;
-
-  cub_kvp thread_kvp;
-  cub::ArgMax arg_max;
-
-  const int block_row = blockIdx.x + blockIdx.y * gridDim.x;
-  if (block_row >= num_rows) {
-    return;
-  }
-
-  const bool should_process_row = true;
-  const int thread_read_offset = block_row * num_experts;
-
-  for (int k_idx = 0; k_idx < k; ++k_idx) {
-    thread_kvp.key = 0;
-    thread_kvp.value = T(-1.f);  // This is OK because inputs are probabilities
-
-    cub_kvp inp_kvp;
-    for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
-      const int idx = thread_read_offset + expert;
-      inp_kvp.key = expert;
-      inp_kvp.value = inputs_after_softmax[idx];
-
-      for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
-        const int prior_winning_expert = indices[k * block_row + prior_k];
-
-        if (prior_winning_expert == expert) {
-          inp_kvp = thread_kvp;
-        }
-      }
-
-      thread_kvp = arg_max(inp_kvp, thread_kvp);
-    }
-
-    const cub_kvp result_kvp =
-        BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
-    if (threadIdx.x == 0) {
-      const int idx = k * block_row + k_idx;
-      // restore normalized probes
-      output[idx] = result_kvp.value / T(softmax_max_prob[idx]);
-      indices[idx] = should_process_row ? result_kvp.key : num_experts;
-      source_rows[idx] = k_idx * num_rows + block_row;
-    }
-    __syncthreads();
-  }
-}
-
 template <typename T, int TPB>
 __launch_bounds__(TPB) __global__ void moe_softmax(const T* input,
                                                    T* output,
@@ -243,14 +504,16 @@ __launch_bounds__(TPB) __global__ void moe_softmax(const T* input,
   }
 }
 
-template <typename T, int TPB>
-__launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
-                                                 T* output,
-                                                 int* indices,
-                                                 int* source_rows,
-                                                 const int64_t num_experts,
-                                                 const int64_t k,
-                                                 const int64_t num_rows) {
+template <typename T, int TPB, typename IdxT = int>
+__launch_bounds__(TPB) __global__
+    void group_moe_top_k(const T* inputs_after_softmax,
+                         T* output,
+                         IdxT* indices,
+                         int* source_rows,
+                         T* softmax_max_prob,
+                         const int64_t num_experts,
+                         const int64_t k,
+                         const int64_t num_rows) {
   using cub_kvp = cub::KeyValuePair<int, T>;
   using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
   __shared__ typename BlockReduce::TempStorage tmpStorage;
@@ -276,6 +539,72 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
       inp_kvp.key = expert;
       inp_kvp.value = inputs_after_softmax[idx];
 
+      for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
+        const IdxT prior_winning_expert = indices[k * block_row + prior_k];
+
+        if (prior_winning_expert == expert) {
+          inp_kvp = thread_kvp;
+        }
+      }
+
+      thread_kvp = arg_max(inp_kvp, thread_kvp);
+    }
+
+    const cub_kvp result_kvp =
+        BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
+    if (threadIdx.x == 0) {
+      const int idx = k * block_row + k_idx;
+      // restore normalized probes
+      output[idx] = result_kvp.value / T(softmax_max_prob[idx]);
+      indices[idx] = should_process_row ? result_kvp.key : num_experts;
+      source_rows[idx] = k_idx * num_rows + block_row;
+    }
+    __syncthreads();
+  }
+}
+
+template <typename T, int TPB, bool NormWeights = false, typename IdxT = int>
+__launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
+                                                 const T* bias,
+                                                 T* output,
+                                                 IdxT* indices,
+                                                 int* source_rows,
+                                                 const int64_t num_experts,
+                                                 const int64_t k,
+                                                 const int64_t num_rows) {
+  using cub_kvp = cub::KeyValuePair<int, T>;
+  using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
+  __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+  cub_kvp thread_kvp;
+  cub::ArgMax arg_max;
+
+  const int block_row = blockIdx.x + blockIdx.y * gridDim.x;
+  if (block_row >= num_rows) {
+    return;
+  }
+
+  const bool should_process_row = true;
+  const int thread_read_offset = block_row * num_experts;
+  T weight_sum = static_cast<T>(0);
+  T* row_outputs = nullptr;
+
+  if constexpr (NormWeights) {
+    extern __shared__ char smem[];
+    row_outputs = reinterpret_cast<T*>(smem);
+  }
+
+  for (int k_idx = 0; k_idx < k; ++k_idx) {
+    thread_kvp.key = 0;
+    thread_kvp.value = T(-1.f);  // This is OK because inputs are probabilities
+
+    cub_kvp inp_kvp;
+    for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
+      const int idx = thread_read_offset + expert;
+      inp_kvp.key = expert;
+      inp_kvp.value = bias ? inputs_after_softmax[idx] + bias[expert]
+                           : inputs_after_softmax[idx];
+
       for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
         const int prior_winning_expert = indices[k * block_row + prior_k];
 
@@ -291,12 +620,253 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax,
         BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
     if (threadIdx.x == 0) {
       const int idx = k * block_row + k_idx;
-      output[idx] = result_kvp.value;
       indices[idx] = should_process_row ? result_kvp.key : num_experts;
       source_rows[idx] = k_idx * num_rows + block_row;
+
+      if constexpr (NormWeights) {
+        T row_out =
+            bias ? inputs_after_softmax[thread_read_offset + result_kvp.key]
+                 : result_kvp.value;
+        row_outputs[k_idx] = row_out;
+        weight_sum += row_out;
+      } else {
+        output[idx] =
+            bias ? inputs_after_softmax[thread_read_offset + result_kvp.key]
+                 : result_kvp.value;
+      }
     }
     __syncthreads();
   }
+  if constexpr (NormWeights) {
+    if (threadIdx.x < WARP_SIZE) {
+      weight_sum = __shfl_sync(0xffffffff, weight_sum, 0);
+    }
+    if (threadIdx.x < k) {
+      output[k * block_row + threadIdx.x] =
+          row_outputs[threadIdx.x] / weight_sum;
+    }
+  }
+}
+
+template <typename T, int TPB, bool NormWeights = false, typename IdxT = int>
+__launch_bounds__(TPB) __global__
+    void moe_softmax_top_k_fused(const T* input,
+                                 const T* bias,
+                                 T* output,
+                                 IdxT* indices,
+                                 int* source_rows,
+                                 const int64_t num_experts,
+                                 const int64_t k,
+                                 const int64_t num_rows) {
+  // softmax
+  using BlockReduce = cub::BlockReduce<float, TPB>;
+  __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+  __shared__ float normalizing_factor;
+  __shared__ float float_max;
+
+  int globalIdx = blockIdx.x + blockIdx.y * gridDim.x;
+  if (globalIdx >= num_rows) {
+    return;
+  }
+  const int64_t thread_row_offset = globalIdx * num_experts;
+  const int64_t idx = thread_row_offset + threadIdx.x;
+
+  cub::Sum sum;
+
+  float threadData =
+      (threadIdx.x < num_experts) ? static_cast<float>(input[idx]) : (-FLT_MAX);
+
+  const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
+  if (threadIdx.x == 0) {
+    float_max = maxElem;
+  }
+  __syncthreads();
+
+  float threadDataSub = threadData - float_max;
+  float threadDataExp = exp(threadDataSub);
+
+  const auto Z = BlockReduce(tmpStorage).Reduce(threadDataExp, sum);
+
+  if (threadIdx.x == 0) {
+    normalizing_factor = 1.f / Z;
+  }
+
+  __syncthreads();
+
+  T val = T(threadDataExp * normalizing_factor);
+
+  // top_k
+  using cub_kvp = cub::KeyValuePair<int, T>;
+  using BlockReduceP = cub::BlockReduce<cub_kvp, TPB>;
+  __shared__ typename BlockReduceP::TempStorage tmpStorageP;
+
+  cub_kvp thread_kvp;
+  cub::ArgMax arg_max;
+
+  T weight_sum = static_cast<T>(0);
+  T* row_outputs = nullptr;
+  if constexpr (NormWeights) {
+    extern __shared__ char smem[];
+    row_outputs = reinterpret_cast<T*>(smem);
+  }
+
+  for (int k_idx = 0; k_idx < k; ++k_idx) {
+    thread_kvp.key = 0;
+    thread_kvp.value = T(-1.f);  // This is OK because inputs are probabilities
+
+    if (threadIdx.x < num_experts) {
+      cub_kvp inp_kvp;
+      int expert = threadIdx.x;
+      inp_kvp.key = expert;
+      inp_kvp.value = bias ? val + bias[expert] : val;
+
+      for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
+        const IdxT prior_winning_expert = indices[k * globalIdx + prior_k];
+
+        if (prior_winning_expert == expert) {
+          inp_kvp = thread_kvp;
+        }
+      }
+      thread_kvp = arg_max(inp_kvp, thread_kvp);
+    }
+
+    const cub_kvp result_kvp =
+        BlockReduceP(tmpStorageP).Reduce(thread_kvp, arg_max);
+    if (threadIdx.x == 0) {
+      const int cur_idx = k * globalIdx + k_idx;
+
+      indices[cur_idx] = result_kvp.key;
+      source_rows[cur_idx] = k_idx * num_rows + globalIdx;
+
+      if constexpr (NormWeights) {
+        T row_out =
+            bias ? (result_kvp.value - bias[result_kvp.key]) : result_kvp.value;
+        row_outputs[k_idx] = row_out;
+        weight_sum += row_out;
+      } else {
+        output[cur_idx] =
+            bias ? (result_kvp.value - bias[result_kvp.key]) : result_kvp.value;
+      }
+    }
+    __syncthreads();
+  }
+  if constexpr (NormWeights) {
+    if (threadIdx.x < WARP_SIZE) {
+      weight_sum = __shfl_sync(0xffffffff, weight_sum, 0);
+    }
+
+    if (threadIdx.x < k) {
+      output[k * globalIdx + threadIdx.x] =
+          row_outputs[threadIdx.x] / weight_sum;
+    }
+  }
+}
+
+inline __device__ unsigned int xorwow_moe(unsigned int& state) {
+  state ^= state >> 7;
+  state ^= state << 9;
+  state ^= state >> 13;
+  return state;
+}
+
+template <typename T, int TPB, typename IdxT = int>
+__launch_bounds__(TPB) __global__
+    void moe_redundant_top_k_normed(const T* inputs_after_softmax,
+                                    const T* bias,
+                                    const int* expert_id_to_ep_rank_array,
+                                    const int* expert_in_rank_num_list,
+                                    int* tokens_per_expert_stats_list,
+                                    T* output,
+                                    IdxT* indices,
+                                    IdxT* indices_tmp,
+                                    int* source_rows,
+                                    const int64_t num_experts,
+                                    const int64_t k,
+                                    const int64_t num_rows,
+                                    const int redundant_ep_rank_num_plus_one) {
+  using cub_kvp = cub::KeyValuePair<int, T>;
+  using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
+  __shared__ typename BlockReduce::TempStorage tmpStorage;
+
+  cub_kvp thread_kvp;
+  cub::ArgMax arg_max;
+
+  const int block_row = blockIdx.x + blockIdx.y * gridDim.x;
+  // unsigned int state = block_row + blockIdx.x * blockDim.x +
+  // *kernel_call_num;
+  unsigned int state = block_row + blockIdx.x * blockDim.x;
+
+  if (block_row >= num_rows) {
+    return;
+  }
+
+  const bool should_process_row = true;
+  const int thread_read_offset = block_row * num_experts;
+  T weight_sum = static_cast<T>(0);
+
+  extern __shared__ char smem[];
+
+  T* row_outputs = reinterpret_cast<T*>(smem);
+
+  for (int k_idx = 0; k_idx < k; ++k_idx) {
+    thread_kvp.key = 0;
+    thread_kvp.value = T(-1.f);  // This is OK because inputs are probabilities
+
+    cub_kvp inp_kvp;
+    for (int expert = threadIdx.x; expert < num_experts; expert += TPB) {
+      const int idx = thread_read_offset + expert;
+      inp_kvp.key = expert;
+      inp_kvp.value = bias ? inputs_after_softmax[idx] + bias[expert]
+                           : inputs_after_softmax[idx];
+
+      for (int prior_k = 0; prior_k < k_idx; ++prior_k) {
+        const int prior_winning_expert = indices_tmp[k * block_row + prior_k];
+
+        if (prior_winning_expert == expert) {
+          inp_kvp = thread_kvp;
+        }
+      }
+
+      thread_kvp = arg_max(inp_kvp, thread_kvp);
+    }
+
+    const cub_kvp result_kvp =
+        BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
+    if (threadIdx.x == 0) {
+      const int idx = k * block_row + k_idx;
+      // output[idx] = bias ? inputs_after_softmax[thread_read_offset +
+      // result_kvp.key]: result_kvp.value;
+      source_rows[idx] = k_idx * num_rows + block_row;
+      int expert_topk = should_process_row ? result_kvp.key : num_experts;
+
+      // runduncy
+      int len = expert_in_rank_num_list[expert_topk];
+      int select = (int)xorwow_moe(state) % len;
+      int selected_rank =
+          expert_id_to_ep_rank_array[expert_topk *
+                                         redundant_ep_rank_num_plus_one +
+                                     select];
+
+      indices[idx] = (IdxT)selected_rank;
+      indices_tmp[idx] = result_kvp.key;
+      atomicAdd(&tokens_per_expert_stats_list[result_kvp.key], 1);
+
+      T row_out =
+          bias ? inputs_after_softmax[thread_read_offset + result_kvp.key]
+               : result_kvp.value;
+      row_outputs[k_idx] = row_out;
+      weight_sum += row_out;
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x < WARP_SIZE) {
+    weight_sum = __shfl_sync(0xffffffff, weight_sum, 0);
+  }
+
+  if (threadIdx.x < k) {
+    output[k * block_row + threadIdx.x] = row_outputs[threadIdx.x] / weight_sum;
+  }
 }
 
 // ====================== TopK softmax things ===============================
@@ -319,12 +889,15 @@ template <typename T,
           int VPT,
           int NUM_EXPERTS,
           int WARPS_PER_CTA,
-          int BYTES_PER_LDG>
+          int BYTES_PER_LDG,
+          bool Norm_Weights = false,
+          typename IdxT = int>
 __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
     void topk_gating_softmax(const T* input,
+                             const T* bias,
                              T* output,
                              const int64_t num_rows,
-                             int* indices,
+                             IdxT* indices,
                              int* source_rows,
                              const int64_t k) {
   // We begin by enforcing compile time assertions and setting up compile time
@@ -377,6 +950,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
   // We compute row offset for each thread sub-group
   const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
   const int thread_row = warp_base_row + thread_row_in_warp;
+  const int thread_row_in_cta = thread_row - cta_base_row;
 
   // Threads with indices out of bounds should early exit here.
   if (thread_row >= num_rows) return;
@@ -392,6 +966,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
   const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
   const T* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
 
+  T weight_sum = static_cast<T>(0);
+  extern __shared__ T row_output[];
+
   // Determine the pointer type to use to read in the data depending on the
   // BYTES_PER_LDG template param. In theory, this can support all powers of 2
   // up to 16.
@@ -460,7 +1037,9 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
 
 #pragma unroll
   for (int ii = 0; ii < VPT; ++ii) {
-    row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
+    row_chunk[ii] = bias ? row_chunk[ii] * reciprocal_row_sum +
+                               bias[first_elt_read_by_thread + ii]
+                         : row_chunk[ii] * reciprocal_row_sum;
   }
 
   // Now, softmax_res contains the softmax of the row chunk. Now, I want to find
@@ -509,12 +1088,19 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
     }
 
     // Write the max for this k iteration to global memory.
+    T final_val = bias ? T(max_val) - bias[expert] : T(max_val);
     if (thread_group_idx == 0) {
       // The lead thread from each sub-group will write out the final results to
       // global memory. (This will be a single) thread per row of the
       // input/output matrices.
       const int idx = k * thread_row + k_idx;
-      output[idx] = T(max_val);
+      if constexpr (Norm_Weights) {
+        const int idx_in_cta = k * thread_row_in_cta + k_idx;
+        row_output[idx_in_cta] = final_val;
+        weight_sum += final_val;
+      } else {
+        output[idx] = final_val;
+      }
       indices[idx] = should_process_row ? expert : NUM_EXPERTS;
       source_rows[idx] = k_idx * num_rows + thread_row;
     }
@@ -537,6 +1123,16 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
       }
     }
   }
+  if constexpr (Norm_Weights) {
+#pragma unroll
+    for (int k_idx = 0; k_idx < k; ++k_idx) {
+      if (thread_group_idx == 0) {
+        const int idx = k * thread_row + k_idx;
+        const int idx_in_cta = k * thread_row_in_cta + k_idx;
+        output[idx] = row_output[idx_in_cta] / weight_sum;
+      }
+    }
+  }
 }
 
 namespace detail {
@@ -556,10 +1152,15 @@ struct TopkConstants {
 };
 }  // namespace detail
 
-template <typename T, int EXPERTS, int WARPS_PER_TB>
+template <typename T,
+          int EXPERTS,
+          int WARPS_PER_TB,
+          bool Norm_Weights = false,
+          typename IdxT = int>
 void topk_gating_softmax_launcher_helper(const T* input,
+                                         const T* bias,
                                          T* output,
-                                         int* indices,
+                                         IdxT* indices,
                                          int* source_row,
                                          const int64_t num_rows,
                                          const int64_t num_experts,
@@ -575,16 +1176,23 @@ void topk_gating_softmax_launcher_helper(const T* input,
   const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
 
   dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
-  topk_gating_softmax<T, VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG>
-      <<<num_blocks, block_dim, 0, stream>>>(
-          input, output, num_rows, indices, source_row, k);
+  static constexpr int ROWS_PER_CTA = WARPS_PER_TB * ROWS_PER_WARP;
+  topk_gating_softmax<T,
+                      VPT,
+                      EXPERTS,
+                      WARPS_PER_TB,
+                      BYTES_PER_LDG,
+                      Norm_Weights>
+      <<<num_blocks, block_dim, ROWS_PER_CTA * k * sizeof(T), stream>>>(
+          input, bias, output, num_rows, indices, source_row, k);
 }
 
-template <typename T>
+template <typename T, typename IdxT = int>
 void topk_gating_softmax_kernelLauncher(const T* input,
+                                        const T* gating_correction_bias,
                                         T* output,
                                         T* softmax,
-                                        int* indices,
+                                        IdxT* indices,
                                         int* source_row,
                                         T* softmax_max_prob,
                                         const int64_t num_rows,
@@ -596,19 +1204,36 @@ void topk_gating_softmax_kernelLauncher(const T* input,
   if (topk_only_mode) {
     static constexpr int TPB = 256;
     const auto config_topk = Get1DBlocksAnd2DGridsMoe(num_rows);
-    moe_top_k<T, TPB><<<config_topk.block_per_grid, TPB, 0, stream>>>(
-        input, output, indices, source_row, num_experts, k, num_rows);
+    moe_top_k<T, TPB>
+        <<<config_topk.block_per_grid, TPB, 0, stream>>>(input,
+                                                         gating_correction_bias,
+                                                         output,
+                                                         indices,
+                                                         source_row,
+                                                         num_experts,
+                                                         k,
+                                                         num_rows);
     return;
   }
   static constexpr int WARPS_PER_TB = 4;
 
-#define LAUNCH_TOPK_GATING_SOFTMAX_HELPER(N)                                   \
-  case N: {                                                                    \
-    topk_gating_softmax_launcher_helper<T, N, WARPS_PER_TB>(                   \
-        input, output, indices, source_row, num_rows, num_experts, k, stream); \
-    break;                                                                     \
+#define LAUNCH_TOPK_GATING_SOFTMAX_HELPER(N)                 \
+  case N: {                                                  \
+    topk_gating_softmax_launcher_helper<T, N, WARPS_PER_TB>( \
+        input,                                               \
+        gating_correction_bias,                              \
+        output,                                              \
+        indices,                                             \
+        source_row,                                          \
+        num_rows,                                            \
+        num_experts,                                         \
+        k,                                                   \
+        stream);                                             \
+    break;                                                   \
   }
-  switch (num_experts) {
+  int64_t tem_num_experts = num_experts;
+  if (gating_correction_bias != nullptr) tem_num_experts = 0;
+  switch (tem_num_experts) {
     LAUNCH_TOPK_GATING_SOFTMAX_HELPER(2)
     LAUNCH_TOPK_GATING_SOFTMAX_HELPER(4)
     LAUNCH_TOPK_GATING_SOFTMAX_HELPER(8)
@@ -632,7 +1257,7 @@ void topk_gating_softmax_kernelLauncher(const T* input,
                 group_experts,
                 softmax_num_rows);
         const auto config_topk = Get1DBlocksAnd2DGridsMoe(num_rows);
-        moe_top_k<T, TPB>
+        group_moe_top_k<T, TPB>
             <<<config_topk.block_per_grid, TPB, 0, stream>>>(softmax,
                                                              output,
                                                              indices,
@@ -646,7 +1271,14 @@ void topk_gating_softmax_kernelLauncher(const T* input,
         moe_softmax<T, TPB><<<config_topk.block_per_grid, TPB, 0, stream>>>(
             input, softmax, num_experts, num_rows);
         moe_top_k<T, TPB><<<config_topk.block_per_grid, TPB, 0, stream>>>(
-            softmax, output, indices, source_row, num_experts, k, num_rows);
+            softmax,
+            gating_correction_bias,
+            output,
+            indices,
+            source_row,
+            num_experts,
+            k,
+            num_rows);
       }
     }
   }
@@ -669,11 +1301,13 @@ void topk_gating_softmax_kernelLauncher(const T* input,
 // to row 0 in the original matrix. Thus, to know where to read in the source
 // matrix, we simply take the modulus of the expanded index.
 
-template <typename T, int VecSize>
+template <typename T, int VecSize, typename OutT = T>
 __global__ void initialize_moe_routing_kernel(
     const T* unpermuted_input,
-    T* permuted_output,
+    OutT* permuted_output,
     const int* expanded_dest_row_to_expanded_source_row,
+    const int* expert_idx_per_token,
+    const float* w4a8_in_scale,
     int* expanded_source_row_to_expanded_dest_row,
     const int64_t num_rows,
     const int64_t active_rows,
@@ -696,27 +1330,60 @@ __global__ void initialize_moe_routing_kernel(
         expanded_dest_row;
   }
 
-  if ((blockIdx.x + blockIdx.y * gridDim.x) < active_rows) {
-    // Duplicate and permute rows
+  if (expanded_dest_row < active_rows) {
+    const int expert_idx = expert_idx_per_token[expanded_dest_row];
+    const float scale = w4a8_in_scale ? w4a8_in_scale[expert_idx] : -1;
     const int source_row = expanded_source_row % num_rows;
 
     const T* source_row_ptr = unpermuted_input + source_row * cols;
-    T* dest_row_ptr = permuted_output + expanded_dest_row * cols;
+    OutT* dest_row_ptr = permuted_output + expanded_dest_row * cols;
 
     for (int tid = threadIdx.x * VecSize; tid < cols;
          tid += blockDim.x * VecSize) {
       // dest_row_ptr[tid] = source_row_ptr[tid];
       Load<T, VecSize>(&source_row_ptr[tid], &src_vec);
-      Store<T, VecSize>(src_vec, &dest_row_ptr[tid]);
+
+      if constexpr (std::is_same<OutT, int8_t>::value) {
+        using StoreT = AlignedVector<OutT, VecSize>;
+        StoreT dest_vec;
+        const float max_bound = 127.f;
+        const float min_bound = -127.f;
+        for (int j = 0; j < VecSize; j++) {
+          float quant_value =
+              max_bound * scale * static_cast<float>(src_vec[j]);
+          quant_value = quant_value > max_bound ? max_bound : quant_value;
+          quant_value = quant_value < min_bound ? min_bound : quant_value;
+          dest_vec[j] = static_cast<int8_t>(round(quant_value));
+        }
+        Store<OutT, VecSize>(dest_vec, &dest_row_ptr[tid]);
+      } else if constexpr (std::is_same<OutT,
+                                        phi::dtype::float8_e4m3fn>::value) {
+        using StoreT = AlignedVector<OutT, VecSize>;
+        StoreT dest_vec;
+        const float max_bound = 448.f;
+        const float min_bound = -448.f;
+        for (int j = 0; j < VecSize; j++) {
+          float quant_value =
+              max_bound * scale * static_cast<float>(src_vec[j]);
+          quant_value = quant_value > max_bound ? max_bound : quant_value;
+          quant_value = quant_value < min_bound ? min_bound : quant_value;
+          dest_vec[j] = static_cast<phi::dtype::float8_e4m3fn>(quant_value);
+        }
+        Store<phi::dtype::float8_e4m3fn, VecSize>(dest_vec, &dest_row_ptr[tid]);
+      } else {
+        Store<T, VecSize>(src_vec, &dest_row_ptr[tid]);
+      }
     }
   }
 }
 
-template <typename T>
+template <typename T, typename OutT = T>
 void initialize_moe_routing_kernelLauncher(
     const T* unpermuted_input,
-    T* permuted_output,
+    OutT* permuted_output,
     const int* expanded_dest_row_to_expanded_source_row,
+    const int* expert_idx_per_token,
+    const float* w4a8_in_scale,
     int* expanded_source_row_to_expanded_dest_row,
     const int64_t num_rows,
     const int64_t active_rows,
@@ -732,6 +1399,8 @@ void initialize_moe_routing_kernelLauncher(
             unpermuted_input,
             permuted_output,
             expanded_dest_row_to_expanded_source_row,
+            expert_idx_per_token,
+            w4a8_in_scale,
             expanded_source_row_to_expanded_dest_row,
             num_rows,
             k * active_rows,
@@ -743,6 +1412,8 @@ void initialize_moe_routing_kernelLauncher(
             unpermuted_input,
             permuted_output,
             expanded_dest_row_to_expanded_source_row,
+            expert_idx_per_token,
+            w4a8_in_scale,
             expanded_source_row_to_expanded_dest_row,
             num_rows,
             k * active_rows,
@@ -793,43 +1464,66 @@ __global__ void finalize_moe_routing_kernel(
     const bool norm_topk_prob,
     const float routed_scaling_factor,
     const int64_t num_rows) {
-  const int original_row = blockIdx.x + blockIdx.y * gridDim.x;
-  // const int original_row = blockIdx.x;
-  // const int num_rows = gridDim.x;
-  if (original_row >= num_rows) return;
-  T* reduced_row_ptr = reduced_unpermuted_output + original_row * cols;
+  const int original_row = blockIdx.x;
+  auto const offset = original_row * cols;
 
-  for (int tid = threadIdx.x; tid < cols; tid += blockDim.x) {
-    T thread_output{0.f};
+  T* reduced_row_ptr = reduced_unpermuted_output + offset;
+  constexpr int64_t FINALIZE_ELEM_PER_THREAD =
+      128 / mctlass::sizeof_bits<T>::value;
+  int64_t const start_offset = threadIdx.x;
+  int64_t const stride = FINALIZE_THREADS_PER_BLOCK;
+  int64_t const num_elems_in_col = cols / FINALIZE_ELEM_PER_THREAD;
+
+  using BiasElem = mctlass::Array<T, FINALIZE_ELEM_PER_THREAD>;
+  using InputElem = mctlass::Array<T, FINALIZE_ELEM_PER_THREAD>;
+  using OutputElem = mctlass::Array<T, FINALIZE_ELEM_PER_THREAD>;
+  using ComputeElem = mctlass::Array<float, FINALIZE_ELEM_PER_THREAD>;
+  using SharedOutputElem = mctlass::Array<T, FINALIZE_ELEM_PER_THREAD>;
+
+  auto const* bias_v = reinterpret_cast<BiasElem const*>(bias);
+  auto const* expanded_permuted_rows_v =
+      reinterpret_cast<InputElem const*>(expanded_permuted_rows);
+  auto* reduced_row_ptr_v = reinterpret_cast<OutputElem*>(reduced_row_ptr);
+
+#pragma unroll
+  for (int elem_index = start_offset; elem_index < num_elems_in_col;
+       elem_index += stride) {
+    ComputeElem thread_output;
+    thread_output.fill(0);
     float row_rescale{0.f};
     for (int k_idx = 0; k_idx < k; ++k_idx) {
-      const int expanded_original_row = original_row + k_idx * num_rows;
-      const int expanded_permuted_row =
+      int64_t const expanded_original_row = original_row + k_idx * num_rows;
+      int64_t const expanded_permuted_row =
           expanded_source_row_to_expanded_dest_row[expanded_original_row];
-
-      const int64_t k_offset = original_row * k + k_idx;
+      int64_t const k_offset = original_row * k + k_idx;
       const float row_scale = scales[k_offset];
       row_rescale = row_rescale + row_scale;
 
-      const T* expanded_permuted_rows_row_ptr =
-          expanded_permuted_rows + expanded_permuted_row * cols;
+      auto const* expanded_permuted_rows_row_ptr =
+          expanded_permuted_rows_v + expanded_permuted_row * num_elems_in_col;
 
-      const int expert_idx = expert_for_source_row[k_offset];
-      const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr;
-      const T bias_value = bias_ptr ? bias_ptr[tid] : T{0.f};
+      int const expert_idx = expert_for_source_row[k_offset];
+      auto const* bias_ptr = bias_v + expert_idx * num_elems_in_col;
 
-      thread_output =
-          static_cast<float>(thread_output) +
-          row_scale * static_cast<float>(
-                          expanded_permuted_rows_row_ptr[tid] +
-                          bias_value *
-                              static_cast<T>(static_cast<float>(compute_bias)));
+      ComputeElem bias_value;
+      if (bias) {
+        bias_value = arrayConvert<BiasElem, ComputeElem>(bias_ptr[elem_index]);
+      } else {
+        bias_value.fill(0);
+      }
+
+      ComputeElem expert_result = arrayConvert<InputElem, ComputeElem>(
+          expanded_permuted_rows_row_ptr[elem_index]);
+
+      thread_output = thread_output + row_scale * (expert_result + bias_value);
     }
-
-    thread_output = static_cast<float>(thread_output) /
-                    (norm_topk_prob ? row_rescale : 1.0f) *
-                    routed_scaling_factor;
-    reduced_row_ptr[tid] = thread_output;
+    for (auto& elem : thread_output) {
+      elem =
+          elem / (norm_topk_prob ? row_rescale : 1.0f) * routed_scaling_factor;
+    }
+    OutputElem output_elem =
+        arrayConvert<ComputeElem, OutputElem>(thread_output);
+    reduced_row_ptr_v[elem_index] = output_elem;
   }
 }
 
@@ -848,136 +1542,21 @@ void finalize_moe_routing_kernelLauncher(
     const bool norm_topk_prob,
     const float routed_scaling_factor,
     cudaStream_t stream) {
-  const int threads = std::min(cols, int64_t(1024));
-  const auto config_final = Get1DBlocksAnd2DGridsMoe(num_rows);
+  const int blocks = num_rows;
+  const int threads = FINALIZE_THREADS_PER_BLOCK;
 
   finalize_moe_routing_kernel<T, 1>
-      <<<config_final.block_per_grid, threads, 0, stream>>>(
-          expanded_permuted_rows,
-          reduced_unpermuted_output,
-          bias,
-          scales,
-          expanded_source_row_to_expanded_dest_row,
-          expert_for_source_row,
-          cols,
-          k,
-          compute_bias,
-          norm_topk_prob,
-          routed_scaling_factor,
-          num_rows);
+      <<<blocks, threads, 0, stream>>>(expanded_permuted_rows,
+                                       reduced_unpermuted_output,
+                                       bias,
+                                       scales,
+                                       expanded_source_row_to_expanded_dest_row,
+                                       expert_for_source_row,
+                                       cols,
+                                       k,
+                                       compute_bias,
+                                       norm_topk_prob,
+                                       routed_scaling_factor,
+                                       num_rows);
 }
-
-// ========================= TopK Softmax specializations
-// ===========================
-template void topk_gating_softmax_kernelLauncher(const float*,
-                                                 float*,
-                                                 float*,
-                                                 int*,
-                                                 int*,
-                                                 float*,
-                                                 const int64_t,
-                                                 const int64_t,
-                                                 const int64_t,
-                                                 const bool,
-                                                 cudaStream_t,
-                                                 const bool);
-template void topk_gating_softmax_kernelLauncher(const half*,
-                                                 half*,
-                                                 half*,
-                                                 int*,
-                                                 int*,
-                                                 half*,
-                                                 const int64_t,
-                                                 const int64_t,
-                                                 const int64_t,
-                                                 const bool,
-                                                 cudaStream_t,
-                                                 const bool);
-#ifdef PADDLE_CUDA_BF16
-template void topk_gating_softmax_kernelLauncher(const __nv_bfloat16*,
-                                                 __nv_bfloat16*,
-                                                 __nv_bfloat16*,
-                                                 int*,
-                                                 int*,
-                                                 __nv_bfloat16*,
-                                                 const int64_t,
-                                                 const int64_t,
-                                                 const int64_t,
-                                                 const bool,
-                                                 cudaStream_t,
-                                                 const bool);
-#endif
-// ===================== Specializations for init routing
-// =========================
-template void initialize_moe_routing_kernelLauncher(const float*,
-                                                    float*,
-                                                    const int*,
-                                                    int*,
-                                                    const int64_t,
-                                                    const int64_t,
-                                                    const int64_t,
-                                                    const int64_t,
-                                                    cudaStream_t);
-template void initialize_moe_routing_kernelLauncher(const half*,
-                                                    half*,
-                                                    const int*,
-                                                    int*,
-                                                    const int64_t,
-                                                    const int64_t,
-                                                    const int64_t,
-                                                    const int64_t,
-                                                    cudaStream_t);
-#ifdef PADDLE_CUDA_BF16
-template void initialize_moe_routing_kernelLauncher(const __nv_bfloat16*,
-                                                    __nv_bfloat16*,
-                                                    const int*,
-                                                    int*,
-                                                    const int64_t,
-                                                    const int64_t,
-                                                    const int64_t,
-                                                    const int64_t,
-                                                    cudaStream_t);
-#endif
-// ==================== Specializations for final routing
-// ===================================
-template void finalize_moe_routing_kernelLauncher(const float*,
-                                                  float*,
-                                                  const float*,
-                                                  const float*,
-                                                  const int*,
-                                                  const int*,
-                                                  const int64_t,
-                                                  const int64_t,
-                                                  const int64_t,
-                                                  const int64_t,
-                                                  const bool,
-                                                  const float,
-                                                  cudaStream_t);
-template void finalize_moe_routing_kernelLauncher(const half*,
-                                                  half*,
-                                                  const half*,
-                                                  const float*,
-                                                  const int*,
-                                                  const int*,
-                                                  const int64_t,
-                                                  const int64_t,
-                                                  const int64_t,
-                                                  const int64_t,
-                                                  const bool,
-                                                  const float,
-                                                  cudaStream_t);
-#ifdef PADDLE_CUDA_BF16
-template void finalize_moe_routing_kernelLauncher(const __nv_bfloat16*,
-                                                  __nv_bfloat16*,
-                                                  const __nv_bfloat16*,
-                                                  const float*,
-                                                  const int*,
-                                                  const int*,
-                                                  const int64_t,
-                                                  const int64_t,
-                                                  const int64_t,
-                                                  const int64_t,
-                                                  const bool,
-                                                  const float,
-                                                  cudaStream_t);
-#endif
+}  // namespace phi
diff --git a/custom_ops/metax_ops/mc_fused_moe_helper.h b/custom_ops/metax_ops/mc_fused_moe_helper.h
deleted file mode 100644
index 002c2b87e..000000000
--- a/custom_ops/metax_ops/mc_fused_moe_helper.h
+++ /dev/null
@@ -1,486 +0,0 @@
-// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "fused_moe_helper.h"
-#include "mctlass/numeric_conversion.h"
-#include "mctlassEx/mctlassEx.h"
-
-template <typename ElementA, typename ElementB, typename ElementC>
-void mc_grouped_gemm_basic_kernel(const ElementA* ptrA,
-                                  mctlassExOrder_t majorA,
-                                  const ElementB* ptrB,
-                                  mctlassExOrder_t majorB,
-                                  const ElementA* ptrScale,
-                                  const ElementA* ptrBias,
-                                  ElementC* ptrC,
-                                  mctlassExOrder_t majorC,
-                                  const int* ptrSegInd,
-                                  int* ptrMNumTilesInd,
-                                  int numExperts,
-                                  int m,  // expanded_active_expert_rows
-                                  int n,  // inter_dim
-                                  int k,  // hidden_size
-                                  mcStream_t stream) {
-  mctlassExHandle_t handle;
-  mctlassExHandleCreate(&handle);
-
-  mctlassExMatrixLayout_t matLayoutA;
-  mctlassExMatrixLayout_t matLayoutB;
-  mctlassExMatrixLayout_t matLayoutC;
-
-  // mat A: (m, k)
-  mctlassExMatrixLayoutCreate(
-      &matLayoutA, mctlassExDataType::MCTLASS_EX_DATATYPE_BF16, m, k, k);
-  mctlassExMatrixLayoutSetAttribute(
-      matLayoutA,
-      mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
-      &majorA,
-      sizeof(mctlassExOrder_t));
-  mctlassExMatrixLayoutSetAttribute(
-      matLayoutA,
-      mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_BATCH_COUNT,
-      &numExperts,
-      sizeof(int));
-  // mat B: (num_experts, n, k)
-  mctlassExMatrixLayoutCreate(
-      &matLayoutB, mctlassExDataType::MCTLASS_EX_DATATYPE_INT8, k, n, k);
-  mctlassExMatrixLayoutSetAttribute(
-      matLayoutB,
-      mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
-      &majorB,
-      sizeof(mctlassExOrder_t));
-  mctlassExMatrixLayoutSetAttribute(
-      matLayoutB,
-      mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_BATCH_COUNT,
-      &numExperts,
-      sizeof(int));
-  // mat C: (m, n)
-  mctlassExMatrixLayoutCreate(
-      &matLayoutC, mctlassExDataType::MCTLASS_EX_DATATYPE_BF16, m, n, n);
-  mctlassExMatrixLayoutSetAttribute(
-      matLayoutC,
-      mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_ORDER,
-      &majorC,
-      sizeof(mctlassExOrder_t));
-  mctlassExMatrixLayoutSetAttribute(
-      matLayoutC,
-      mctlassExMatrixLayoutAttribute_t::MCTLASS_EX_MATRIX_LAYOUT_BATCH_COUNT,
-      &numExperts,
-      sizeof(int));
-  // bias: (num_experts, n)
-  // scale: (num, n)
-
-  mctlassExDesc_t mctlass_desc;
-  mctlassExCreateDesc(&mctlass_desc);
-  mctlassExDataType input_type = mctlassExDataType::MCTLASS_EX_DATATYPE_BF16;
-  mctlassExDataType scale_type = mctlassExDataType::MCTLASS_EX_DATATYPE_INT8;
-  mctlassExDataType compute_type = mctlassExDataType::MCTLASS_EX_DATATYPE_FP32;
-  mctlassExEpilogueType epilogue_type =
-      mctlassExEpilogueType::MCTLASS_EX_EPILOGUE_TYPE_DEFAULT;
-  if (ptrBias) {
-    epilogue_type = mctlassExEpilogueType::MCTLASS_EX_EPILOGUE_TYPE_BIAS;
-  }
-  // set scale
-  mctlassExDescSetAttribute(
-      mctlass_desc,
-      mctlassExDescAttributes_t::MCTLASS_EX_DESC_B_SCALE_POINTER,
-      &ptrScale,
-      sizeof(ptrScale));
-  mctlassExDescSetAttribute(
-      mctlass_desc,
-      mctlassExDescAttributes_t::MCTLASS_EX_DESC_B_SCALE_TYPE,
-      &input_type,
-      sizeof(mctlassExDataType));
-  // set bias
-  if (ptrBias) {
-    mctlassExDescSetAttribute(
-        mctlass_desc,
-        mctlassExDescAttributes_t::MCTLASS_EX_DESC_BIAS_POINTER,
-        &ptrBias,
-        sizeof(ptrBias));
-  }
-  // set coumpute type
-  mctlassExDescSetAttribute(
-      mctlass_desc,
-      mctlassExDescAttributes_t::MCTLASS_EX_DESC_COMPUTE_TYPE,
-      &compute_type,
-      sizeof(mctlassExDataType));
-  // set epilogue type
-  mctlassExDescSetAttribute(
-      mctlass_desc,
-      mctlassExDescAttributes_t::MCTLASS_EX_DESC_EPILOGUE_TYPE,
-      &epilogue_type,
-      sizeof(mctlassExEpilogueType));
-
-  const mctlassExContiguousGroupedGemmAlgo_t algo =
-      mctlassExContiguousGroupedGemmAlgo_t::
-          MCTLASS_EX_CONTIGUOUS_GROUPED_ALGO_DEFAULT;
-  mctlassExContiguousGroupedDesc_t contiguous_group_desc;
-  mctlassExContiguousGroupedDescCreate(
-      &contiguous_group_desc, ptrSegInd, nullptr, ptrMNumTilesInd, 1);
-  int blocksizeM;
-  mctlassExContiguousGroupedGemmGetBlocksizeM(handle,
-                                              mctlass_desc,
-                                              matLayoutA,
-                                              matLayoutB,
-                                              matLayoutC,
-                                              &algo,
-                                              &blocksizeM);
-  mctlassExContiguousGroupedGemmComputeMNumTilesIndptr(handle,
-                                                       mctlass_desc,
-                                                       matLayoutA,
-                                                       matLayoutB,
-                                                       matLayoutC,
-                                                       &algo,
-                                                       contiguous_group_desc,
-                                                       numExperts,
-                                                       blocksizeM,
-                                                       stream);
-
-  mctlassExContiguousGroupedGemmBasic(handle,
-                                      mctlass_desc,
-                                      ptrA,
-                                      matLayoutA,
-                                      ptrB,
-                                      matLayoutB,
-                                      ptrC,
-                                      matLayoutC,
-                                      contiguous_group_desc,
-                                      &algo,
-                                      nullptr,
-                                      0,
-                                      stream);
-
-  mctlassExHandleDestroy(handle);
-  mctlassExMatrixLayoutDestroy(matLayoutA);
-  mctlassExMatrixLayoutDestroy(matLayoutB);
-  mctlassExMatrixLayoutDestroy(matLayoutC);
-  mctlassExContiguousGroupedDescDestroy(contiguous_group_desc);
-  mctlassExDestroyDesc(mctlass_desc);
-}
-
-template <typename T, typename ElementA, typename ElementB, typename ElementC>
-class McMoeHelper {
- public:
-  McMoeHelper(const std::string gemm_method) : gemm_method_(gemm_method) {}
-
-  // --------      getWorkspaceSize      -------- //
-  template <typename KeyT>
-  size_t getWorkspaceSize(const int64_t num_rows,
-                          const int64_t hidden_size,
-                          const int64_t inter_size,
-                          const int64_t num_experts,
-                          const int64_t k) {
-    const size_t buf_size = AlignTo16(k * num_rows * hidden_size);
-    const size_t interbuf_size = AlignTo16(k * num_rows * inter_size);
-    const size_t padded_experts = AlignTo16(num_experts);
-    const size_t num_moe_inputs = AlignTo16(k * num_rows);
-    // softmax output, permuted_rows and permuted_experts have moved to outside
-    // of moe kernel, allocate them in Encoder or Decoder before invoking
-    // FfnLayer forward.
-    size_t total_ws_bytes =
-        5 * num_moe_inputs *
-        sizeof(int);  // source_rows_, permuted_rows_, permuted_experts_
-    total_ws_bytes += buf_size * sizeof(KeyT);  // permuted_data
-    total_ws_bytes +=
-        padded_experts * sizeof(int32_t);  // Hold total_rows_before_expert_
-
-    const size_t bytes_for_fc1_result = interbuf_size * sizeof(KeyT);
-    const size_t sorter_ws_size_bytes =
-        AlignTo16(sorter_.getWorkspaceSize(num_rows));
-    sorter_.update_num_experts(num_experts);
-
-    int64_t bytes_for_intermediate_and_sorting = bytes_for_fc1_result;
-    if (sorter_ws_size_bytes > bytes_for_fc1_result) {
-      int64_t remaining_bytes =
-          AlignTo16(sorter_ws_size_bytes - bytes_for_fc1_result);
-      bytes_for_intermediate_and_sorting += remaining_bytes;
-    }
-
-    total_ws_bytes +=
-        bytes_for_intermediate_and_sorting;  // intermediate (fc1) output + cub
-                                             // sorting workspace
-
-    int64_t num_softmax_outs = 0;
-    const bool is_pow_2 =
-        (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
-    if (!is_pow_2 || num_experts > 256) {
-      num_softmax_outs = AlignTo16(num_rows * num_experts);
-    }
-
-    total_ws_bytes += num_softmax_outs * sizeof(float);
-
-    return total_ws_bytes;
-  }
-
-  void computeFFN(const paddle::Tensor* input,
-                  const paddle::Tensor* gate_weight,
-                  const paddle::Tensor* ffn1_weight,
-                  const paddle::Tensor* ffn1_scale,
-                  const paddle::Tensor* ffn1_bias,
-                  const paddle::Tensor* ffn2_weight,
-                  const paddle::Tensor* ffn2_scale,
-                  const paddle::Tensor* ffn2_bias,
-                  const paddle::Tensor* moe_token_type_ids,
-                  const int moe_topk,
-                  const bool group_moe,
-                  const bool norm_topk_prob,
-                  const float routed_scaling_factor,
-                  const std::string moe_type,
-                  paddle::Tensor* output) {
-    auto* input_activations = input->data<T>();
-    auto* gating_weights = gate_weight->data<float>();
-    const T* fc1_expert_biases = ffn1_bias ? ffn1_bias->data<T>() : nullptr;
-    const T* fc2_expert_biases = ffn2_bias ? ffn2_bias->data<T>() : nullptr;
-
-    auto* output_ = output->data<T>();
-    auto stream = input->stream();
-    auto place = input->place();
-    auto input_type = input->dtype();
-
-    auto input_dims = input->dims();
-    auto ffn1_dims = ffn1_weight->dims();
-    int64_t token_num = 0;
-    if (input_dims.size() == 3) {
-      token_num = input_dims[0] * input_dims[1];
-    } else {
-      token_num = input_dims[0];
-    }
-    const int64_t num_rows = token_num;
-
-    const int64_t hidden_size = ffn1_dims[2];
-    int64_t inter_dim = 0;
-    if (moe_type == "qkv") {
-      inter_dim = ffn1_dims[2] * ffn1_dims[3] * ffn1_dims[4];
-    } else {
-      inter_dim = ffn1_dims[1];
-    }
-
-    // if (gemm_method == "weight_only_int4") {
-    //   inter_dim = inter_dim * 2;
-    // }
-
-    const int64_t inter_size = inter_dim;
-    const int64_t num_experts = ffn1_dims[0];
-    const int64_t k = moe_topk;
-
-    int64_t bytes =
-        getWorkspaceSize<T>(num_rows, hidden_size, inter_size, num_experts, k);
-
-    // Pointers
-    int* expert_for_source_row;
-    int* source_rows_;
-    int* permuted_rows_;
-    int* permuted_experts_;
-    int* expanded_source_row_to_expanded_dest_row;
-
-    T* permuted_data_;
-    int32_t* total_rows_before_expert_;
-    T* fc1_result_;
-    float* softmax_out_;
-
-    paddle::Tensor ws_ptr_tensor =
-        GetEmptyTensor({bytes}, paddle::DataType::INT8, place);
-    int8_t* ws_ptr = ws_ptr_tensor.data<int8_t>();
-
-    const int64_t buf_size = AlignTo16(k * num_rows * hidden_size);
-    const int64_t interbuf_size = AlignTo16(k * num_rows * inter_size);
-    const int64_t padded_experts = AlignTo16(num_experts);
-    const int64_t num_moe_inputs = AlignTo16(k * num_rows);
-
-    expert_for_source_row = reinterpret_cast<int*>(ws_ptr);
-    source_rows_ = expert_for_source_row + num_moe_inputs;
-    permuted_rows_ = source_rows_ + num_moe_inputs;
-    permuted_experts_ = permuted_rows_ + num_moe_inputs;
-    expanded_source_row_to_expanded_dest_row =
-        permuted_experts_ + num_moe_inputs;
-    permuted_data_ = reinterpret_cast<T*>(
-        expanded_source_row_to_expanded_dest_row + num_moe_inputs);
-    total_rows_before_expert_ =
-        reinterpret_cast<int32_t*>(permuted_data_ + buf_size);
-    fc1_result_ =
-        reinterpret_cast<T*>(total_rows_before_expert_ + padded_experts);
-
-    const bool is_pow_2 =
-        (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
-    if (!is_pow_2 || num_experts > 256) {
-      softmax_out_ = reinterpret_cast<float*>(fc1_result_ + interbuf_size);
-    } else {
-      softmax_out_ = nullptr;
-    }
-
-    paddle::Tensor expert_scales_float_tensor =
-        GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
-    float* expert_scales_float = expert_scales_float_tensor.data<float>();
-
-    float* softmax_max_prob = nullptr;
-    if (group_moe) {
-      paddle::Tensor softmax_max_prob_tensor = GetEmptyTensor(
-          {num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
-      // (TODO: check fill success ?)
-      paddle::experimental::fill(softmax_max_prob_tensor, 0.f);
-      softmax_max_prob = softmax_max_prob_tensor.data<float>();
-    }
-
-    paddle::Tensor fc1_out_tensor =
-        GetEmptyTensor({num_rows * k, inter_size}, input_type, place);
-    T* fc1_out = fc1_out_tensor.data<T>();
-
-    auto input_cast_tensor =
-        paddle::experimental::cast(*input, paddle::DataType::FLOAT32);
-    auto gate_tensor =
-        paddle::experimental::matmul(input_cast_tensor, *gate_weight);
-    float* gating_output = gate_tensor.data<float>();
-
-    if (moe_token_type_ids) {
-      auto* moe_token_type_ids_out = moe_token_type_ids->data<int>();
-      moe_token_type_ids_kernelLauncher<float>(gating_output,
-                                               moe_token_type_ids_out,
-                                               num_rows,
-                                               num_experts,
-                                               k,
-                                               stream);
-    }
-
-    topk_gating_softmax_kernelLauncher<float>(gating_output,
-                                              expert_scales_float,
-                                              softmax_out_,
-                                              expert_for_source_row,
-                                              source_rows_,
-                                              softmax_max_prob,
-                                              num_rows,
-                                              num_experts,
-                                              k,
-                                              group_moe,
-                                              stream);
-
-    const int64_t sorter_ws_size_bytes =
-        AlignTo16(sorter_.getWorkspaceSize(int64_t(k * num_rows)));
-
-    sorter_.run(fc1_result_,
-                sorter_ws_size_bytes,
-                expert_for_source_row,
-                permuted_experts_,
-                source_rows_,
-                permuted_rows_,
-                k * num_rows,
-                false,
-                stream);
-
-    initialize_moe_routing_kernelLauncher(
-        input_activations,
-        permuted_data_,
-        permuted_rows_,
-        expanded_source_row_to_expanded_dest_row,
-        num_rows,
-        num_rows,
-        hidden_size,
-        k,
-        stream);
-
-    const int64_t expanded_active_expert_rows = k * num_rows;
-
-    compute_total_rows_before_expert(permuted_experts_,
-                                     expanded_active_expert_rows,
-                                     num_experts,
-                                     total_rows_before_expert_,
-                                     stream);
-
-    mctlassExOrder_t row_major = mctlassExOrder_t::MCTLASS_EX_ORDER_ROW_MAJOR;
-    mctlassExOrder_t column_major =
-        mctlassExOrder_t::MCTLASS_EX_ORDER_COLUMN_MAJOR;
-    auto m_num_tile =
-        GetEmptyTensor({num_experts}, paddle::DataType::INT32, place);
-    int* m_num_tile_ptr = reinterpret_cast<int*>(m_num_tile.data<int>());
-
-    mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
-        reinterpret_cast<const ElementA*>(permuted_data_),
-        row_major,
-        reinterpret_cast<const ElementB*>(ffn1_weight->data<ElementB>()),
-        column_major,
-        reinterpret_cast<const ElementA*>(ffn1_scale->data<T>()),
-        reinterpret_cast<const ElementA*>(fc1_expert_biases),
-        reinterpret_cast<ElementC*>(fc1_out),
-        row_major,
-        total_rows_before_expert_,
-        m_num_tile_ptr,
-        num_experts,
-        expanded_active_expert_rows,
-        inter_size,
-        hidden_size,
-        stream);
-
-    if (moe_type == "ffn") {
-      auto act_out_tensor =
-          paddle::experimental::swiglu(fc1_out_tensor, nullptr);
-      auto act_out = act_out_tensor.data<T>();
-
-      paddle::Tensor fc2_output_tensor =
-          GetEmptyTensor({k * num_rows, hidden_size}, input_type, place);
-      T* fc2_result = fc2_output_tensor.data<T>();
-
-      mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
-          reinterpret_cast<const ElementA*>(act_out),
-          row_major,
-          reinterpret_cast<const ElementB*>(ffn2_weight->data<ElementB>()),
-          column_major,
-          reinterpret_cast<const ElementA*>(ffn2_scale->data<T>()),
-          nullptr,
-          reinterpret_cast<ElementC*>(fc2_result),
-          row_major,
-          total_rows_before_expert_,
-          m_num_tile_ptr,
-          num_experts,
-          expanded_active_expert_rows,
-          hidden_size,
-          inter_size / 2,
-          stream);
-
-      finalize_moe_routing_kernelLauncher(
-          fc2_result,
-          output_,
-          fc2_expert_biases,
-          reinterpret_cast<float*>(expert_scales_float),
-          expanded_source_row_to_expanded_dest_row,
-          expert_for_source_row,
-          num_rows,
-          hidden_size,
-          k,
-          static_cast<int>(1),
-          norm_topk_prob,
-          routed_scaling_factor,
-          stream);
-    } else {
-      finalize_moe_routing_kernelLauncher(
-          // fc2_result,
-          fc1_out,
-          output_,
-          fc1_expert_biases,  // fc2_expert_biases,
-          reinterpret_cast<float*>(expert_scales_float),
-          expanded_source_row_to_expanded_dest_row,
-          expert_for_source_row,
-          num_rows,
-          inter_size,
-          k,
-          static_cast<int>(0),
-          norm_topk_prob,
-          routed_scaling_factor,
-          stream);
-    }
-  }
-
- private:
-  std::string gemm_method_;
-  CubKeyValueSorter sorter_;
-};
diff --git a/custom_ops/metax_ops/moe_dispatch.cu b/custom_ops/metax_ops/moe_dispatch.cu
index 717c797ad..5b474db3c 100644
--- a/custom_ops/metax_ops/moe_dispatch.cu
+++ b/custom_ops/metax_ops/moe_dispatch.cu
@@ -17,26 +17,35 @@
 #pragma GCC diagnostic ignored "-Wunused-function"
 #pragma once
 
-#include "fused_moe_helper.h"
+#include "fused_moe_imp_op.h"
 #include "fused_moe_op.h"
 #pragma GCC diagnostic pop
 
 #include "helper.h"
 
 template <paddle::DataType T>
-void MoeDispatchKernel(const paddle::Tensor& input,
-                       const paddle::Tensor& gating_output,
-                       const int moe_topk,
-                       const bool group_moe,
-                       const bool topk_only_mode,
-                       const int num_rows,
-                       const int hidden_size,
-                       const int expert_num,
-                       paddle::Tensor* permute_input,
-                       paddle::Tensor* tokens_expert_prefix_sum,
-                       paddle::Tensor* permute_indices_per_token,
-                       paddle::Tensor* top_k_weight,
-                       paddle::Tensor* top_k_indices) {
+void MoeDispatchKernel(
+    const paddle::Tensor& input,
+    const paddle::Tensor& gating_output,
+    const paddle::optional<paddle::Tensor>& gating_correction_bias,
+    const paddle::optional<paddle::Tensor>& w4a8_in_scale,
+    const int moe_topk,
+    const bool group_moe,
+    const bool topk_only_mode,
+    const int num_rows,
+    const int hidden_size,
+    const int expert_num,
+    paddle::Tensor* permute_input,
+    paddle::Tensor* tokens_expert_prefix_sum,
+    paddle::Tensor* permute_indices_per_token,
+    paddle::Tensor* topk_weight,
+    paddle::Tensor* topk_idx,
+    paddle::Tensor* expert_idx_per_token) {
+  using namespace phi;
+
+  if (num_rows == 0) {
+    return;
+  }
   typedef PDTraits<T> traits_;
   typedef typename traits_::DataType DataType_;
   typedef typename traits_::data_t data_t;
@@ -78,7 +87,7 @@ void MoeDispatchKernel(const paddle::Tensor& input,
       reinterpret_cast<int*>(sorter_ws_ptr + sorter_ws_size_bytes);
   int* permuted_rows_ = permuted_experts_ + num_moe_inputs;
 
-  int* expert_for_source_row = top_k_indices->data<int>();
+  int* topk_idx_ptr = topk_idx->data<int>();
 
   float* softmax_max_prob = nullptr;
   if (group_moe) {
@@ -103,23 +112,25 @@ void MoeDispatchKernel(const paddle::Tensor& input,
     softmax_out_ = nullptr;
   }
 
-  topk_gating_softmax_kernelLauncher<float>(gating_output.data<float>(),
-                                            top_k_weight->data<float>(),
-                                            softmax_out_,
-                                            expert_for_source_row,
-                                            source_rows_,
-                                            softmax_max_prob,
-                                            num_rows,
-                                            expert_num,
-                                            moe_topk,
-                                            group_moe,
-                                            stream,
-                                            topk_only_mode);
+  topk_gating_softmax_kernelLauncher(
+      gating_output.data<float>(),
+      static_cast<const float*>(nullptr),  // no gating_correction_bias
+      topk_weight->data<float>(),
+      softmax_out_,
+      topk_idx_ptr,
+      source_rows_,
+      softmax_max_prob,
+      num_rows,
+      expert_num,
+      moe_topk,
+      group_moe,
+      stream,
+      topk_only_mode);
 
   sorter_.run(reinterpret_cast<void*>(sorter_ws_ptr),
               sorter_ws_size_bytes,
-              expert_for_source_row,
-              permuted_experts_,
+              topk_idx_ptr,
+              expert_idx_per_token->data<int32_t>(),
               source_rows_,
               permuted_rows_,
               moe_topk * num_rows,
@@ -130,6 +141,8 @@ void MoeDispatchKernel(const paddle::Tensor& input,
       input.data<data_t>(),
       permute_input->data<data_t>(),
       permuted_rows_,
+      expert_idx_per_token->data<int32_t>(),
+      nullptr,
       permute_indices_per_token->data<int32_t>(),
       num_rows,
       num_rows,
@@ -137,7 +150,7 @@ void MoeDispatchKernel(const paddle::Tensor& input,
       moe_topk,
       stream);
 
-  compute_total_rows_before_expert(permuted_experts_,
+  compute_total_rows_before_expert(expert_idx_per_token->data<int32_t>(),
                                    moe_topk * num_rows,
                                    expert_num,
                                    tokens_expert_prefix_sum->data<int32_t>(),
@@ -147,8 +160,11 @@ void MoeDispatchKernel(const paddle::Tensor& input,
 std::vector<paddle::Tensor> MoeExpertDispatch(
     const paddle::Tensor& input,
     const paddle::Tensor& gating_output,
+    const paddle::optional<paddle::Tensor>& gating_correction_bias,
+    const paddle::optional<paddle::Tensor>& w4a8_in_scale,
     const int moe_topk,
     const bool group_moe,
+    const std::string& moe_quant_type,
     const bool topk_only_mode) {
   const auto input_type = input.dtype();
   auto place = input.place();
@@ -168,9 +184,9 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
   auto permute_input =
       GetEmptyTensor({moe_topk * num_rows, hidden_size}, input_type, place);
   // correspond to the weighted coefficients of the results from each expert.
-  auto top_k_weight =
+  auto topk_weight =
       GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
-  auto top_k_indices =
+  auto topk_idx =
       GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::INT32, place);
 
   auto tokens_expert_prefix_sum =
@@ -178,18 +194,24 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
   auto permute_indices_per_token =
       GetEmptyTensor({moe_topk, num_rows}, paddle::DataType::INT32, place);
 
+  auto expert_idx_per_token =
+      GetEmptyTensor({num_rows * moe_topk}, paddle::DataType::INT32, place);
+
   if (token_rows == 0) {
     return {permute_input,
             tokens_expert_prefix_sum,
             permute_indices_per_token,
-            top_k_weight,
-            top_k_indices};
+            topk_weight,
+            topk_idx,
+            expert_idx_per_token};
   }
 
   switch (input_type) {
     case paddle::DataType::BFLOAT16:
       MoeDispatchKernel<paddle::DataType::BFLOAT16>(input,
                                                     gating_output,
+                                                    gating_correction_bias,
+                                                    w4a8_in_scale,
                                                     moe_topk,
                                                     group_moe,
                                                     topk_only_mode,
@@ -199,37 +221,25 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
                                                     &permute_input,
                                                     &tokens_expert_prefix_sum,
                                                     &permute_indices_per_token,
-                                                    &top_k_weight,
-                                                    &top_k_indices);
+                                                    &topk_weight,
+                                                    &topk_idx,
+                                                    &expert_idx_per_token);
       break;
-    // case paddle::DataType::FLOAT16:
-    //   MoeDispatchKernel<paddle::DataType::FLOAT16>(input,
-    //                                                gating_output,
-    //                                                moe_topk,
-    //                                                group_moe,
-    //                                                topk_only_mode,
-    //                                                num_rows,
-    //                                                hidden_size,
-    //                                                expert_num,
-    //                                                &permute_input,
-    //                                                &tokens_expert_prefix_sum,
-    //                                                &permute_indices_per_token,
-    //                                                &top_k_weight,
-    //                                                &top_k_indices);
-    //   break;
     default:
-      PD_THROW("Only support bf16 for MoeDispatchKernel");
+      PD_THROW("Unsupported data type for MoeDispatchKernel");
   }
   return {permute_input,
           tokens_expert_prefix_sum,
           permute_indices_per_token,
-          top_k_weight,
-          top_k_indices};
+          topk_weight,
+          topk_idx,
+          expert_idx_per_token};
 }
 
 std::vector<std::vector<int64_t>> MoeExpertDispatchInferShape(
     const std::vector<int64_t>& input_shape,
     const std::vector<int64_t>& gating_output_shape,
+    const paddle::optional<std::vector<int64_t>>& bias_shape,
     const int moe_topk) {
   int token_rows = -1;
 
@@ -241,33 +251,44 @@ std::vector<std::vector<int64_t>> MoeExpertDispatchInferShape(
   const int expert_num = gating_output_shape[gating_output_shape.size() - 1];
   const int num_rows = token_rows;
   const int hidden_size = input_shape[input_shape.size() - 1];
+  const int permuted_rows = num_rows == -1 ? -1 : moe_topk * num_rows;
 
-  return {{moe_topk * num_rows, hidden_size},
+  return {{permuted_rows, hidden_size},
           {expert_num},
           {moe_topk, num_rows},
           {num_rows, moe_topk},
-          {num_rows, moe_topk}};
+          {num_rows, moe_topk},
+          {permuted_rows}};
 }
 
 std::vector<paddle::DataType> MoeExpertDispatchInferDtype(
     const paddle::DataType& input_dtype,
     const paddle::DataType& gating_output_dtype,
+    const paddle::optional<paddle::DataType>& bias_type,
     const int moe_topk) {
   return {input_dtype,
           paddle::DataType::INT64,
           paddle::DataType::INT32,
           paddle::DataType::FLOAT32,
+          paddle::DataType::INT32,
           paddle::DataType::INT32};
 }
 
-PD_BUILD_OP(moe_expert_dispatch)
-    .Inputs({"input", "gating_output"})
+PD_BUILD_STATIC_OP(moe_expert_dispatch)
+    .Inputs({"input",
+             "gating_output",
+             paddle::Optional("gating_correction_bias"),
+             paddle::Optional("w4a8_in_scale")})
     .Outputs({"permute_input",
               "tokens_expert_prefix_sum",
               "permute_indices_per_token",
-              "top_k_weight",
-              "top_k_indices"})
-    .Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"})
+              "topk_weight",
+              "topk_idx",
+              "expert_idx_per_token"})
+    .Attrs({"moe_topk:int",
+            "group_moe:bool",
+            "moe_quant_type:std::string",
+            "topk_only_mode:bool"})
     .SetKernelFn(PD_KERNEL(MoeExpertDispatch))
     .SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));
diff --git a/custom_ops/metax_ops/moe_ffn.cu b/custom_ops/metax_ops/moe_ffn.cu
index f4a5dbcd4..df00f722b 100644
--- a/custom_ops/metax_ops/moe_ffn.cu
+++ b/custom_ops/metax_ops/moe_ffn.cu
@@ -12,23 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// BUILD_MARK
 #pragma once
+#include "fused_moe_helper.h"
 #include "helper.h"
-#include "mc_fused_moe_helper.h"
 
-template <paddle::DataType T,
-          typename ElementA,
-          typename ElementB,
-          typename ElementC>
-void McMoeFFNKernel(paddle::Tensor& permute_input,
-                    const paddle::Tensor& tokens_expert_prefix_sum,
-                    const paddle::Tensor& ffn1_weight,
-                    const paddle::Tensor& ffn2_weight,
-                    const paddle::optional<paddle::Tensor>& ffn1_bias,
-                    const paddle::optional<paddle::Tensor>& ffn1_scale,
-                    const paddle::optional<paddle::Tensor>& ffn2_scale,
-                    const std::string& quant_method) {
+template <paddle::DataType T>
+void MoeFFNKernel(paddle::Tensor& permute_input,
+                  const paddle::Tensor& tokens_expert_prefix_sum,
+                  const paddle::Tensor& up_gate_proj_weight,
+                  const paddle::Tensor& down_proj_weight,
+                  const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
+                  const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
+                  const paddle::optional<paddle::Tensor>& down_proj_scale,
+                  const paddle::optional<paddle::Tensor>& expert_idx_per_token,
+                  const std::string& quant_method) {
+  using namespace phi;
   typedef PDTraits<T> traits_;
   typedef typename traits_::DataType DataType_;
   typedef typename traits_::data_t data_t;
@@ -38,11 +36,13 @@ void McMoeFFNKernel(paddle::Tensor& permute_input,
   auto input_type = permute_input.dtype();
   auto stream = permute_input.stream();
 
+  auto int8_moe_gemm_runner = McMoeGemmRunner<DataType_, int8_t>();
+
   const int expanded_active_expert_rows =
-      permute_input.dims()[0];                    // permute_input.dims(): m, k
-  const int num_experts = ffn1_weight.dims()[0];  // batchsize
-  const int hidden_size = ffn1_weight.dims()[2];  // n
-  int inter_dim = ffn1_weight.dims()[1];          // k
+      permute_input.dims()[0];  // permute_input.dims(): m, k
+  const int num_experts = up_gate_proj_weight.dims()[0];  // batchsize
+  const int hidden_size = up_gate_proj_weight.dims()[2];  // n
+  int inter_dim = up_gate_proj_weight.dims()[1];          // k
 
   const int64_t inter_size = inter_dim;  // since weight_only_int_8
   paddle::Tensor fc1_out_tensor = GetEmptyTensor(
@@ -58,60 +58,71 @@ void McMoeFFNKernel(paddle::Tensor& permute_input,
 
   // ffn1
   auto fc1_expert_biases =
-      ffn1_bias
-          ? const_cast<paddle::Tensor*>(ffn1_bias.get_ptr())->data<data_t>()
+      up_gate_proj_bias
+          ? const_cast<paddle::Tensor*>(up_gate_proj_bias.get_ptr())
+                ->data<data_t>()
           : nullptr;
   auto fc1_expert_scales =
-      const_cast<paddle::Tensor*>(ffn1_scale.get_ptr())->data<data_t>();
-  mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
-      reinterpret_cast<const ElementA*>(permuted_input_ptr),
-      row_major,
-      reinterpret_cast<const ElementB*>(ffn1_weight.data<ElementB>()),
-      column_major,
-      reinterpret_cast<const ElementA*>(fc1_expert_scales),
-      reinterpret_cast<const ElementA*>(fc1_expert_biases),
-      reinterpret_cast<ElementC*>(fc1_out_ptr),
-      row_major,
-      tokens_expert_prefix_sum.data<int>(),
-      m_num_tile_ptr,
-      num_experts,
-      expanded_active_expert_rows,
-      inter_dim,
-      hidden_size,
-      stream);
+      const_cast<paddle::Tensor*>(up_gate_proj_scale.get_ptr())->data<data_t>();
+  if (quant_method == "weight_only_int8") {
+    int8_moe_gemm_runner.mc_grouped_gemm_basic_kernel(
+        reinterpret_cast<const DataType_*>(permuted_input_ptr),
+        row_major,
+        reinterpret_cast<const int8_t*>(up_gate_proj_weight.data<int8_t>()),
+        column_major,
+        reinterpret_cast<const DataType_*>(fc1_expert_scales),
+        reinterpret_cast<const DataType_*>(fc1_expert_biases),
+        reinterpret_cast<DataType_*>(fc1_out_ptr),
+        row_major,
+        tokens_expert_prefix_sum.data<int>(),
+        m_num_tile_ptr,
+        num_experts,
+        expanded_active_expert_rows,
+        inter_dim,
+        hidden_size,
+        stream);
+  } else {
+    throw std::runtime_error("Unsupported gemm method: " + quant_method);
+  }
 
   // swiglu
   auto act_out_tensor = paddle::experimental::swiglu(fc1_out_tensor, nullptr);
   auto act_out = act_out_tensor.data<data_t>();
 
   auto fc2_expert_scales =
-      const_cast<paddle::Tensor*>(ffn2_scale.get_ptr())->data<data_t>();
-  mc_grouped_gemm_basic_kernel<ElementA, ElementB, ElementC>(
-      reinterpret_cast<const ElementA*>(act_out),
-      row_major,
-      reinterpret_cast<const ElementB*>(ffn2_weight.data<ElementB>()),
-      column_major,
-      reinterpret_cast<const ElementA*>(fc2_expert_scales),
-      nullptr,
-      reinterpret_cast<ElementC*>(permuted_input_ptr),
-      row_major,
-      tokens_expert_prefix_sum.data<int>(),
-      m_num_tile_ptr,
-      num_experts,
-      expanded_active_expert_rows,
-      hidden_size,
-      inter_dim / 2,
-      stream);
+      const_cast<paddle::Tensor*>(down_proj_scale.get_ptr())->data<data_t>();
+
+  if (quant_method == "weight_only_int8") {
+    int8_moe_gemm_runner.mc_grouped_gemm_basic_kernel(
+        reinterpret_cast<const DataType_*>(act_out),
+        row_major,
+        reinterpret_cast<const int8_t*>(down_proj_weight.data<int8_t>()),
+        column_major,
+        reinterpret_cast<const DataType_*>(fc2_expert_scales),
+        nullptr,
+        reinterpret_cast<DataType_*>(permuted_input_ptr),
+        row_major,
+        tokens_expert_prefix_sum.data<int>(),
+        m_num_tile_ptr,
+        num_experts,
+        expanded_active_expert_rows,
+        hidden_size,
+        inter_dim / 2,
+        stream);
+  } else {
+    throw std::runtime_error("Unsupported gemm method: " + quant_method);
+  }
 }
 
 std::vector<paddle::Tensor> MoeExpertFFN(
     paddle::Tensor& permute_input,
     const paddle::Tensor& tokens_expert_prefix_sum,
-    const paddle::Tensor& ffn1_weight,
-    const paddle::Tensor& ffn2_weight,
-    const paddle::optional<paddle::Tensor>& ffn1_bias,
-    const paddle::optional<paddle::Tensor>& ffn1_scale,
-    const paddle::optional<paddle::Tensor>& ffn2_scale,
+    const paddle::Tensor& up_gate_proj_weight,
+    const paddle::Tensor& down_proj_weight,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_bias,
+    const paddle::optional<paddle::Tensor>& up_gate_proj_scale,
+    const paddle::optional<paddle::Tensor>& down_proj_scale,
+    const paddle::optional<paddle::Tensor>& expert_idx_per_token,
     const std::string& quant_method) {
   assert(quant_method == "weight_only_int8");
   const auto input_type = permute_input.dtype();
@@ -122,31 +133,18 @@ std::vector<paddle::Tensor> MoeExpertFFN(
 
   switch (input_type) {
     case paddle::DataType::BFLOAT16:
-      McMoeFFNKernel<paddle::DataType::BFLOAT16,
-                     maca_bfloat16,
-                     int8_t,
-                     maca_bfloat16>(permute_input,
-                                    tokens_expert_prefix_sum,
-                                    ffn1_weight,
-                                    ffn2_weight,
-                                    ffn1_bias,
-                                    ffn1_scale,
-                                    ffn2_scale,
-                                    quant_method);
+      MoeFFNKernel<paddle::DataType::BFLOAT16>(permute_input,
+                                               tokens_expert_prefix_sum,
+                                               up_gate_proj_weight,
+                                               down_proj_weight,
+                                               up_gate_proj_bias,
+                                               up_gate_proj_scale,
+                                               down_proj_scale,
+                                               expert_idx_per_token,
+                                               quant_method);
       break;
-    // case paddle::DataType::FLOAT16:
-    //   MoeFFNKernel<paddle::DataType::FLOAT16>(permute_input,
-    //                                           tokens_expert_prefix_sum,
-    //                                           ffn1_weight,
-    //                                           ffn2_weight,
-    //                                           ffn1_bias,
-    //                                           ffn1_scale,
-    //                                           ffn2_scale,
-    //                                           quant_method,
-    //                                           ffn_out);
-    //   break;
     default:
-      PD_THROW("Unsupported data type for MoeExpertFFN");
+      PD_THROW("Unsupported data type for MoeFFNhKernel");
   }
   return {permute_input};
 }
@@ -154,33 +152,37 @@ std::vector<paddle::Tensor> MoeExpertFFN(
 std::vector<std::vector<int64_t>> MoeExpertFFNInferShape(
     const std::vector<int64_t>& permute_input_shape,
     const std::vector<int64_t>& tokens_expert_prefix_sum_shape,
-    const std::vector<int64_t>& ffn1_weight_shape,
-    const std::vector<int64_t>& ffn2_weight_shape,
-    const paddle::optional<std::vector<int64_t>>& ffn1_bias_shape,
-    const paddle::optional<std::vector<int64_t>>& ffn1_scale_shape,
-    const paddle::optional<std::vector<int64_t>>& ffn2_scale_shape) {
+    const std::vector<int64_t>& up_gate_proj_weight_shape,
+    const std::vector<int64_t>& down_proj_weight_shape,
+    const paddle::optional<std::vector<int64_t>>& up_gate_proj_bias_shape,
+    const paddle::optional<std::vector<int64_t>>& up_gate_proj_scale_shape,
+    const paddle::optional<std::vector<int64_t>>& down_proj_scale_shape,
+    const paddle::optional<std::vector<int64_t>>& expert_idx_per_token_shape,
+    const std::string& quant_method) {
   return {permute_input_shape};
 }
 
 std::vector<paddle::DataType> MoeExpertFFNInferDtype(
     const paddle::DataType& permute_input_dtype,
     const paddle::DataType& tokens_expert_prefix_sum_dtype,
-    const paddle::DataType& ffn1_weight_dtype,
-    const paddle::DataType& ffn2_weight_dtype,
-    const paddle::optional<paddle::DataType>& ffn1_bias_dtype,
-    const paddle::optional<paddle::DataType>& ffn1_scale_dtype,
-    const paddle::optional<paddle::DataType>& ffn2_scale_dtype) {
+    const paddle::DataType& up_gate_proj_weight_dtype,
+    const paddle::DataType& down_proj_weight_dtype,
+    const paddle::optional<paddle::DataType>& up_gate_proj_bias_dtype,
+    const paddle::optional<paddle::DataType>& up_gate_proj_scale_dtype,
+    const paddle::optional<paddle::DataType>& down_proj_scale_dtype,
+    const paddle::optional<paddle::DataType>& expert_idx_per_token_dtype) {
   return {permute_input_dtype};
 }
 
 PD_BUILD_OP(moe_expert_ffn)
     .Inputs({"permute_input",
              "tokens_expert_prefix_sum",
-             "ffn1_weight",
-             "ffn2_weight",
-             paddle::Optional("ffn1_bias"),
-             paddle::Optional("ffn1_scale"),
-             paddle::Optional("ffn2_scale")})
+             "up_gate_proj_weight",
+             "down_proj_weight",
+             paddle::Optional("up_gate_proj_bias"),
+             paddle::Optional("up_gate_proj_scale"),
+             paddle::Optional("down_proj_scale"),
+             paddle::Optional("expert_idx_per_token")})
     .Outputs({"output_tensor"})
     .Attrs({"quant_method:std::string"})
     .SetKernelFn(PD_KERNEL(MoeExpertFFN))
diff --git a/custom_ops/metax_ops/moe_reduce.cu b/custom_ops/metax_ops/moe_reduce.cu
index 7ec694215..c5132f65e 100644
--- a/custom_ops/metax_ops/moe_reduce.cu
+++ b/custom_ops/metax_ops/moe_reduce.cu
@@ -14,7 +14,6 @@
 
 #pragma once
 
-#include "fused_moe_helper.h"
 #include "fused_moe_op.h"
 #include "helper.h"
 
@@ -23,13 +22,14 @@ void MoeReduceKernel(const paddle::Tensor& ffn_out,
                      const paddle::Tensor& top_k_weight,
                      const paddle::Tensor& permute_indices_per_token,
                      const paddle::Tensor& top_k_indices,
-                     const paddle::optional<paddle::Tensor>& ffn2_bias,
+                     const paddle::optional<paddle::Tensor>& down_proj_bias,
                      const bool norm_topk_prob,
                      const float routed_scaling_factor,
                      const int num_rows,
                      const int hidden_size,
                      const int topk,
                      paddle::Tensor* output) {
+  using namespace phi;
   typedef PDTraits<T> traits_;
   typedef typename traits_::DataType DataType_;
   typedef typename traits_::data_t data_t;
@@ -38,7 +38,7 @@ void MoeReduceKernel(const paddle::Tensor& ffn_out,
   finalize_moe_routing_kernelLauncher(
       ffn_out.data<data_t>(),
       output->data<data_t>(),
-      ffn2_bias ? ffn2_bias->data<data_t>() : nullptr,
+      down_proj_bias ? down_proj_bias->data<data_t>() : nullptr,
       top_k_weight.data<float>(),
       permute_indices_per_token.data<int32_t>(),
       top_k_indices.data<int>(),
@@ -56,7 +56,7 @@ std::vector<paddle::Tensor> MoeExpertReduce(
     const paddle::Tensor& top_k_weight,
     const paddle::Tensor& permute_indices_per_token,
     const paddle::Tensor& top_k_indices,
-    const paddle::optional<paddle::Tensor>& ffn2_bias,
+    const paddle::optional<paddle::Tensor>& down_proj_bias,
     const bool norm_topk_prob,
     const float routed_scaling_factor) {
   const auto input_type = ffn_out.dtype();
@@ -69,7 +69,7 @@ std::vector<paddle::Tensor> MoeExpertReduce(
   auto output = GetEmptyTensor({num_rows, hidden_size}, input_type, place);
 
   // Avoids ‘invalid configuration argument’ when we launch the kernel.
-  if (ffn_out.dims()[0] == 0) return {output};
+  if (num_rows == 0) return {output};
 
   switch (input_type) {
     case paddle::DataType::BFLOAT16:
@@ -77,7 +77,7 @@ std::vector<paddle::Tensor> MoeExpertReduce(
                                                   top_k_weight,
                                                   permute_indices_per_token,
                                                   top_k_indices,
-                                                  ffn2_bias,
+                                                  down_proj_bias,
                                                   norm_topk_prob,
                                                   routed_scaling_factor,
                                                   num_rows,
@@ -85,21 +85,8 @@ std::vector<paddle::Tensor> MoeExpertReduce(
                                                   topk,
                                                   &output);
       break;
-    // case paddle::DataType::FLOAT16:
-    //   MoeReduceKernel<paddle::DataType::FLOAT16>(ffn_out,
-    //                                               top_k_weight,
-    //                                               permute_indices_per_token,
-    //                                               top_k_indices,
-    //                                               ffn2_bias,
-    //                                               norm_topk_prob,
-    //                                               routed_scaling_factor,
-    //                                               num_rows,
-    //                                               hidden_size,
-    //                                               topk,
-    //                                               &output);
-    //   break;
     default:
-      PD_THROW("Only support bf16 for MoeDispatchKernel");
+      PD_THROW("Unsupported data type for MoeReduceKernel");
   }
   return {output};
 }
@@ -109,7 +96,7 @@ std::vector<std::vector<int64_t>> MoeExpertReduceInferShape(
     const std::vector<int64_t>& top_k_weight_shape,
     const std::vector<int64_t>& permute_indices_per_token_shape,
     const std::vector<int64_t>& top_k_indices_shape,
-    const paddle::optional<std::vector<int64_t>>& ffn2_bias_shape) {
+    const paddle::optional<std::vector<int64_t>>& down_proj_bias_shape) {
   const int topk = top_k_indices_shape[1];
   std::vector<int64_t> fused_moe_out_shape = {ffn_out_shape[0] / topk,
                                               ffn_out_shape[1]};
@@ -122,7 +109,7 @@ std::vector<paddle::DataType> MoeExpertReduceInferDtype(
     const paddle::DataType& top_k_weight_dtype,
     const paddle::DataType& permute_indices_per_token_dtype,
     const paddle::DataType& top_k_indices_dtype,
-    const paddle::optional<paddle::DataType>& ffn2_bias_dtype) {
+    const paddle::optional<paddle::DataType>& down_proj_bias_dtype) {
   return {ffn_out_dtype};
 }
 
@@ -131,7 +118,7 @@ PD_BUILD_OP(moe_expert_reduce)
              "top_k_weight",
              "permute_indices_per_token",
              "top_k_indices",
-             paddle::Optional("ffn2_bias")})
+             paddle::Optional("down_proj_bias")})
     .Outputs({"output"})
     .Attrs({"norm_topk_prob:bool", "routed_scaling_factor:float"})
     .SetKernelFn(PD_KERNEL(MoeExpertReduce))
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index 40900b187..2bbde8814 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -627,11 +627,17 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
         "gpu_ops/append_attn/get_block_shape_and_split_kv_block.cu",
         "gpu_ops/moe/tritonmoe_preprocess.cu",
         "gpu_ops/moe/moe_topk_select.cu",
+        "gpu_ops/get_img_boundaries.cc",
+        "gpu_ops/remote_cache_kv_ipc.cc",
+        "gpu_ops/sample_kernels/rejection_top_p_sampling.cu",
+        "gpu_ops/sample_kernels/top_k_renorm_probs.cu",
+        "gpu_ops/sample_kernels/min_p_sampling_from_probs.cu",
         "metax_ops/moe_dispatch.cu",
         "metax_ops/moe_ffn.cu",
         "metax_ops/moe_reduce.cu",
         "metax_ops/fused_moe.cu",
-        "metax_ops/apply_rope.cu",
+        "metax_ops/apply_rope_qkv.cu",
+        "metax_ops/cache_kv_with_rope.cu",
     ]
 
     sources += find_end_files("gpu_ops/speculate_decoding", ".cu")
@@ -657,6 +663,11 @@ elif paddle.device.is_compiled_with_custom_device("metax_gpu"):
                 os.path.join(maca_path, "include"),
                 os.path.join(maca_path, "include/mcr"),
                 os.path.join(maca_path, "include/common"),
+                os.path.join(maca_path, "include/mcfft"),
+                os.path.join(maca_path, "include/mcrand"),
+                os.path.join(maca_path, "include/mcsparse"),
+                os.path.join(maca_path, "include/mcblas"),
+                os.path.join(maca_path, "include/mcsolver"),
             ],
         ),
     )
diff --git a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py
index 306b11bf0..646997e31 100644
--- a/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py
+++ b/fastdeploy/model_executor/layers/backends/metax/attention/flash_attn_backend.py
@@ -31,7 +31,7 @@ from fastdeploy.model_executor.layers.backends.metax.attention.flash_attention_i
     flash_attn_kvcache_func,
     flash_attn_unpadded_func,
 )
-from fastdeploy.model_executor.ops.gpu import apply_rope
+from fastdeploy.model_executor.ops.gpu import apply_rope_qkv, cache_kv_with_rope
 
 
 @dataclass
@@ -127,15 +127,14 @@ class FlashAttentionBackend(AttentionBackend):
         self.rank, self.device_id = init_rank_and_device_id(fd_config)
         self.enable_mm = fd_config.model_config.enable_mm
         max_num_seqs = fd_config.scheduler_config.max_num_seqs
-        if self.enable_mm:
-            self.attention_metadata.rotary_cos_decode = paddle.empty(
-                shape=[max_num_seqs, 1, 1, self.head_dim],
-                dtype="float32",
-            )
-            self.attention_metadata.rotary_sin_decode = paddle.empty(
-                shape=[max_num_seqs, 1, 1, self.head_dim],
-                dtype="float32",
-            )
+        self.attention_metadata.rotary_cos_decode = paddle.empty(
+            shape=[max_num_seqs, 1, 1, self.head_dim],
+            dtype=self.dtype,
+        )
+        self.attention_metadata.rotary_sin_decode = paddle.empty(
+            shape=[max_num_seqs, 1, 1, self.head_dim],
+            dtype=self.dtype,
+        )
 
     def init_attention_metadata(self, forward_meta: ForwardMeta):
         """Initialize attntion metadata hence all layers in the forward pass can reuse it."""
@@ -245,6 +244,12 @@ class FlashAttentionBackend(AttentionBackend):
         seq_lens_this_time = forward_meta.seq_lens_this_time[batch_ids]
         cached_kv_lens = forward_meta.seq_lens_decoder[batch_ids, 0]
 
+        self.block_table_prefill = forward_meta.block_tables[batch_ids, :]
+        # mapping token idx to batch idx
+        self.batch_ids_q = paddle.repeat_interleave(
+            paddle.arange(0, batch_ids.shape[0], dtype="int32"), repeats=seq_lens_this_time, axis=0
+        )
+
         all_indices = []
         for i in range(len(batch_ids)):
             start_pos = cached_kv_lens[i]
@@ -285,19 +290,25 @@ class FlashAttentionBackend(AttentionBackend):
         self.attention_metadata.rotary_sin_prefill = paddle.repeat_interleave(rot_sin, repeats=2, axis=-1)
 
     def update_rotary_embs_decoder(self, forward_meta: ForwardMeta):
-        if not self.enable_mm:  # only initialize once for text-only model
-            if self.attention_metadata.rotary_cos_decode is None or self.attention_metadata.rotary_sin_decode is None:
-                self.attention_metadata.rotary_cos_decode = forward_meta.rotary_embs[0, 0, :, 0, :].astype(self.dtype)
-                self.attention_metadata.rotary_sin_decode = forward_meta.rotary_embs[1, 0, :, 0, :].astype(self.dtype)
-        elif self.batch_ids_decode.shape[0] > 0:
-            bs = self.batch_ids_decode.shape[0]
+        if self.batch_ids_decode.shape[0] == 0:
+            return
+
+        bs = self.batch_ids_decode.shape[0]
+        if self.enable_mm:
             index = paddle.concat(
                 [self.batch_ids_decode.view([-1, 1]), self.seq_lens_dec.to("int64").view([-1, 1])], axis=1
             )
             rot_cos = paddle.gather_nd(forward_meta.rotary_embs[:, 0, 0, :, 0, :], index).view([bs, 1, 1, -1])
             rot_sin = paddle.gather_nd(forward_meta.rotary_embs[:, 1, 0, :, 0, :], index).view([bs, 1, 1, -1])
-            self.attention_metadata.rotary_cos_decode[:bs].copy_(paddle.repeat_interleave(rot_cos, repeats=2, axis=-1))
-            self.attention_metadata.rotary_sin_decode[:bs].copy_(paddle.repeat_interleave(rot_sin, repeats=2, axis=-1))
+        else:
+            rot_cos = paddle.gather(forward_meta.rotary_embs[0, 0, :, 0, :], self.seq_lens_dec).view([bs, 1, 1, -1])
+            rot_sin = paddle.gather(forward_meta.rotary_embs[1, 0, :, 0, :], self.seq_lens_dec).view([bs, 1, 1, -1])
+        self.attention_metadata.rotary_cos_decode[:bs].copy_(
+            paddle.repeat_interleave(rot_cos, repeats=2, axis=-1).astype(self.dtype)
+        )
+        self.attention_metadata.rotary_sin_decode[:bs].copy_(
+            paddle.repeat_interleave(rot_sin, repeats=2, axis=-1).astype(self.dtype)
+        )
 
     def get_attntion_meta(self) -> AttentionMetadata:
         """get_attntion_meta"""
@@ -395,6 +406,25 @@ class FlashAttentionBackend(AttentionBackend):
                             }
                     # non last block: seq_lens_this_time > block_size
                     else:
+                        if bool(self.num_layers_draft_model) and (
+                            seq_len < self.block_size and i < cur_used_num_blocks - 1
+                        ):
+                            cache_end = seq_len - cache_start
+                            assert cache_end <= self.block_size
+
+                            forward_meta.caches[k_cache_id][block_id, 0:cache_end, :, :] = slice_trans_k[
+                                cache_start:seq_len, :, :
+                            ]
+                            forward_meta.caches[v_cache_id][block_id, 0:cache_end, :, :] = slice_trans_v[
+                                cache_start:seq_len, :, :
+                            ]
+                            if layer_id == self.num_layers - 1:
+                                self.record_block_table_metadata[batch_idx] = {
+                                    "block_id": block_id.item(),
+                                    "cache_end": cache_end,
+                                }
+                            break
+
                         assert seq_len > self.block_size
                         cache_end = cache_start + self.block_size
                         forward_meta.caches[k_cache_id][block_id] = slice_trans_k[cache_start:cache_end, :, :]
@@ -403,9 +433,20 @@ class FlashAttentionBackend(AttentionBackend):
             tensor_start = tensor_end
 
     def forward_prefill(self, prefill_qkv, layer_id, k_cache_id, v_cache_id, forward_meta: ForwardMeta):
-        qkv = prefill_qkv.view([-1, self.num_heads + self.kv_num_heads * 2, self.head_dim])
-        q, k, v = qkv.split(num_or_sections=[self.num_heads, self.kv_num_heads, self.kv_num_heads], axis=-2)
-        q, k = apply_rope(q, k, self.attention_metadata.rotary_cos_prefill, self.attention_metadata.rotary_sin_prefill)
+        q, k, v = cache_kv_with_rope(
+            prefill_qkv,
+            forward_meta.caches[k_cache_id],
+            forward_meta.caches[v_cache_id],
+            self.block_table_prefill,
+            self.attention_metadata.rotary_cos_prefill,
+            self.attention_metadata.rotary_sin_prefill,
+            self.prefill_info_dict["cu_seqlens_q"],
+            self.batch_ids_q,
+            self.num_heads,
+            self.kv_num_heads,
+            self.head_dim,
+            self.block_size,
+        )
 
         prefill_out = flash_attn_unpadded_func(
             q,
@@ -419,23 +460,17 @@ class FlashAttentionBackend(AttentionBackend):
             causal=self.causal,
         )[0]
 
-        self.update_kv_cache(k, v, k_cache_id, v_cache_id, layer_id, forward_meta, self.batch_ids_prefill)
-
         return prefill_out
 
     def forward_decode(self, decode_qkv, k_cache_id, v_cache_id, forward_meta: ForwardMeta):
-        qkv = decode_qkv.view([-1, 1, self.num_heads + self.kv_num_heads * 2, self.head_dim])
-        q, k, v = qkv.split(num_or_sections=[self.num_heads, self.kv_num_heads, self.kv_num_heads], axis=-2)
-
-        if self.enable_mm:  # vl
-            q, k = apply_rope(
-                q, k, self.attention_metadata.rotary_cos_decode, self.attention_metadata.rotary_sin_decode
-            )
-            rotary_cos = None
-            rotary_sin = None
-        else:
-            rotary_cos = self.attention_metadata.rotary_cos_decode
-            rotary_sin = self.attention_metadata.rotary_sin_decode
+        q, k, v = apply_rope_qkv(
+            decode_qkv,
+            self.attention_metadata.rotary_cos_decode,
+            self.attention_metadata.rotary_sin_decode,
+            self.num_heads,
+            self.kv_num_heads,
+            self.head_dim,
+        )
 
         decode_out = flash_attn_kvcache_func(
             q,
@@ -445,8 +480,8 @@ class FlashAttentionBackend(AttentionBackend):
             self.block_table_dec,
             k,
             v,
-            rotary_cos=rotary_cos,
-            rotary_sin=rotary_sin,
+            rotary_cos=None,
+            rotary_sin=None,
             causal=self.causal,
             is_rotary_interleaved=True,
         )[0].squeeze(1)
diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
index d803e3d31..9b65e073c 100644
--- a/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
+++ b/fastdeploy/model_executor/layers/backends/metax/moe/fused_moe_cutlass_metax_backend.py
@@ -209,7 +209,8 @@ class MetaxCutlassMoEMethod(MoEMethodBase):
             None,
             (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
             (layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
-            "weight_only_int8",
+            expert_idx_per_token,  # expert_idx_per_token: only for w4a8
+            self.moe_quant_type,
         )
 
     def apply_ep_prefill(
@@ -262,15 +263,26 @@ class MetaxCutlassMoEMethod(MoEMethodBase):
                 permute_indices_per_token,
                 topk_weights,
                 topk_idx,
+                expert_idx_per_token,  # only for w4a8
             ) = moe_expert_dispatch(
                 x,
                 gate_out,
+                None,  # Use layer.gate_correction_bias in get_moe_scores.
+                None,  # if set, permute_input will be int8_t
                 layer.top_k,
                 False,
+                self.moe_quant_type,
                 True,
             )
 
-            ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert, None)
+            if not layer.with_bias and self.moe_quant_type != "w4a8" and self.moe_quant_type != "w4afp8":
+                # only w4a8 need expert_idx_per_token
+                # Other need not this tensor, so we make it None.
+                expert_idx_per_token = None
+            else:
+                expert_idx_per_token = expert_idx_per_token.cast("int64")
+
+            ffn_out = self.compute_ffn(layer, permute_input, token_nums_per_expert, expert_idx_per_token)
 
             fused_moe_out = moe_expert_reduce(
                 ffn_out,
@@ -291,7 +303,7 @@ class MetaxCutlassMoEMethod(MoEMethodBase):
                 (layer.up_gate_proj_weight_scale if hasattr(layer, "up_gate_proj_weight_scale") else None),
                 None,
                 (layer.down_proj_weight_scale if hasattr(layer, "down_proj_weight_scale") else None),
-                "weight_only_int8",
+                self.moe_quant_type,
                 layer.top_k,
                 True,
                 False,
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index e88ee0e5b..17556c441 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -344,21 +344,12 @@ def post_process_normal(
         model_output.stop_flags,
     )
 
-    if current_platform.is_cuda() or current_platform.is_iluvatar() or current_platform.is_dcu():
-        set_stop_value_multi_ends(
-            sampler_output.sampled_token_ids,
-            model_output.stop_flags,
-            model_output.seq_lens_this_time,
-            model_output.eos_token_id,
-            model_output.next_tokens,
-            model_output.pre_ids,
-            model_output.step_idx,
-            model_output.stop_token_ids,
-            model_output.stop_seqs_len,
-            model_output.min_tokens,
-            False,
-        )  # multi ends
-    elif current_platform.is_maca():
+    if (
+        current_platform.is_cuda()
+        or current_platform.is_iluvatar()
+        or current_platform.is_dcu()
+        or current_platform.is_maca()
+    ):
         set_stop_value_multi_ends(
             sampler_output.sampled_token_ids,
             model_output.stop_flags,
diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py
index bcb558fc6..46c71faf5 100644
--- a/fastdeploy/worker/metax_model_runner.py
+++ b/fastdeploy/worker/metax_model_runner.py
@@ -1819,7 +1819,7 @@ class MetaxModelRunner(ModelRunnerBase):
         )
 
         post_process(
-            sampler_output=sampler_output,
+            sampler_or_pooler_output=sampler_output,
             model_output=model_output_data,
             share_inputs=self.share_inputs,
             block_size=self.cache_config.block_size,