fix cutlass ep (#5337)

2025-12-24 13:28:13 +08:00 · 2025-12-03 14:06:01 +08:00
parent 690bcb8e50
commit d5a9b75b4e
1 changed files with 89 additions and 70 deletions
--- a/custom_ops/gpu_ops/moe/moe_topk_select.cu
+++ b/custom_ops/gpu_ops/moe/moe_topk_select.cu
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // Ignore CUTLASS warnings about type punning
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
@@ -39,20 +38,35 @@ void moe_topk_select_kernel(const T* input,
                            const int64_t k,
                            cudaStream_t stream,
                            const bool apply_norm_weight = false,
-                            const bool enable_softmax_top_k_fused = false
+                            const bool enable_softmax_top_k_fused = false) {
                          ) {
  static constexpr int WARPS_PER_TB = 4;
-  #define LAUNCH_TOPK_GATING_SOFTMAX_HELPER(N)                                   \
+#define LAUNCH_TOPK_GATING_SOFTMAX_HELPER(N)                          \
-  case N: {                                                                      \
+  case N: {                                                           \
-    if (apply_norm_weight) {                                                     \
+    if (apply_norm_weight) {                                          \
-      topk_gating_softmax_launcher_helper<T, N, WARPS_PER_TB, true>(             \
+      topk_gating_softmax_launcher_helper<T, N, WARPS_PER_TB, true>(  \
-          input, bias, output, indices, source_row, num_rows, num_experts, k, stream); \
+          input,                                                      \
-    } else {                                                                     \
+          bias,                                                       \
-      topk_gating_softmax_launcher_helper<T, N, WARPS_PER_TB, false>(            \
+          output,                                                     \
-          input, bias, output, indices, source_row, num_rows, num_experts, k, stream); \
+          indices,                                                    \
-    }                                                                            \
+          source_row,                                                 \
-    break;                                                                       \
+          num_rows,                                                   \
          num_experts,                                                \
          k,                                                          \
          stream);                                                    \
    } else {                                                          \
      topk_gating_softmax_launcher_helper<T, N, WARPS_PER_TB, false>( \
          input,                                                      \
          bias,                                                       \
          output,                                                     \
          indices,                                                    \
          source_row,                                                 \
          num_rows,                                                   \
          num_experts,                                                \
          k,                                                          \
          stream);                                                    \
    }                                                                 \
    break;                                                            \
  }
  switch (num_experts) {
    LAUNCH_TOPK_GATING_SOFTMAX_HELPER(2)
@@ -68,56 +82,56 @@ void moe_topk_select_kernel(const T* input,
      static constexpr int TPB = 256;
      const auto config_topk = Get1DBlocksAnd2DGridsMoe(num_rows);
      if (!enable_softmax_top_k_fused) {
-          moe_softmax<T, TPB><<<config_topk.block_per_grid, TPB, 0, stream>>>(
+        moe_softmax<T, TPB><<<config_topk.block_per_grid, TPB, 0, stream>>>(
-              input, softmax, num_experts, num_rows);
+            input, softmax, num_experts, num_rows);
-          if (apply_norm_weight) {
+        if (apply_norm_weight) {
-            moe_top_k<T, TPB, true>
+          moe_top_k<T, TPB, true>
-                <<<config_topk.block_per_grid, TPB, k * sizeof(T), stream>>>(softmax,
+              <<<config_topk.block_per_grid, TPB, k * sizeof(T), stream>>>(
-                                                                 bias,
+                  softmax,
-                                                                 output,
+                  bias,
-                                                                 indices,
+                  output,
-                                                                 source_row,
+                  indices,
-                                                                 num_experts,
+                  source_row,
-                                                                 k,
+                  num_experts,
-                                                                 num_rows);
+                  k,
-          } else {
+                  num_rows);
-            moe_top_k<T, TPB, false>
+        } else {
-                <<<config_topk.block_per_grid, TPB, 0, stream>>>(softmax,
+          moe_top_k<T, TPB, false>
-                                                                  bias,
+              <<<config_topk.block_per_grid, TPB, 0, stream>>>(softmax,
-                                                                  output,
+                                                               bias,
-                                                                  indices,
+                                                               output,
-                                                                  source_row,
+                                                               indices,
-                                                                  num_experts,
+                                                               source_row,
-                                                                  k,
+                                                               num_experts,
-                                                                  num_rows);
+                                                               k,
-          }
+                                                               num_rows);
-          cudaGetLastError();
+        }
        cudaGetLastError();
      } else {
        assert(k <= TPB);
        if (apply_norm_weight) {
          moe_softmax_top_k_fused<T, TPB, true>
              <<<config_topk.block_per_grid, TPB, k * sizeof(T), stream>>>(
                  input,
                  bias,
                  output,
                  indices,
                  source_row,
                  num_experts,
                  k,
                  num_rows);
        } else {
          moe_softmax_top_k_fused<T, TPB, false>
              <<<config_topk.block_per_grid, TPB, 0, stream>>>(input,
                                                               bias,
                                                               output,
                                                               indices,
                                                               source_row,
                                                               num_experts,
                                                               k,
                                                               num_rows);
        }
      }
      else {
          assert(k<=TPB);
          if (apply_norm_weight) {
            moe_softmax_top_k_fused<T, TPB, true>
                <<<config_topk.block_per_grid, TPB, k * sizeof(T), stream>>>(input,
                                                                 bias,
                                                                 output,
                                                                 indices,
                                                                 source_row,
                                                                 num_experts,
                                                                 k,
                                                                 num_rows);
          } else {
            moe_softmax_top_k_fused<T, TPB, false>
                <<<config_topk.block_per_grid, TPB, 0, stream>>>(input,
                                                                  bias,
                                                                  output,
                                                                  indices,
                                                                  source_row,
                                                                  num_experts,
                                                                  k,
                                                                  num_rows);
          }
      }
    }
  }
 }
@@ -146,6 +160,13 @@ std::vector<paddle::Tensor> MoETopKSelectKernel(
  auto topk_weights =
      GetEmptyTensor({num_rows, moe_topk}, paddle::DataType::FLOAT32, place);
  // NOTE(sunxin): Avoid "invalid configuration argument" error caused by empty
  // tensors.
  if (gating_dims[0] == 0) {
    cudaGetLastError();
    return {topk_ids, topk_weights};
  }
  const int num_moe_inputs = AlignTo16(num_rows * moe_topk);
  const int bytes = num_moe_inputs * sizeof(int);
@@ -213,8 +234,7 @@ std::vector<std::vector<int64_t>> MoETopKSelectKernelInferShape(
  }
  const int num_rows = token_rows;
-  return {{num_rows, moe_topk},
+  return {{num_rows, moe_topk}, {num_rows, moe_topk}};
          {num_rows, moe_topk}};
 }
 std::vector<paddle::DataType> MoETopKSelectKernelInferDtype(
@@ -223,16 +243,15 @@ std::vector<paddle::DataType> MoETopKSelectKernelInferDtype(
    const int moe_topk,
    const bool apply_norm_weight,
    const bool enable_softmax_top_k_fused) {
-  return {paddle::DataType::INT64,
+  return {paddle::DataType::INT64, paddle::DataType::FLOAT32};
          paddle::DataType::FLOAT32};
 }
 PD_BUILD_STATIC_OP(moe_topk_select)
    .Inputs({"gating_logits", paddle::Optional("bias")})
-    .Outputs({"topk_ids",
+    .Outputs({"topk_ids", "topk_weights"})
-              "topk_weights"})
+    .Attrs({"moe_topk:int",
-    .Attrs({"moe_topk:int", "apply_norm_weight:bool",  "enable_softmax_top_k_fused:bool"})
+            "apply_norm_weight:bool",
            "enable_softmax_top_k_fused:bool"})
    .SetKernelFn(PD_KERNEL(MoETopKSelectKernel))
    .SetInferShapeFn(PD_INFER_SHAPE(MoETopKSelectKernelInferShape))
    .SetInferDtypeFn(PD_INFER_DTYPE(MoETopKSelectKernelInferDtype));