Supports DP+TP+EP hybrid parallel deployment strategy (#3489)

* Support DP+TP+EP hybrid parallel deployment strategy * Support DP+TP+EP hybrid parallel deployment strategy * fix conflict * add moe_tp_ep function split_allgather_out * del tp_group in moe_cutlass_backend * for ci * fix parallel_config for ci * del log
2025-10-05 08:37:06 +08:00 · 2025-08-26 15:04:01 +08:00
parent 52eda7fdb3
commit d339df2e90
15 changed files with 304 additions and 224 deletions
--- a/custom_ops/gpu_ops/moe/ep_moe_prefill_func.cu
+++ b/custom_ops/gpu_ops/moe/ep_moe_prefill_func.cu
@@ -43,11 +43,16 @@
      __VA_ARGS__                                                                       \
      break;                                                                            \
    }                                                                                   \
-    case 48: {                                                 \
-        constexpr size_t NUM_EXPERTS_PER_RANK = 48;            \
-        __VA_ARGS__                                            \
-        break;                                                 \
-    }   \
+    case 32: {                                                                          \
+      constexpr size_t NUM_EXPERTS_PER_RANK = 32;                                       \
+      __VA_ARGS__                                                                       \
+      break;                                                                            \
+    }                                                                                   \
+    case 48: {                                                                          \
+        constexpr size_t NUM_EXPERTS_PER_RANK = 48;                                     \
+        __VA_ARGS__                                                                     \
+        break;                                                                          \
+    }                                                                                   \
    case 64: {                                                                          \
      constexpr size_t NUM_EXPERTS_PER_RANK = 64;                                       \
      __VA_ARGS__                                                                       \
--- a/custom_ops/gpu_ops/save_with_output_msg.cc
+++ b/custom_ops/gpu_ops/save_with_output_msg.cc
@@ -105,7 +105,8 @@ void SaveOutMmsg(const paddle::Tensor& x,
                 int64_t rank_id,
                 int msg_queue_id,
                 bool save_each_rank) {
-    if (!save_each_rank && rank_id > 0) {
+    // don't use save_each_rank now!
+    if (rank_id > 0) {
        return;
    }
    if (x.place() == paddle::CPUPlace()) {