[MetaxGPU] Support FastDeploy on metax gpu (#3241)

* [MetaxGPU] Support FastDeploy on metax gpu * Update metax_worker.py 1. change worker log; 2. remove custom allreduce, adapt it later; 3. remove cuda graph; * Update __init__.py 1. remove metax's key work comment * Update __init__.py 1. remove metax's key word comment; 2. add fused_moe_kernel_paddle import --------- Co-authored-by: yongqiangma <xing.wo@163.com>
2025-10-05 08:37:06 +08:00 · 2025-08-13 11:11:54 +08:00
parent ed6bff215a
commit b4fef2cf29
29 changed files with 3224 additions and 11 deletions
--- a/custom_ops/gpu_ops/helper.h
+++ b/custom_ops/gpu_ops/helper.h
@@ -509,6 +509,7 @@ static void PrintMatrix3(const T *mat_d, int num, std::string name) {
 }

 #ifndef PADDLE_WITH_HIP
+#ifndef PADDLE_WITH_CUSTOM_DEVICE_METAX_GPU
 __forceinline__ __device__ uint32_t ld_flag_acquire(uint32_t *flag_addr,
                                                    int mode = 0) {
  uint32_t flag;
@@ -541,7 +542,7 @@ __forceinline__ __device__ void st_flag_release(uint32_t *flag_addr,
                 "l"(flag_addr));
  }
 }
-
+#endif
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
  int max_shared_mem_per_block_opt_in = 0;
  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,