[Iluvatar GPU] Optimze attention and moe performance (#3234)

2025-10-18 06:31:17 +08:00 · 2025-08-08 10:51:24 +08:00
parent 37569cca86
commit fbdd6b0663
24 changed files with 1130 additions and 1653 deletions
--- a/custom_ops/gpu_ops/sample_kernels/utils.cuh
+++ b/custom_ops/gpu_ops/sample_kernels/utils.cuh
@@ -258,9 +258,13 @@ inline std::pair<int, int> GetCudaComputeCapability() {

 /******************* math *******************/
 __forceinline__ __device__ float ptx_rcp(float x) {
+#ifdef PADDLE_WITH_COREX
+  return __ivcorex_rcpf(x);
+#else
  float y;
  asm volatile("rcp.approx.ftz.f32 %0, %1;" : "=f"(y) : "f"(x));
  return y;
+#endif
 }

 template <typename T1, typename T2>