[Iluvatar GPU] Optimze attention and moe performance (#3234)

This commit is contained in:
yzwu
2025-08-08 10:51:24 +08:00
committed by GitHub
parent 37569cca86
commit fbdd6b0663
24 changed files with 1130 additions and 1653 deletions

View File

@@ -258,9 +258,13 @@ inline std::pair<int, int> GetCudaComputeCapability() {
/******************* math *******************/
__forceinline__ __device__ float ptx_rcp(float x) {
#ifdef PADDLE_WITH_COREX
return __ivcorex_rcpf(x);
#else
float y;
asm volatile("rcp.approx.ftz.f32 %0, %1;" : "=f"(y) : "f"(x));
return y;
#endif
}
template <typename T1, typename T2>