[Optimize] Optimize tensorwise fp8 performance (#2729)

* [Optimize] Optimize tensorwise fp8 performance
2025-10-05 08:37:06 +08:00 · 2025-07-07 20:06:28 +08:00
parent 1b54a2831e
commit ef6649a577
6 changed files with 318 additions and 88 deletions
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -442,6 +442,7 @@ elif paddle.is_compiled_with_cuda():
            "gpu_ops/scaled_gemm_f8_i4_f16_weight_quantize.cu",
            "gpu_ops/cutlass_kernels/cutlass_heuristic.cu",
            "gpu_ops/cutlass_kernels/cutlass_preprocessors.cu",
+            "gpu_ops/fused_hadamard_quant_fp8.cu"
        ]

        sources += find_end_files(fp8_auto_gen_directory, ".cu")