support w4afp8 EP inference (#3044)

2025-10-05 16:48:03 +08:00 · 2025-08-25 11:27:45 +08:00
parent 46664985fc
commit 9205c88da1
17 changed files with 995 additions and 99 deletions
--- a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py
+++ b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py
@@ -36,8 +36,8 @@ void w4afp8_gemm_M{M}_N{N}_TAILN{TAILN}_K{K}_B{BATCH}_P{PADDING}_{TYPE}(
    {cutlass_type} * out,
    const float *weight_scale,
    const float *input_row_sum,
-    const int *tokens,
-    const int max_tokens,
+    const int64_t *tokens,
+    const int64_t max_tokens,
    cudaStream_t stream);
 """

@@ -54,8 +54,8 @@ void w4afp8_gemm_M{M}_N{N}_TAILN{TAILN}_K{K}_B{BATCH}_P{PADDING}_{TYPE}(
        {cutlass_type} * out,
        const float *weight_scale,
        const float *input_row_sum,
-        const int *tokens,
-        const int max_tokens,
+        const int64_t *tokens,
+        const int64_t max_tokens,
        cudaStream_t stream) {{

    constexpr static int M = {M};