support w4afp8 EP inference (#3382)

2025-10-04 16:22:57 +08:00 · 2025-08-13 21:41:34 +08:00
parent 4dbaa3d74c
commit 2513cd929b
17 changed files with 944 additions and 100 deletions
--- a/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py
+++ b/custom_ops/utils/auto_gen_w4afp8_gemm_kernel.py
@@ -36,8 +36,8 @@ void w4afp8_gemm_M{M}_N{N}_TAILN{TAILN}_K{K}_B{BATCH}_P{PADDING}_{TYPE}(
    {cutlass_type} * out,
    const float *weight_scale,
    const float *input_row_sum,
-    const int *tokens,
-    const int max_tokens,
+    const int64_t *tokens,
+    const int64_t max_tokens,
    cudaStream_t stream);
 """

@@ -54,8 +54,8 @@ void w4afp8_gemm_M{M}_N{N}_TAILN{TAILN}_K{K}_B{BATCH}_P{PADDING}_{TYPE}(
        {cutlass_type} * out,
        const float *weight_scale,
        const float *input_row_sum,
-        const int *tokens,
-        const int max_tokens,
+        const int64_t *tokens,
+        const int64_t max_tokens,
        cudaStream_t stream) {{

    constexpr static int M = {M};
@@ -204,4 +204,4 @@ for type in dtype:
    }"""
    )

-template_head_file.close()
+template_head_file.close()