FastDeploy/custom_ops/gpu_ops/moe/template_config.json

{
  "moe_fast_hardamard_impl": {
    "name": "moe_fast_hardamard_impl",
    "function_name": "MoeFastHardamardImplWrapper",
    "impl_file": "moe_fast_hardamard_impl.cuh",
    "template_params": [
      "T",
      "OutT",
      "kLogN",
      "VecSize",
      "kNChunks",
      "kThreads",
      "UseDiagonalBlockMatrix"
    ],
    "dispatch_params": {},
    "data_types": [
      ["phi::dtype::float16", "phi::dtype::float16", "float16_float16"],
      ["phi::dtype::float16", "int8_t", "float16_int8"],
      ["phi::dtype::bfloat16", "phi::dtype::bfloat16", "bfloat16_bfloat16"],
      ["phi::dtype::bfloat16", "int8_t", "bfloat16_int8"]
    ],
    "max_instances_per_file": 16,
    "file_prefix": "moe_fast_hardamard_impl_",
    "function_signature": "template void {function_name}{template_args}(\n    const T *x,\n    const int64_t *expert_idx_per_token,\n    const int64_t *recv_expert_count,\n    const T *shift,\n    const T *smooth,\n    const float* quant_scales,\n    const int quant_round_type,\n    const float quant_max_bound,\n    const float quant_min_bound,\n    const int64_t token_num,\n    const int64_t dim,\n    const int num_max_tokens_per_expert,\n    bool used_in_ep_low_latency,\n    OutT* out,\n    cudaStream_t stream);\n\n"
  }
}