[Graph Optimization][Speculative Decoding] Update yaml and fix typo (#4612)

This commit is contained in:
RAM
2025-10-28 11:43:26 +08:00
committed by GitHub
parent b2c6c41447
commit 86d5006a57
2 changed files with 4 additions and 4 deletions

View File

@@ -1,6 +1,6 @@
max_model_len: 32768
max_num_seqs: 96
gpu_memory_utilization: 0.9
gpu_memory_utilization: 0.85
kv_cache_ratio: 0.71
tensor_parallel_size: 4
quantization: wint4

View File

@@ -74,7 +74,7 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel(
const int token_id = linear_index / hidden_size;
const int ori_bi = batch_id_per_token[token_id];
if (ori_bi == -1) return; // NOTE(gongshaotian): For CUDAGraph padding
if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding
if (seq_lens_decoder[ori_bi] == 0) continue;
const int bias = linear_index % hidden_size;
const int hi = bias / head_size; // q + k + v
@@ -378,7 +378,7 @@ __global__ void append_speculate_cache_rope_kernel(
linear_index += step) {
const int token_id = linear_index / hidden_size;
const int ori_bi = batch_id_per_token[token_id];
if (ori_bi == -1) return; // NOTE(gongshaotian): For CUDAGraph padding
if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding
if (seq_lens_decoder[ori_bi] == 0) continue;
const int bias = linear_index % hidden_size;
@@ -508,7 +508,7 @@ __global__ void append_speculate_cache_neox_rope_kernel(
linear_index += step) {
const int token_id = linear_index / half_hidden_size;
const int ori_bi = batch_id_per_token[token_id];
if (ori_bi == -1) return; // NOTE(gongshaotian): For CUDAGraph padding
if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding
if (seq_lens_decoder[ori_bi] == 0) continue;
const int bias = linear_index % half_hidden_size;
const int hi = bias / half_head_size; // q + k + v