diff --git a/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml b/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml index c609fba49..974c2eaf7 100644 --- a/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml +++ b/benchmarks/yaml/eb45-32k-wint4-mtp-h800-tp4.yaml @@ -1,6 +1,6 @@ max_model_len: 32768 max_num_seqs: 96 -gpu_memory_utilization: 0.9 +gpu_memory_utilization: 0.85 kv_cache_ratio: 0.71 tensor_parallel_size: 4 quantization: wint4 diff --git a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh index 56026157a..d5d62dbf2 100644 --- a/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh +++ b/custom_ops/gpu_ops/append_attn/speculate_write_cache_with_rope_impl.cuh @@ -74,7 +74,7 @@ __global__ void append_speculate_cache_T_rope_qk_norm_kernel( const int token_id = linear_index / hidden_size; const int ori_bi = batch_id_per_token[token_id]; - if (ori_bi == -1) return; // NOTE(gongshaotian): For CUDAGraph padding + if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding if (seq_lens_decoder[ori_bi] == 0) continue; const int bias = linear_index % hidden_size; const int hi = bias / head_size; // q + k + v @@ -378,7 +378,7 @@ __global__ void append_speculate_cache_rope_kernel( linear_index += step) { const int token_id = linear_index / hidden_size; const int ori_bi = batch_id_per_token[token_id]; - if (ori_bi == -1) return; // NOTE(gongshaotian): For CUDAGraph padding + if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding if (seq_lens_decoder[ori_bi] == 0) continue; const int bias = linear_index % hidden_size; @@ -508,7 +508,7 @@ __global__ void append_speculate_cache_neox_rope_kernel( linear_index += step) { const int token_id = linear_index / half_hidden_size; const int ori_bi = batch_id_per_token[token_id]; - if (ori_bi == -1) return; // NOTE(gongshaotian): For CUDAGraph padding + if (ori_bi == -1) continue; // NOTE(gongshaotian): For CUDAGraph padding if (seq_lens_decoder[ori_bi] == 0) continue; const int bias = linear_index % half_hidden_size; const int hi = bias / half_head_size; // q + k + v