From e38709b499a9340c96195232fb7699adff57cd30 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 10 Dec 2025 11:03:19 +0800 Subject: [PATCH] [BugFix] Fix limit_thinking early return logic in CUDA kernels (#5471) * Initial plan * [BugFix] Fix limit_thinking bug - change AND to OR in condition checks Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> * Update Chinese comments to reflect OR logic instead of AND Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: yuanlehome <23653004+yuanlehome@users.noreply.github.com> --- custom_ops/gpu_ops/limit_thinking_content_length_v1.cu | 4 ++-- custom_ops/gpu_ops/limit_thinking_content_length_v2.cu | 4 ++-- .../speculate_limit_thinking_content_length_v1.cu | 4 ++-- .../speculate_limit_thinking_content_length_v2.cu | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu index 45bf8f704..89e1e1c66 100644 --- a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu @@ -32,8 +32,8 @@ __global__ void limit_thinking_content_length_kernel_v1( const int max_think_len = max_think_lens[bid]; if (max_think_len < 0) return; int current_limit_think_status = limit_think_status[bid]; - // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行 - if (current_limit_think_status == 2 && stop_flags[bid]) { + // 如果在回复阶段, 或者已经触发停止标志, 则直接返回, 无需多余执行 + if (current_limit_think_status == 2 || stop_flags[bid]) { return; } diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu index ea5f8c9c4..8ff42058f 100644 --- a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu @@ -34,8 +34,8 @@ __global__ void limit_thinking_content_length_kernel_v2( const int max_think_len = max_think_lens[bid]; if (max_think_len < 0) return; int current_limit_think_status = limit_think_status[bid]; - // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行 - if (current_limit_think_status == 3 && stop_flags[bid]) { + // 如果在回复阶段, 或者已经触发停止标志, 则直接返回, 无需多余执行 + if (current_limit_think_status == 3 || stop_flags[bid]) { return; } diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu index 0a703639c..a18f3b2bc 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu @@ -38,8 +38,8 @@ __global__ void speculate_limit_thinking_content_length_kernel_v1( const int max_think_len = max_think_lens[bid]; if (max_think_len < 0) return; int current_limit_think_status = limit_think_status[bid]; - // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行 - if (current_limit_think_status == 2 && stop_flags[bid]) { + // 如果在回复阶段, 或者已经触发停止标志, 则直接返回, 无需多余执行 + if (current_limit_think_status == 2 || stop_flags[bid]) { return; } diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu index 709911d2b..270ab07e1 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu @@ -41,8 +41,8 @@ __global__ void speculate_limit_thinking_content_length_kernel_v2( const int max_think_len = max_think_lens[bid]; if (max_think_len < 0) return; int current_limit_think_status = limit_think_status[bid]; - // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. - if (current_limit_think_status == 3 && stop_flags[bid]) { + // 如果在回复阶段, 或者已经触发停止标志, 则直接返回, 无需多余执行. + if (current_limit_think_status == 3 || stop_flags[bid]) { return; }