diff --git a/custom_ops/gpu_ops/rebuild_padding.cu b/custom_ops/gpu_ops/rebuild_padding.cu
index 381524ef5..a80db0303 100644
--- a/custom_ops/gpu_ops/rebuild_padding.cu
+++ b/custom_ops/gpu_ops/rebuild_padding.cu
@@ -80,30 +80,10 @@ __global__ void RebuildAppendPaddingKernel(T *output_data,
                          &src_vec);
         Store<T, VecSize>(src_vec, &output_data[i]);
 
-        // printf(
-        //     "[normal] out_token_id: %d, ori_token_id: %d, input_token_id: %d "
-        //     "bias_idx: %d, bid: %d, seq_id: %d\n",
-        //     out_token_id,
-        //     ori_token_id,
-        //     input_token_id,
-        //     bias_idx,
-        //     bi,
-        //     seq_id);
-
         if (enable_logprob && seq_len_encoder[bi] > 0) {
             int first_token_seq_id = seq_len_encoder[bi] - 2;
             const int first_token_id =
                 ori_token_id - cum_offset_bi + first_token_seq_id;
-            // printf(
-            //     "[first token] out_token_id: %d, ori_token_id: %d, "
-            //     "first_token_id: %d, bias_idx: %d, bid: %d, "
-            //     "first_token_seq_id: %d\n",
-            //     out_token_id,
-            //     ori_token_id,
-            //     first_token_id,
-            //     bias_idx,
-            //     bi,
-            //     first_token_seq_id);
             Load<T, VecSize>(&input_data[first_token_id * dim_embed + bias_idx],
                              &src_vec);
             Store<T, VecSize>(src_vec, &first_token_out[i]);
@@ -153,9 +133,6 @@ std::vector<paddle::Tensor> rebuild_padding(
                            0,
                            D,
                            tmp_out.place());
-        // printf("token_num: %d, need_delete_token_num: %d\n",
-        //        token_num,
-        //        need_delete_token_num);
     } else {
         out =
             paddle::full({bsz, dim_embed}, 0, tmp_out.dtype(), tmp_out.place());
@@ -169,10 +146,6 @@ std::vector<paddle::Tensor> rebuild_padding(
     printf("elem_nums: %d\n", elem_nums);
 
     if (output_padding_offset) {
-        // if (first_token_out.is_initialized()) {
-        //     printf("first_token_out is initialized, enable_logprob: %d\n",
-        //            enable_logprob);
-        // }
         RebuildAppendPaddingKernel<DataType_, PackSize>
             <<<grid_size, blocksize, 0, cu_stream>>>(
                 reinterpret_cast<DataType_ *>(out.data<data_t>()),
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_get_output_with_topk.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_get_output_with_topk.cc
index 922198f95..1dc7e416c 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_get_output_with_topk.cc
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_get_output_with_topk.cc
@@ -26,7 +26,7 @@
 #define MAX_BSZ 512
 #define K 20
 #define MAX_DRAFT_TOKEN_NUM 6
-#define SPECULATE_GET_WITH_OUTPUT_DEBUG
+// #define SPECULATE_GET_WITH_OUTPUT_DEBUG
 
 struct batch_msgdata {
     int tokens[MAX_DRAFT_TOKEN_NUM * (K + 1)];
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc
index 10f547cea..b345893c9 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc
@@ -26,7 +26,7 @@
 #define MAX_BSZ 512
 #define K 20
 #define MAX_DRAFT_TOKEN_NUM 6
-#define SPECULATE_SAVE_WITH_OUTPUT_DEBUG
+// #define SPECULATE_SAVE_WITH_OUTPUT_DEBUG
 
 struct batch_msgdata {
     int tokens[MAX_DRAFT_TOKEN_NUM * (K + 1)];
@@ -134,7 +134,6 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids,
         for (int j = 0; j < cur_token_num; j++) {
             auto* cur_tokens = &cur_batch_msg_sed->tokens[j * (K + 1)];
             auto* cur_scores = &cur_batch_msg_sed->scores[j * (K + 1)];
-            std::cout << "token_offset: " << token_offset << std::endl;
             for (int k = 0; k < K + 1; k++) {
                 if (k == 0) {
                     cur_tokens[k] =
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
index c08128317..e75396530 100644
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -403,8 +403,6 @@ class EngineArgs:
         if self.dynamic_load_weight:
             self.enable_prefix_caching = False
         if self.enable_logprob:
-            # if self.speculative_config is not None:
-            #     raise NotImplementedError("Logprob does not support speculation_config.")
             if not current_platform.is_cuda():
                 raise NotImplementedError("Only CUDA platform supports logprob.")
         if self.splitwise_role != "mixed":
diff --git a/fastdeploy/model_executor/layers/sample/sampler.py b/fastdeploy/model_executor/layers/sample/sampler.py
index bc1092caf..8c5e7190d 100644
--- a/fastdeploy/model_executor/layers/sample/sampler.py
+++ b/fastdeploy/model_executor/layers/sample/sampler.py
@@ -290,11 +290,8 @@ class Sampler(nn.Layer):
         # Get with the logprob of the prompt or sampled token.
         token_logprobs = paddle.take_along_axis(logprobs, token_ids, axis=-1)
 
-        print(f"[Sampler] logprobs: {logprobs}")
-        print(f"[Sampler] token_logprobs: {token_logprobs}")
         # Compute the ranks of the actual token.
         token_ranks = (logprobs >= token_logprobs).sum(-1)
-        print(f"[Sampler] token_ranks: {token_ranks}")
 
         if num_logprobs >= 1:
             # Find the topK values.
@@ -363,7 +360,6 @@ class Sampler(nn.Layer):
             sampled_token_ids=next_tokens,
             logprobs_tensors=logprobs_tensors,
         )
-        print(f"[Sampler] sampler_output: {sampler_output}")
 
         return sampler_output
 
@@ -407,11 +403,8 @@ class SpeculativeSampler(nn.Layer):
         share_inputs = sampling_metadata.share_inputs
         last_logits = logits
         real_bsz = share_inputs["seq_lens_this_time"].shape[0]
-        print(f"[SpeculativeSampler][compute] seq_lens_this_time: {share_inputs['seq_lens_this_time']}")
-        print(f"[SpeculativeSampler][compute] seq_lens_encoder: {share_inputs['seq_lens_encoder']}")
         batch_token_num = share_inputs["batch_token_num"]
 
-        print(f"[SpeculativeSampler][compute] batch_token_num: {batch_token_num}")
         temp_scaled_logprobs = sampling_metadata.temp_scaled_logprobs
         top_p_normalized_logprobs = sampling_metadata.top_p_normalized_logprobs
         if temp_scaled_logprobs is not None:
@@ -479,11 +472,8 @@ class SpeculativeSampler(nn.Layer):
         # Get with the logprob of the prompt or sampled token.
         token_logprobs = paddle.take_along_axis(logprobs, token_ids, axis=-1)
 
-        print(f"[SpeculativeSampler] logprobs: {logprobs}")
-        print(f"[SpeculativeSampler] token_logprobs: {token_logprobs}")
         # Compute the ranks of the actual token.
         token_ranks = (logprobs >= token_logprobs).sum(-1)
-        print(f"[SpeculativeSampler] token_ranks: {token_ranks}")
 
         if num_logprobs >= 1:
             # Find the topK values.
@@ -534,9 +524,6 @@ class SpeculativeSampler(nn.Layer):
             max_model_len,
         )
 
-        print(f"[SpeculativeSampler] verify_tokens: {verify_tokens}")
-        print(f"[SpeculativeSampler] actual_candidate_len: {actual_candidate_len}")
-
         speculate_verify(
             share_inputs["accept_tokens"],
             share_inputs["accept_num"],
@@ -562,10 +549,7 @@ class SpeculativeSampler(nn.Layer):
             True,  # enable_topp
             self.speculative_benchmark_mode,
         )
-        print(f"[SpeculativeSampler] accept_num: {share_inputs['accept_num']}")
-        print(f"[SpeculativeSampler] accept_tokens: {share_inputs['accept_tokens']}")
 
-        print(f"[SpeculativeSampler] logits: {logits}")
         num_logprobs = sampling_metadata.max_num_logprobs
         if num_logprobs is not None:
             real_bsz = share_inputs["seq_lens_this_time"].shape[0]
@@ -575,15 +559,13 @@ class SpeculativeSampler(nn.Layer):
                 share_inputs["accept_num"][:real_bsz].unsqueeze(1),
             ).squeeze(1)
             share_inputs["batch_token_num"] = batch_token_num
-            print(f"[SpeculativeSampler] batch_token_num: {share_inputs['batch_token_num']}")
             ori_cu_batch_token_offset = paddle.concat([paddle.to_tensor([0]), paddle.cumsum(batch_token_num)]).astype(
                 "int32"
             )
             cu_batch_token_offset = paddle.concat(
                 [paddle.to_tensor([0]), paddle.cumsum(share_inputs["accept_num"])]
             ).astype("int32")
-            print(f"[SpeculativeSampler] ori_cu_batch_token_offset: {ori_cu_batch_token_offset}")
-            print(f"[SpeculativeSampler] cu_batch_token_offset: {cu_batch_token_offset}")
+            share_inputs["cu_batch_token_offset"] = cu_batch_token_offset
             target_logtis = paddle.empty([share_inputs["accept_num"].sum(), logits.shape[1]], dtype=logits.dtype)
             speculate_get_target_logits(
                 target_logtis,
@@ -594,9 +576,7 @@ class SpeculativeSampler(nn.Layer):
                 share_inputs["seq_lens_encoder"],
                 share_inputs["accept_num"],
             )
-            print(f"[SpeculativeSampler] target_logtis: {target_logtis}")
             raw_logprobs = self.compute_logprobs(target_logtis, sampling_metadata)
-            print(f"[SpeculativeSampler] raw_logprobs: {raw_logprobs}")
 
         sampler_output = None
         if num_logprobs is not None:
@@ -608,7 +588,6 @@ class SpeculativeSampler(nn.Layer):
                     for i in range(share_inputs["accept_num"].shape[0])
                 ]
             )
-            print(f"[SpeculativeSampler] token_ids: {token_ids}")
             logprobs_tensors = self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=token_ids)
 
             sampler_output = SamplerOutput(
@@ -617,8 +596,6 @@ class SpeculativeSampler(nn.Layer):
                 token_num_per_batch=batch_token_num,
             )
 
-        print(f"[SpeculativeSampler] sampler_output: {sampler_output}")
-
         return sampler_output
 
 
@@ -656,7 +633,6 @@ class MTPSampler(nn.Layer):
         share_inputs = sampling_metadata.share_inputs
         real_bsz = share_inputs["seq_lens_this_time"].shape[0]
         last_logits = logits
-        # print(f"[MTPSampler][compute] real_bsz: {real_bsz}")
         temp_scaled_logprobs = sampling_metadata.temp_scaled_logprobs
         top_p_normalized_logprobs = sampling_metadata.top_p_normalized_logprobs
         if temp_scaled_logprobs is not None:
@@ -669,17 +645,12 @@ class MTPSampler(nn.Layer):
                 .astype("bool")
             )
             temperature = temperature.squeeze(1).repeat_interleave(share_inputs["batch_token_num"])
-            # print(f"[MTPSampler][compute] real_bsz_temp_scaled: {real_bsz_temp_scaled}")
-            # print(f"[MTPSampler][compute] temperature: {temperature}")
             temp_temperature = paddle.where(
                 real_bsz_temp_scaled, temperature, paddle.ones_like(temperature)
             ).unsqueeze(1)
-            # print(f"[MTPSampler][compute] temp_temperature: {temp_temperature}")
             last_logits = last_logits / temp_temperature
-            # print(f"[MTPSampler][compute] last_logits: {last_logits}")
 
         last_logprobs = F.log_softmax(last_logits, axis=-1)
-        # print(f"[MTPSampler][compute] last_logits: {last_logits}")
         top_p_logprob = None
         top_p_token_mask = None
 
@@ -690,7 +661,6 @@ class MTPSampler(nn.Layer):
                 .repeat_interleave(share_inputs["batch_token_num"])
                 .unsqueeze(1)
             )
-            # print(f"[MTPSampler][compute] real_token_top_p: {real_token_top_p}")
             top_p_normalized_logprobs = (
                 top_p_normalized_logprobs[:real_bsz]
                 .astype("int32")
@@ -699,17 +669,12 @@ class MTPSampler(nn.Layer):
                 .astype("bool")
                 .unsqueeze(1)
             )
-            # print(f"[MTPSampler][compute] top_p_normalized_logprobs: {top_p_normalized_logprobs}")
             top_p_token_mask = paddle.logical_and(top_p_normalized_logprobs, real_token_top_p != 1.0)
-            # print(f"[MTPSampler][compute] top_p_token_mask: {top_p_token_mask}")
 
             if top_p_token_mask.any():
                 probs = F.softmax(last_logits, axis=-1)
-                # print(f"[MTPSampler][compute] probs: {probs}")
                 probs = top_p_normalize_probs_paddle(probs, real_token_top_p)
-                # print(f"[MTPSampler][compute] probs: {probs}")
                 top_p_logprob = paddle.log(probs)
-                # print(f"[MTPSampler][compute] top_p_logprob: {top_p_logprob}")
         if top_p_logprob is not None:
             last_logprobs = paddle.where(top_p_token_mask, top_p_logprob, last_logprobs)
         return last_logprobs
@@ -767,7 +732,6 @@ class MTPSampler(nn.Layer):
         num_logprobs = sampling_metadata.max_num_logprobs
         if num_logprobs is not None and share_inputs["substep"] == 0:
             raw_logprobs = self.compute_logprobs(share_inputs["draft_logits"], sampling_metadata)
-            print(f"[MTPSampler] raw_logprobs: {raw_logprobs}")
 
         logits = apply_speculative_penalty_multi_scores(
             sampling_metadata.pre_token_ids,
@@ -803,8 +767,6 @@ class MTPSampler(nn.Layer):
                 share_inputs["seq_lens_this_time"],
                 share_inputs["seq_lens_encoder"],
             )
-            print(f"[MTPSampler] token_ids: {token_ids}")
-            print(f"[MTPSampler] total_token_num: {share_inputs['batch_token_num'].sum()}")
 
             logprobs_tensors = self.gather_logprobs(raw_logprobs, num_logprobs, token_ids=token_ids)
 
@@ -813,7 +775,5 @@ class MTPSampler(nn.Layer):
                 logprobs_tensors=logprobs_tensors,
                 token_num_per_batch=share_inputs["batch_token_num"],
             )
-        print(f"[MTPSampler] sampler_output: {sampler_output}")
-        print(f"[MTPSampler] next_tokens: {next_tokens}")
 
         return next_tokens, sampler_output
diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py
index a4262a299..e48260fc6 100644
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -158,7 +158,6 @@ class TokenProcessor:
                 get_output_ep,
                 get_output_topk,
                 speculate_get_output,
-                speculate_get_output_topk,
             )
         rank_id = self.cfg.parallel_config.local_data_parallel_id
 
@@ -166,24 +165,9 @@ class TokenProcessor:
             try:
                 is_blocking = True
                 if self.speculative_decoding:
-                    if self.use_logprobs:
-                        speculate_get_output_topk(
-                            self.output_tokens,
-                            self.output_scores,
-                            self.output_ranks,
-                            K,
-                            rank_id,
-                            is_blocking,
-                        )
-                        print(f"[TokenProcessor] output_tokens: {self.output_tokens}")
-                        print(f"[TokenProcessor] output_scores: {self.output_scores}")
-                        print(f"[TokenProcessor] output_ranks: {self.output_ranks}")
-                        if self.output_tokens[0, 0] == -2:
-                            continue
-                    else:
-                        speculate_get_output(self.output_tokens, rank_id, is_blocking, False)
-                        if self.output_tokens[0] == -2:
-                            continue
+                    speculate_get_output(self.output_tokens, rank_id, is_blocking, False)
+                    if self.output_tokens[0] == -2:
+                        continue
 
                 else:
                     if self.use_logprobs:
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
index bf2571b2c..a02fbc05d 100644
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -626,7 +626,6 @@ class MTPProposer(Proposer):
         """
         for substep in range(self.num_model_steps):
             if self.model_inputs["not_need_stop"]:
-                print(f"[MTPProposer] ******************** substep: {substep} ********************")
                 self.model_inputs["substep"] = substep
                 # Remove padding
                 (
@@ -682,19 +681,12 @@ class MTPProposer(Proposer):
                     previous_hidden_states=target_hidden_states,
                     forward_meta=self.forward_meta,
                 )
-                print(f"[MTPProposer] model_output: {model_output}")
 
                 if self.enable_logprob and substep == 0:
                     first_token_hidden_states = paddle.empty(
                         [self.max_num_seqs, self.model_config.hidden_size], dtype=model_output.dtype
                     )
 
-                print(f"[MTPProposer] cu_seqlens_q: {self.model_inputs['cu_seqlens_q']}")
-                print(f"[MTPProposer] seq_lens_this_time: {self.model_inputs['seq_lens_this_time']}")
-                print(f"[MTPProposer] seq_lens_encoder: {self.model_inputs['seq_lens_encoder']}")
-                print(f"[MTPProposer] seq_lens_decoder: {self.model_inputs['seq_lens_decoder']}")
-                print(f"[MTPProposer] output_cum_offsets: {self.model_inputs['output_cum_offsets']}")
-                print(f"[MTPProposer] output_padding_offset: {self.model_inputs['output_padding_offset']}")
                 hidden_states = rebuild_padding(
                     model_output,
                     self.model_inputs["cu_seqlens_q"],
@@ -706,16 +698,11 @@ class MTPProposer(Proposer):
                     first_token_hidden_states if substep == 0 else None,
                     self.enable_logprob if substep == 0 else False,
                 )
-                print(f"[MTPProposer] hidden_states: {hidden_states}")
-                print(f"[MTPProposer] first_token_hidden_states: {first_token_hidden_states}")
 
                 # 4. Compute logits, Sample
                 logits = self.model.compute_logits(hidden_states)
                 if self.enable_logprob and substep == 0:
                     first_token_logits = self.model.compute_logits(first_token_hidden_states)
-                    print(f"[MTPProposer] logits: {logits}")
-                    print(f"[MTPProposer] first_token_logits: {first_token_logits}")
-                    print(f"[MTPProposer] output_padding_offset: {self.model_inputs['output_padding_offset']}")
 
                     draft_logits, batch_token_num, cu_batch_token_offset = speculate_get_logits(
                         logits,
@@ -727,9 +714,6 @@ class MTPProposer(Proposer):
                     self.model_inputs["draft_logits"] = draft_logits
                     self.model_inputs["batch_token_num"] = batch_token_num
                     self.model_inputs["cu_batch_token_offset"] = cu_batch_token_offset
-                    print(f"[MTPProposer] draft_logits: {draft_logits}")
-                    print(f"[MTPProposer] batch_token_num: {batch_token_num}")
-                    print(f"[MTPProposer] cu_batch_token_offset: {cu_batch_token_offset}")
 
                 sampled_token_ids, sampler_output = self.sampler(
                     logits,