diff --git a/custom_ops/gpu_ops/get_output_msg_with_topk.cc b/custom_ops/gpu_ops/get_output_msg_with_topk.cc index 5da88dc1d..4d6b5f56b 100644 --- a/custom_ops/gpu_ops/get_output_msg_with_topk.cc +++ b/custom_ops/gpu_ops/get_output_msg_with_topk.cc @@ -39,9 +39,6 @@ void GetOutputTopK(const paddle::Tensor& x, int k, int64_t rank_id, bool wait_flag) { - if (rank_id > 0) { - return; - } static struct msgdata msg_rcv; int msg_queue_id = 1; diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 2dcbde647..aa3e86f2d 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -401,8 +401,6 @@ class EngineArgs: if self.enable_logprob: if self.speculative_config is not None: raise NotImplementedError("Logprob does not support speculation_config.") - if self.enable_expert_parallel: - raise NotImplementedError("Logprob does not support enable_expert_parallel.") if not current_platform.is_cuda(): raise NotImplementedError("Only CUDA platform supports logprob.") if self.speculative_config is not None: diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 57375941e..4f8a9c15d 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -303,24 +303,23 @@ class TokenProcessor: continue else: - if ( + if self.use_logprobs: + get_output_topk( + self.output_tokens, + self.output_scores, + self.output_ranks, + K, + rank_id, + is_blocking, + ) + elif ( self.cfg.parallel_config.enable_expert_parallel and self.cfg.parallel_config.data_parallel_size > 1 ): get_output_ep(self.output_tokens, rank_id, is_blocking) else: - if self.use_logprobs: - get_output_topk( - self.output_tokens, - self.output_scores, - self.output_ranks, - K, - rank_id, - is_blocking, - ) - else: - get_output(self.output_tokens, rank_id, is_blocking) + get_output(self.output_tokens, rank_id, is_blocking) if self.output_tokens[0, 0] == -2: continue