ep support logprob (#4089) (#4151)

2025-10-05 16:48:03 +08:00 · 2025-09-19 14:07:31 +08:00
parent a685e5ad35
commit 66a98b44ed
3 changed files with 11 additions and 17 deletions
--- a/custom_ops/gpu_ops/get_output_msg_with_topk.cc
+++ b/custom_ops/gpu_ops/get_output_msg_with_topk.cc
@@ -39,9 +39,6 @@ void GetOutputTopK(const paddle::Tensor& x,
                   int k,
                   int64_t rank_id,
                   bool wait_flag) {
    if (rank_id > 0) {
        return;
    }
    static struct msgdata msg_rcv;
    int msg_queue_id = 1;
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -401,8 +401,6 @@ class EngineArgs:
        if self.enable_logprob:
            if self.speculative_config is not None:
                raise NotImplementedError("Logprob does not support speculation_config.")
            if self.enable_expert_parallel:
                raise NotImplementedError("Logprob does not support enable_expert_parallel.")
            if not current_platform.is_cuda():
                raise NotImplementedError("Only CUDA platform supports logprob.")
        if self.speculative_config is not None:
--- a/fastdeploy/output/token_processor.py
+++ b/fastdeploy/output/token_processor.py
@@ -303,24 +303,23 @@ class TokenProcessor:
                        continue
                else:
-                    if (
+                    if self.use_logprobs:
                        get_output_topk(
                            self.output_tokens,
                            self.output_scores,
                            self.output_ranks,
                            K,
                            rank_id,
                            is_blocking,
                        )
                    elif (
                        self.cfg.parallel_config.enable_expert_parallel
                        and self.cfg.parallel_config.data_parallel_size > 1
                    ):
                        get_output_ep(self.output_tokens, rank_id, is_blocking)
                    else:
-                        if self.use_logprobs:
+                        get_output(self.output_tokens, rank_id, is_blocking)
                            get_output_topk(
                                self.output_tokens,
                                self.output_scores,
                                self.output_ranks,
                                K,
                                rank_id,
                                is_blocking,
                            )
                        else:
                            get_output(self.output_tokens, rank_id, is_blocking)
                    if self.output_tokens[0, 0] == -2:
                        continue