diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 5178e2b8c..da4e5d946 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -112,10 +112,12 @@ class GPUModelRunner(ModelRunnerBase): self.speculative_method = self.fd_config.speculative_config.method self.speculative_decoding = self.speculative_method is not None self.enable_logprob = fd_config.model_config.enable_logprob - self.max_logprobs = fd_config.model_config.max_logprobs self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop self.is_pooling_model = self.fd_config.model_config.runner_type == "pooling" - self.vocal_size = self.fd_config.model_config.vocab_size + self.ori_vocab_size = self.fd_config.model_config.ori_vocab_size + self.max_logprobs = ( + self.ori_vocab_size if fd_config.model_config.max_logprobs == -1 else fd_config.model_config.max_logprobs + ) self.prompt_logprobs_reqs: dict[str, Request] = {} self.in_progress_prompt_logprobs: dict[str, LogprobsTensors] = {} @@ -2722,7 +2724,7 @@ class GPUModelRunner(ModelRunnerBase): if request.prompt_token_ids is None or num_prompt_logprobs is None: continue if num_prompt_logprobs == -1: - num_prompt_logprobs = self.vocal_size + num_prompt_logprobs = self.ori_vocab_size num_tokens = request.prefill_end_index - request.prefill_start_index num_prompt_tokens = len(request.prompt_token_ids)