mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
[Feature] Online Chat API Support Return logprobs (#2777)
* online chat support logprobs * check xpu * check vl_gpu_model_runner and xpu_model_runner * get_worker() check platform
This commit is contained in:
@@ -42,8 +42,10 @@ from fastdeploy.model_executor.pre_and_post_process import (post_process,
|
||||
rebuild_padding,
|
||||
step_cuda)
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
if not current_platform.is_dcu():
|
||||
from fastdeploy.spec_decode import MTPProposer, NgramProposer
|
||||
|
||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
||||
from fastdeploy.worker.model_runner_base import ModelRunnerBase
|
||||
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
|
||||
@@ -64,6 +66,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.device_id = device_id
|
||||
self.speculative_method = self.fd_config.speculative_config.method
|
||||
self.speculative_decoding = self.speculative_method is not None
|
||||
self.enable_logprob = fd_config.model_config.enable_logprob
|
||||
|
||||
self.guided_backend = None
|
||||
if self.fd_config.parallel_config.guided_decoding_backend != "off":
|
||||
@@ -612,6 +615,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
min_dec_lens=self.share_inputs["min_dec_len"],
|
||||
bad_words_token_ids=self.share_inputs["bad_tokens"],
|
||||
eos_token_ids=self.share_inputs["eos_token_id"],
|
||||
max_num_logprobs=20 if self.enable_logprob else None,
|
||||
)
|
||||
|
||||
def load_model(self) -> None:
|
||||
@@ -807,15 +811,15 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["step_idx"],
|
||||
self.share_inputs["stop_flags"],
|
||||
)
|
||||
sampled_token_ids = self.sampler(logits,
|
||||
sampler_output = self.sampler(logits,
|
||||
self.sampling_metadata)
|
||||
if self.parallel_config.tensor_parallel_degree > 1:
|
||||
paddle.distributed.broadcast(sampled_token_ids, 0)
|
||||
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
|
||||
else:
|
||||
self.sampler(logits, self.sampling_metadata,
|
||||
self.parallel_config.max_model_len,
|
||||
self.share_inputs)
|
||||
sampled_token_ids = None
|
||||
sampler_output = None
|
||||
if self.parallel_config.tensor_parallel_degree > 1:
|
||||
paddle.distributed.broadcast(
|
||||
self.share_inputs["accept_tokens"], 0)
|
||||
@@ -855,7 +859,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
accept_num=self.share_inputs["accept_num"]
|
||||
if self.speculative_decoding else None)
|
||||
|
||||
post_process(sampled_token_ids=sampled_token_ids,
|
||||
post_process(sampler_output=sampler_output,
|
||||
model_output=model_output_data,
|
||||
speculative_decoding=self.speculative_decoding,
|
||||
skip_save_output=True)
|
||||
@@ -1038,18 +1042,18 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["step_idx"],
|
||||
self.share_inputs["stop_flags"],
|
||||
)
|
||||
sampled_token_ids = self.sampler(
|
||||
sampler_output = self.sampler(
|
||||
logits,
|
||||
self.sampling_metadata,
|
||||
skip_idx_list,
|
||||
)
|
||||
if self.parallel_config.tensor_parallel_degree > 1:
|
||||
paddle.distributed.broadcast(sampled_token_ids, 0)
|
||||
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
|
||||
|
||||
else:
|
||||
self.sampler(logits, self.sampling_metadata,
|
||||
self.parallel_config.max_model_len, self.share_inputs)
|
||||
sampled_token_ids = None
|
||||
sampler_output = None
|
||||
if self.parallel_config.tensor_parallel_degree > 1:
|
||||
paddle.distributed.broadcast(
|
||||
self.share_inputs["accept_tokens"], 0)
|
||||
@@ -1092,7 +1096,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
skip_save_output = True
|
||||
else:
|
||||
skip_save_output = False
|
||||
post_process(sampled_token_ids=sampled_token_ids,
|
||||
post_process(sampler_output=sampler_output,
|
||||
model_output=model_output_data,
|
||||
save_each_rank=self.parallel_config.use_ep,
|
||||
speculative_decoding=self.speculative_decoding,
|
||||
|
Reference in New Issue
Block a user