mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 00:33:03 +08:00
[Feature] Support return logprob of generated tokens (#2784)
* online chat support logprobs * check xpu * check vl_gpu_model_runner * only cuda support logprob * get_worker() check platform --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -45,6 +45,7 @@ from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import (
|
||||
ScatterOp, VariableResolutionResamplerModel)
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
||||
from fastdeploy.worker.output import SamplerOutput
|
||||
from fastdeploy.worker.utils import check_safetensors_model
|
||||
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
|
||||
|
||||
@@ -52,7 +53,7 @@ if current_platform.is_cuda() and current_platform.available():
|
||||
from fastdeploy.model_executor.layers.utils import (
|
||||
remove_padding, speculate_remove_padding)
|
||||
|
||||
from fastdeploy.model_executor.ops.gpu import (save_output,
|
||||
from fastdeploy.model_executor.ops.gpu import (save_output, save_output_topk,
|
||||
set_stop_value_multi_ends,
|
||||
set_value_by_flags_and_idx,
|
||||
update_inputs)
|
||||
@@ -84,7 +85,7 @@ class GPUVLModelRunner(VLModelRunnerBase):
|
||||
self.mp_group = hcg.get_model_parallel_group()
|
||||
self.is_safetensors_model = check_safetensors_model(
|
||||
args.model_name_or_path)
|
||||
|
||||
self.enable_logprob = args.enable_logprob
|
||||
model_path = os.path.dirname(args.model_name_or_path)
|
||||
args.llm_model_name_or_path = args.model_name_or_path
|
||||
if not self.is_safetensors_model:
|
||||
@@ -825,6 +826,7 @@ class GPUVLModelRunner(VLModelRunnerBase):
|
||||
min_dec_lens=self.share_inputs["min_dec_len"],
|
||||
bad_words_token_ids=self.share_inputs["bad_tokens"],
|
||||
eos_token_ids=self.share_inputs["eos_token_id"],
|
||||
max_num_logprobs=20 if self.enable_logprob else None,
|
||||
)
|
||||
|
||||
def generate(self) -> None:
|
||||
@@ -846,17 +848,17 @@ class GPUVLModelRunner(VLModelRunnerBase):
|
||||
self.share_inputs["stop_flags"],
|
||||
)
|
||||
# sampler & save_output
|
||||
next_tokens = self.sampler(logits, self.sampling_metadata)
|
||||
sampler_output = self.sampler(logits, self.sampling_metadata)
|
||||
if self.fd_config.parallel_config.tensor_parallel_degree > 1:
|
||||
paddle.distributed.broadcast(next_tokens, 0)
|
||||
self.post_process(next_tokens)
|
||||
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
|
||||
self.post_process(sampler_output)
|
||||
|
||||
def post_process(self, next_tokens: paddle.Tensor) -> None:
|
||||
def post_process(self, sampler_output: SamplerOutput) -> None:
|
||||
"""
|
||||
post_process
|
||||
"""
|
||||
if self.share_inputs["enable_thinking"]:
|
||||
exists_think_end = next_tokens == self.model_cfg.think_end_id
|
||||
exists_think_end = sampler_output.sampled_token_ids == self.model_cfg.think_end_id
|
||||
paddle.assign(
|
||||
paddle.where(
|
||||
exists_think_end,
|
||||
@@ -872,12 +874,12 @@ class GPUVLModelRunner(VLModelRunnerBase):
|
||||
), self.share_inputs["reasoning_index"])
|
||||
|
||||
stop_wo_think = (
|
||||
(next_tokens == self.share_inputs["eos_token_id"]) |
|
||||
(sampler_output.sampled_token_ids == self.share_inputs["eos_token_id"]) |
|
||||
(self.share_inputs["reasoning_index"] == 0)) & (
|
||||
self.share_inputs["need_think_end"] > 0)
|
||||
next_tokens = paddle.where(stop_wo_think,
|
||||
sampler_output.sampled_token_ids = paddle.where(stop_wo_think,
|
||||
self.model_cfg.think_end_id,
|
||||
next_tokens)
|
||||
sampler_output.sampled_token_ids)
|
||||
paddle.assign(
|
||||
paddle.where(
|
||||
stop_wo_think,
|
||||
@@ -900,7 +902,7 @@ class GPUVLModelRunner(VLModelRunnerBase):
|
||||
)
|
||||
|
||||
set_stop_value_multi_ends(
|
||||
next_tokens,
|
||||
sampler_output.sampled_token_ids,
|
||||
self.share_inputs["stop_flags"],
|
||||
self.share_inputs["seq_lens_this_time"],
|
||||
self.share_inputs["eos_token_id"],
|
||||
@@ -917,15 +919,25 @@ class GPUVLModelRunner(VLModelRunnerBase):
|
||||
self.share_inputs["seq_lens_decoder"],
|
||||
self.share_inputs["input_ids"],
|
||||
self.share_inputs["stop_nums"],
|
||||
next_tokens,
|
||||
sampler_output.sampled_token_ids,
|
||||
self.share_inputs["is_block_step"],
|
||||
)
|
||||
save_output(
|
||||
next_tokens,
|
||||
self.share_inputs["not_need_stop"],
|
||||
self.rank,
|
||||
False, # use_ep
|
||||
)
|
||||
if sampler_output.logprobs_tensors is None:
|
||||
save_output(
|
||||
sampler_output.sampled_token_ids,
|
||||
self.share_inputs["not_need_stop"],
|
||||
self.rank,
|
||||
False, # use_ep
|
||||
)
|
||||
else:
|
||||
save_output_topk(
|
||||
sampler_output.sampled_token_ids,
|
||||
sampler_output.logprobs_tensors.logprob_token_ids,
|
||||
sampler_output.logprobs_tensors.logprobs,
|
||||
sampler_output.logprobs_tensors.selected_token_ranks,
|
||||
self.share_inputs["not_need_stop"],
|
||||
self.rank,
|
||||
)
|
||||
|
||||
def _cal_theortical_kvcache(self):
|
||||
"""
|
||||
|
Reference in New Issue
Block a user