mirror of
				https://github.com/PaddlePaddle/FastDeploy.git
				synced 2025-10-31 11:56:44 +08:00 
			
		
		
		
	 dade19d7a4
			
		
	
	dade19d7a4
	
	
	
		
			
			* [Feature] support logprobs in chat/completions and completions endpoints * Temporarily comment out text_offset due to incorrect logic * Clean up temporary debug prints * [Feature] support logprobs in offline mode via SamplingParams * fix: serialize Logprob as dict before zmq send to fix msgpack error * refactor: remove redundant methods to simplify codebase * Fix missing fields in CompletionOutput.to_dict affecting msgpack serialization * refactor: centralize param validation in engine_client to reduce duplication * revert: rollback changes in offline_demo.py * revert: rollback changes in offline_demo.py * [bugfix] fix parameter validation for logprobs * [bugfix] fix parameter validation for logprobs * [bugfix] fix parameter validation for logprobs * [bugfix] fix parameter validation for logprobs --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
		
			
				
	
	
		
			279 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			279 lines
		
	
	
		
			6.1 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| # Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
 | |
| #
 | |
| # Licensed under the Apache License, Version 2.0 (the "License"
 | |
| # you may not use this file except in compliance with the License.
 | |
| # You may obtain a copy of the License at
 | |
| #
 | |
| #     http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| # Unless required by applicable law or agreed to in writing, software
 | |
| # distributed under the License is distributed on an "AS IS" BASIS,
 | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| # See the License for the specific language governing permissions and
 | |
| # limitations under the License.
 | |
| """
 | |
| 
 | |
| from dataclasses import dataclass
 | |
| from typing import NamedTuple, Optional
 | |
| 
 | |
| import paddle
 | |
| 
 | |
| 
 | |
| class Logprob(NamedTuple):
 | |
|     """
 | |
|     A named tuple containing information about a token's log probability.
 | |
|     """
 | |
| 
 | |
|     logprob: float
 | |
|     rank: Optional[int] = None
 | |
|     decoded_token: Optional[str] = None
 | |
| 
 | |
| 
 | |
| # [{token_id, logprob}] for tokens sampled from the top-k
 | |
| SampleLogprobs = list[dict[int, Logprob]]
 | |
| 
 | |
| 
 | |
| class LogprobsLists(NamedTuple):
 | |
|     """ """
 | |
| 
 | |
|     # [num_reqs, max_num_logprobs + 1]
 | |
|     logprob_token_ids: list[list[int]]
 | |
|     # [num_reqs, max_num_logprobs + 1]
 | |
|     logprobs: list[list[float]]
 | |
|     # [num_reqs]
 | |
|     sampled_token_ranks: list[int]
 | |
| 
 | |
|     def slice(self, start: int, end: int):
 | |
|         """slice"""
 | |
|         return LogprobsLists(
 | |
|             self.logprob_token_ids[start:end],
 | |
|             self.logprobs[start:end],
 | |
|             self.sampled_token_ranks[start:end],
 | |
|         )
 | |
| 
 | |
|     def slice_columns(self, start: int, end: int):
 | |
|         """
 | |
|         Slice columns (per-row top-k logprobs and token IDs).
 | |
|         Keeps the number of requests unchanged.
 | |
|         """
 | |
|         return LogprobsLists(
 | |
|             [row[start:end] for row in self.logprob_token_ids],
 | |
|             [row[start:end] for row in self.logprobs],
 | |
|             self.sampled_token_ranks,  # unchanged
 | |
|         )
 | |
| 
 | |
| 
 | |
| class LogprobsTensors(NamedTuple):
 | |
|     """ """
 | |
| 
 | |
|     # [num_reqs, max_num_logprobs + 1]
 | |
|     logprob_token_ids: paddle.Tensor
 | |
|     # [num_reqs, max_num_logprobs + 1]
 | |
|     logprobs: paddle.Tensor
 | |
|     # [num_reqs]
 | |
|     selected_token_ranks: paddle.Tensor
 | |
| 
 | |
|     def tolists(self):
 | |
|         """Convert to lists."""
 | |
|         return LogprobsLists(
 | |
|             self.logprob_token_ids.tolist(),
 | |
|             self.logprobs.tolist(),
 | |
|             self.selected_token_ranks.tolist(),
 | |
|         )
 | |
| 
 | |
|     @staticmethod
 | |
|     def empty_cpu(num_positions: int, num_tokens_per_position: int) -> "LogprobsTensors":
 | |
|         """Create empty LogprobsTensors on CPU."""
 | |
| 
 | |
|         logprob_token_ids = paddle.empty([num_positions, num_tokens_per_position], dtype=paddle.int64).cpu()
 | |
|         logprobs = paddle.empty_like(logprob_token_ids, dtype=paddle.float32)
 | |
|         selected_token_ranks = paddle.empty([num_positions], dtype=paddle.int64).cpu()
 | |
|         return LogprobsTensors(
 | |
|             logprob_token_ids=logprob_token_ids,
 | |
|             logprobs=logprobs,
 | |
|             selected_token_ranks=selected_token_ranks,
 | |
|         )
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class SamplerOutput:
 | |
|     """ """
 | |
| 
 | |
|     # [num_reqs, max_num_generated_tokens]
 | |
|     # Different requests can have different number of generated tokens.
 | |
|     # All requests are padded to max_num_generated_tokens.
 | |
|     # PLACEHOLDER_TOKEN_ID (-1 by default) is used for padding.
 | |
|     sampled_token_ids: paddle.Tensor
 | |
|     logprobs_tensors: Optional[LogprobsTensors]
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class ModelOutputData:
 | |
|     """
 | |
|     OutputData by execute_model
 | |
|     """
 | |
| 
 | |
|     """
 | |
|         Tokens generated in the previous step
 | |
|     """
 | |
|     next_tokens: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         Flags indicating whether decoding should stop
 | |
|     """
 | |
|     stop_flags: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         Index of the current decoding step
 | |
|     """
 | |
|     step_idx: int
 | |
| 
 | |
|     """
 | |
|         Maximum decoding length
 | |
|     """
 | |
|     max_dec_len: int
 | |
| 
 | |
|     """
 | |
|         Previous ids used for decoding
 | |
|     """
 | |
|     pre_ids: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         Sequence lengths for this step
 | |
|     """
 | |
|     seq_lens_this_time: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         Eos token ID
 | |
|     """
 | |
|     eos_token_id: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         Indicates if stopping conditions should be ignored
 | |
|     """
 | |
|     not_need_stop: bool
 | |
| 
 | |
|     """
 | |
|         Sequence lengths of the encoder
 | |
|     """
 | |
|     seq_lens_encoder: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         Sequence lengths of the decoder
 | |
|     """
 | |
|     seq_lens_decoder: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         Indicates if this is a blocking step
 | |
|     """
 | |
|     is_block_step: bool
 | |
| 
 | |
|     """
 | |
|         The ID of the message queue.
 | |
|     """
 | |
|     msg_queue_id: int
 | |
| 
 | |
|     """
 | |
|         The model parallel rank
 | |
|     """
 | |
|     mp_rank: int
 | |
| 
 | |
|     """
 | |
|         Use EP parallel
 | |
|     """
 | |
|     use_ep: bool
 | |
| 
 | |
|     """
 | |
|         input ids
 | |
|     """
 | |
|     input_ids: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         stop nums for every sequence
 | |
|     """
 | |
|     stop_nums: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         for speculative decoding
 | |
|         full hidden states before lm_head
 | |
|     """
 | |
|     full_hidden_states: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|          draft tokens for every sequence
 | |
|     """
 | |
|     draft_tokens: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         draft token num for every sequence
 | |
|     """
 | |
|     actual_draft_token_num: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         accepted tokens in current step
 | |
|     """
 | |
|     accept_tokens: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         the number of accepted tokens in current step
 | |
|     """
 | |
|     accept_num: paddle.Tensor
 | |
| 
 | |
|     """
 | |
|         vl model enable to think
 | |
|     """
 | |
|     enable_thinking: paddle.Tensor = None
 | |
| 
 | |
|     """
 | |
|         vl model think end id
 | |
|     """
 | |
|     think_end_id: int = -1
 | |
| 
 | |
|     """
 | |
|         vl model need to think
 | |
|     """
 | |
|     need_think_end: paddle.Tensor = None
 | |
| 
 | |
|     """
 | |
|         vl model reasoning index
 | |
|     """
 | |
|     reasoning_index: paddle.Tensor = None
 | |
| 
 | |
|     """
 | |
|         the token ids of stop sequence
 | |
|     """
 | |
|     stop_token_ids: paddle.Tensor = None
 | |
| 
 | |
|     """
 | |
|         the length of stop sequence
 | |
|     """
 | |
|     stop_seqs_len: paddle.Tensor = None
 | |
| 
 | |
| 
 | |
| @dataclass
 | |
| class ModelRunnerOutput:
 | |
|     """
 | |
|     [WIP] ModelRunnerOutput is serialized and sent to the scheduler process.
 | |
|     """
 | |
| 
 | |
|     """
 | |
|         [num_reqs]
 | |
|     """
 | |
|     req_ids: list[str]
 | |
| 
 | |
|     """
 | |
|         req_id -> index
 | |
|     """
 | |
|     req_id_to_index: dict[str, int]
 | |
| 
 | |
|     """
 | |
|         [num_reqs, num_generated_tokens]
 | |
|     """
 | |
|     sampled_token_ids: list[list[int]]
 | |
| 
 | |
|     """
 | |
|         [num_reqs, num_spec_tokens]
 | |
|     """
 | |
|     spec_token_ids: Optional[list[list[int]]]
 |