diff --git a/docs/online_serving/README.md b/docs/online_serving/README.md index 0858d6cb5..437f64655 100644 --- a/docs/online_serving/README.md +++ b/docs/online_serving/README.md @@ -223,6 +223,9 @@ include_draft_logprobs: Optional[bool] = False # Whether to return log probabilities during draft stages (e.g., pre-generation or intermediate steps) # for debugging or analysis of the generation process (default False means not returned). +include_logprobs_decode_token: Optional[bool] = True +# Whether to include decoded token in the logprobs/prompt_logprobs results, (default True means the decoded token is always include in results). + logits_processors_args: Optional[Dict] = None # Additional arguments for logits processors, enabling customization of generation logic # (e.g., dynamically adjusting probability distributions). @@ -481,6 +484,9 @@ include_draft_logprobs: Optional[bool] = False # Whether to return log probabilities during draft stages (e.g., pre-generation or intermediate steps) # for debugging or analysis of the generation process (default False means not returned). +include_logprobs_decode_token: Optional[bool] = True +# Whether to include decoded token in the prompt_logprobs results, (default True means the decoded token is always include in results). + logits_processors_args: Optional[Dict] = None # Additional arguments for logits processors, enabling customization of generation logic # (e.g., dynamically adjusting probability distributions). diff --git a/docs/zh/online_serving/README.md b/docs/zh/online_serving/README.md index 2a794b5ec..1747ef639 100644 --- a/docs/zh/online_serving/README.md +++ b/docs/zh/online_serving/README.md @@ -218,6 +218,9 @@ top_p_normalized_logprobs: Optional[bool] = False include_draft_logprobs: Optional[bool] = False # 是否在预生成或中间步骤返回对数概率(log probabilities),用于调试或分析生成过程(默认 False 表示不返回)。 +include_logprobs_decode_token: Optional[bool] = True +# 是否在logprobs/prompt_logprobs结果中返回解码后的token,(默认的True表示总是在结果中返回) + logits_processors_args: Optional[Dict] = None # 传递给 logits 处理器(logits processors)的额外参数,用于自定义生成过程中的逻辑(如动态调整概率分布)。 @@ -469,6 +472,9 @@ top_p_normalized_logprobs: Optional[bool] = False include_draft_logprobs: Optional[bool] = False # 是否在预生成或中间步骤返回对数概率(log probabilities),用于调试或分析生成过程(默认 False 表示不返回)。 +include_logprobs_decode_token: Optional[bool] = True +# 是否在prompt_logprobs结果中返回解码后的token,(默认的True表示总是在结果中返回) + logits_processors_args: Optional[Dict] = None # 传递给 logits 处理器(logits processors)的额外参数,用于自定义生成过程中的逻辑(如动态调整概率分布)。 diff --git a/fastdeploy/entrypoints/openai/protocol.py b/fastdeploy/entrypoints/openai/protocol.py index a401e4db8..199dc4487 100644 --- a/fastdeploy/entrypoints/openai/protocol.py +++ b/fastdeploy/entrypoints/openai/protocol.py @@ -457,6 +457,7 @@ class CompletionRequest(BaseModel): frequency_penalty: Optional[float] = Field(default=None, ge=-2, le=2) logprobs: Optional[int] = None include_draft_logprobs: Optional[bool] = False + include_logprobs_decode_token: Optional[bool] = True prompt_logprobs: Optional[int] = None # For logits and logprobs post processing temp_scaled_logprobs: bool = False @@ -620,6 +621,7 @@ class ChatCompletionRequest(BaseModel): top_logprobs: Optional[int] = None prompt_logprobs: Optional[int] = None include_draft_logprobs: Optional[bool] = False + include_logprobs_decode_token: Optional[bool] = True # For logits and logprobs post processing temp_scaled_logprobs: bool = False diff --git a/fastdeploy/entrypoints/openai/serving_chat.py b/fastdeploy/entrypoints/openai/serving_chat.py index 01dd47ca5..50876b2cf 100644 --- a/fastdeploy/entrypoints/openai/serving_chat.py +++ b/fastdeploy/entrypoints/openai/serving_chat.py @@ -303,7 +303,7 @@ class OpenAIServingChat: else self.engine_client.ori_vocab_size ) prompt_logprobs_res = self._build_prompt_logprobs( - prompt_logprobs_tensors, num_prompt_logprobs + prompt_logprobs_tensors, num_prompt_logprobs, request.include_logprobs_decode_token ) choice = ChatCompletionResponseStreamChoice( index=i, @@ -370,12 +370,18 @@ class OpenAIServingChat: request.top_logprobs if request.top_logprobs != -1 else self.engine_client.ori_vocab_size ) logprobs_res = self._create_chat_logprobs( - output_top_logprobs, request.logprobs, num_top_logprobs + output_top_logprobs, + request.logprobs, + num_top_logprobs, + request.include_logprobs_decode_token, ) if request.include_draft_logprobs and output_draft_top_logprobs is not None: draft_logprobs_res = self._create_chat_logprobs( - output_draft_top_logprobs, request.logprobs, num_top_logprobs + output_draft_top_logprobs, + request.logprobs, + num_top_logprobs, + request.include_logprobs_decode_token, ) delta_message = DeltaMessage( @@ -577,7 +583,10 @@ class OpenAIServingChat: ) # logprobs logprobs_res = self._create_chat_logprobs( - output_top_logprobs, request.logprobs, num_top_logprobs + output_top_logprobs, + request.logprobs, + num_top_logprobs, + request.include_logprobs_decode_token, ) if logprobs_res and logprobs_res.content is not None: logprob_contents[idx].extend(logprobs_res.content) @@ -585,7 +594,10 @@ class OpenAIServingChat: # draft_logprobs if request.include_draft_logprobs and output_draft_top_logprobs is not None: draft_logprobs_res = self._create_chat_logprobs( - output_draft_top_logprobs, request.logprobs, num_top_logprobs + output_draft_top_logprobs, + request.logprobs, + num_top_logprobs, + request.include_logprobs_decode_token, ) if draft_logprobs_res and draft_logprobs_res.content is not None: draft_logprob_contents[idx].extend(draft_logprobs_res.content) @@ -596,7 +608,9 @@ class OpenAIServingChat: if request.prompt_logprobs != -1 else self.engine_client.ori_vocab_size ) - prompt_logprobs_res = self._build_prompt_logprobs(prompt_logprobs_tensors, num_prompt_logprobs) + prompt_logprobs_res = self._build_prompt_logprobs( + prompt_logprobs_tensors, num_prompt_logprobs, request.include_logprobs_decode_token + ) if prompt_logprobs_res: prompt_logprobs_res_list[idx].extend(clamp_prompt_logprobs(prompt_logprobs_res)) if data["finished"]: @@ -738,6 +752,7 @@ class OpenAIServingChat: output_top_logprobs, request_logprobs: Optional[bool] = None, request_top_logprobs: Optional[int] = None, + request_decode_flag: Optional[bool] = True, ) -> Optional[LogProbs]: """Create OpenAI-style logprobs for chat completions.""" if output_top_logprobs is None or len(output_top_logprobs) < 3 or any(not lst for lst in output_top_logprobs): @@ -755,6 +770,7 @@ class OpenAIServingChat: request_logprobs=request_logprobs, response_logprobs=top_logprobs, request_top_logprobs=request_top_logprobs, + request_decode_flag=request_decode_flag, ) if logprobs_res is None: logprobs_res = step_logprobs_res @@ -767,6 +783,7 @@ class OpenAIServingChat: request_logprobs: bool, response_logprobs: Optional[LogprobsLists], request_top_logprobs: int, + request_decode_flag: bool, ) -> Optional[LogProbs]: """ Construct a logprobs response object in line with the OpenAI style. @@ -796,12 +813,16 @@ class OpenAIServingChat: # Construct the candidate token structure (LogProbEntry) of topk top_logprob_entries: List[LogProbEntry] = [] for tid, lp in zip(topk_token_ids, topk_logprobs): - token_str = self.engine_client.data_processor.process_logprob_response( - [tid], clean_up_tokenization_spaces=False - ) - token_bytes = token_str.encode("utf-8", errors="replace") - if "\ufffd" in token_str: - token_str = "bytes:" + "".join(f"\\x{byte:02x}" for byte in token_bytes) + if request_decode_flag: + token_str = self.engine_client.data_processor.process_logprob_response( + [tid], clean_up_tokenization_spaces=False + ) + token_bytes = token_str.encode("utf-8", errors="replace") + if "\ufffd" in token_str: + token_str = "bytes:" + "".join(f"\\x{byte:02x}" for byte in token_bytes) + else: + token_str = "" + token_bytes = [] entry = LogProbEntry(token=token_str, logprob=lp, bytes=list(token_bytes)) top_logprob_entries.append(entry) # Construct the sampled token object (avoid sharing references with top_logprob_entries) @@ -823,6 +844,7 @@ class OpenAIServingChat: self, prompt_logprobs_tensors: LogprobsTensors, num_prompt_logprobs: int, + include_logprobs_decode_token: bool, ): """Update with prompt logprobs from worker. Args: @@ -834,10 +856,13 @@ class OpenAIServingChat: # Detokenize non-incrementally. # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps] - decoded_tokens = [ - self.engine_client.data_processor.process_logprob_response(token_id) - for token_id in token_ids.flatten().tolist() - ] + if include_logprobs_decode_token: + decoded_tokens = [ + self.engine_client.data_processor.process_logprob_response(token_id) + for token_id in token_ids.flatten().tolist() + ] + else: + decoded_tokens = None # Recover shapes. num_prompt_tokens, num_logprobs = logprobs.shape diff --git a/fastdeploy/entrypoints/openai/serving_completion.py b/fastdeploy/entrypoints/openai/serving_completion.py index e02fa22be..c8a9f1efe 100644 --- a/fastdeploy/entrypoints/openai/serving_completion.py +++ b/fastdeploy/entrypoints/openai/serving_completion.py @@ -452,7 +452,7 @@ class OpenAIServingCompletion: else self.engine_client.ori_vocab_size ) prompt_logprobs_res = self._build_prompt_logprobs( - prompt_logprobs_tensors, num_prompt_logprobs + prompt_logprobs_tensors, num_prompt_logprobs, request.include_logprobs_decode_token ) if request.return_token_ids: chunk = CompletionStreamResponse( @@ -651,7 +651,9 @@ class OpenAIServingCompletion: num_prompt_logprobs = ( request.prompt_logprobs if request.prompt_logprobs != -1 else self.engine_client.ori_vocab_size ) - prompt_logprobs_res = self._build_prompt_logprobs(prompt_logprobs_tensors, num_prompt_logprobs) + prompt_logprobs_res = self._build_prompt_logprobs( + prompt_logprobs_tensors, num_prompt_logprobs, request.include_logprobs_decode_token + ) if request.echo: prompt_text = self._echo_back_prompt(request, idx // (1 if request.n is None else request.n)) token_ids = [*prompt_token_ids, *output["token_ids"]] @@ -817,6 +819,7 @@ class OpenAIServingCompletion: self, prompt_logprobs_tensors: LogprobsTensors, num_prompt_logprobs: int, + include_logprobs_decode_token: bool, ): """Update with prompt logprobs from worker. Args: @@ -828,10 +831,13 @@ class OpenAIServingCompletion: # Detokenize non-incrementally. # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps] - decoded_tokens = [ - self.engine_client.data_processor.process_logprob_response(token_id) - for token_id in token_ids.flatten().tolist() - ] + if include_logprobs_decode_token: + decoded_tokens = [ + self.engine_client.data_processor.process_logprob_response(token_id) + for token_id in token_ids.flatten().tolist() + ] + else: + decoded_tokens = None # Recover shapes. num_prompt_tokens, num_logprobs = logprobs.shape diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index ccb21c436..40b4a3f4f 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -60,7 +60,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): ) as mock_decode: mock_decode.side_effect = ["token1", "token2", "token3", "token4", "token5", "token6"] - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) # Verify result structure (first element is None, then actual results) self.assertEqual(len(result), num_prompt_tokens + 1) @@ -98,7 +98,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): ) as mock_decode: mock_decode.side_effect = ["hello", "world"] - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, -1) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, -1, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -125,7 +125,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): ) as mock_decode: mock_decode.return_value = "single_token" - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -154,7 +154,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): ) as mock_decode: mock_decode.side_effect = ["t1", "t2", "t3", "t4", "t5", "t6"] - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -188,7 +188,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): prompt_logprobs_tensors = LogprobsTensors(token_ids, logprobs, ranks) - result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.chat_completion_handler._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) diff --git a/tests/entrypoints/openai/test_serving_completion.py b/tests/entrypoints/openai/test_serving_completion.py index fdefd1cc3..680d775bd 100644 --- a/tests/entrypoints/openai/test_serving_completion.py +++ b/tests/entrypoints/openai/test_serving_completion.py @@ -208,7 +208,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): ) as mock_decode: mock_decode.side_effect = ["token1", "token2", "token3", "token4", "token5", "token6"] - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) # Verify result structure (first element is None, then actual results) self.assertEqual(len(result), num_prompt_tokens + 1) @@ -246,7 +246,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): ) as mock_decode: mock_decode.side_effect = ["hello", "world"] - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, -1) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, -1, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -273,7 +273,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): ) as mock_decode: mock_decode.return_value = "single_token" - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -302,7 +302,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): ) as mock_decode: mock_decode.side_effect = ["t1", "t2", "t3", "t4", "t5", "t6"] - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0]) @@ -336,7 +336,7 @@ class TestOpenAIServingCompletion(unittest.IsolatedAsyncioTestCase): prompt_logprobs_tensors = LogprobsTensors(token_ids, logprobs, ranks) - result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs) + result = self.serving_completion._build_prompt_logprobs(prompt_logprobs_tensors, num_logprobs, True) self.assertEqual(len(result), num_prompt_tokens + 1) self.assertIsNone(result[0])