mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-09-27 04:46:16 +08:00
[Bug fix] Fix bug for supporting max think len (#4267)
* fix bug for supporting max thinking lens * fix max_think_lens
This commit is contained in:
@@ -72,6 +72,7 @@ class Request:
|
|||||||
structural_tag: Optional[Any] = None,
|
structural_tag: Optional[Any] = None,
|
||||||
guided_json_object: Optional[bool] = None,
|
guided_json_object: Optional[bool] = None,
|
||||||
enable_thinking: Optional[bool] = True,
|
enable_thinking: Optional[bool] = True,
|
||||||
|
reasoning_max_tokens: Optional[int] = None,
|
||||||
trace_carrier: dict = dict(),
|
trace_carrier: dict = dict(),
|
||||||
dp_rank: Optional[int] = None,
|
dp_rank: Optional[int] = None,
|
||||||
chat_template: Optional[str] = None,
|
chat_template: Optional[str] = None,
|
||||||
@@ -121,6 +122,7 @@ class Request:
|
|||||||
self.multimodal_img_boundaries = None
|
self.multimodal_img_boundaries = None
|
||||||
|
|
||||||
self.enable_thinking = enable_thinking
|
self.enable_thinking = enable_thinking
|
||||||
|
self.reasoning_max_tokens = reasoning_max_tokens
|
||||||
self.trace_carrier = trace_carrier
|
self.trace_carrier = trace_carrier
|
||||||
|
|
||||||
self.chat_template = chat_template
|
self.chat_template = chat_template
|
||||||
@@ -178,7 +180,8 @@ class Request:
|
|||||||
guided_grammar=d.get("guided_grammar", None),
|
guided_grammar=d.get("guided_grammar", None),
|
||||||
structural_tag=d.get("structural_tag", None),
|
structural_tag=d.get("structural_tag", None),
|
||||||
guided_json_object=d.get("guided_json_object", None),
|
guided_json_object=d.get("guided_json_object", None),
|
||||||
enable_thinking=d.get("enable_thinking", True),
|
enable_thinking=d.get("enable_thinking", False),
|
||||||
|
reasoning_max_tokens=d.get("reasoning_max_tokens", None),
|
||||||
trace_carrier=d.get("trace_carrier", {}),
|
trace_carrier=d.get("trace_carrier", {}),
|
||||||
chat_template=d.get("chat_template", None),
|
chat_template=d.get("chat_template", None),
|
||||||
num_computed_tokens=d.get("num_computed_tokens", 0),
|
num_computed_tokens=d.get("num_computed_tokens", 0),
|
||||||
@@ -229,6 +232,7 @@ class Request:
|
|||||||
"disaggregate_info": self.disaggregate_info,
|
"disaggregate_info": self.disaggregate_info,
|
||||||
"draft_token_ids": self.draft_token_ids,
|
"draft_token_ids": self.draft_token_ids,
|
||||||
"enable_thinking": self.enable_thinking,
|
"enable_thinking": self.enable_thinking,
|
||||||
|
"reasoning_max_tokens": self.reasoning_max_tokens,
|
||||||
"trace_carrier": self.trace_carrier,
|
"trace_carrier": self.trace_carrier,
|
||||||
"chat_template": self.chat_template,
|
"chat_template": self.chat_template,
|
||||||
"num_computed_tokens": self.num_computed_tokens,
|
"num_computed_tokens": self.num_computed_tokens,
|
||||||
|
@@ -672,6 +672,8 @@ class ResourceManagerV1(ResourceManager):
|
|||||||
return False
|
return False
|
||||||
if self.available_batch() == 0:
|
if self.available_batch() == 0:
|
||||||
return False
|
return False
|
||||||
|
if request.reasoning_max_tokens is not None:
|
||||||
|
request.reasoning_max_tokens -= 1
|
||||||
request.need_prefill_tokens = len(request.prompt_token_ids)
|
request.need_prefill_tokens = len(request.prompt_token_ids)
|
||||||
need_prealloc_prefill_blocks = (
|
need_prealloc_prefill_blocks = (
|
||||||
request.need_prefill_tokens + self.config.cache_config.block_size - 1
|
request.need_prefill_tokens + self.config.cache_config.block_size - 1
|
||||||
|
Reference in New Issue
Block a user