mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[Feature] mm and thinking model support structred output (#2749)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* mm support structured output * update code * update code * update format * update code * update code * add enable_thinking default * update code * add structured_outputs test case * add ci install xgrammar * add ci timeout time * update test for structured_outputs * update code * add error traceback info * update error msg * update structred output code * update code * update code * update config * update torch version --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -29,9 +29,9 @@ from fastdeploy.model_executor.graph_optimization.utils import (
|
||||
profile_run_guard,
|
||||
sot_warmup_guard,
|
||||
)
|
||||
from fastdeploy.model_executor.guided_decoding import get_guided_backend
|
||||
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
|
||||
from fastdeploy.model_executor.guided_decoding import (
|
||||
LogitsProcessorBase,
|
||||
get_guided_backend,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||
@@ -97,10 +97,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.enable_logprob = fd_config.model_config.enable_logprob
|
||||
self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop
|
||||
|
||||
self.guided_backend = None
|
||||
if self.fd_config.parallel_config.guided_decoding_backend != "off":
|
||||
self.guided_backend = get_guided_backend(fd_config=self.fd_config)
|
||||
|
||||
# VL model config:
|
||||
if self.enable_mm:
|
||||
if "ernie" in self.fd_config.model_config.model_type:
|
||||
@@ -129,6 +125,11 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
else:
|
||||
self.sampler = SpeculativeSampler(fd_config)
|
||||
|
||||
self.guided_backend = None
|
||||
if self.fd_config.parallel_config.guided_decoding_backend != "off":
|
||||
self.guided_backend = get_guided_backend(fd_config=self.fd_config)
|
||||
self.sampler.set_reasoning_parser(self.guided_backend.get_reasoning_parser())
|
||||
|
||||
# Lazy initialize kv cache after model loading
|
||||
# self.kv_caches: list[paddle.Tensor] = []
|
||||
|
||||
@@ -206,7 +207,16 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
elif request.structural_tag is not None:
|
||||
schemata_key = ("structural_tag", request.structural_tag)
|
||||
|
||||
return self.guided_backend.get_logits_processor(schemata_key=schemata_key), schemata_key
|
||||
enable_thinking = request.get("enable_thinking", True)
|
||||
enable_thinking = enable_thinking if enable_thinking is not None else True
|
||||
|
||||
return (
|
||||
self.guided_backend.get_logits_processor(
|
||||
schemata_key=schemata_key,
|
||||
enable_thinking=enable_thinking,
|
||||
),
|
||||
schemata_key,
|
||||
)
|
||||
|
||||
def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = None):
|
||||
"""
|
||||
@@ -1336,10 +1346,10 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
Returns:
|
||||
A list of indices corresponding to the requests that need to be skipped.
|
||||
"""
|
||||
skip_idx_list = []
|
||||
if not self.cache_config.enable_chunked_prefill or self.guided_backend is None:
|
||||
return skip_idx_list
|
||||
if not self.cache_config.enable_chunked_prefill or self.guided_backend is None or model_forward_batch is None:
|
||||
return []
|
||||
|
||||
skip_idx_list = []
|
||||
for task in model_forward_batch:
|
||||
if task.get("prefill_chunk_info", None) is None or task.chunk_idx >= len(task.prefill_chunk_info):
|
||||
continue
|
||||
@@ -1505,6 +1515,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
speculative_decoding=self.speculative_decoding,
|
||||
skip_save_output=skip_save_output,
|
||||
)
|
||||
if self.guided_backend is not None and sampler_output is not None:
|
||||
self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
|
||||
|
||||
# 6. Speculative decode
|
||||
if self.speculative_decoding:
|
||||
@@ -1538,7 +1550,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
"""
|
||||
Add cache for guided decoding.
|
||||
"""
|
||||
if self.guided_backend is None:
|
||||
if self.guided_backend is None or model_forward_batch is None:
|
||||
return
|
||||
|
||||
for request in model_forward_batch:
|
||||
|
Reference in New Issue
Block a user