[Feature] mm and thinking model support structred output (#2749)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled

* mm support structured output

* update code

* update code

* update format

* update code

* update code

* add enable_thinking default

* update code

* add structured_outputs test case

* add ci install xgrammar

* add ci timeout time

* update test for structured_outputs

* update code

* add error traceback info

* update error msg

* update structred output code

* update code

* update code

* update config

* update torch version

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
kevin
2025-09-02 16:21:09 +08:00
committed by GitHub
parent 0e4df5a6f4
commit 1908465542
17 changed files with 1168 additions and 83 deletions

View File

@@ -29,9 +29,9 @@ from fastdeploy.model_executor.graph_optimization.utils import (
profile_run_guard,
sot_warmup_guard,
)
from fastdeploy.model_executor.guided_decoding import get_guided_backend
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
from fastdeploy.model_executor.guided_decoding import (
LogitsProcessorBase,
get_guided_backend,
)
from fastdeploy.model_executor.layers.attention import get_attention_backend
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
@@ -97,10 +97,6 @@ class GPUModelRunner(ModelRunnerBase):
self.enable_logprob = fd_config.model_config.enable_logprob
self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop
self.guided_backend = None
if self.fd_config.parallel_config.guided_decoding_backend != "off":
self.guided_backend = get_guided_backend(fd_config=self.fd_config)
# VL model config:
if self.enable_mm:
if "ernie" in self.fd_config.model_config.model_type:
@@ -129,6 +125,11 @@ class GPUModelRunner(ModelRunnerBase):
else:
self.sampler = SpeculativeSampler(fd_config)
self.guided_backend = None
if self.fd_config.parallel_config.guided_decoding_backend != "off":
self.guided_backend = get_guided_backend(fd_config=self.fd_config)
self.sampler.set_reasoning_parser(self.guided_backend.get_reasoning_parser())
# Lazy initialize kv cache after model loading
# self.kv_caches: list[paddle.Tensor] = []
@@ -206,7 +207,16 @@ class GPUModelRunner(ModelRunnerBase):
elif request.structural_tag is not None:
schemata_key = ("structural_tag", request.structural_tag)
return self.guided_backend.get_logits_processor(schemata_key=schemata_key), schemata_key
enable_thinking = request.get("enable_thinking", True)
enable_thinking = enable_thinking if enable_thinking is not None else True
return (
self.guided_backend.get_logits_processor(
schemata_key=schemata_key,
enable_thinking=enable_thinking,
),
schemata_key,
)
def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = None):
"""
@@ -1336,10 +1346,10 @@ class GPUModelRunner(ModelRunnerBase):
Returns:
A list of indices corresponding to the requests that need to be skipped.
"""
skip_idx_list = []
if not self.cache_config.enable_chunked_prefill or self.guided_backend is None:
return skip_idx_list
if not self.cache_config.enable_chunked_prefill or self.guided_backend is None or model_forward_batch is None:
return []
skip_idx_list = []
for task in model_forward_batch:
if task.get("prefill_chunk_info", None) is None or task.chunk_idx >= len(task.prefill_chunk_info):
continue
@@ -1505,6 +1515,8 @@ class GPUModelRunner(ModelRunnerBase):
speculative_decoding=self.speculative_decoding,
skip_save_output=skip_save_output,
)
if self.guided_backend is not None and sampler_output is not None:
self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)
# 6. Speculative decode
if self.speculative_decoding:
@@ -1538,7 +1550,7 @@ class GPUModelRunner(ModelRunnerBase):
"""
Add cache for guided decoding.
"""
if self.guided_backend is None:
if self.guided_backend is None or model_forward_batch is None:
return
for request in model_forward_batch: