[Feature] mm and thinking model support structred output (#2749)

* mm support structured output * update code * update code * update format * update code * update code * add enable_thinking default * update code * add structured_outputs test case * add ci install xgrammar * add ci timeout time * update test for structured_outputs * update code * add error traceback info * update error msg * update structred output code * update code * update code * update config * update torch version --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-10-05 08:37:06 +08:00 · 2025-09-02 16:21:09 +08:00
parent 0e4df5a6f4
commit 1908465542
17 changed files with 1168 additions and 83 deletions
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -29,9 +29,9 @@ from fastdeploy.model_executor.graph_optimization.utils import (
    profile_run_guard,
    sot_warmup_guard,
 )
-from fastdeploy.model_executor.guided_decoding import get_guided_backend
-from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
+from fastdeploy.model_executor.guided_decoding import (
    LogitsProcessorBase,
+    get_guided_backend,
 )
 from fastdeploy.model_executor.layers.attention import get_attention_backend
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
@@ -97,10 +97,6 @@ class GPUModelRunner(ModelRunnerBase):
        self.enable_logprob = fd_config.model_config.enable_logprob
        self.enable_early_stop = self.fd_config.early_stop_config.enable_early_stop

-        self.guided_backend = None
-        if self.fd_config.parallel_config.guided_decoding_backend != "off":
-            self.guided_backend = get_guided_backend(fd_config=self.fd_config)
-
        # VL model config:
        if self.enable_mm:
            if "ernie" in self.fd_config.model_config.model_type:
@@ -129,6 +125,11 @@ class GPUModelRunner(ModelRunnerBase):
        else:
            self.sampler = SpeculativeSampler(fd_config)

+        self.guided_backend = None
+        if self.fd_config.parallel_config.guided_decoding_backend != "off":
+            self.guided_backend = get_guided_backend(fd_config=self.fd_config)
+            self.sampler.set_reasoning_parser(self.guided_backend.get_reasoning_parser())
+
        # Lazy initialize kv cache after model loading
        # self.kv_caches: list[paddle.Tensor] = []

@@ -206,7 +207,16 @@ class GPUModelRunner(ModelRunnerBase):
        elif request.structural_tag is not None:
            schemata_key = ("structural_tag", request.structural_tag)

-        return self.guided_backend.get_logits_processor(schemata_key=schemata_key), schemata_key
+        enable_thinking = request.get("enable_thinking", True)
+        enable_thinking = enable_thinking if enable_thinking is not None else True
+
+        return (
+            self.guided_backend.get_logits_processor(
+                schemata_key=schemata_key,
+                enable_thinking=enable_thinking,
+            ),
+            schemata_key,
+        )

    def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = None):
        """
@@ -1336,10 +1346,10 @@ class GPUModelRunner(ModelRunnerBase):
        Returns:
            A list of indices corresponding to the requests that need to be skipped.
        """
-        skip_idx_list = []
-        if not self.cache_config.enable_chunked_prefill or self.guided_backend is None:
-            return skip_idx_list
+        if not self.cache_config.enable_chunked_prefill or self.guided_backend is None or model_forward_batch is None:
+            return []

+        skip_idx_list = []
        for task in model_forward_batch:
            if task.get("prefill_chunk_info", None) is None or task.chunk_idx >= len(task.prefill_chunk_info):
                continue
@@ -1505,6 +1515,8 @@ class GPUModelRunner(ModelRunnerBase):
            speculative_decoding=self.speculative_decoding,
            skip_save_output=skip_save_output,
        )
+        if self.guided_backend is not None and sampler_output is not None:
+            self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)

        # 6. Speculative decode
        if self.speculative_decoding:
@@ -1538,7 +1550,7 @@ class GPUModelRunner(ModelRunnerBase):
        """
        Add cache for guided decoding.
        """
-        if self.guided_backend is None:
+        if self.guided_backend is None or model_forward_batch is None:
            return

        for request in model_forward_batch: