[Optimization] xgrammar async compile, multi thread, speed up (#4835)

* xgrammar async compile, multi thread, speed up * fix test_sampler.py & pre-commit err * add redis version check && fix request.llm_engine_recv_req_timestamp * xgrammar prefill & decode & v0 * fix test_gpu_prompt_logprobs.py * add test_guided_decoding.py * Update fastdeploy/scheduler/splitwise_scheduler.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update fastdeploy/model_executor/guided_decoding/xgrammar_backend.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update fastdeploy/model_executor/guided_decoding/xgrammar_backend.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix torch xgrammar unittest env --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-11-14 18:05:26 +08:00
parent b925533051
commit 5fc12eddfe
11 changed files with 810 additions and 373 deletions
--- a/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
+++ b/fastdeploy/model_executor/guided_decoding/base_guided_decoding.py
@@ -14,9 +14,10 @@
 # limitations under the License.
 """

+import multiprocessing
 import os
 import traceback
-from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import Future, ThreadPoolExecutor

 from fastdeploy.config import ErnieArchitectures, FDConfig
 from fastdeploy.engine.request import Request
@@ -135,9 +136,9 @@ class BackendBase:
    """

    def __init__(self, fd_config: FDConfig):
-        self.cache = {}
        self.fd_config = fd_config
-        self.executor = ThreadPoolExecutor()
+        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+        self.executor = ThreadPoolExecutor(max_workers=max_workers)
        self.max_cache_size = 2048
        self.reasoning_parser = None

@@ -263,7 +264,7 @@ class BackendBase:
        self,
        schemata_key: tuple[str, str],
        enable_thinking: bool = False,
-    ) -> tuple[LogitsProcessorBase, bool]:
+    ) -> Future[LogitsProcessorBase]:
        """
        get logits processor by key from cache or create new one.

@@ -275,13 +276,8 @@ class BackendBase:
                - LogitsProcessorBase: The logits processor instance
                - bool: True if processor was from cache, False if newly created
        """
-        value = self.cache.get(schemata_key, None)
-        if value:
-            value_copy = value.copy()
-            value_copy.enable_reasoning = enable_thinking
-            return value_copy, True
        value = self.executor.submit(self._init_logits_processor, schemata_key, enable_thinking)
-        return value, False
+        return value

    def _get_tokenizer_hf(self):
        """
@@ -303,7 +299,7 @@ class BackendBase:

                tokenizer = AutoTokenizer.from_pretrained(
                    self.fd_config.model_config.model,
-                    use_fast=False,
+                    use_fast=True,
                )

                if not isinstance(tokenizer, PreTrainedTokenizerFast):
@@ -334,21 +330,6 @@ class BackendBase:
        except Exception as e:
            raise Exception(f"Fail to initialize hf tokenizer: {e}, {str(traceback.format_exc())}")

-    def add_cache(self, schemata_key: tuple[str, str], processor: LogitsProcessorBase) -> None:
-        """
-        add logits processor to cache.
-
-        Args:
-            schemata_key (tuple[str, str]): Tuple containing processor type and schema string
-            processor (LogitsProcessorBase): Logits processor instance to cache
-
-        Returns:
-            None: No return value
-        """
-        if len(self.cache) >= self.max_cache_size:
-            return
-        self.cache[schemata_key] = processor.copy()
-

 class BaseChecker:
    """