[Speculative Decoding][MTP] Support static CacheKV C8 quantization and optimize memory usage (#5155)

* support static cachekv c8 quantization in mtp mode * optimize memory allocation
2025-12-24 13:28:13 +08:00 · 2025-11-21 15:10:13 +08:00
parent 3c36283d7d
commit 2d1dade5e2
6 changed files with 350 additions and 295 deletions
--- a/fastdeploy/scheduler/config.py
+++ b/fastdeploy/scheduler/config.py
@@ -268,7 +268,9 @@ class SchedulerConfig:
            Exception: If invalid scheduler type is specified
        """
        self.name = "local"  # "local" for LocalScheduler or "global" for GlobalScheduler
-        self.max_num_batched_tokens = 2048
+        self.max_num_batched_tokens = 2048  # base token_num for text inputs
+        self.max_extra_num_batched_tokens = 16384  # extra token_num for multimodal inputs
+        self.max_chunk_len = 18432  # max supported token_num = max_num_batched_tokens + max_extra_num_batched_tokens
        self.max_num_seqs = 34
        self.splitwise_role = "mixed"
        self.config = None