mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Speculative Decoding][MTP] Support static CacheKV C8 quantization and optimize memory usage (#5155)
* support static cachekv c8 quantization in mtp mode * optimize memory allocation
This commit is contained in:
@@ -268,7 +268,9 @@ class SchedulerConfig:
|
||||
Exception: If invalid scheduler type is specified
|
||||
"""
|
||||
self.name = "local" # "local" for LocalScheduler or "global" for GlobalScheduler
|
||||
self.max_num_batched_tokens = 2048
|
||||
self.max_num_batched_tokens = 2048 # base token_num for text inputs
|
||||
self.max_extra_num_batched_tokens = 16384 # extra token_num for multimodal inputs
|
||||
self.max_chunk_len = 18432 # max supported token_num = max_num_batched_tokens + max_extra_num_batched_tokens
|
||||
self.max_num_seqs = 34
|
||||
self.splitwise_role = "mixed"
|
||||
self.config = None
|
||||
|
||||
Reference in New Issue
Block a user