[Feature] Support block scheduler v1 for FD (#2928)

* Support FD block scheduler v1

* Support FD block scheduler v1

* Support FD block scheduler v1

* Fix according to copilot review

* Fix according to review

* Remove is_dummy

* Fix bug when real_bsz=1

* Fix infer first token cost time

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
chenjian
2025-07-23 20:31:31 +08:00
committed by GitHub
parent ca0f71bd39
commit 85a78d695d
16 changed files with 898 additions and 40 deletions

View File

@@ -130,6 +130,11 @@ class EngineArgs:
Ratio of tokens to process in a block.
"""
prealloc_dec_block_slot_num_threshold: int = 5
"""
Token slot threshold for preallocating decoder blocks.
"""
dist_init_ip: Optional[str] = None
"""
The master node ip of multinode deployment
@@ -525,10 +530,14 @@ class EngineArgs:
)
cache_group.add_argument(
"--swap-space",
type=float,
default=EngineArgs.swap_space,
help="The amount of CPU memory to offload to.",
"--swap-space", type=float, default=EngineArgs.swap_space, help="The amount of CPU memory to offload to."
)
cache_group.add_argument(
"--prealloc-dec-block-slot-num-threshold",
type=int,
default=5,
help="Number of token slot threadshold to allocate next blocks for decoding.",
)
cache_group.add_argument(
@@ -784,6 +793,7 @@ class EngineArgs:
gpu_memory_utilization=self.gpu_memory_utilization,
num_gpu_blocks_override=self.num_gpu_blocks_override,
kv_cache_ratio=self.kv_cache_ratio,
prealloc_dec_block_slot_num_threshold=self.prealloc_dec_block_slot_num_threshold,
enable_prefix_caching=self.enable_prefix_caching,
swap_space=self.swap_space,
cache_queue_port=self.cache_queue_port,