mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[Feature][MTP]Support MTP for rl-model (#4009)
* qk norm for speculate decode C16 * support mtp in v1_scheduler mode * support mtp rope_3d * support mtp features * add unit test && del some log --------- Co-authored-by: yuanxiaolan <yuanxiaolan01@baidu.com> Co-authored-by: xiaoxiaohehe001 <hiteezsf@163.com>
This commit is contained in:
@@ -84,10 +84,14 @@ class ResourceManagerV1(ResourceManager):
|
||||
return len(request.block_tables) * self.config.cache_config.block_size
|
||||
|
||||
def get_new_block_nums(self, request: Request, num_new_tokens: int):
|
||||
return (
|
||||
block_num = (
|
||||
request.num_computed_tokens + num_new_tokens + self.config.cache_config.block_size - 1
|
||||
) // self.config.cache_config.block_size - len(request.block_tables)
|
||||
|
||||
if self.config.speculative_config.method is not None:
|
||||
block_num = min(block_num + 1, self.config.cache_config.max_block_num_per_seq)
|
||||
return block_num
|
||||
|
||||
def _prepare_prefill_task(self, request, new_token_num):
|
||||
request.prefill_start_index = request.num_computed_tokens
|
||||
request.prefill_end_index = request.num_computed_tokens + new_token_num
|
||||
|
Reference in New Issue
Block a user