mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-07 09:31:35 +08:00
[Cherry-Pick][Bug Fix]fix the bug for real size 0 in cudagraph (#3888)
* fix the bug for real size 0 in cudagraph * fix cache_messager --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -163,7 +163,7 @@ class CacheMessager:
|
|||||||
try:
|
try:
|
||||||
prefilled_step_idx_data = np.zeros(shape=[1], dtype=np.int32)
|
prefilled_step_idx_data = np.zeros(shape=[1], dtype=np.int32)
|
||||||
prefilled_layer_idx_data = np.zeros(shape=[1], dtype=np.int32)
|
prefilled_layer_idx_data = np.zeros(shape=[1], dtype=np.int32)
|
||||||
prefilled_layer_name = f"splitwise_complete_prefilled_step_{self.dp_rank_id}.{self.gpu_id}"
|
prefilled_layer_name = f"splitwise_complete_prefilled_layer_{self.dp_rank_id}.{self.gpu_id}"
|
||||||
prefilled_step_name = f"splitwise_complete_prefilled_step_{self.dp_rank_id}.{self.gpu_id}"
|
prefilled_step_name = f"splitwise_complete_prefilled_step_{self.dp_rank_id}.{self.gpu_id}"
|
||||||
step_shm_value = IPCSignal(
|
step_shm_value = IPCSignal(
|
||||||
name=f"splitwise_complete_prefilled_step_{self.dp_rank_id}",
|
name=f"splitwise_complete_prefilled_step_{self.dp_rank_id}",
|
||||||
|
@@ -42,6 +42,7 @@ from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
|
|||||||
from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
|
from fastdeploy.model_executor.layers.sample.sampler import Sampler, SpeculativeSampler
|
||||||
from fastdeploy.model_executor.model_loader import get_model_loader
|
from fastdeploy.model_executor.model_loader import get_model_loader
|
||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
|
from fastdeploy.utils import ceil_div
|
||||||
|
|
||||||
if current_platform.is_iluvatar():
|
if current_platform.is_iluvatar():
|
||||||
from fastdeploy.model_executor.ops.iluvatar import set_value_by_flags_and_idx
|
from fastdeploy.model_executor.ops.iluvatar import set_value_by_flags_and_idx
|
||||||
@@ -588,17 +589,16 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
"""Set dummy prefill inputs to share_inputs"""
|
"""Set dummy prefill inputs to share_inputs"""
|
||||||
# NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
|
# NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
|
||||||
max_dec_len = expected_decode_len + 1
|
max_dec_len = expected_decode_len + 1
|
||||||
full_length = min(
|
input_length = min(
|
||||||
num_tokens // batch_size,
|
ceil_div(num_tokens, batch_size),
|
||||||
self.parallel_config.max_model_len - max_dec_len,
|
self.parallel_config.max_model_len - max_dec_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
|
# NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
|
||||||
# TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
|
# TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
|
||||||
if self.fd_config.parallel_config.enable_expert_parallel:
|
if self.fd_config.parallel_config.enable_expert_parallel:
|
||||||
full_length = min(full_length, 32)
|
input_length = min(input_length, 32)
|
||||||
|
|
||||||
input_length = int(full_length * self.cache_config.kv_cache_ratio)
|
|
||||||
block_num = (
|
block_num = (
|
||||||
input_length + self.cache_config.block_size - 1
|
input_length + self.cache_config.block_size - 1
|
||||||
) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
|
) // self.cache_config.block_size + self.cache_config.enc_dec_block_num
|
||||||
|
Reference in New Issue
Block a user