mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Bug fix] Fix cudagraph when use ep. (#3130)
* fix cudagraph when use ep * fix typo * reduce full length to adapt large bsz such 128/256
This commit is contained in:
@@ -525,6 +525,12 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
num_tokens // batch_size,
|
num_tokens // batch_size,
|
||||||
self.parallel_config.max_model_len - max_dec_len,
|
self.parallel_config.max_model_len - max_dec_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
|
||||||
|
# TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
|
||||||
|
if self.fd_config.parallel_config.enable_expert_parallel:
|
||||||
|
full_length = min(full_length, 32)
|
||||||
|
|
||||||
input_length = int(full_length * self.cache_config.kv_cache_ratio)
|
input_length = int(full_length * self.cache_config.kv_cache_ratio)
|
||||||
block_num = (
|
block_num = (
|
||||||
input_length + self.cache_config.block_size - 1
|
input_length + self.cache_config.block_size - 1
|
||||||
|
Reference in New Issue
Block a user