[Bug fix] Fix cudagraph when use ep. (#3130)

* fix cudagraph when use ep * fix typo * reduce full length to adapt large bsz such 128/256
2025-10-05 16:48:03 +08:00 · 2025-08-04 18:06:18 +08:00
parent 2bd8a50649
commit 01d7586661
1 changed files with 6 additions and 0 deletions
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -525,6 +525,12 @@ class GPUModelRunner(ModelRunnerBase):
            num_tokens // batch_size,
            self.parallel_config.max_model_len - max_dec_len,
        )
        # NOTE(wanglongzhi): When the full length is too large, DeepEP's buffer size will not be enough to cause the result to appear nan.
        # TODO(wanglongzhi): Figure out the accurate buffer size of DeepEP.
        if self.fd_config.parallel_config.enable_expert_parallel:
            full_length = min(full_length, 32)
        input_length = int(full_length * self.cache_config.kv_cache_ratio)
        block_num = (
            input_length + self.cache_config.block_size - 1