fix ep prefill (#2762)

This commit is contained in:
RichardWooSJTU
2025-07-09 14:03:05 +08:00
committed by GitHub
parent c4718fd693
commit fee544e808
7 changed files with 66 additions and 32 deletions

View File

@@ -34,6 +34,7 @@ from fastdeploy.model_executor.layers.attention.base_attention_backend import (
from fastdeploy.model_executor.layers.attention.ops import (
get_block_shape_and_split_kv_block, gqa_rope_write_cache,
init_signal_layerwise, open_shm_and_get_meta_signal, pre_cache_len_concat)
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
from fastdeploy.worker.forward_meta import ForwardMeta
@@ -100,22 +101,16 @@ class FlashAttentionBackend(AttentionBackend):
self.use_speculate = self.speculative_method is not None
self.speculate_max_draft_token_num = fd_config.speculative_config.num_speculative_tokens
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
# pd_disaggregation
self.use_pd_disaggregation: int = int(
os.getenv("FLAGS_use_pd_disaggregation", 0))
self.start_layer_index: int = fd_config.model_config.start_layer_index
self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
if fd_config.parallel_config.expert_parallel_rank is None:
fd_config.parallel_config.expert_parallel_rank = 0
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
fd_config.parallel_config.expert_parallel_rank
if self.device_id is None:
self.device_id = device_id
else:
self.device_id = self.device_id.split(",")[device_id]
self.rank, self.device_id = init_rank_and_device_id(fd_config)
def get_attntion_meta(self):
"""get_attntion_meta"""