mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[BugFix] [PD Disaggregation] fix v1 scheduler prefill node profile run & ipc transfer protocol (#5132)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
* [fix] fix v1 scheduler profile run for append attention in prefill node * [fix] skip send_signal if kv signal not inited for gpu and xpu * [fix] extend fix to flash_attn & mla_attn * [fix] fix v1 pd run in ipc transfer protocol * [ci] add test for v1 pd profile run using ipc transfer protocol * [style] fix code style check * [style] fix code style again * [fix] fix profile run * [update] remove --num-gpu-blocks-override in example script * [chore] rename forward_meta is_profiling to is_dummy_or_profile_run
This commit is contained in:
@@ -1229,7 +1229,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
self.share_inputs["mask_rollback"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
|
||||
|
||||
def _prepare_inputs(self) -> None:
|
||||
def _prepare_inputs(self, is_dummy_or_profile_run=False) -> None:
|
||||
"""Prepare the model inputs"""
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
recover_decode_task(
|
||||
@@ -1280,7 +1280,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
max_bad_tokens_len = np.max(self.share_inputs["bad_tokens_len"].numpy())
|
||||
|
||||
# Initialize forward meta data
|
||||
self.initialize_forward_meta()
|
||||
self.initialize_forward_meta(is_dummy_or_profile_run=is_dummy_or_profile_run)
|
||||
|
||||
# Get sampling metadata
|
||||
self.sampling_metadata = SamplingMetadata(
|
||||
@@ -1334,7 +1334,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
"""Get current model"""
|
||||
return self.model
|
||||
|
||||
def initialize_forward_meta(self):
|
||||
def initialize_forward_meta(self, is_dummy_or_profile_run=False):
|
||||
"""
|
||||
Initialize forward meta and attention meta data
|
||||
"""
|
||||
@@ -1386,6 +1386,9 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
only_prefill_use_cudagraph if self.cudagraph_only_prefill else only_decode_use_cudagraph
|
||||
)
|
||||
|
||||
# Set forward_meta.is_dummy_or_profile_run to True to skip init_kv_signal_per_query for attention backends
|
||||
self.forward_meta.is_dummy_or_profile_run = is_dummy_or_profile_run
|
||||
|
||||
# Initialzie attention meta data
|
||||
for attn_backend in self.attn_backends:
|
||||
attn_backend.init_attention_metadata(self.forward_meta)
|
||||
@@ -1778,7 +1781,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
while True:
|
||||
# 1. Initialize forward meta and attention meta data
|
||||
self._prepare_inputs()
|
||||
self._prepare_inputs(is_dummy_or_profile_run=True)
|
||||
|
||||
# 2. Padding inputs for cuda graph
|
||||
self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph
|
||||
|
||||
Reference in New Issue
Block a user