[BugFix] [PD Disaggregation] fix v1 scheduler prefill node profile run & ipc transfer protocol (#5132)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled

* [fix] fix v1 scheduler profile run for append attention in prefill node

* [fix] skip send_signal if kv signal not inited for gpu and xpu

* [fix] extend fix to flash_attn & mla_attn

* [fix] fix v1 pd run in ipc transfer protocol

* [ci] add test for v1 pd profile run using ipc transfer protocol

* [style] fix code style check

* [style] fix code style again

* [fix] fix profile run

* [update] remove --num-gpu-blocks-override in example script

* [chore] rename forward_meta is_profiling to is_dummy_or_profile_run
This commit is contained in:
Yonghua Li
2025-11-20 21:39:22 +08:00
committed by GitHub
parent 01c30f6b87
commit 43097a512a
12 changed files with 512 additions and 94 deletions

View File

@@ -1229,7 +1229,7 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["mask_rollback"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
def _prepare_inputs(self) -> None:
def _prepare_inputs(self, is_dummy_or_profile_run=False) -> None:
"""Prepare the model inputs"""
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
recover_decode_task(
@@ -1280,7 +1280,7 @@ class GPUModelRunner(ModelRunnerBase):
max_bad_tokens_len = np.max(self.share_inputs["bad_tokens_len"].numpy())
# Initialize forward meta data
self.initialize_forward_meta()
self.initialize_forward_meta(is_dummy_or_profile_run=is_dummy_or_profile_run)
# Get sampling metadata
self.sampling_metadata = SamplingMetadata(
@@ -1334,7 +1334,7 @@ class GPUModelRunner(ModelRunnerBase):
"""Get current model"""
return self.model
def initialize_forward_meta(self):
def initialize_forward_meta(self, is_dummy_or_profile_run=False):
"""
Initialize forward meta and attention meta data
"""
@@ -1386,6 +1386,9 @@ class GPUModelRunner(ModelRunnerBase):
only_prefill_use_cudagraph if self.cudagraph_only_prefill else only_decode_use_cudagraph
)
# Set forward_meta.is_dummy_or_profile_run to True to skip init_kv_signal_per_query for attention backends
self.forward_meta.is_dummy_or_profile_run = is_dummy_or_profile_run
# Initialzie attention meta data
for attn_backend in self.attn_backends:
attn_backend.init_attention_metadata(self.forward_meta)
@@ -1778,7 +1781,7 @@ class GPUModelRunner(ModelRunnerBase):
while True:
# 1. Initialize forward meta and attention meta data
self._prepare_inputs()
self._prepare_inputs(is_dummy_or_profile_run=True)
# 2. Padding inputs for cuda graph
self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph