[BugFix] [PD Disaggregation] fix v1 scheduler prefill node profile run & ipc transfer protocol (#5132)

* [fix] fix v1 scheduler profile run for append attention in prefill node * [fix] skip send_signal if kv signal not inited for gpu and xpu * [fix] extend fix to flash_attn & mla_attn * [fix] fix v1 pd run in ipc transfer protocol * [ci] add test for v1 pd profile run using ipc transfer protocol * [style] fix code style check * [style] fix code style again * [fix] fix profile run * [update] remove --num-gpu-blocks-override in example script * [chore] rename forward_meta is_profiling to is_dummy_or_profile_run
2025-12-24 13:28:13 +08:00 · 2025-11-20 21:39:22 +08:00
parent 01c30f6b87
commit 43097a512a
12 changed files with 512 additions and 94 deletions
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1229,7 +1229,7 @@ class GPUModelRunner(ModelRunnerBase):

        self.share_inputs["mask_rollback"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")

-    def _prepare_inputs(self) -> None:
+    def _prepare_inputs(self, is_dummy_or_profile_run=False) -> None:
        """Prepare the model inputs"""
        if envs.ENABLE_V1_KVCACHE_SCHEDULER:
            recover_decode_task(
@@ -1280,7 +1280,7 @@ class GPUModelRunner(ModelRunnerBase):
        max_bad_tokens_len = np.max(self.share_inputs["bad_tokens_len"].numpy())

        # Initialize forward meta data
-        self.initialize_forward_meta()
+        self.initialize_forward_meta(is_dummy_or_profile_run=is_dummy_or_profile_run)

        # Get sampling metadata
        self.sampling_metadata = SamplingMetadata(
@@ -1334,7 +1334,7 @@ class GPUModelRunner(ModelRunnerBase):
        """Get current model"""
        return self.model

-    def initialize_forward_meta(self):
+    def initialize_forward_meta(self, is_dummy_or_profile_run=False):
        """
        Initialize forward meta and attention meta data
        """
@@ -1386,6 +1386,9 @@ class GPUModelRunner(ModelRunnerBase):
            only_prefill_use_cudagraph if self.cudagraph_only_prefill else only_decode_use_cudagraph
        )

+        # Set forward_meta.is_dummy_or_profile_run to True to skip init_kv_signal_per_query for attention backends
+        self.forward_meta.is_dummy_or_profile_run = is_dummy_or_profile_run
+
        # Initialzie attention meta data
        for attn_backend in self.attn_backends:
            attn_backend.init_attention_metadata(self.forward_meta)
@@ -1778,7 +1781,7 @@ class GPUModelRunner(ModelRunnerBase):

        while True:
            # 1. Initialize forward meta and attention meta data
-            self._prepare_inputs()
+            self._prepare_inputs(is_dummy_or_profile_run=True)

            # 2. Padding inputs for cuda graph
            self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph