[GCU] Enable gcu CI (#3190)

* [GCU] Update to the latest version * [GCU] Enable CI
2025-10-04 08:16:42 +08:00 · 2025-08-13 11:48:24 +08:00
parent ce9180241e
commit d1a92e3e17
6 changed files with 87 additions and 45 deletions
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -295,7 +295,7 @@ class GCUModelRunner(ModelRunnerBase):

        if self.speculative_method in ["mtp"]:
            self.proposer.insert_prefill_inputs(req_dicts)
-        self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
+        self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer

    def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int):
        """Set dummy prefill inputs to share_inputs"""
@@ -675,7 +675,7 @@ class GCUModelRunner(ModelRunnerBase):
        )
        self.share_inputs["decoder_batch_ids"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
        self.share_inputs["decoder_tile_ids_per_batch"] = paddle.full([int(decode_max_tile_size)], 0, dtype="int32")
-        self.share_inputs["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32").pin_memory()
+        self.share_inputs["decoder_num_blocks_cpu"] = paddle.full([1], 0, dtype="int32").cpu()
        self.share_inputs["max_len_tensor_cpu"] = paddle.full([8], 0, dtype="int32").cpu()

        # Get the attention backend
@@ -1062,9 +1062,7 @@ class GCUModelRunner(ModelRunnerBase):

        self._update_chunked_prefill(model_forward_batch)
        self._add_cache(model_forward_batch)
-        self.seq_lens_this_time_buffer[:num_running_requests].copy_(
-            self.share_inputs["seq_lens_this_time"][:num_running_requests], False
-        )
+        self.seq_lens_this_time_buffer.copy_(self.share_inputs["seq_lens_this_time"], False)
        return None

    def _add_cache(self, model_forward_batch) -> None: