[Bug fix] Fix bug for d blocks not enough (#3479)

* Support batched tokens for EP * Support batched tokens for EP * Support batched tokens for EP * Support batched tokens for EP * Support batched tokens for EP and fix bug * Support batched tokens for EP and fix bug * Support batched tokens for EP and fix bug * Support batched tokens for EP and fix bug * Fix bug for memory allocation * Fix bug for D blocks not enough * fix bug when d blocks not enough * fix bug when d blocks not enough * fix cache message recycle step * fix cache message recycle step * Fix step_idx recycle
2025-10-06 00:57:33 +08:00 · 2025-08-21 11:36:16 +08:00
parent c487b62ee0
commit 6854506533
4 changed files with 120 additions and 51 deletions
--- a/fastdeploy/cache_manager/cache_messager.py
+++ b/fastdeploy/cache_manager/cache_messager.py
@@ -252,6 +252,9 @@ class CacheMessager:
            self.last_step_idx = -1
            self.last_layer_idx = -1  # int32

+            max_step_idx = 100003
+            engine_recycled_count = 0
+
            while True:

                cache_info = self.engine_worker_queue.get_cache_info()
@@ -271,7 +274,6 @@ class CacheMessager:
                                current_info["status"] = "init"
                                logger.info(f"start cache_infos: {current_info}")
                            self.cache_info[info["request_id"]] = current_info
-                            self.last_step_idx = min(self.last_step_idx, current_info["current_id"])
                        else:
                            self.cache_info[info["request_id"]] = info
                prefilled_layer_idx = layer_shm_value.value[0]
@@ -287,7 +289,18 @@ class CacheMessager:
                if not self.cache_info:
                    time.sleep(0.001)
                    continue
-                logger.debug(f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx: {prefilled_step_idx}")
+                if self.last_step_idx > prefilled_step_idx:
+                    engine_recycled_count += 1
+                self.last_step_idx = prefilled_step_idx  # only copy value read from shm memory
+                prefilled_step_idx = (
+                    prefilled_step_idx + max_step_idx * engine_recycled_count
+                )  # remap prefilled_step_idx for comparison
+
+                logger.debug(
+                    f"prefilled_layer_idx: {prefilled_layer_idx}, prefilled_step_idx in shm: {self.last_step_idx},"
+                    f"prefilled_step_idx: {prefilled_step_idx} engine_recycled_count {engine_recycled_count}"
+                )
+
                for req_id, item in list(self.cache_info.items()):
                    if "status" not in item:
                        continue
@@ -318,7 +331,8 @@ class CacheMessager:
                    if item["current_id"] < prefilled_step_idx:
                        current_layer_idx = self.num_hidden_layers
                    else:
-                        current_layer_idx = prefilled_layer_idx + 1
+                        if item["current_id"] == prefilled_step_idx:
+                            current_layer_idx = prefilled_layer_idx + 1

                    for layer_idx in range(item["layer_idx"], current_layer_idx):
                        tic = time.time()
@@ -361,9 +375,7 @@ class CacheMessager:
                            self.engine_worker_queue.put_finished_req([(item["request_id"], "finished")])
                            logger.info(f"put write cache {item['request_id']}")
                        del self.cache_info[req_id]
-
-                    self.last_step_idx = prefilled_step_idx
-                    self.last_layer_idx = prefilled_layer_idx
+                self.last_layer_idx = prefilled_layer_idx

        except Exception as e:
            logger.info(f"prefill layerwise send cache thread has exception: {e}")