[Feature]CP support data clear (#4214)

* Update serving_chat.py * Update serving_completion.py * Update serving_completion.py * mv connection_manager init * [BugFix] fix kv cache * fix format * [Feature] support clear data --------- Co-authored-by: Yuanle Liu <yuanlehome@163.com> Co-authored-by: RAM <gstian5555@outlook.com>
2025-10-30 19:36:42 +08:00 · 2025-09-23 16:53:39 +08:00
parent f38b174a75
commit de4feff147
10 changed files with 65 additions and 0 deletions
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -141,6 +141,9 @@ class EngineClient:
        self.zmq_client = ZmqIpcClient(model, mode)
        self.zmq_client.connect()

+    def check_model_weight_status(self):
+        return self.model_weights_status_signal.value[0] < 0
+
    async def format_and_add_data(self, prompts: dict):
        """
        Format the request data and send the request to the server.
--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -480,6 +480,7 @@ def reset_scheduler():

    if llm_engine is None:
        return Response("Engine not loaded", status_code=500)
+    llm_engine.engine.clear_data()
    llm_engine.engine.scheduler.reset()
    return Response("Scheduler Reset Successfully", status_code=200)

--- a/fastdeploy/entrypoints/openai/serving_chat.py
+++ b/fastdeploy/entrypoints/openai/serving_chat.py
@@ -210,6 +210,8 @@ class OpenAIServingChat:
                decoder_base_url=self.tokenizer_base_url,
            )
            while num_choices > 0:
+                if self.engine_client.check_model_weight_status():
+                    raise ValueError("Engine is clearing model weight")
                try:
                    response = await asyncio.wait_for(response_queue.get(), timeout=10)
                    current_waiting_time = 0
@@ -425,6 +427,8 @@ class OpenAIServingChat:
                decoder_base_url=self.tokenizer_base_url,
            )
            while True:
+                if self.engine_client.check_model_weight_status():
+                    raise ValueError("Engine is clearing model weight")
                try:
                    response = await asyncio.wait_for(response_queue.get(), timeout=10)
                    current_waiting_time = 0
--- a/fastdeploy/entrypoints/openai/serving_completion.py
+++ b/fastdeploy/entrypoints/openai/serving_completion.py
@@ -216,6 +216,8 @@ class OpenAIServingCompletion:
            completion_batched_token_ids = [[] for _ in range(num_choices)]
            current_waiting_time = 0
            while num_choices > 0:
+                if self.engine_client.check_model_weight_status():
+                    raise ValueError("Engine is clearing model weight")
                try:
                    response = await asyncio.wait_for(response_queue.get(), timeout=10)
                    current_waiting_time = 0
@@ -333,6 +335,8 @@ class OpenAIServingCompletion:
            )
            current_waiting_time = 0
            while num_choices > 0:
+                if self.engine_client.check_model_weight_status():
+                    raise ValueError("Engine is clearing model weight")
                try:
                    response = await asyncio.wait_for(response_queue.get(), timeout=10)
                    current_waiting_time = 0