From cd62cc2df95525372a1dadbc41abb4363c523808 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 23 Dec 2025 12:13:11 +0000 Subject: [PATCH] Improve error handling in cleanup functions - Add check for already terminated processes before attempting cleanup - Add ProcessLookupError handling for race conditions - Add success logging for better debugging - Check if cache manager processes are alive before terminating Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> --- fastdeploy/entrypoints/api_server.py | 10 ++++++++++ fastdeploy/entrypoints/openai/api_server.py | 16 +++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/fastdeploy/entrypoints/api_server.py b/fastdeploy/entrypoints/api_server.py index a8fe7da9d..1ba5eb8fa 100644 --- a/fastdeploy/entrypoints/api_server.py +++ b/fastdeploy/entrypoints/api_server.py @@ -43,9 +43,19 @@ def cleanup_engine(): try: if hasattr(llm_engine, "worker_proc") and llm_engine.worker_proc is not None: try: + # 检查进程是否已经结束 + if llm_engine.worker_proc.poll() is not None: + api_server_logger.info("Worker process already terminated") + return + pgid = os.getpgid(llm_engine.worker_proc.pid) api_server_logger.info(f"Terminating worker process group {pgid}") os.killpg(pgid, signal.SIGTERM) + # 等待进程结束 + llm_engine.worker_proc.wait(timeout=5) + api_server_logger.info("Worker process terminated successfully") + except ProcessLookupError: + api_server_logger.info("Worker process already terminated") except Exception as e: api_server_logger.error(f"Error terminating worker process: {e}") except Exception as e: diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 35e9c836f..ac395bc51 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -97,26 +97,36 @@ def cleanup_processes(): try: if hasattr(llm_engine, "worker_proc") and llm_engine.worker_proc is not None: try: + # 检查进程是否已经结束 + if llm_engine.worker_proc.poll() is not None: + api_server_logger.info("Worker process already terminated") + return + pgid = os.getpgid(llm_engine.worker_proc.pid) api_server_logger.info(f"Terminating worker process group {pgid}") os.killpg(pgid, signal.SIGTERM) # 等待进程结束,如果超时则强制杀死 try: llm_engine.worker_proc.wait(timeout=5) + api_server_logger.info("Worker process terminated successfully") except subprocess.TimeoutExpired: api_server_logger.warning("Worker process did not terminate in time, sending SIGKILL") os.killpg(pgid, signal.SIGKILL) llm_engine.worker_proc.wait(timeout=2) + except ProcessLookupError: + api_server_logger.info("Worker process already terminated") except Exception as e: api_server_logger.error(f"Error terminating worker process: {e}") if hasattr(llm_engine, "cache_manager_processes") and llm_engine.cache_manager_processes: for proc in llm_engine.cache_manager_processes: try: - proc.terminate() - proc.join(timeout=2) if proc.is_alive(): - proc.kill() + proc.terminate() + proc.join(timeout=2) + if proc.is_alive(): + proc.kill() + proc.join(timeout=1) except Exception as e: api_server_logger.error(f"Error terminating cache manager process: {e}") except Exception as e: