[BugFix] fix workers=1 (#4364)

* [Feature] support prefix cache in DP * fix * Update common_engine.py * Update common_engine.py * Update common_engine.py * Update common_engine.py * [BugFix] fix workers more than 1 * fix * Update api_server.py * fix * Update api_server.py * fix --------- Co-authored-by: ltd0924 <luotingdan@baidu.com>
2025-11-02 20:54:03 +08:00 · 2025-10-15 17:06:25 +08:00
parent bcaa98ff9c
commit d8841b7b40
5 changed files with 47 additions and 18 deletions
--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -22,13 +22,13 @@ import time
 import traceback
 from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
-from multiprocessing import current_process

 import uvicorn
 import zmq
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse, Response, StreamingResponse
+from gunicorn.app.base import BaseApplication
 from opentelemetry import trace
 from prometheus_client import CONTENT_TYPE_LATEST

@@ -87,6 +87,21 @@ if args.tool_parser_plugin:
 llm_engine = None


+class StandaloneApplication(BaseApplication):
+    def __init__(self, app, options=None):
+        self.application = app
+        self.options = options or {}
+        super().__init__()
+
+    def load_config(self):
+        config = {key: value for key, value in self.options.items() if key in self.cfg.settings and value is not None}
+        for key, value in config.items():
+            self.cfg.set(key.lower(), value)
+
+    def load(self):
+        return self.application
+
+
 def load_engine():
    """
    load engine
@@ -95,10 +110,10 @@ def load_engine():
    if llm_engine is not None:
        return llm_engine

-    api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}")
+    api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}")
    engine_args = EngineArgs.from_cli_args(args)
    engine = LLMEngine.from_engine_args(engine_args)
-    if not engine.start(api_server_pid=os.getpid()):
+    if not engine.start(api_server_pid=args.port):
        api_server_logger.error("Failed to initialize FastDeploy LLM engine, service exit now!")
        return None

@@ -119,12 +134,12 @@ def load_data_service():
    global llm_engine
    if llm_engine is not None:
        return llm_engine
-    api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}")
+    api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}")
    engine_args = EngineArgs.from_cli_args(args)
    config = engine_args.create_engine_config()
    api_server_logger.info(f"local_data_parallel_id: {config.parallel_config}")
    expert_service = ExpertService(config, config.parallel_config.local_data_parallel_id)
-    if not expert_service.start(os.getpid(), config.parallel_config.local_data_parallel_id):
+    if not expert_service.start(args.port, config.parallel_config.local_data_parallel_id):
        api_server_logger.error("Failed to initialize FastDeploy LLM expert service, service exit now!")
        return None
    llm_engine = expert_service
@@ -136,13 +151,22 @@ async def lifespan(app: FastAPI):
    """
    async context manager for FastAPI lifespan
    """
+    import logging
+
+    uvicorn_access = logging.getLogger("uvicorn.access")
+    uvicorn_access.handlers.clear()
+
+    # 使用 gunicorn 的格式
+    formatter = logging.Formatter("[%(asctime)s] [%(process)d] [INFO] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
+
+    handler = logging.StreamHandler()
+    handler.setFormatter(formatter)
+    uvicorn_access.addHandler(handler)
+    uvicorn_access.propagate = False

    if args.tokenizer is None:
        args.tokenizer = args.model
-    if current_process().name != "MainProcess":
-        pid = os.getppid()
-    else:
-        pid = os.getpid()
+    pid = args.port
    api_server_logger.info(f"{pid}")

    if args.served_model_name is not None:
@@ -449,16 +473,17 @@ def launch_api_server() -> None:
    api_server_logger.info(f"args: {args.__dict__}")
    fd_start_span("FD_START")

+    options = {
+        "bind": f"{args.host}:{args.port}",
+        "workers": args.workers,
+        "worker_class": "uvicorn.workers.UvicornWorker",
+        "loglevel": "info",
+        "log_config": UVICORN_CONFIG,
+        "timeout_graceful_shutdown": args.timeout_graceful_shutdown,
+    }
+
    try:
-        uvicorn.run(
-            app="fastdeploy.entrypoints.openai.api_server:app",
-            host=args.host,
-            port=args.port,
-            workers=args.workers,
-            log_config=UVICORN_CONFIG,
-            log_level="info",
-            timeout_graceful_shutdown=args.timeout_graceful_shutdown,
-        )  # set log level to error to avoid log
+        StandaloneApplication(app, options).run()
    except Exception as e:
        api_server_logger.error(f"launch sync http server error, {e}, {str(traceback.format_exc())}")

--- a/requirements.txt
+++ b/requirements.txt
@@ -30,6 +30,7 @@ use-triton-in-paddle
 crcmod
 fastsafetensors==0.1.14
 msgpack
+gunicorn
 modelscope
 opentelemetry-api>=1.24.0
 opentelemetry-sdk>=1.24.0
--- a/requirements_dcu.txt
+++ b/requirements_dcu.txt
@@ -28,6 +28,7 @@ use-triton-in-paddle
 crcmod
 fastsafetensors==0.1.14
 msgpack
+gunicorn
 opentelemetry-api>=1.24.0
 opentelemetry-sdk>=1.24.0
 opentelemetry-instrumentation-redis
--- a/requirements_iluvatar.txt
+++ b/requirements_iluvatar.txt
@@ -29,6 +29,7 @@ use-triton-in-paddle
 crcmod
 fastsafetensors==0.1.14
 msgpack
+gunicorn
 opentelemetry-api>=1.24.0
 opentelemetry-sdk>=1.24.0
 opentelemetry-instrumentation-redis
--- a/requirements_metaxgpu.txt
+++ b/requirements_metaxgpu.txt
@@ -30,6 +30,7 @@ use-triton-in-paddle
 crcmod
 fastsafetensors==0.1.14
 msgpack
+gunicorn
 modelscope
 opentelemetry-api>=1.24.0
 opentelemetry-sdk>=1.24.0