diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 7af00d0aa..310f8ed7f 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -22,13 +22,13 @@ import time import traceback from collections.abc import AsyncGenerator from contextlib import asynccontextmanager -from multiprocessing import current_process import uvicorn import zmq from fastapi import FastAPI, HTTPException, Request from fastapi.exceptions import RequestValidationError from fastapi.responses import JSONResponse, Response, StreamingResponse +from gunicorn.app.base import BaseApplication from opentelemetry import trace from prometheus_client import CONTENT_TYPE_LATEST @@ -87,6 +87,21 @@ if args.tool_parser_plugin: llm_engine = None +class StandaloneApplication(BaseApplication): + def __init__(self, app, options=None): + self.application = app + self.options = options or {} + super().__init__() + + def load_config(self): + config = {key: value for key, value in self.options.items() if key in self.cfg.settings and value is not None} + for key, value in config.items(): + self.cfg.set(key.lower(), value) + + def load(self): + return self.application + + def load_engine(): """ load engine @@ -95,10 +110,10 @@ def load_engine(): if llm_engine is not None: return llm_engine - api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}") + api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}") engine_args = EngineArgs.from_cli_args(args) engine = LLMEngine.from_engine_args(engine_args) - if not engine.start(api_server_pid=os.getpid()): + if not engine.start(api_server_pid=args.port): api_server_logger.error("Failed to initialize FastDeploy LLM engine, service exit now!") return None @@ -119,12 +134,12 @@ def load_data_service(): global llm_engine if llm_engine is not None: return llm_engine - api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}") + api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}, port: {args.port}") engine_args = EngineArgs.from_cli_args(args) config = engine_args.create_engine_config() api_server_logger.info(f"local_data_parallel_id: {config.parallel_config}") expert_service = ExpertService(config, config.parallel_config.local_data_parallel_id) - if not expert_service.start(os.getpid(), config.parallel_config.local_data_parallel_id): + if not expert_service.start(args.port, config.parallel_config.local_data_parallel_id): api_server_logger.error("Failed to initialize FastDeploy LLM expert service, service exit now!") return None llm_engine = expert_service @@ -136,13 +151,22 @@ async def lifespan(app: FastAPI): """ async context manager for FastAPI lifespan """ + import logging + + uvicorn_access = logging.getLogger("uvicorn.access") + uvicorn_access.handlers.clear() + + # 使用 gunicorn 的格式 + formatter = logging.Formatter("[%(asctime)s] [%(process)d] [INFO] %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + + handler = logging.StreamHandler() + handler.setFormatter(formatter) + uvicorn_access.addHandler(handler) + uvicorn_access.propagate = False if args.tokenizer is None: args.tokenizer = args.model - if current_process().name != "MainProcess": - pid = os.getppid() - else: - pid = os.getpid() + pid = args.port api_server_logger.info(f"{pid}") if args.served_model_name is not None: @@ -449,16 +473,17 @@ def launch_api_server() -> None: api_server_logger.info(f"args: {args.__dict__}") fd_start_span("FD_START") + options = { + "bind": f"{args.host}:{args.port}", + "workers": args.workers, + "worker_class": "uvicorn.workers.UvicornWorker", + "loglevel": "info", + "log_config": UVICORN_CONFIG, + "timeout_graceful_shutdown": args.timeout_graceful_shutdown, + } + try: - uvicorn.run( - app="fastdeploy.entrypoints.openai.api_server:app", - host=args.host, - port=args.port, - workers=args.workers, - log_config=UVICORN_CONFIG, - log_level="info", - timeout_graceful_shutdown=args.timeout_graceful_shutdown, - ) # set log level to error to avoid log + StandaloneApplication(app, options).run() except Exception as e: api_server_logger.error(f"launch sync http server error, {e}, {str(traceback.format_exc())}") diff --git a/requirements.txt b/requirements.txt index 8eb02b628..32acf860d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -30,6 +30,7 @@ use-triton-in-paddle crcmod fastsafetensors==0.1.14 msgpack +gunicorn modelscope opentelemetry-api>=1.24.0 opentelemetry-sdk>=1.24.0 diff --git a/requirements_dcu.txt b/requirements_dcu.txt index 79bac3a62..a622320a9 100644 --- a/requirements_dcu.txt +++ b/requirements_dcu.txt @@ -28,6 +28,7 @@ use-triton-in-paddle crcmod fastsafetensors==0.1.14 msgpack +gunicorn opentelemetry-api>=1.24.0 opentelemetry-sdk>=1.24.0 opentelemetry-instrumentation-redis diff --git a/requirements_iluvatar.txt b/requirements_iluvatar.txt index d481e3feb..7983b3b58 100644 --- a/requirements_iluvatar.txt +++ b/requirements_iluvatar.txt @@ -29,6 +29,7 @@ use-triton-in-paddle crcmod fastsafetensors==0.1.14 msgpack +gunicorn opentelemetry-api>=1.24.0 opentelemetry-sdk>=1.24.0 opentelemetry-instrumentation-redis diff --git a/requirements_metaxgpu.txt b/requirements_metaxgpu.txt index 26f6de095..c17f3b354 100644 --- a/requirements_metaxgpu.txt +++ b/requirements_metaxgpu.txt @@ -30,6 +30,7 @@ use-triton-in-paddle crcmod fastsafetensors==0.1.14 msgpack +gunicorn modelscope opentelemetry-api>=1.24.0 opentelemetry-sdk>=1.24.0