mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[NewFeature]Support dp multi api server && Fix some bug in mixed ep && merge develop (#3598)
* [Feature] update ep * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * fix queue ports idx * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * fix ci * Update engine.py * fix ci * fix some bug in mixed ep * add server fix and op fix * rm some log * fix code style * ltd fix * fix * fix * fix some bug * fix bug * fix bug * fix style * Update config.py * Update splitwise_connector.py * Update cache_messager.py * Update __init__.py * merge and fix * Update engine.py * Update common_engine.py * Update run_ci_xpu.sh * Update ernie_processor.py * Update ernie_processor.py --------- Co-authored-by: ltd0924 <ltd0924@sina.com> Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
This commit is contained in:
@@ -31,6 +31,7 @@ from prometheus_client import CONTENT_TYPE_LATEST
|
||||
|
||||
from fastdeploy.engine.args_utils import EngineArgs
|
||||
from fastdeploy.engine.engine import LLMEngine
|
||||
from fastdeploy.engine.expert_service import ExpertService
|
||||
from fastdeploy.entrypoints.chat_utils import load_chat_template
|
||||
from fastdeploy.entrypoints.engine_client import EngineClient
|
||||
from fastdeploy.entrypoints.openai.protocol import (
|
||||
@@ -60,6 +61,7 @@ from fastdeploy.utils import (
|
||||
FlexibleArgumentParser,
|
||||
StatefulSemaphore,
|
||||
api_server_logger,
|
||||
configure_uvicorn_logging,
|
||||
console_logger,
|
||||
is_port_available,
|
||||
retrive_model_from_server,
|
||||
@@ -98,15 +100,10 @@ def load_engine():
|
||||
api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}")
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
engine = LLMEngine.from_engine_args(engine_args)
|
||||
|
||||
if not engine.start(api_server_pid=os.getpid()):
|
||||
api_server_logger.error("Failed to initialize FastDeploy LLM engine, service exit now!")
|
||||
return None
|
||||
|
||||
api_server_logger.info("FastDeploy LLM engine initialized!\n")
|
||||
console_logger.info(f"Launching metrics service at http://{args.host}:{args.metrics_port}/metrics")
|
||||
console_logger.info(f"Launching chat completion service at http://{args.host}:{args.port}/v1/chat/completions")
|
||||
console_logger.info(f"Launching completion service at http://{args.host}:{args.port}/v1/completions")
|
||||
llm_engine = engine
|
||||
return engine
|
||||
|
||||
@@ -117,6 +114,25 @@ MAX_CONCURRENT_CONNECTIONS = (args.max_concurrency + args.workers - 1) // args.w
|
||||
connection_semaphore = StatefulSemaphore(MAX_CONCURRENT_CONNECTIONS)
|
||||
|
||||
|
||||
def load_data_service():
|
||||
"""
|
||||
load data service
|
||||
"""
|
||||
global llm_engine
|
||||
if llm_engine is not None:
|
||||
return llm_engine
|
||||
api_server_logger.info(f"FastDeploy LLM API server starting... {os.getpid()}")
|
||||
engine_args = EngineArgs.from_cli_args(args)
|
||||
config = engine_args.create_engine_config()
|
||||
api_server_logger.info(f"local_data_parallel_id: {config.parallel_config}")
|
||||
expert_service = ExpertService(config, config.parallel_config.local_data_parallel_id)
|
||||
if not expert_service.start(os.getpid(), config.parallel_config.local_data_parallel_id):
|
||||
api_server_logger.error("Failed to initialize FastDeploy LLM expert service, service exit now!")
|
||||
return None
|
||||
llm_engine = expert_service
|
||||
return expert_service
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""
|
||||
@@ -140,19 +156,20 @@ async def lifespan(app: FastAPI):
|
||||
model_paths = [ModelPath(name=served_model_names, model_path=args.model, verification=verification)]
|
||||
|
||||
engine_client = EngineClient(
|
||||
args.model,
|
||||
args.tokenizer,
|
||||
args.max_model_len,
|
||||
args.tensor_parallel_size,
|
||||
pid,
|
||||
args.limit_mm_per_prompt,
|
||||
args.mm_processor_kwargs,
|
||||
model_name_or_path=args.model,
|
||||
tokenizer=args.tokenizer,
|
||||
max_model_len=args.max_model_len,
|
||||
tensor_parallel_size=args.tensor_parallel_size,
|
||||
pid=pid,
|
||||
port=int(args.engine_worker_queue_port[args.local_data_parallel_id]),
|
||||
limit_mm_per_prompt=args.limit_mm_per_prompt,
|
||||
mm_processor_kwargs=args.mm_processor_kwargs,
|
||||
# args.enable_mm,
|
||||
args.reasoning_parser,
|
||||
args.data_parallel_size,
|
||||
args.enable_logprob,
|
||||
args.workers,
|
||||
args.tool_call_parser,
|
||||
reasoning_parser=args.reasoning_parser,
|
||||
data_parallel_size=args.data_parallel_size,
|
||||
enable_logprob=args.enable_logprob,
|
||||
workers=args.workers,
|
||||
tool_parser=args.tool_call_parser,
|
||||
)
|
||||
app.state.dynamic_load_weight = args.dynamic_load_weight
|
||||
model_handler = OpenAIServingModels(
|
||||
@@ -176,6 +193,9 @@ async def lifespan(app: FastAPI):
|
||||
app.state.engine_client = engine_client
|
||||
app.state.chat_handler = chat_handler
|
||||
app.state.completion_handler = completion_handler
|
||||
global llm_engine
|
||||
if llm_engine is not None:
|
||||
llm_engine.engine.data_processor = engine_client.data_processor
|
||||
yield
|
||||
# close zmq
|
||||
try:
|
||||
@@ -510,8 +530,18 @@ def launch_controller_server():
|
||||
|
||||
def main():
|
||||
"""main函数"""
|
||||
if load_engine() is None:
|
||||
return
|
||||
configure_uvicorn_logging()
|
||||
load_model_register_plugins()
|
||||
if args.local_data_parallel_id == 0:
|
||||
if not load_engine():
|
||||
return
|
||||
else:
|
||||
if not load_data_service():
|
||||
return
|
||||
api_server_logger.info("FastDeploy LLM engine initialized!\n")
|
||||
console_logger.info(f"Launching metrics service at http://{args.host}:{args.metrics_port}/metrics")
|
||||
console_logger.info(f"Launching chat completion service at http://{args.host}:{args.port}/v1/chat/completions")
|
||||
console_logger.info(f"Launching completion service at http://{args.host}:{args.port}/v1/completions")
|
||||
|
||||
launch_controller_server()
|
||||
launch_metrics_server()
|
||||
|
Reference in New Issue
Block a user