mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[Feature] Models api (#3073)
* add v1/models interface related * add model parameters * default model verification * unit test * check model err_msg * unit test * type annotation * model parameter in response * modify document description * modify document description * unit test * verification * verification update * model_name * pre-commit * update test case * update test case * Update tests/entrypoints/openai/test_serving_models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/entrypoints/openai/test_serving_models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/entrypoints/openai/test_serving_models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update tests/entrypoints/openai/test_serving_models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update fastdeploy/entrypoints/openai/serving_models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: LiqinruiG <37392159+LiqinruiG@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
@@ -40,9 +40,11 @@ from fastdeploy.entrypoints.openai.protocol import (
|
||||
CompletionResponse,
|
||||
ControlSchedulerRequest,
|
||||
ErrorResponse,
|
||||
ModelList,
|
||||
)
|
||||
from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat
|
||||
from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion
|
||||
from fastdeploy.entrypoints.openai.serving_models import ModelPath, OpenAIServingModels
|
||||
from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
|
||||
from fastdeploy.metrics.metrics import (
|
||||
EXCLUDE_LABELS,
|
||||
@@ -128,6 +130,15 @@ async def lifespan(app: FastAPI):
|
||||
else:
|
||||
pid = os.getpid()
|
||||
api_server_logger.info(f"{pid}")
|
||||
|
||||
if args.served_model_name is not None:
|
||||
served_model_names = args.served_model_name
|
||||
verification = True
|
||||
else:
|
||||
served_model_names = args.model
|
||||
verification = False
|
||||
model_paths = [ModelPath(name=served_model_names, model_path=args.model, verification=verification)]
|
||||
|
||||
engine_client = EngineClient(
|
||||
args.model,
|
||||
args.tokenizer,
|
||||
@@ -144,8 +155,22 @@ async def lifespan(app: FastAPI):
|
||||
args.tool_call_parser,
|
||||
)
|
||||
app.state.dynamic_load_weight = args.dynamic_load_weight
|
||||
chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time, chat_template)
|
||||
completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips, args.max_waiting_time)
|
||||
model_handler = OpenAIServingModels(
|
||||
model_paths,
|
||||
args.max_model_len,
|
||||
args.ips,
|
||||
)
|
||||
app.state.model_handler = model_handler
|
||||
chat_handler = OpenAIServingChat(
|
||||
engine_client, app.state.model_handler, pid, args.ips, args.max_waiting_time, chat_template
|
||||
)
|
||||
completion_handler = OpenAIServingCompletion(
|
||||
engine_client,
|
||||
app.state.model_handler,
|
||||
pid,
|
||||
args.ips,
|
||||
args.max_waiting_time,
|
||||
)
|
||||
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
|
||||
engine_client.pid = pid
|
||||
app.state.engine_client = engine_client
|
||||
@@ -309,6 +334,23 @@ async def create_completion(request: CompletionRequest):
|
||||
return JSONResponse(status_code=e.status_code, content={"detail": e.detail})
|
||||
|
||||
|
||||
@app.get("/v1/models")
|
||||
async def list_models() -> Response:
|
||||
"""
|
||||
List all available models.
|
||||
"""
|
||||
if app.state.dynamic_load_weight:
|
||||
status, msg = app.state.engine_client.is_workers_alive()
|
||||
if not status:
|
||||
return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304)
|
||||
|
||||
models = await app.state.model_handler.list_models()
|
||||
if isinstance(models, ErrorResponse):
|
||||
return JSONResponse(content=models.model_dump(), status_code=models.code)
|
||||
elif isinstance(models, ModelList):
|
||||
return JSONResponse(content=models.model_dump())
|
||||
|
||||
|
||||
@app.get("/update_model_weight")
|
||||
def update_model_weight(request: Request) -> Response:
|
||||
"""
|
||||
|
Reference in New Issue
Block a user