[Feature] Models api (#3073)

* add v1/models interface related

* add model parameters

* default model verification

* unit test

* check model err_msg

* unit test

* type annotation

* model parameter in response

* modify document description

* modify document description

* unit test

* verification

* verification update

* model_name

* pre-commit

* update test case

* update test case

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update fastdeploy/entrypoints/openai/serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: LiqinruiG <37392159+LiqinruiG@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Yzc216
2025-08-21 17:02:56 +08:00
committed by GitHub
parent b7eee3aec1
commit 466cbb5a99
13 changed files with 289 additions and 20 deletions

View File

@@ -46,8 +46,9 @@ class OpenAIServingChat:
OpenAI-style chat completions serving
"""
def __init__(self, engine_client, pid, ips, max_waiting_time, chat_template):
def __init__(self, engine_client, models, pid, ips, max_waiting_time, chat_template):
self.engine_client = engine_client
self.models = models
self.pid = pid
self.master_ip = ips
self.max_waiting_time = max_waiting_time
@@ -81,6 +82,14 @@ class OpenAIServingChat:
err_msg = f"Only master node can accept completion request, please send request to master node: {self.pod_ips[0]}"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
if self.models:
is_supported, request.model = self.models.is_supported_model(request.model)
if not is_supported:
err_msg = f"Unsupported model: {request.model}, support {', '.join([x.name for x in self.models.model_paths])} or default"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
try:
if self.max_waiting_time < 0:
await self.engine_client.semaphore.acquire()