[Feature] Models api (#3073)

* add v1/models interface related

* add model parameters

* default model verification

* unit test

* check model err_msg

* unit test

* type annotation

* model parameter in response

* modify document description

* modify document description

* unit test

* verification

* verification update

* model_name

* pre-commit

* update test case

* update test case

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update tests/entrypoints/openai/test_serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

* Update fastdeploy/entrypoints/openai/serving_models.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>

---------

Co-authored-by: LiqinruiG <37392159+LiqinruiG@users.noreply.github.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
Yzc216
2025-08-21 17:02:56 +08:00
committed by GitHub
parent b7eee3aec1
commit 466cbb5a99
13 changed files with 289 additions and 20 deletions

View File

@@ -46,6 +46,8 @@ When using FastDeploy to deploy models (including offline inference and service
| ```dynamic_load_weight``` | `int` | Whether to enable dynamic weight loading, default: 0 |
| ```enable_expert_parallel``` | `bool` | Whether to enable expert parallel |
| ```enable_logprob``` | `bool` | Whether to enable return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.If logrpob is not used, this parameter can be omitted when starting |
| ```served_model_name```| `str`| The model name used in the API. If not specified, the model name will be the same as the --model argument |
| ```revision``` | `str` | The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. |
| ```chat_template``` | `str` | Specify the template used for model concatenation, It supports both string input and file path input. The default value is None. If not specified, the model's default template will be used. |
## 1. Relationship between KVCache allocation, ```num_gpu_blocks_override``` and ```block_size```?

View File

@@ -44,6 +44,8 @@
| ```dynamic_load_weight``` | `int` | 是否动态加载权重默认0 |
| ```enable_expert_parallel``` | `bool` | 是否启用专家并行 |
| ```enable_logprob``` | `bool` | 是否启用输出token返回logprob。如果未使用 logrpob则在启动时可以省略此参数。 |
| ```served_model_name``` | `str` | API 中使用的模型名称,如果未指定,模型名称将与--model参数相同 |
| ```revision``` | `str` | 自动下载模型时用于指定模型的Git版本分支名或tag |
| ```chat_template``` | `str` | 指定模型拼接使用的模板支持字符串与文件路径默认为None如未指定则使用模型默认模板 |
## 1. KVCache分配与```num_gpu_blocks_override```、```block_size```的关系?

View File

@@ -50,6 +50,10 @@ class EngineArgs:
"""
The name or path of the model to be used.
"""
served_model_name: Optional[str] = None
"""
The name of the model being served.
"""
revision: Optional[str] = "master"
"""
The revision for downloading models.
@@ -379,6 +383,12 @@ class EngineArgs:
default=EngineArgs.model,
help="Model name or path to be used.",
)
model_group.add_argument(
"--served-model-name",
type=nullable_str,
default=EngineArgs.served_model_name,
help="Served model name",
)
model_group.add_argument(
"--revision",
type=nullable_str,

View File

@@ -40,9 +40,11 @@ from fastdeploy.entrypoints.openai.protocol import (
CompletionResponse,
ControlSchedulerRequest,
ErrorResponse,
ModelList,
)
from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat
from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion
from fastdeploy.entrypoints.openai.serving_models import ModelPath, OpenAIServingModels
from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
from fastdeploy.metrics.metrics import (
EXCLUDE_LABELS,
@@ -128,6 +130,15 @@ async def lifespan(app: FastAPI):
else:
pid = os.getpid()
api_server_logger.info(f"{pid}")
if args.served_model_name is not None:
served_model_names = args.served_model_name
verification = True
else:
served_model_names = args.model
verification = False
model_paths = [ModelPath(name=served_model_names, model_path=args.model, verification=verification)]
engine_client = EngineClient(
args.model,
args.tokenizer,
@@ -144,8 +155,22 @@ async def lifespan(app: FastAPI):
args.tool_call_parser,
)
app.state.dynamic_load_weight = args.dynamic_load_weight
chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time, chat_template)
completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips, args.max_waiting_time)
model_handler = OpenAIServingModels(
model_paths,
args.max_model_len,
args.ips,
)
app.state.model_handler = model_handler
chat_handler = OpenAIServingChat(
engine_client, app.state.model_handler, pid, args.ips, args.max_waiting_time, chat_template
)
completion_handler = OpenAIServingCompletion(
engine_client,
app.state.model_handler,
pid,
args.ips,
args.max_waiting_time,
)
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
engine_client.pid = pid
app.state.engine_client = engine_client
@@ -309,6 +334,23 @@ async def create_completion(request: CompletionRequest):
return JSONResponse(status_code=e.status_code, content={"detail": e.detail})
@app.get("/v1/models")
async def list_models() -> Response:
"""
List all available models.
"""
if app.state.dynamic_load_weight:
status, msg = app.state.engine_client.is_workers_alive()
if not status:
return JSONResponse(content={"error": "Worker Service Not Healthy"}, status_code=304)
models = await app.state.model_handler.list_models()
if isinstance(models, ErrorResponse):
return JSONResponse(content=models.model_dump(), status_code=models.code)
elif isinstance(models, ModelList):
return JSONResponse(content=models.model_dump())
@app.get("/update_model_weight")
def update_model_weight(request: Request) -> Response:
"""

View File

@@ -18,6 +18,7 @@ from __future__ import annotations
import json
import time
import uuid
from typing import Any, Dict, List, Literal, Optional, Union
from pydantic import BaseModel, Field, model_validator
@@ -55,6 +56,37 @@ class UsageInfo(BaseModel):
prompt_tokens_details: Optional[PromptTokenUsageInfo] = None
class ModelPermission(BaseModel):
id: str = Field(default_factory=lambda: f"modelperm-{str(uuid.uuid4().hex)}")
object: str = "model_permission"
created: int = Field(default_factory=lambda: int(time.time()))
allow_create_engine: bool = False
allow_sampling: bool = True
allow_logprobs: bool = True
allow_search_indices: bool = False
allow_view: bool = True
allow_fine_tuning: bool = False
organization: str = "*"
group: Optional[str] = None
is_blocking: bool = False
class ModelInfo(BaseModel):
id: str
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "FastDeploy"
root: Optional[str] = None
parent: Optional[str] = None
max_model_len: Optional[int] = None
permission: list[ModelPermission] = Field(default_factory=list)
class ModelList(BaseModel):
object: str = "list"
data: list[ModelInfo] = Field(default_factory=list)
class FunctionCall(BaseModel):
"""
Function call.

View File

@@ -46,8 +46,9 @@ class OpenAIServingChat:
OpenAI-style chat completions serving
"""
def __init__(self, engine_client, pid, ips, max_waiting_time, chat_template):
def __init__(self, engine_client, models, pid, ips, max_waiting_time, chat_template):
self.engine_client = engine_client
self.models = models
self.pid = pid
self.master_ip = ips
self.max_waiting_time = max_waiting_time
@@ -81,6 +82,14 @@ class OpenAIServingChat:
err_msg = f"Only master node can accept completion request, please send request to master node: {self.pod_ips[0]}"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
if self.models:
is_supported, request.model = self.models.is_supported_model(request.model)
if not is_supported:
err_msg = f"Unsupported model: {request.model}, support {', '.join([x.name for x in self.models.model_paths])} or default"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
try:
if self.max_waiting_time < 0:
await self.engine_client.semaphore.acquire()

View File

@@ -38,8 +38,9 @@ from fastdeploy.worker.output import LogprobsLists
class OpenAIServingCompletion:
def __init__(self, engine_client, pid, ips, max_waiting_time):
def __init__(self, engine_client, models, pid, ips, max_waiting_time):
self.engine_client = engine_client
self.models = models
self.pid = pid
self.master_ip = ips
self.host_ip = get_host_ip()
@@ -71,6 +72,12 @@ class OpenAIServingCompletion:
err_msg = f"Only master node can accept completion request, please send request to master node: {self.pod_ips[0]}"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
if self.models:
is_supported, request.model = self.models.is_supported_model(request.model)
if not is_supported:
err_msg = f"Unsupported model: {request.model}, support {', '.join([x.name for x in self.models.model_paths])} or default"
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
created_time = int(time.time())
if request.user is not None:
request_id = f"cmpl-{request.user}-{uuid.uuid4()}"

View File

@@ -0,0 +1,96 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from dataclasses import dataclass
from typing import List, Union
from fastdeploy.entrypoints.openai.protocol import (
ErrorResponse,
ModelInfo,
ModelList,
ModelPermission,
)
from fastdeploy.utils import api_server_logger, get_host_ip
@dataclass
class ModelPath:
name: str
model_path: str
verification: bool = False
class OpenAIServingModels:
"""
OpenAI-style models serving
"""
def __init__(
self,
model_paths: list[ModelPath],
max_model_len: int,
ips: Union[List[str], str],
):
self.model_paths = model_paths
self.max_model_len = max_model_len
self.master_ip = ips
self.host_ip = get_host_ip()
if self.master_ip is not None:
if isinstance(self.master_ip, list):
self.master_ip = self.master_ip[0]
else:
self.master_ip = self.master_ip.split(",")[0]
def _check_master(self):
if self.master_ip is None:
return True
if self.host_ip == self.master_ip:
return True
return False
def is_supported_model(self, model_name) -> tuple[bool, str]:
"""
Check whether the specified model is supported.
"""
if self.model_paths[0].verification is False:
return True, self.model_name()
if model_name == "default":
return True, self.model_name()
return any(model.name == model_name for model in self.model_paths), model_name
def model_name(self) -> str:
"""
Returns the current model name.
"""
return self.model_paths[0].name
async def list_models(self) -> ModelList:
"""
Show available models.
"""
if not self._check_master():
err_msg = (
f"Only master node can accept models request, please send request to master node: {self.master_ip}"
)
api_server_logger.error(err_msg)
return ErrorResponse(message=err_msg, code=400)
model_infos = [
ModelInfo(
id=model.name, max_model_len=self.max_model_len, root=model.model_path, permission=[ModelPermission()]
)
for model in self.model_paths
]
return ModelList(data=model_infos)

View File

@@ -321,7 +321,7 @@ def test_model_invalid():
payload = build_request_payload(TEMPLATE, data)
resp = send_request(URL, payload).json()
assert resp.get("object") == "chat.completion", "不存在的 model 应触发校验异常"
assert "non-existent-model" in resp.get("model"), "未返回预期的 model 信息"
# assert "non-existent-model" in resp.get("model"), "未返回预期的 model 信息"
assert len(resp.get("choices")[0].get("message").get("content")) > 0, "模型名为不存在的 model未正常生成回复"
@@ -341,7 +341,7 @@ def test_model_with_special_characters():
payload = build_request_payload(TEMPLATE, data)
resp = send_request(URL, payload).json()
assert resp.get("object") == "chat.completion", "不存在的 model 应触发校验异常"
assert "!@#" in resp.get("model"), "未返回预期的 model 信息"
# assert "!@#" in resp.get("model"), "未返回预期的 model 信息"
assert (
len(resp.get("choices")[0].get("message").get("content")) > 0
), "模型名为model 参数为非法格式,未正常生成回复"

View File

@@ -24,7 +24,9 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
def test_single_prompt_non_streaming(self):
"""测试单prompt非流式响应"""
self.completion_handler = OpenAIServingCompletion(self.mock_engine, pid=123, ips=None, max_waiting_time=30)
self.completion_handler = OpenAIServingCompletion(
self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30
)
request = CompletionRequest(prompt="test prompt", max_tokens=10, echo=True, logprobs=1)
@@ -54,7 +56,9 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
async def test_echo_back_prompt_and_streaming(self):
"""测试_echo_back_prompt方法和流式响应的prompt拼接逻辑"""
self.completion_handler = OpenAIServingCompletion(self.mock_engine, pid=123, ips=None, max_waiting_time=30)
self.completion_handler = OpenAIServingCompletion(
self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30
)
request = CompletionRequest(prompt="test prompt", max_tokens=10, stream=True, echo=True)
@@ -76,7 +80,9 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
def test_multi_prompt_non_streaming(self):
"""测试多prompt非流式响应"""
self.completion_handler = OpenAIServingCompletion(self.mock_engine, pid=123, ips=None, max_waiting_time=30)
self.completion_handler = OpenAIServingCompletion(
self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30
)
request = CompletionRequest(prompt=["prompt1", "prompt2"], max_tokens=10, echo=True)
@@ -108,7 +114,9 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
self.assertEqual(response.choices[1].text, "prompt2 response2")
async def test_multi_prompt_streaming(self):
self.completion_handler = OpenAIServingCompletion(self.mock_engine, pid=123, ips=None, max_waiting_time=30)
self.completion_handler = OpenAIServingCompletion(
self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30
)
request = CompletionRequest(prompt=["prompt1", "prompt2"], max_tokens=10, stream=True, echo=True)
@@ -140,7 +148,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
res = {"outputs": {"send_idx": 0, "text": "!"}}
idx = 0
instance = OpenAIServingCompletion(self.mock_engine, pid=123, ips=None, max_waiting_time=30)
instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30)
await instance._echo_back_prompt(request, res, idx)
self.assertEqual(res["outputs"]["text"], "Hello!")
@@ -149,7 +157,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
res = {"outputs": {"send_idx": 0, "text": "!"}}
idx = 0
instance = OpenAIServingCompletion(self.mock_engine, pid=123, ips=None, max_waiting_time=30)
instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30)
await instance._echo_back_prompt(request, res, idx)
self.assertEqual(res["outputs"]["text"], "Hello!")
@@ -158,7 +166,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
res = {"outputs": {"send_idx": 1, "text": "!"}}
idx = 0
instance = OpenAIServingCompletion(self.mock_engine, pid=123, ips=None, max_waiting_time=30)
instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30)
await instance._echo_back_prompt(request, res, idx)
self.assertEqual(res["outputs"]["text"], "!")
@@ -168,7 +176,7 @@ class TestCompletionEcho(unittest.IsolatedAsyncioTestCase):
res = {"outputs": {"send_idx": 0, "text": "!"}}
idx = 0
instance = OpenAIServingCompletion(self.mock_engine, pid=123, ips=None, max_waiting_time=30)
instance = OpenAIServingCompletion(self.mock_engine, models=None, pid=123, ips=None, max_waiting_time=30)
await instance._echo_back_prompt(request, res, idx)
self.assertEqual(res["outputs"]["text"], "!")

View File

@@ -16,7 +16,7 @@ class TestOpenAIServingCompletion(unittest.TestCase):
engine_client = Mock()
engine_client.reasoning_parser = "ernie_x1"
# 创建一个OpenAIServingCompletion实例
serving_completion = OpenAIServingCompletion(engine_client, "pid", "ips", 360)
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
# 创建一个模拟的output并设置finish_reason为"tool_call"
output = {"tool_call": "tool_call"}
# 调用calc_finish_reason方法
@@ -29,7 +29,7 @@ class TestOpenAIServingCompletion(unittest.TestCase):
engine_client = Mock()
engine_client.reasoning_parser = "ernie_x1"
# 创建一个OpenAIServingCompletion实例
serving_completion = OpenAIServingCompletion(engine_client, "pid", "ips", 360)
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
# 创建一个模拟的output并设置finish_reason为其他值
output = {"finish_reason": "other_reason"}
# 调用calc_finish_reason方法
@@ -41,7 +41,7 @@ class TestOpenAIServingCompletion(unittest.TestCase):
# 创建一个模拟的engine_client
engine_client = Mock()
# 创建一个OpenAIServingCompletion实例
serving_completion = OpenAIServingCompletion(engine_client, "pid", "ips", 360)
serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
# 创建一个模拟的output
output = {}
# 调用calc_finish_reason方法
@@ -52,7 +52,7 @@ class TestOpenAIServingCompletion(unittest.TestCase):
def test_request_output_to_completion_response(self):
engine_client = Mock()
# 创建一个OpenAIServingCompletion实例
openai_serving_completion = OpenAIServingCompletion(engine_client, "pid", "ips", 360)
openai_serving_completion = OpenAIServingCompletion(engine_client, None, "pid", "ips", 360)
final_res_batch: List[RequestOutput] = [
{
"outputs": {

View File

@@ -0,0 +1,51 @@
import asyncio
import unittest
from fastdeploy.entrypoints.openai.protocol import ModelInfo, ModelList
from fastdeploy.entrypoints.openai.serving_models import ModelPath, OpenAIServingModels
from fastdeploy.utils import get_host_ip
MODEL_NAME = "baidu/ERNIE-4.5-0.3B-PT"
MODEL_PATHS = [ModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
MAX_MODEL_LEN = 2048
async def _async_serving_models_init() -> OpenAIServingModels:
"""Asynchronously initialize an OpenAIServingModels instance."""
return OpenAIServingModels(
model_paths=MODEL_PATHS,
max_model_len=MAX_MODEL_LEN,
ips=get_host_ip(),
)
class TestOpenAIServingModels(unittest.TestCase):
"""Unit test for OpenAIServingModels"""
def test_serving_model_name(self):
"""Test model name retrieval"""
# 通过 asyncio.run() 执行异步初始化
serving_models = asyncio.run(_async_serving_models_init())
self.assertEqual(serving_models.model_name(), MODEL_NAME)
def test_list_models(self):
"""Test the model listing functionality"""
serving_models = asyncio.run(_async_serving_models_init())
# 通过 asyncio.run() 执行异步方法
result = asyncio.run(serving_models.list_models())
# 验证返回类型和内容
self.assertIsInstance(result, ModelList)
self.assertEqual(len(result.data), 1)
model_info = result.data[0]
self.assertIsInstance(model_info, ModelInfo)
self.assertEqual(model_info.id, MODEL_NAME)
self.assertEqual(model_info.max_model_len, MAX_MODEL_LEN)
self.assertEqual(model_info.root, MODEL_PATHS[0].model_path)
self.assertEqual(result.object, "list")
if __name__ == "__main__":
unittest.main()

View File

@@ -57,7 +57,12 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
async def test_serving_chat(self):
request = ChatCompletionRequest(messages=[{"role": "user", "content": "你好"}])
self.chat_completion_handler = OpenAIServingChat(
self.mock_engine, pid=123, ips=None, max_waiting_time=-1, chat_template=self.input_chat_template
self.mock_engine,
models=None,
pid=123,
ips=None,
max_waiting_time=-1,
chat_template=self.input_chat_template,
)
async def mock_chat_completion_full_generator(
@@ -79,7 +84,12 @@ class TestLodChatTemplate(unittest.IsolatedAsyncioTestCase):
async def test_serving_chat_cus(self):
request = ChatCompletionRequest(messages=[{"role": "user", "content": "hi"}], chat_template="hello")
self.chat_completion_handler = OpenAIServingChat(
self.mock_engine, pid=123, ips=None, max_waiting_time=10, chat_template=self.input_chat_template
self.mock_engine,
models=None,
pid=123,
ips=None,
max_waiting_time=10,
chat_template=self.input_chat_template,
)
async def mock_chat_completion_full_generator(