[Feature] add custom chat template (#3251)

* add custom chat_template

* add custom chat_template

* add unittest

* fix

* add docs

* fix comment

* add offline chat

* fix unit test

* fix unit test

* fix

* fix pre commit

* fix unit test

* add unit test

* add unit test

* add unit test

* fix pre_commit

* fix enable_thinking

* fix pre commit

* fix pre commit

* fix unit test

* add requirements
This commit is contained in:
luukunn
2025-08-18 16:34:08 +08:00
committed by GitHub
parent 70ee910cd5
commit 9c129813f9
19 changed files with 288 additions and 3 deletions

View File

@@ -30,6 +30,7 @@ from prometheus_client import CONTENT_TYPE_LATEST
from fastdeploy.engine.args_utils import EngineArgs
from fastdeploy.engine.engine import LLMEngine
from fastdeploy.entrypoints.chat_utils import load_chat_template
from fastdeploy.entrypoints.engine_client import EngineClient
from fastdeploy.entrypoints.openai.protocol import (
ChatCompletionRequest,
@@ -77,6 +78,7 @@ parser.add_argument("--max-concurrency", default=512, type=int, help="max concur
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
args.model = retrive_model_from_server(args.model, args.revision)
chat_template = load_chat_template(args.chat_template)
if args.tool_parser_plugin:
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
llm_engine = None
@@ -141,7 +143,7 @@ async def lifespan(app: FastAPI):
args.tool_call_parser,
)
app.state.dynamic_load_weight = args.dynamic_load_weight
chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time)
chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time, chat_template)
completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips, args.max_waiting_time)
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
engine_client.pid = pid