mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[Feature] add custom chat template (#3251)
* add custom chat_template * add custom chat_template * add unittest * fix * add docs * fix comment * add offline chat * fix unit test * fix unit test * fix * fix pre commit * fix unit test * add unit test * add unit test * add unit test * fix pre_commit * fix enable_thinking * fix pre commit * fix pre commit * fix unit test * add requirements
This commit is contained in:
@@ -30,6 +30,7 @@ from prometheus_client import CONTENT_TYPE_LATEST
|
||||
|
||||
from fastdeploy.engine.args_utils import EngineArgs
|
||||
from fastdeploy.engine.engine import LLMEngine
|
||||
from fastdeploy.entrypoints.chat_utils import load_chat_template
|
||||
from fastdeploy.entrypoints.engine_client import EngineClient
|
||||
from fastdeploy.entrypoints.openai.protocol import (
|
||||
ChatCompletionRequest,
|
||||
@@ -77,6 +78,7 @@ parser.add_argument("--max-concurrency", default=512, type=int, help="max concur
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
args = parser.parse_args()
|
||||
args.model = retrive_model_from_server(args.model, args.revision)
|
||||
chat_template = load_chat_template(args.chat_template)
|
||||
if args.tool_parser_plugin:
|
||||
ToolParserManager.import_tool_parser(args.tool_parser_plugin)
|
||||
llm_engine = None
|
||||
@@ -141,7 +143,7 @@ async def lifespan(app: FastAPI):
|
||||
args.tool_call_parser,
|
||||
)
|
||||
app.state.dynamic_load_weight = args.dynamic_load_weight
|
||||
chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time)
|
||||
chat_handler = OpenAIServingChat(engine_client, pid, args.ips, args.max_waiting_time, chat_template)
|
||||
completion_handler = OpenAIServingCompletion(engine_client, pid, args.ips, args.max_waiting_time)
|
||||
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
|
||||
engine_client.pid = pid
|
||||
|
Reference in New Issue
Block a user