[FDConfig] remove engine client args, use fd_config instead (#5217)

* [refactor] remove engine client args, use fd_config instead

* [chore] update

* [fix] fix

* [fix] fix

* [chore] rename config to fd_config

* [fix] fix run_batch

* [ci] add ci case for engine client

---------

Co-authored-by: Jiaxin Sui <95567040+plusNew001@users.noreply.github.com>
This commit is contained in:
Yonghua Li
2025-11-28 17:20:54 +08:00
committed by GitHub
parent 73886204d4
commit a535050b11
4 changed files with 70 additions and 1561 deletions

View File

@@ -175,25 +175,12 @@ async def lifespan(app: FastAPI):
model_paths = [ModelPath(name=served_model_names, model_path=args.model, verification=verification)]
engine_args = EngineArgs.from_cli_args(args)
config = engine_args.create_engine_config(port_availability_check=False)
fd_config = engine_args.create_engine_config(port_availability_check=False)
engine_client = EngineClient(
model_name_or_path=args.model,
tokenizer=args.tokenizer,
max_model_len=args.max_model_len,
tensor_parallel_size=args.tensor_parallel_size,
pid=pid,
port=int(os.environ.get("INFERENCE_MSG_QUEUE_ID", "0")),
limit_mm_per_prompt=args.limit_mm_per_prompt,
mm_processor_kwargs=args.mm_processor_kwargs,
reasoning_parser=args.reasoning_parser,
data_parallel_size=args.data_parallel_size,
enable_logprob=args.enable_logprob,
fd_config=fd_config,
workers=args.workers,
tool_parser=args.tool_call_parser,
enable_prefix_caching=args.enable_prefix_caching,
splitwise_role=args.splitwise_role,
max_processor_cache=args.max_processor_cache,
config=config,
)
await engine_client.connection_manager.initialize()
app.state.dynamic_load_weight = args.dynamic_load_weight
@@ -224,14 +211,14 @@ async def lifespan(app: FastAPI):
embedding_handler = OpenAIServingEmbedding(
engine_client,
app.state.model_handler,
config,
fd_config,
pid,
args.ips,
args.max_waiting_time,
chat_template,
)
reward_handler = OpenAIServingReward(
engine_client, app.state.model_handler, config, pid, args.ips, args.max_waiting_time, chat_template
engine_client, app.state.model_handler, fd_config, pid, args.ips, args.max_waiting_time, chat_template
)
engine_client.create_zmq_client(model=pid, mode=zmq.PUSH)
engine_client.pid = pid