[Feature] [PD] add simple router and refine splitwise deployment (#4709)

* add simple router and refine splitwise deployment

* fix
This commit is contained in:
Juncai
2025-11-06 14:56:02 +08:00
committed by GitHub
parent 831266da7a
commit 08ca0f6aea
39 changed files with 2397 additions and 171 deletions

View File

@@ -94,10 +94,11 @@ async def async_request_eb_openai_chat_completions(
"stream_options": {
"include_usage": True,
"continuous_usage_stats": True,
}
},
"max_tokens": request_func_input.output_len,
}
if request_func_input.response_format:
payload["response_format"] =request_func_input.response_format
payload["response_format"] = request_func_input.response_format
# 超参由yaml传入
payload.update(request_func_input.hyper_parameters)
@@ -132,13 +133,13 @@ async def async_request_eb_openai_chat_completions(
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
if chunk != "[DONE]":
#print("####chunk:", chunk, type(chunk))
# print("####chunk:", chunk, type(chunk))
timestamp = time.perf_counter()
data = json.loads(chunk)
if request_id == "None" and "id" in data:
request_id = data["id"]
if choices := data.get("choices"):
content = choices[0]["delta"].get("content")
reason_content = choices[0]["delta"].get("reasoning_content")
@@ -164,7 +165,6 @@ async def async_request_eb_openai_chat_completions(
elif usage := data.get("usage", {}):
output.output_tokens = usage.get("completion_tokens", 0)
output.prompt_tokens = usage.get("prompt_tokens", 0)
most_recent_timestamp = timestamp

View File

@@ -46,7 +46,7 @@ class SampleRequest:
prompt_len: int
expected_output_len: int
response_format: Optional[dict] = None
class BenchmarkDataset(ABC):
"""BenchmarkDataset"""
@@ -299,7 +299,7 @@ class EBChatDataset(BenchmarkDataset):
prompt = entry["messages"][-1].get("content", "")
history_QA = entry.get("messages", [])
response_format = entry.get("response_format")
new_output_len = int(entry.get("max_tokens", 12288))
new_output_len = int(entry.get("max_tokens", output_len if output_len else 12288))
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(prompt, None)
@@ -311,7 +311,7 @@ class EBChatDataset(BenchmarkDataset):
prompt_len=0,
history_QA=history_QA,
expected_output_len=new_output_len,
response_format=response_format
response_format=response_format,
)
)
cnt += 1

View File

@@ -352,7 +352,7 @@ async def benchmark(
ignore_eos=ignore_eos,
debug=debug,
extra_body=extra_body,
response_format=response_format
response_format=response_format,
)
print("test_input:", test_input)
@@ -384,7 +384,7 @@ async def benchmark(
logprobs=logprobs,
ignore_eos=ignore_eos,
extra_body=extra_body,
response_format=response_format
response_format=response_format,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
@@ -444,7 +444,7 @@ async def benchmark(
debug=debug,
ignore_eos=ignore_eos,
extra_body=extra_body,
response_format=response_format
response_format=response_format,
)
tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -460,7 +460,7 @@ async def benchmark(
api_url=base_url + "/stop_profile",
output_len=test_output_len,
logprobs=logprobs,
response_format=response_format
response_format=response_format,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:

View File

@@ -3,4 +3,4 @@ max_num_seqs: 128
gpu_memory_utilization: 0.85
tensor_parallel_size: 1
limit_mm_per_prompt: '{"image": 100, "video": 100}'
enable_mm: True
enable_mm: True

View File

@@ -5,4 +5,4 @@ metadata:
max_tokens: 32768
repetition_penalty: 1.05
frequency_penalty: 0
presence_penalty: 0
presence_penalty: 0