[Feature] [PD] add simple router and refine splitwise deployment (#4709)

* add simple router and refine splitwise deployment * fix
2025-12-24 13:28:13 +08:00 · 2025-11-06 14:56:02 +08:00
parent 831266da7a
commit 08ca0f6aea
39 changed files with 2397 additions and 171 deletions
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -94,10 +94,11 @@ async def async_request_eb_openai_chat_completions(
            "stream_options": {
                "include_usage": True,
                "continuous_usage_stats": True,
-            }
+            },
+            "max_tokens": request_func_input.output_len,
        }
        if request_func_input.response_format:
-            payload["response_format"] =request_func_input.response_format
+            payload["response_format"] = request_func_input.response_format

        # 超参由yaml传入
        payload.update(request_func_input.hyper_parameters)
@@ -132,13 +133,13 @@ async def async_request_eb_openai_chat_completions(

                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
                        if chunk != "[DONE]":
-                            #print("####chunk:", chunk, type(chunk))
+                            # print("####chunk:", chunk, type(chunk))
                            timestamp = time.perf_counter()
                            data = json.loads(chunk)

                            if request_id == "None" and "id" in data:
                                request_id = data["id"]
-                            
+
                            if choices := data.get("choices"):
                                content = choices[0]["delta"].get("content")
                                reason_content = choices[0]["delta"].get("reasoning_content")
@@ -164,7 +165,6 @@ async def async_request_eb_openai_chat_completions(
                            elif usage := data.get("usage", {}):
                                output.output_tokens = usage.get("completion_tokens", 0)
                                output.prompt_tokens = usage.get("prompt_tokens", 0)
-                            

                            most_recent_timestamp = timestamp

--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -46,7 +46,7 @@ class SampleRequest:
    prompt_len: int
    expected_output_len: int
    response_format: Optional[dict] = None
-    
+

 class BenchmarkDataset(ABC):
    """BenchmarkDataset"""
@@ -299,7 +299,7 @@ class EBChatDataset(BenchmarkDataset):
            prompt = entry["messages"][-1].get("content", "")
            history_QA = entry.get("messages", [])
            response_format = entry.get("response_format")
-            new_output_len = int(entry.get("max_tokens", 12288))
+            new_output_len = int(entry.get("max_tokens", output_len if output_len else 12288))

            if enable_multimodal_chat:
                prompt = self.apply_multimodal_chat_transformation(prompt, None)
@@ -311,7 +311,7 @@ class EBChatDataset(BenchmarkDataset):
                    prompt_len=0,
                    history_QA=history_QA,
                    expected_output_len=new_output_len,
-                    response_format=response_format
+                    response_format=response_format,
                )
            )
            cnt += 1
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -352,7 +352,7 @@ async def benchmark(
        ignore_eos=ignore_eos,
        debug=debug,
        extra_body=extra_body,
-        response_format=response_format
+        response_format=response_format,
    )

    print("test_input:", test_input)
@@ -384,7 +384,7 @@ async def benchmark(
            logprobs=logprobs,
            ignore_eos=ignore_eos,
            extra_body=extra_body,
-            response_format=response_format
+            response_format=response_format,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
@@ -444,7 +444,7 @@ async def benchmark(
            debug=debug,
            ignore_eos=ignore_eos,
            extra_body=extra_body,
-            response_format=response_format
+            response_format=response_format,
        )
        tasks.append(asyncio.create_task(limited_request_func(request_func_input=request_func_input, pbar=pbar)))
    outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -460,7 +460,7 @@ async def benchmark(
            api_url=base_url + "/stop_profile",
            output_len=test_output_len,
            logprobs=logprobs,
-            response_format=response_format
+            response_format=response_format,
        )
        profile_output = await request_func(request_func_input=profile_input)
        if profile_output.success:
--- a/benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml
+++ b/benchmarks/yaml/qwen25_7b-vl-32k-bf16.yaml
@@ -3,4 +3,4 @@ max_num_seqs: 128
 gpu_memory_utilization: 0.85
 tensor_parallel_size: 1
 limit_mm_per_prompt: '{"image": 100, "video": 100}'
-enable_mm: True
+enable_mm: True
--- a/benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml
+++ b/benchmarks/yaml/request_yaml/qwen25-vl-32k.yaml
@@ -5,4 +5,4 @@ metadata:
 max_tokens: 32768
 repetition_penalty: 1.05
 frequency_penalty: 0
-presence_penalty: 0
+presence_penalty: 0