[FD CLI] Add bench cli (#4160)

* add bench cli * Update test_main.py
2025-09-26 20:41:53 +08:00 · 2025-09-22 20:37:30 +08:00
parent 5e1f13bd3b
commit 5532e8a323
11 changed files with 2890 additions and 3 deletions
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -965,7 +965,7 @@ if __name__ == "__main__":
    parser.add_argument(
        "--backend",
        type=str,
-        default="vllm",
+        default="openai-chat",
        choices=list(ASYNC_REQUEST_FUNCS.keys()),
    )
    parser.add_argument(
--- a/fastdeploy/entrypoints/cli/init.py
+++ b/fastdeploy/entrypoints/cli/init.py
@@ -0,0 +1,7 @@
+from fastdeploy.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand
+from fastdeploy.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand
+
+__all__: list[str] = [
+    "BenchmarkLatencySubcommand",
+    "BenchmarkServingSubcommand",
+]
--- a/fastdeploy/entrypoints/cli/benchmark/init.py
+++ b/fastdeploy/entrypoints/cli/benchmark/init.py
--- a/fastdeploy/entrypoints/cli/benchmark/base.py
+++ b/fastdeploy/entrypoints/cli/benchmark/base.py
@@ -0,0 +1,41 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/benchmark/base.py
+
+import argparse
+
+from fastdeploy.entrypoints.cli.types import CLISubcommand
+
+
+class BenchmarkSubcommandBase(CLISubcommand):
+    """The base class of subcommands for vllm bench."""
+
+    help: str
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        """Add the CLI arguments to the parser."""
+        raise NotImplementedError
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        """Run the benchmark.
+
+        Args:
+            args: The arguments to the command.
+        """
+        raise NotImplementedError
--- a/fastdeploy/entrypoints/cli/benchmark/datasets.py
+++ b/fastdeploy/entrypoints/cli/benchmark/datasets.py
@@ -0,0 +1,593 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
+import argparse
+import base64
+import io
+import json
+import logging
+import random
+from abc import ABC, abstractmethod
+from collections.abc import Mapping
+from contextlib import suppress
+from dataclasses import dataclass
+from io import BytesIO
+from typing import Any, Optional, Union
+
+from fontTools.feaLib import ast
+from PIL import Image
+
+from fastdeploy.utils import FlexibleArgumentParser
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class SampleRequest:
+    """
+    Represents a single inference request for benchmarking.
+    """
+
+    no: int
+    prompt: Union[str, Any]
+    history_QA: Union[str, Any]
+    json_data: Optional[dict]
+    prompt_len: int
+    expected_output_len: int
+
+
+class BenchmarkDataset(ABC):
+    """BenchmarkDataset"""
+
+    DEFAULT_SEED = 0
+    IS_MULTIMODAL = False
+
+    def __init__(
+        self,
+        dataset_path: Optional[str] = None,
+        random_seed: int = DEFAULT_SEED,
+        shuffle: bool = False,
+        hyperparameter_path: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize the BenchmarkDataset with an optional dataset path and random
+        seed.  Args:
+            dataset_path (Optional[str]): Path to the dataset. If None, it
+            indicates that a default or random dataset might be used.
+            random_seed (int): Seed value for reproducible shuffling or
+            sampling. Defaults to DEFAULT_SEED.
+        """
+        self.dataset_path = dataset_path
+        # Set the random seed, ensuring that a None value is replaced with the
+        # default seed.
+        self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
+        self.data = None
+        self.shuffle = shuffle
+        self.hyperparameter_path = hyperparameter_path
+        self.hyperparameters = {}
+
+    def load_data(self) -> None:
+        """
+        Load data from the dataset path into self.data.
+
+        This method must be overridden by subclasses since the method to load
+        data will vary depending on the dataset format and source.
+
+        Raises:
+            NotImplementedError: If a subclass does not implement this method.
+        """
+        # TODO (jenniferzhao): add support for downloading data
+        raise NotImplementedError("load_data must be implemented in subclasses.")
+
+    @abstractmethod
+    def sample(self, num_requests: int) -> list[SampleRequest]:
+        """
+        Abstract method to generate sample requests from the dataset.
+
+        Subclasses must override this method to implement dataset-specific logic
+        for generating a list of SampleRequest objects.
+
+        Args:
+            num_requests (int): The number of sample requests to generate.
+
+        Returns:
+            list[SampleRequest]: A list of sample requests generated from the
+            dataset.
+        """
+        raise NotImplementedError("sample must be implemented in subclasses.")
+
+    def maybe_oversample_requests(self, requests: list[SampleRequest], num_requests: int) -> None:
+        """
+        Oversamples the list of requests if its size is less than the desired
+        number.
+
+        Args:
+            requests (List[SampleRequest]): The current list of sampled
+            requests.  num_requests (int): The target number of requests.
+        """
+        if len(requests) < num_requests:
+            random.seed(self.random_seed)
+            additional = random.choices(requests, k=num_requests - len(requests))
+            requests.extend(additional)
+            logger.info("Oversampled requests to reach %d total samples.", num_requests)
+
+
+def is_valid_sequence(
+    prompt_len: int,
+    output_len: int,
+    min_len: int = 4,
+    max_prompt_len: int = 1024,
+    max_total_len: int = 2048,
+    skip_min_output_len_check: bool = False,
+) -> bool:
+    """
+    Validate a sequence based on prompt and output lengths.
+
+    Default pruning criteria are copied from the original `sample_hf_requests`
+    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
+    from `sample_requests` in benchmark_throughput.py.
+    """
+    # Check for invalid conditions
+    prompt_too_short = prompt_len < min_len
+    output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
+    prompt_too_long = prompt_len > max_prompt_len
+    combined_too_long = (prompt_len + output_len) > max_total_len
+
+    # Return True if none of the invalid conditions are met
+    return not (prompt_too_short or output_too_short or prompt_too_long or combined_too_long)
+
+
+def process_image(image: Any) -> Mapping[str, Any]:
+    """
+    Process a single image input and return a multimedia content dictionary.
+
+    Supports three input types:
+
+    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
+       containing raw image data.  - Loads the bytes as a PIL.Image.Image.
+
+    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
+       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
+       a dictionary with the image as a base64 data URL.
+
+    3. String input: - Treats the string as a URL or local file path.  -
+       Prepends "file://" if the string doesn't start with "http://" or
+       "file://".  - Returns a dictionary with the image URL.
+
+    Raises:
+        ValueError: If the input is not a supported type.
+    """
+    if isinstance(image, dict) and "bytes" in image:
+        image = Image.open(BytesIO(image["bytes"]))
+    if isinstance(image, Image.Image):
+        image = image.convert("RGB")
+        with io.BytesIO() as image_data:
+            image.save(image_data, format="JPEG")
+            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+        return {
+            "type": "image_url",
+            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
+        }
+
+    if isinstance(image, str):
+        image_url = image if image.startswith(("http://", "file://")) else f"file://{image}"
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    raise ValueError(
+        f"Invalid image input {image}. Must be a PIL.Image.Image" " or str or dictionary with raw image bytes."
+    )
+
+
+class EBDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    temperature: float
+    repetition_penalty: float
+    frequency_penalty: float
+    presence_penalty: float
+    top_p: float
+    prompt_len: int
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = [json.loads(i.strip()) for i in f.readlines()]
+
+        if self.shuffle:
+            random.seed(self.random_seed)
+            random.shuffle(self.data)
+
+    def sample(
+        self,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        cnt = 1
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            prompt = entry["text"]
+            self.temperature = float(entry["temperature"])
+            self.repetition_penalty = float(entry["penalty_score"])
+            self.frequency_penalty = float(entry["frequency_score"])
+            self.presence_penalty = float(entry["presence_score"])
+            self.top_p = float(entry["topp"])
+            self.prompt_len = int(entry["input_token_num"])
+            new_output_len = int(entry["max_dec_len"])
+
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(prompt, None)
+            samples.append(
+                SampleRequest(
+                    no=cnt,
+                    prompt=prompt,
+                    prompt_len=self.prompt_len,
+                    history_QA=[],
+                    expected_output_len=new_output_len,
+                )
+            )
+            cnt += 1
+
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+class EBChatDataset(BenchmarkDataset):
+    """
+    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
+    sample requests based on conversation turns.
+    """
+
+    prompt_len: int
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.load_data()
+
+    def load_data(self) -> None:
+        if self.dataset_path is None:
+            raise ValueError("dataset_path must be provided for loading data.")
+
+        with open(self.dataset_path, encoding="utf-8") as f:
+            self.data = [json.loads(i.strip()) for i in f.readlines()]
+
+        if self.shuffle:
+            random.seed(self.random_seed)
+            random.shuffle(self.data)
+
+    def sample(
+        self,
+        num_requests: int,
+        lora_path: Optional[str] = None,
+        max_loras: Optional[int] = None,
+        output_len: Optional[int] = None,
+        enable_multimodal_chat: bool = False,
+        **kwargs,
+    ) -> list:
+        samples: list = []
+        cnt = 1
+        for entry in self.data:
+            if len(samples) >= num_requests:
+                break
+            json_data = entry
+            prompt = entry["messages"][-1].get("content", "")
+            history_QA = entry.get("messages", [])
+            new_output_len = int(entry.get("max_tokens", 12288))
+
+            if enable_multimodal_chat:
+                prompt = self.apply_multimodal_chat_transformation(prompt, None)
+            samples.append(
+                SampleRequest(
+                    no=cnt,
+                    json_data=json_data,
+                    prompt=prompt,
+                    prompt_len=0,
+                    history_QA=history_QA,
+                    expected_output_len=new_output_len,
+                )
+            )
+            cnt += 1
+
+        self.maybe_oversample_requests(samples, num_requests)
+        return samples
+
+
+class _ValidateDatasetArgs(argparse.Action):
+    """Argparse action to validate dataset name and path compatibility."""
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        setattr(namespace, self.dest, values)
+
+        # Get current values of both dataset_name and dataset_path
+        dataset_name = getattr(namespace, "dataset_name", "random")
+        dataset_path = getattr(namespace, "dataset_path", None)
+
+        # Validate the combination
+        if dataset_name == "random" and dataset_path is not None:
+            parser.error(
+                "Cannot use 'random' dataset with --dataset-path. "
+                "Please specify the appropriate --dataset-name (e.g., "
+                "'sharegpt', 'custom', 'sonnet') for your dataset file: "
+                f"{dataset_path}"
+            )
+
+
+def get_samples(args):
+    """Get the sample requests from the specified dataset."""
+    if not hasattr(args, "request_id_prefix"):
+        args.request_id_prefix = ""
+
+    # For datasets that follow a similar structure, use a mapping.
+    dataset_mapping = {
+        "EB": lambda: EBDataset(random_seed=args.seed, dataset_path=args.dataset_path, shuffle=args.shuffle).sample(
+            num_requests=args.num_prompts,
+            output_len=args.sharegpt_output_len,
+        ),
+        "EBChat": lambda: EBChatDataset(
+            random_seed=args.seed, dataset_path=args.dataset_path, shuffle=args.shuffle
+        ).sample(
+            num_requests=args.num_prompts,
+            output_len=args.sharegpt_output_len,
+        ),
+    }
+
+    try:
+        input_requests = dataset_mapping[args.dataset_name]()
+    except KeyError as err:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
+
+    return input_requests
+
+
+def add_dataset_parser(parser: FlexibleArgumentParser):
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument(
+        "--num-prompts",
+        type=int,
+        default=1000,
+        help="Number of prompts to process.",
+    )
+    parser.add_argument(
+        "--dataset-name",
+        type=str,
+        default="sharegpt",
+        choices=[
+            "sharegpt",
+            "burstgpt",
+            "sonnet",
+            "random",
+            "hf",
+            "EB",
+            "EBChat",
+        ],
+        help="Name of the dataset to benchmark on.",
+    )
+    parser.add_argument(
+        "--no-stream",
+        action="store_true",
+        help="Do not load the dataset in streaming mode.",
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=str,
+        default=None,
+        action=_ValidateDatasetArgs,
+        help="Path to the sharegpt/sonnet dataset. " "Or the huggingface dataset ID if using HF dataset.",
+    )
+    parser.add_argument(
+        "--no-oversample",
+        action="store_true",
+        help="Do not oversample if the dataset has " "fewer samples than num-prompts.",
+    )
+
+    # group for dataset specific arguments
+    custom_group = parser.add_argument_group("custom dataset options")
+    custom_group.add_argument(
+        "--custom-output-len",
+        type=int,
+        default=256,
+        help="Number of output tokens per request, used only for custom dataset.",
+    )
+    custom_group.add_argument(
+        "--custom-skip-chat-template",
+        action="store_true",
+        help="Skip applying chat template to prompt, used only for custom dataset.",
+    )
+
+    spec_bench_group = parser.add_argument_group("spec bench dataset options")
+    spec_bench_group.add_argument(
+        "--spec-bench-output-len",
+        type=int,
+        default=256,
+        help="Num of output tokens per request, used only for spec bench dataset.",
+    )
+    spec_bench_group.add_argument(
+        "--spec-bench-category",
+        type=str,
+        default=None,
+        help="Category for spec bench dataset. If None, use all categories.",
+    )
+
+    sonnet_group = parser.add_argument_group("sonnet dataset options")
+    sonnet_group.add_argument(
+        "--sonnet-input-len",
+        type=int,
+        default=550,
+        help="Number of input tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-output-len",
+        type=int,
+        default=150,
+        help="Number of output tokens per request, used only for sonnet dataset.",
+    )
+    sonnet_group.add_argument(
+        "--sonnet-prefix-len",
+        type=int,
+        default=200,
+        help="Number of prefix tokens per request, used only for sonnet dataset.",
+    )
+
+    sharegpt_group = parser.add_argument_group("sharegpt dataset options")
+    sharegpt_group.add_argument(
+        "--sharegpt-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output length " "from the ShareGPT dataset.",
+    )
+
+    blazedit_group = parser.add_argument_group("blazedit dataset options")
+    blazedit_group.add_argument(
+        "--blazedit-min-distance",
+        type=float,
+        default=0.0,
+        help="Minimum distance for blazedit dataset. Min: 0, Max: 1.0",
+    )
+    blazedit_group.add_argument(
+        "--blazedit-max-distance",
+        type=float,
+        default=1.0,
+        help="Maximum distance for blazedit dataset. Min: 0, Max: 1.0",
+    )
+
+    random_group = parser.add_argument_group("random dataset options")
+    random_group.add_argument(
+        "--random-input-len",
+        type=int,
+        default=1024,
+        help="Number of input tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for random sampling.",
+    )
+    random_group.add_argument(
+        "--random-range-ratio",
+        type=float,
+        default=0.0,
+        help="Range ratio for sampling input/output length, "
+        "used only for random sampling. Must be in the range [0, 1) to define "
+        "a symmetric sampling range"
+        "[length * (1 - range_ratio), length * (1 + range_ratio)].",
+    )
+    random_group.add_argument(
+        "--random-prefix-len",
+        type=int,
+        default=0,
+        help=(
+            "Number of fixed prefix tokens before the random context "
+            "in a request. "
+            "The total input length is the sum of `random-prefix-len` and "
+            "a random "
+            "context length sampled from [input_len * (1 - range_ratio), "
+            "input_len * (1 + range_ratio)]."
+        ),
+    )
+    random_group.add_argument(
+        "--random-batch-size",
+        type=int,
+        default=1,
+        help=("Batch size for random sampling. " "Only used for embeddings benchmark."),
+    )
+
+    def _parse_mm_bucket_config(v: object) -> dict[tuple[int, int, int], float]:
+        # If already a dict (e.g., programmatic call), normalize keys
+        def normalize(d: dict) -> dict[tuple[int, int, int], float]:
+            out: dict[tuple[int, int, int], float] = {}
+            for k, val in d.items():
+                key = k
+                if isinstance(key, str):
+                    with suppress(Exception):
+                        key = ast.literal_eval(key)
+                if not (isinstance(key, tuple) and len(key) == 3 and all(isinstance(x, int) for x in key)):
+                    raise ValueError(f"Invalid bucket key {k!r}. Expected tuple (H, W, T).")
+                out[(int(key[0]), int(key[1]), int(key[2]))] = float(val)
+            return out
+
+        if isinstance(v, dict):
+            return normalize(v)
+        if isinstance(v, str):
+            # Python literal (supports tuple keys)
+            parsed = ast.literal_eval(v)
+            if not isinstance(parsed, dict):
+                raise ValueError("Bucket config must parse to a dict.")
+            return normalize(parsed)
+        raise ValueError("Unsupported value for --random-mm-bucket-config.")
+
+    hf_group = parser.add_argument_group("hf dataset options")
+    hf_group.add_argument("--hf-subset", type=str, default=None, help="Subset of the HF dataset.")
+    hf_group.add_argument("--hf-split", type=str, default=None, help="Split of the HF dataset.")
+    hf_group.add_argument(
+        "--hf-name",
+        type=str,
+        default=None,
+        help=(
+            "Name of the dataset on HuggingFace "
+            "(e.g., 'lmarena-ai/VisionArena-Chat'). "
+            "Specify this if your dataset-path is a local path."
+        ),
+    )
+    hf_group.add_argument(
+        "--hf-output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the output lengths " "from the sampled HF dataset.",
+    )
+
+    prefix_repetition_group = parser.add_argument_group("prefix repetition dataset options")
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-prefix-len",
+        type=int,
+        default=256,
+        help="Number of prefix tokens per request, used only for prefix " "repetition dataset.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-suffix-len",
+        type=int,
+        default=256,
+        help="Number of suffix tokens per request, used only for prefix "
+        "repetition dataset. Total input length is prefix_len + suffix_len.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-num-prefixes",
+        type=int,
+        default=10,
+        help="Number of prefixes to generate, used only for prefix repetition "
+        "dataset. Prompts per prefix is num_requests // num_prefixes.",
+    )
+    prefix_repetition_group.add_argument(
+        "--prefix-repetition-output-len",
+        type=int,
+        default=128,
+        help="Number of output tokens per request, used only for prefix " "repetition dataset.",
+    )
--- a/fastdeploy/entrypoints/cli/benchmark/endpoint_request_func.py
+++ b/fastdeploy/entrypoints/cli/benchmark/endpoint_request_func.py
@@ -0,0 +1,702 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/backend_request_func.py
+
+
+import io
+import json
+import os
+import sys
+import time
+import traceback
+from dataclasses import dataclass, field
+from typing import Optional
+
+import aiohttp
+from tqdm.asyncio import tqdm
+
+AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
+
+
+@dataclass
+class RequestFuncInput:
+    """Input for requesting LLMs via API"""
+
+    no: int
+    prompt: str
+    history_QA: Optional[dict]
+    hyper_parameters: dict
+    api_url: str
+    prompt_len: int
+    output_len: int
+    model: str
+    model_name: Optional[str] = None
+    logprobs: Optional[int] = None
+    extra_body: Optional[dict] = None
+    multi_modal_content: Optional[dict] = None
+    ignore_eos: bool = False
+    language: Optional[str] = None
+    debug: bool = False
+
+
+@dataclass
+class RequestFuncOutput:
+    """Output for requesting LLMs via API"""
+
+    no: int = 0
+    generated_text: str = ""
+    reasoning_content: str = ""
+    success: bool = False
+    latency: float = 0.0
+    output_tokens: int = 0
+    ttft: float = 0.0  # Time to first token
+    arrival_time: list = field(default_factory=list)  # arrival_time
+    itl: list = field(default_factory=list)  # list of inter-token latencies
+    tpot: float = 0.0  # avg next-token latencies
+    prompt_len: int = 0
+    prompt_tokens: int = 0  # 推理侧返回输入token数
+    error: str = ""
+
+
+async def async_request_eb_openai_chat_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using EB OpenAI"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith(("completions", "profile")), "OpenAI Chat Completions API URL must end with 'completions'."
+
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        if request_func_input.multi_modal_content:
+            content.append(request_func_input.multi_modal_content)
+        payload = {
+            "model": request_func_input.model,
+            "messages": request_func_input.history_QA,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+                "continuous_usage_stats": True,
+            },
+        }
+        # 超参由yaml传入
+        payload.update(request_func_input.hyper_parameters)
+
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+
+        if request_func_input.debug:
+            print(f"payload:{json.dumps(payload, ensure_ascii=False)}")
+
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = 0
+        output.no = request_func_input.no
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload, headers=headers) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk != "[DONE]":
+                            # print("####chunk:", chunk, type(chunk))
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            if choices := data.get("choices"):
+                                content = choices[0]["delta"].get("content")
+                                reason_content = choices[0]["delta"].get("reasoning_content")
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+                                    # cached_tokens
+                                    output.prompt_len = (
+                                        data["usage"].get("prompt_tokens_details", {}).get("cached_tokens", 0)
+                                    )
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                output.generated_text += content or ""
+                                output.reasoning_content += reason_content or ""
+                                output.arrival_time.append(choices[0].get("arrival_time", timestamp))
+                            elif usage := data.get("usage", {}):
+                                output.output_tokens = usage.get("completion_tokens", 0)
+                                output.prompt_tokens = usage.get("prompt_tokens", 0)
+
+                            most_recent_timestamp = timestamp
+
+                    # output.generated_text = generated_text
+                    if output.generated_text.strip() == "":
+                        output.success = False
+                        output.error = "No generated text found!"
+                    else:
+                        output.success = True
+                    output.latency = most_recent_timestamp - st
+                else:
+                    error_text = await response.text()
+                    print(
+                        "####error response:",
+                        error_text,
+                        "####payload:",
+                        payload,
+                    )
+                    output.error = error_text or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        # 保存失败请求结果
+        if not output.success:
+            with open("error_output.txt", "a") as f:
+                f.write(str(output) + "\n")
+    if pbar:
+        pbar.update(1)
+    if request_func_input.debug:
+        print("#####final_output:", output)
+    return output
+
+
+async def async_request_eb_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using EB OpenAI"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model,
+            "prompt": request_func_input.prompt,
+            "stream": True,
+            "stream_options": {
+                "include_usage": True,
+                "continuous_usage_stats": True,
+            },
+        }
+        # 超参由yaml传入
+        payload.update(request_func_input.hyper_parameters)
+
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+
+        if request_func_input.debug:
+            print("payload:", json.dumps(payload, ensure_ascii=False))
+
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+            "Content-Type": "application/json",
+        }
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        output.no = request_func_input.no
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload, headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk != "[DONE]":
+                            # print("####chunk:", chunk, chunk.usage)
+                            timestamp = time.perf_counter()
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = timestamp - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                generated_text += text or ""
+
+                                most_recent_timestamp = timestamp
+                                output.arrival_time.append(choices[0].get("arrival_time", timestamp))
+                            elif usage := data.get("usage"):
+                                output.prompt_tokens = usage.get("prompt_tokens")
+                                output.output_tokens = usage.get("completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
+                        )
+
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+
+                    if output.generated_text == "":
+                        output.success = False
+                        output.error = "No generated text found!"
+                    else:
+                        output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if request_func_input.debug:
+            print(f"final_output:{output}")
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_tgi(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using the TGI API"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
+        params = {
+            "max_new_tokens": request_func_input.output_len,
+            "do_sample": True,
+            "temperature": 0.01,  # TGI does not accept 0.0 temperature.
+            "top_p": 0.99,  # TGI does not accept 1.0 top_p.
+            "truncate": request_func_input.prompt_len,
+            "ignore_eos_token": request_func_input.ignore_eos,
+        }
+        payload = {
+            "inputs": request_func_input.prompt,
+            "parameters": params,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+        if request_func_input.ignore_eos:
+            output.output_tokens = request_func_input.output_len
+        else:
+            output.output_tokens = None
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+                        chunk_bytes = chunk_bytes.decode("utf-8")
+
+                        # NOTE: Sometimes TGI returns a ping response without
+                        # any data, we should skip it.
+                        if chunk_bytes.startswith(":"):
+                            continue
+                        chunk = chunk_bytes.removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = time.perf_counter() - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+                        output.arrival_time.append(data["arrival_time"])
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+                    output.generated_text = data["generated_text"]
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_trt_llm(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using TRT's llm_server"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith("generate_stream")
+
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "accumulate_tokens": True,
+            "text_input": request_func_input.prompt,
+            "temperature": 0.0,
+            "top_p": 1.0,
+            "max_tokens": request_func_input.output_len,
+            "stream": True,
+        }
+        if request_func_input.ignore_eos:
+            payload["min_length"] = request_func_input.output_len
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
+
+                        data = json.loads(chunk)
+                        output.generated_text += data["text_output"]
+                        timestamp = time.perf_counter()
+                        # First token
+                        if ttft == 0.0:
+                            ttft = timestamp - st
+                            output.ttft = ttft
+
+                        # Decoding phase
+                        else:
+                            output.itl.append(timestamp - most_recent_timestamp)
+
+                        most_recent_timestamp = timestamp
+
+                    output.latency = most_recent_timestamp - st
+                    output.success = True
+
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_deepspeed_mii(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using Deepspeed MII"""
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
+
+        payload = {
+            "prompt": request_func_input.prompt,
+            "max_tokens": request_func_input.output_len,
+            "temperature": 0.01,  # deepspeed-mii does not accept 0.0 temp.
+            "top_p": 1.0,
+        }
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+        # will use 0 as placeholder.
+        # See https://github.com/microsoft/DeepSpeed-MII/pull/311
+        output.ttft = 0
+
+        st = time.perf_counter()
+        try:
+            async with session.post(url=request_func_input.api_url, json=payload) as response:
+                if response.status == 200:
+                    parsed_resp = await response.json()
+                    output.latency = time.perf_counter() - st
+                    if "choices" in parsed_resp:
+                        output.generated_text = parsed_resp["choices"][0]["text"]
+                    elif "text" in parsed_resp:
+                        output.generated_text = parsed_resp["text"][0]
+                    else:
+                        output.error = "Unexpected response format: " "neither 'choices' nor 'text' found"
+                        output.success = False
+                    output.success = True
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+async def async_request_openai_completions(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using OpenAI"""
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("completions", "profile")
+    ), "OpenAI Completions API URL must end with 'completions' or 'profile'."
+
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
+            "prompt": request_func_input.prompt,
+            # "temperature": 0.0,
+            "max_tokens": request_func_input.output_len,
+            "logprobs": request_func_input.logprobs,
+            "stream": True,
+            # "stream_options": {
+            #    "include_usage": True,
+            # },
+        }
+        if request_func_input.ignore_eos:
+            payload["ignore_eos"] = request_func_input.ignore_eos
+
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(url=api_url, json=payload, headers=headers) as response:
+                if response.status == 200:
+                    first_chunk_received = False
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                        if chunk != "[DONE]":
+                            # print("####chunk:", chunk, type(chunk))
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if choices := data.get("choices"):
+                                # Note that text could be empty here
+                                # e.g. for special tokens
+                                text = choices[0].get("text")
+                                timestamp = time.perf_counter()
+                                # First token
+                                if not first_chunk_received:
+                                    first_chunk_received = True
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += text or ""
+                            elif usage := data.get("usage"):
+                                output.output_tokens = usage.get("completion_tokens")
+                    if first_chunk_received:
+                        output.success = True
+                    else:
+                        output.success = False
+                        output.error = (
+                            "Never received a valid chunk to calculate TTFT." "This response will be marked as failed!"
+                        )
+                    output.generated_text = generated_text
+                    output.latency = most_recent_timestamp - st
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
+async def async_request_openai_audio(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    """Request an LLM using OpenAI"""
+    # Lazy import without PlaceholderModule to avoid vllm dep.
+    import soundfile
+
+    api_url = request_func_input.api_url
+    assert api_url.endswith(
+        ("transcriptions", "translations")
+    ), "OpenAI Chat Completions API URL must end with 'transcriptions' "
+    "or `translations`."
+
+    async with aiohttp.ClientSession(trust_env=True, timeout=AIOHTTP_TIMEOUT) as session:
+        content = [{"type": "text", "text": request_func_input.prompt}]
+        payload = {
+            "model": (request_func_input.model_name if request_func_input.model_name else request_func_input.model),
+            "temperature": 0.0,
+            "max_completion_tokens": request_func_input.output_len,
+            "stream": True,
+            "language": "en",
+            # Flattened due to multipart/form-data
+            "stream_include_usage": True,
+            "stream_continuous_usage_stats": True,
+        }
+        if request_func_input.extra_body:
+            payload.update(request_func_input.extra_body)
+        headers = {
+            "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
+        }
+
+        # Send audio file
+        def to_bytes(y, sr):
+            buffer = io.BytesIO()
+            soundfile.write(buffer, y, sr, format="WAV")
+            buffer.seek(0)
+            return buffer
+
+        with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
+            form = aiohttp.FormData()
+            form.add_field("file", f, content_type="audio/wav")
+            for key, value in payload.items():
+                form.add_field(key, str(value))
+
+            output = RequestFuncOutput()
+            output.prompt_len = request_func_input.prompt_len
+
+            generated_text = ""
+            ttft = 0.0
+            st = time.perf_counter()
+            most_recent_timestamp = st
+            try:
+                async with session.post(url=api_url, data=form, headers=headers) as response:
+                    if response.status == 200:
+                        async for chunk_bytes in response.content:
+                            chunk_bytes = chunk_bytes.strip()
+                            if not chunk_bytes:
+                                continue
+
+                            chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
+                            if chunk != "[DONE]":
+                                timestamp = time.perf_counter()
+                                data = json.loads(chunk)
+
+                                if choices := data.get("choices"):
+                                    content = choices[0]["delta"].get("content")
+                                    # First token
+                                    if ttft == 0.0:
+                                        ttft = timestamp - st
+                                        output.ttft = ttft
+
+                                    # Decoding phase
+                                    else:
+                                        output.itl.append(timestamp - most_recent_timestamp)
+
+                                    generated_text += content or ""
+                                elif usage := data.get("usage"):
+                                    output.output_tokens = usage.get("completion_tokens")
+
+                                most_recent_timestamp = timestamp
+
+                        output.generated_text = generated_text
+                        output.success = True
+                        output.latency = most_recent_timestamp - st
+                    else:
+                        output.error = response.reason or ""
+                        output.success = False
+            except Exception:
+                output.success = False
+                exc_info = sys.exc_info()
+                output.error = "".join(traceback.format_exception(*exc_info))
+
+        if pbar:
+            pbar.update(1)
+        return output
+
+
+ASYNC_REQUEST_FUNCS = {
+    "tgi": async_request_tgi,
+    "vllm": async_request_openai_completions,
+    "lmdeploy": async_request_openai_completions,
+    "deepspeed-mii": async_request_deepspeed_mii,
+    "openai": async_request_eb_openai_completions,
+    "openai-chat": async_request_eb_openai_chat_completions,
+    "openai-audio": async_request_openai_audio,
+    "tensorrt-llm": async_request_trt_llm,
+    "scalellm": async_request_openai_completions,
+    "sglang": async_request_openai_completions,
+}
+
+OPENAI_COMPATIBLE_BACKENDS = [
+    k
+    for k, v in ASYNC_REQUEST_FUNCS.items()
+    if v
+    in (
+        async_request_openai_completions,
+        async_request_eb_openai_chat_completions,
+    )
+]
--- a/fastdeploy/entrypoints/cli/benchmark/latency.py
+++ b/fastdeploy/entrypoints/cli/benchmark/latency.py
@@ -0,0 +1,153 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/vllm/benchmarks/latency.py
+
+import argparse
+import dataclasses
+import json
+import time
+
+import numpy as np
+from tqdm import tqdm
+
+import fastdeploy.envs as envs
+from fastdeploy.engine.args_utils import EngineArgs
+from fastdeploy.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+
+
+def add_cli_args(parser: argparse.ArgumentParser):
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of generated sequences per prompt.",
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument("--num-iters", type=int, default=30, help="Number of iterations to run.")
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+    parser.add_argument(
+        "--disable-detokenize",
+        action="store_true",
+        help=("Do not detokenize responses (i.e. do not include " "detokenization time in the latency measurement)"),
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    # V1 enables prefix caching by default which skews the latency
+    # numbers. We need to disable prefix caching by default.
+    parser.set_defaults(enable_prefix_caching=False)
+
+
+def main(args: argparse.Namespace):
+    if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
+        raise OSError(
+            "The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
+            "Please set it to a valid path to use torch profiler."
+        )
+    engine_args = EngineArgs.from_cli_args(args)
+
+    # Lazy import to avoid importing LLM when the bench command is not selected.
+    from fastdeploy import LLM, SamplingParams
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+    assert llm.llm_engine.cfg.max_model_len >= (args.input_len + args.output_len), (
+        "Please ensure that max_model_len is greater than" " the sum of input_len and output_len."
+    )
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        max_tokens=args.output_len,
+    )
+    dummy_prompt_token_ids = np.random.randint(10000, size=(args.batch_size, args.input_len))
+    dummy_prompts = [{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()]
+
+    def llm_generate():
+        llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False, stream=True)
+
+    def run_to_completion():
+        start_time = time.perf_counter()
+        llm_generate()
+        end_time = time.perf_counter()
+        latency = end_time - start_time
+        return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion()
+
+    if args.profile:
+        print("Profiling...")
+        run_to_completion()
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion())
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f"{percentage}% percentile latency: {percentile} seconds")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+class BenchmarkLatencySubcommand(BenchmarkSubcommandBase):
+    """The `latency` subcommand for fastdeploy bench."""
+
+    name = "latency"
+    help = "Benchmark the latency of a single batch of requests."
+
+    @classmethod
+    def add_cli_args(cls, parser: argparse.ArgumentParser) -> None:
+        add_cli_args(parser)
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        main(args)
--- a/fastdeploy/entrypoints/cli/benchmark/main.py
+++ b/fastdeploy/entrypoints/cli/benchmark/main.py
@@ -0,0 +1,160 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/benchmark/main.py
+
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+import typing
+
+from fastdeploy.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase
+from fastdeploy.entrypoints.cli.types import CLISubcommand
+
+if typing.TYPE_CHECKING:
+    from fastdeploy.utils import FlexibleArgumentParser
+
+
+FD_SUBCMD_PARSER_EPILOG = (
+    "Tip: Use `fastdeploy [serve|run-batch|bench <bench_type>] "
+    "--help=<keyword>` to explore arguments from help.\n"
+    "   - To view a argument group:     --help=ModelConfig\n"
+    "   - To view a single argument:    --help=max-num-seqs\n"
+    "   - To search by keyword:         --help=max\n"
+    "   - To list all groups:           --help=listgroup\n"
+    "   - To view help with pager:      --help=page"
+)
+
+
+def _output_with_pager(text: str):
+    """Output text using scrolling view if available and appropriate."""
+
+    pagers = ["less -R", "more"]
+    for pager_cmd in pagers:
+        try:
+            proc = subprocess.Popen(pager_cmd.split(), stdin=subprocess.PIPE, text=True)
+            proc.communicate(input=text)
+            return
+        except (subprocess.SubprocessError, OSError, FileNotFoundError):
+            continue
+
+    # No pager worked, fall back to normal print
+    print(text)
+
+
+def show_filtered_argument_or_group_from_help(parser: argparse.ArgumentParser, subcommand_name: list[str]):
+
+    # Only handle --help=<keyword> for the current subcommand.
+    # Since subparser_init() runs for all subcommands during CLI setup,
+    # we skip processing if the subcommand name is not in sys.argv.
+    # sys.argv[0] is the program name. The subcommand follows.
+    # e.g., for `vllm bench latency`,
+    # sys.argv is `['vllm', 'bench', 'latency', ...]`
+    # and subcommand_name is "bench latency".
+    if len(sys.argv) <= len(subcommand_name) or sys.argv[1 : 1 + len(subcommand_name)] != subcommand_name:
+        return
+
+    for arg in sys.argv:
+        if arg.startswith("--help="):
+            search_keyword = arg.split("=", 1)[1]
+
+            # Enable paged view for full help
+            if search_keyword == "page":
+                help_text = parser.format_help()
+                _output_with_pager(help_text)
+                sys.exit(0)
+
+            # List available groups
+            if search_keyword == "listgroup":
+                output_lines = ["\nAvailable argument groups:"]
+                for group in parser._action_groups:
+                    if group.title and not group.title.startswith("positional arguments"):
+                        output_lines.append(f"  - {group.title}")
+                        if group.description:
+                            output_lines.append("    " + group.description.strip())
+                        output_lines.append("")
+                _output_with_pager("\n".join(output_lines))
+                sys.exit(0)
+
+            # For group search
+            formatter = parser._get_formatter()
+            for group in parser._action_groups:
+                if group.title and group.title.lower() == search_keyword.lower():
+                    formatter.start_section(group.title)
+                    formatter.add_text(group.description)
+                    formatter.add_arguments(group._group_actions)
+                    formatter.end_section()
+                    _output_with_pager(formatter.format_help())
+                    sys.exit(0)
+
+            # For single arg
+            matched_actions = []
+
+            for group in parser._action_groups:
+                for action in group._group_actions:
+                    # search option name
+                    if any(search_keyword.lower() in opt.lower() for opt in action.option_strings):
+                        matched_actions.append(action)
+
+            if matched_actions:
+                header = f"\nParameters matching '{search_keyword}':\n"
+                formatter = parser._get_formatter()
+                formatter.add_arguments(matched_actions)
+                _output_with_pager(header + formatter.format_help())
+                sys.exit(0)
+
+            print(f"\nNo group or parameter matching '{search_keyword}'")
+            print("Tip: use `--help=listgroup` to view all groups.")
+            sys.exit(1)
+
+
+class BenchmarkSubcommand(CLISubcommand):
+    """The `bench` subcommand for the vLLM CLI."""
+
+    name = "bench"
+    help = "fastdeploy bench subcommand."
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        args.dispatch_function(args)
+
+    def validate(self, args: argparse.Namespace) -> None:
+        pass
+
+    def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        bench_parser = subparsers.add_parser(
+            self.name, help=self.help, description=self.help, usage="fastdeploy bench <bench_type> [options]"
+        )
+        bench_subparsers = bench_parser.add_subparsers(required=True, dest="bench_type")
+
+        for cmd_cls in BenchmarkSubcommandBase.__subclasses__():
+            cmd_subparser = bench_subparsers.add_parser(
+                cmd_cls.name,
+                help=cmd_cls.help,
+                description=cmd_cls.help,
+                usage=f"fastdeploy bench {cmd_cls.name} [options]",
+            )
+            cmd_subparser.set_defaults(dispatch_function=cmd_cls.cmd)
+            cmd_cls.add_cli_args(cmd_subparser)
+            show_filtered_argument_or_group_from_help(cmd_subparser, ["bench", cmd_cls.name])
+            cmd_subparser.epilog = FD_SUBCMD_PARSER_EPILOG
+        return bench_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [BenchmarkSubcommand()]
--- a/fastdeploy/entrypoints/cli/benchmark/serve.py
+++ b/fastdeploy/entrypoints/cli/benchmark/serve.py
--- a/fastdeploy/entrypoints/cli/main.py
+++ b/fastdeploy/entrypoints/cli/main.py
@@ -21,11 +21,13 @@ import importlib.metadata


 def main():
+    import fastdeploy.entrypoints.cli.benchmark.main
    import fastdeploy.entrypoints.cli.openai
    from fastdeploy.utils import FlexibleArgumentParser

    CMD_MODULES = [
        fastdeploy.entrypoints.cli.openai,
+        fastdeploy.entrypoints.cli.benchmark.main,
    ]

    parser = FlexibleArgumentParser(description="FastDeploy CLI")
@@ -33,7 +35,7 @@ def main():
        "-v",
        "--version",
        action="version",
-        version=importlib.metadata.version("fastdeploy"),
+        version=importlib.metadata.version("fastdeploy-gpu"),
    )
    subparsers = parser.add_subparsers(required=False, dest="subparser")
    cmds = {}
--- a/tests/entrypoints/cli/test_main.py
+++ b/tests/entrypoints/cli/test_main.py
@@ -18,7 +18,7 @@ class TestCliMain(unittest.TestCase):
        cli_main()

        # Verify version check
-        mock_metadata.version.assert_called_once_with("fastdeploy")
+        mock_metadata.version.assert_called_once_with("fastdeploy-gpu")
        mock_args.dispatch_function.assert_called_once()