Files
FastDeploy/benchmarks/benchmark_dataset.py
Zhang Yulong 5b49142988 update (#5298)
2025-11-28 18:29:16 +08:00

818 lines
21 KiB
Python

"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py
import base64
import io
import json
import logging
import random
from abc import ABC, abstractmethod
from collections.abc import Mapping
from dataclasses import dataclass
from io import BytesIO
from typing import Any, Optional, Union
from PIL import Image
logger = logging.getLogger(__name__)
@dataclass
class SampleRequest:
"""
Represents a single inference request for benchmarking.
"""
no: int
prompt: Union[str, Any]
history_QA: Union[str, Any]
json_data: Optional[dict]
prompt_len: int
expected_output_len: int
response_format: Optional[dict] = None
random_flag: bool = False
class BenchmarkDataset(ABC):
"""BenchmarkDataset"""
DEFAULT_SEED = 0
IS_MULTIMODAL = False
def __init__(
self,
dataset_path: Optional[str] = None,
random_seed: int = DEFAULT_SEED,
shuffle: bool = False,
hyperparameter_path: Optional[str] = None,
) -> None:
"""
Initialize the BenchmarkDataset with an optional dataset path and random
seed. Args:
dataset_path (Optional[str]): Path to the dataset. If None, it
indicates that a default or random dataset might be used.
random_seed (int): Seed value for reproducible shuffling or
sampling. Defaults to DEFAULT_SEED.
"""
self.dataset_path = dataset_path
# Set the random seed, ensuring that a None value is replaced with the
# default seed.
self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
self.data = None
self.shuffle = shuffle
self.hyperparameter_path = hyperparameter_path
self.hyperparameters = {}
def load_data(self) -> None:
"""
Load data from the dataset path into self.data.
This method must be overridden by subclasses since the method to load
data will vary depending on the dataset format and source.
Raises:
NotImplementedError: If a subclass does not implement this method.
"""
# TODO (jenniferzhao): add support for downloading data
raise NotImplementedError("load_data must be implemented in subclasses.")
@abstractmethod
def sample(self, num_requests: int) -> list[SampleRequest]:
"""
Abstract method to generate sample requests from the dataset.
Subclasses must override this method to implement dataset-specific logic
for generating a list of SampleRequest objects.
Args:
num_requests (int): The number of sample requests to generate.
Returns:
list[SampleRequest]: A list of sample requests generated from the
dataset.
"""
raise NotImplementedError("sample must be implemented in subclasses.")
def maybe_oversample_requests(self, requests: list[SampleRequest], num_requests: int) -> None:
"""
Oversamples the list of requests if its size is less than the desired
number.
Args:
requests (List[SampleRequest]): The current list of sampled
requests. num_requests (int): The target number of requests.
"""
if len(requests) < num_requests:
random.seed(self.random_seed)
additional = random.choices(requests, k=num_requests - len(requests))
requests.extend(additional)
logger.info("Oversampled requests to reach %d total samples.", num_requests)
def is_valid_sequence(
prompt_len: int,
output_len: int,
min_len: int = 4,
max_prompt_len: int = 1024,
max_total_len: int = 2048,
skip_min_output_len_check: bool = False,
) -> bool:
"""
Validate a sequence based on prompt and output lengths.
Default pruning criteria are copied from the original `sample_hf_requests`
and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
from `sample_requests` in benchmark_throughput.py.
"""
# Check for invalid conditions
prompt_too_short = prompt_len < min_len
output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
prompt_too_long = prompt_len > max_prompt_len
combined_too_long = (prompt_len + output_len) > max_total_len
# Return True if none of the invalid conditions are met
return not (prompt_too_short or output_too_short or prompt_too_long or combined_too_long)
def process_image(image: Any) -> Mapping[str, Any]:
"""
Process a single image input and return a multimedia content dictionary.
Supports three input types:
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
containing raw image data. - Loads the bytes as a PIL.Image.Image.
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
a dictionary with the image as a base64 data URL.
3. String input: - Treats the string as a URL or local file path. -
Prepends "file://" if the string doesn't start with "http://" or
"file://". - Returns a dictionary with the image URL.
Raises:
ValueError: If the input is not a supported type.
"""
if isinstance(image, dict) and "bytes" in image:
image = Image.open(BytesIO(image["bytes"]))
if isinstance(image, Image.Image):
image = image.convert("RGB")
with io.BytesIO() as image_data:
image.save(image_data, format="JPEG")
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
return {
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
}
if isinstance(image, str):
image_url = image if image.startswith(("http://", "file://")) else f"file://{image}"
return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError(
f"Invalid image input {image}. Must be a PIL.Image.Image" " or str or dictionary with raw image bytes."
)
class EBDataset(BenchmarkDataset):
"""
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
temperature: float
repetition_penalty: float
frequency_penalty: float
presence_penalty: float
top_p: float
prompt_len: int
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
with open(self.dataset_path, encoding="utf-8") as f:
self.data = [json.loads(i.strip()) for i in f.readlines()]
if self.shuffle:
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(
self,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
samples: list = []
cnt = 1
for entry in self.data:
if len(samples) >= num_requests:
break
prompt = entry["text"]
self.temperature = float(entry["temperature"])
self.repetition_penalty = float(entry["penalty_score"])
self.frequency_penalty = float(entry["frequency_score"])
self.presence_penalty = float(entry["presence_score"])
self.top_p = float(entry["topp"])
self.prompt_len = int(entry["input_token_num"])
new_output_len = int(entry["max_dec_len"])
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(prompt, None)
samples.append(
SampleRequest(
no=cnt,
prompt=prompt,
prompt_len=self.prompt_len,
history_QA=[],
expected_output_len=new_output_len,
)
)
cnt += 1
self.maybe_oversample_requests(samples, num_requests)
return samples
class EBChatDataset(BenchmarkDataset):
"""
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
prompt_len: int
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
with open(self.dataset_path, encoding="utf-8") as f:
self.data = [json.loads(i.strip()) for i in f.readlines()]
if self.shuffle:
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(
self,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
samples: list = []
cnt = 1
for entry in self.data:
if len(samples) >= num_requests:
break
json_data = entry
prompt = entry["messages"][-1].get("content", "")
history_QA = entry.get("messages", [])
response_format = entry.get("response_format")
new_output_len = int(entry.get("max_tokens", output_len if output_len else 12288))
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(prompt, None)
samples.append(
SampleRequest(
no=cnt,
json_data=json_data,
prompt=prompt,
prompt_len=0,
history_QA=history_QA,
expected_output_len=new_output_len,
response_format=response_format,
)
)
cnt += 1
self.maybe_oversample_requests(samples, num_requests)
return samples
class RandomTextDataset(BenchmarkDataset):
"""
Generates random English words for pure text benchmarking.
"""
# Common English words vocabulary
COMMON_WORDS = [
"the",
"be",
"to",
"of",
"and",
"a",
"in",
"that",
"have",
"i",
"it",
"for",
"not",
"on",
"with",
"he",
"as",
"you",
"do",
"at",
"this",
"but",
"his",
"by",
"from",
"they",
"we",
"say",
"her",
"she",
"or",
"an",
"will",
"my",
"one",
"all",
"would",
"there",
"their",
"what",
"so",
"up",
"out",
"if",
"about",
"who",
"get",
"which",
"go",
"me",
"when",
"make",
"can",
"like",
"time",
"no",
"just",
"him",
"know",
"take",
"people",
"into",
"year",
"your",
"good",
"some",
"could",
"them",
"see",
"other",
"than",
"then",
"now",
"look",
"only",
"come",
"its",
"over",
"think",
"also",
"back",
"after",
"use",
"two",
"how",
"our",
"work",
"first",
"well",
"way",
"even",
"new",
"want",
"because",
"any",
"these",
"give",
"day",
"most",
"us",
"is",
"are",
"was",
"were",
"been",
"has",
"had",
"did",
"done",
"said",
"told",
"asked",
"thought",
"went",
"saw",
"looked",
"found",
"took",
"gave",
"made",
"put",
"set",
"got",
"ran",
"came",
"walked",
"stood",
"sat",
"lay",
"felt",
"heard",
"saw",
"knew",
"thought",
"understood",
"believed",
"wanted",
"needed",
"liked",
"loved",
"hated",
"feared",
"hoped",
"expected",
"planned",
"decided",
"agreed",
"disagreed",
"argued",
"discussed",
"explained",
"described",
"reported",
"announced",
"declared",
"stated",
"claimed",
"suggested",
"proposed",
"recommended",
"advised",
"warned",
"threatened",
"promised",
"offered",
"refused",
"denied",
"admitted",
"confessed",
"apologized",
"forgave",
"thanked",
"congratulated",
"celebrated",
"welcomed",
"greeted",
"introduced",
"presented",
"showed",
"demonstrated",
"proved",
"tested",
"examined",
"studied",
"learned",
"taught",
"trained",
"practiced",
"performed",
"played",
"worked",
"built",
"created",
"designed",
"developed",
"improved",
"changed",
"fixed",
"solved",
"completed",
"finished",
"started",
"began",
"continued",
"stopped",
"ended",
"left",
"arrived",
"departed",
"traveled",
"moved",
"stayed",
"waited",
"rested",
"slept",
"woke",
"ate",
"drank",
"cooked",
"cleaned",
"washed",
"dressed",
"undressed",
"showered",
"bathed",
"brushed",
"combed",
"shaved",
"cut",
"trimmed",
"painted",
"drew",
"wrote",
"read",
"spoke",
"listened",
"heard",
"saw",
"watched",
"looked",
"observed",
"noticed",
"recognized",
"remembered",
"forgot",
"learned",
"understood",
"knew",
"believed",
"doubted",
"wondered",
"thought",
"considered",
"decided",
"chose",
"selected",
"preferred",
"liked",
"loved",
"hated",
"feared",
"worried",
"hoped",
"expected",
"planned",
"prepared",
"organized",
"arranged",
"scheduled",
"timed",
"measured",
"counted",
"calculated",
"estimated",
"valued",
"priced",
"cost",
"paid",
"bought",
"sold",
"traded",
"exchanged",
"shared",
"divided",
"combined",
"joined",
"connected",
"attached",
"separated",
"divided",
"cut",
"broke",
"fixed",
"repaired",
"built",
"created",
"made",
"produced",
"manufactured",
"assembled",
"constructed",
"designed",
"planned",
"developed",
"improved",
"enhanced",
"changed",
"modified",
"adjusted",
"adapted",
"converted",
"transformed",
"turned",
"became",
"grew",
"developed",
"evolved",
"progressed",
"advanced",
"moved",
"went",
"came",
"arrived",
"departed",
"left",
"returned",
"went back",
"came back",
"arrived back",
"departed again",
"left again",
"returned again",
"went away",
"came close",
"moved away",
"approached",
"reached",
"arrived at",
"departed from",
"left from",
"returned to",
"went to",
"came from",
"traveled to",
"traveled from",
"moved to",
"moved from",
"stayed at",
"remained at",
"waited for",
"rested at",
"slept at",
"woke up at",
"ate at",
"drank at",
"cooked at",
"cleaned at",
"washed at",
"dressed at",
"undressed at",
"showered at",
"bathed at",
"brushed at",
"combed at",
"shaved at",
"cut at",
"trimmed at",
"painted at",
"drew at",
"wrote at",
"read at",
"spoke at",
"listened at",
"heard at",
"saw at",
"watched at",
"looked at",
"observed at",
"noticed at",
"recognized at",
"remembered at",
"forgot at",
"learned at",
"understood at",
"knew at",
"believed at",
"doubted at",
"wondered at",
"thought at",
"considered at",
"decided at",
"chose at",
"selected at",
"preferred at",
"liked at",
"loved at",
"hated at",
"feared at",
"worried at",
"hoped at",
"expected at",
"planned at",
"prepared at",
"organized at",
"arranged at",
"scheduled at",
"timed at",
"measured at",
"counted at",
"calculated at",
"estimated at",
"valued at",
"priced at",
"cost at",
"paid at",
"bought at",
"sold at",
"traded at",
"exchanged at",
"shared at",
"divided at",
"combined at",
"joined at",
"connected at",
"attached at",
"separated at",
"divided at",
"cut at",
"broke at",
"fixed at",
"repaired at",
"built at",
"created at",
"made at",
"produced at",
"manufactured at",
]
def __init__(self, **kwargs):
super().__init__(**kwargs)
def sample(
self,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
random_input_len: Optional[int] = None,
random_output_len: Optional[int] = None,
random_range_ratio: Optional[float] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
samples = []
def sample_len(base_len: int, ratio: float) -> int:
if base_len is None:
return None
if ratio is None or ratio <= 0:
return base_len
lo = max(1, int(base_len * (1 - ratio)))
hi = int(base_len * (1 + ratio))
return random.randint(lo, hi)
for i in range(1, num_requests + 1):
# [length * (1 - range_ratio), length * (1 + range_ratio)]
sampled_input_len = sample_len(random_input_len, random_range_ratio)
sampled_output_len = sample_len(random_output_len, random_range_ratio)
words = [random.choice(self.COMMON_WORDS) for _ in range(sampled_input_len)]
prompt_text = " ".join(words)
data = {
"messages": [{"role": "user", "content": prompt_text}],
}
samples.append(
SampleRequest(
no=i,
json_data=data,
prompt=prompt_text,
prompt_len=sampled_input_len,
history_QA=data["messages"],
expected_output_len=sampled_output_len,
random_flag=True,
)
)
return samples