FastDeploy/benchmarks/benchmark_dataset.py

"""
# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

# This file is modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_dataset.py


import base64
import io
import json
import logging
import random
from abc import ABC, abstractmethod
from collections.abc import Mapping
from dataclasses import dataclass
from io import BytesIO
from typing import Any, Optional, Union

from PIL import Image

logger = logging.getLogger(__name__)


@dataclass
class SampleRequest:
    """
    Represents a single inference request for benchmarking.
    """

    no: int
    prompt: Union[str, Any]
    history_QA: Union[str, Any]
    json_data: Optional[dict]
    prompt_len: int
    expected_output_len: int
    response_format: Optional[dict] = None
    random_flag: bool = False


class BenchmarkDataset(ABC):
    """BenchmarkDataset"""

    DEFAULT_SEED = 0
    IS_MULTIMODAL = False

    def __init__(
        self,
        dataset_path: Optional[str] = None,
        random_seed: int = DEFAULT_SEED,
        shuffle: bool = False,
        hyperparameter_path: Optional[str] = None,
    ) -> None:
        """
        Initialize the BenchmarkDataset with an optional dataset path and random
        seed.  Args:
            dataset_path (Optional[str]): Path to the dataset. If None, it
            indicates that a default or random dataset might be used.
            random_seed (int): Seed value for reproducible shuffling or
            sampling. Defaults to DEFAULT_SEED.
        """
        self.dataset_path = dataset_path
        # Set the random seed, ensuring that a None value is replaced with the
        # default seed.
        self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
        self.data = None
        self.shuffle = shuffle
        self.hyperparameter_path = hyperparameter_path
        self.hyperparameters = {}

    def load_data(self) -> None:
        """
        Load data from the dataset path into self.data.

        This method must be overridden by subclasses since the method to load
        data will vary depending on the dataset format and source.

        Raises:
            NotImplementedError: If a subclass does not implement this method.
        """
        # TODO (jenniferzhao): add support for downloading data
        raise NotImplementedError("load_data must be implemented in subclasses.")

    @abstractmethod
    def sample(self, num_requests: int) -> list[SampleRequest]:
        """
        Abstract method to generate sample requests from the dataset.

        Subclasses must override this method to implement dataset-specific logic
        for generating a list of SampleRequest objects.

        Args:
            num_requests (int): The number of sample requests to generate.

        Returns:
            list[SampleRequest]: A list of sample requests generated from the
            dataset.
        """
        raise NotImplementedError("sample must be implemented in subclasses.")

    def maybe_oversample_requests(self, requests: list[SampleRequest], num_requests: int) -> None:
        """
        Oversamples the list of requests if its size is less than the desired
        number.

        Args:
            requests (List[SampleRequest]): The current list of sampled
            requests.  num_requests (int): The target number of requests.
        """
        if len(requests) < num_requests:
            random.seed(self.random_seed)
            additional = random.choices(requests, k=num_requests - len(requests))
            requests.extend(additional)
            logger.info("Oversampled requests to reach %d total samples.", num_requests)


def is_valid_sequence(
    prompt_len: int,
    output_len: int,
    min_len: int = 4,
    max_prompt_len: int = 1024,
    max_total_len: int = 2048,
    skip_min_output_len_check: bool = False,
) -> bool:
    """
    Validate a sequence based on prompt and output lengths.

    Default pruning criteria are copied from the original `sample_hf_requests`
    and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
    from `sample_requests` in benchmark_throughput.py.
    """
    # Check for invalid conditions
    prompt_too_short = prompt_len < min_len
    output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
    prompt_too_long = prompt_len > max_prompt_len
    combined_too_long = (prompt_len + output_len) > max_total_len

    # Return True if none of the invalid conditions are met
    return not (prompt_too_short or output_too_short or prompt_too_long or combined_too_long)


def process_image(image: Any) -> Mapping[str, Any]:
    """
    Process a single image input and return a multimedia content dictionary.

    Supports three input types:

    1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
       containing raw image data.  - Loads the bytes as a PIL.Image.Image.

    2. PIL.Image.Image input: - Converts the image to RGB.  - Saves the image as
       a JPEG in memory.  - Encodes the JPEG data as a base64 string.  - Returns
       a dictionary with the image as a base64 data URL.

    3. String input: - Treats the string as a URL or local file path.  -
       Prepends "file://" if the string doesn't start with "http://" or
       "file://".  - Returns a dictionary with the image URL.

    Raises:
        ValueError: If the input is not a supported type.
    """
    if isinstance(image, dict) and "bytes" in image:
        image = Image.open(BytesIO(image["bytes"]))
    if isinstance(image, Image.Image):
        image = image.convert("RGB")
        with io.BytesIO() as image_data:
            image.save(image_data, format="JPEG")
            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
        return {
            "type": "image_url",
            "image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
        }

    if isinstance(image, str):
        image_url = image if image.startswith(("http://", "file://")) else f"file://{image}"
        return {"type": "image_url", "image_url": {"url": image_url}}

    raise ValueError(
        f"Invalid image input {image}. Must be a PIL.Image.Image" " or str or dictionary with raw image bytes."
    )


class EBDataset(BenchmarkDataset):
    """
    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
    sample requests based on conversation turns.
    """

    temperature: float
    repetition_penalty: float
    frequency_penalty: float
    presence_penalty: float
    top_p: float
    prompt_len: int

    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.load_data()

    def load_data(self) -> None:
        if self.dataset_path is None:
            raise ValueError("dataset_path must be provided for loading data.")

        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = [json.loads(i.strip()) for i in f.readlines()]

        if self.shuffle:
            random.seed(self.random_seed)
            random.shuffle(self.data)

    def sample(
        self,
        num_requests: int,
        lora_path: Optional[str] = None,
        max_loras: Optional[int] = None,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
        **kwargs,
    ) -> list:
        samples: list = []
        cnt = 1
        for entry in self.data:
            if len(samples) >= num_requests:
                break
            prompt = entry["text"]
            self.temperature = float(entry["temperature"])
            self.repetition_penalty = float(entry["penalty_score"])
            self.frequency_penalty = float(entry["frequency_score"])
            self.presence_penalty = float(entry["presence_score"])
            self.top_p = float(entry["topp"])
            self.prompt_len = int(entry["input_token_num"])
            new_output_len = int(entry["max_dec_len"])

            if enable_multimodal_chat:
                prompt = self.apply_multimodal_chat_transformation(prompt, None)
            samples.append(
                SampleRequest(
                    no=cnt,
                    prompt=prompt,
                    prompt_len=self.prompt_len,
                    history_QA=[],
                    expected_output_len=new_output_len,
                )
            )
            cnt += 1

        self.maybe_oversample_requests(samples, num_requests)
        return samples


class EBChatDataset(BenchmarkDataset):
    """
    Implements the ShareGPT dataset.  Loads data from a JSON file and generates
    sample requests based on conversation turns.
    """

    prompt_len: int

    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.load_data()

    def load_data(self) -> None:
        if self.dataset_path is None:
            raise ValueError("dataset_path must be provided for loading data.")

        with open(self.dataset_path, encoding="utf-8") as f:
            self.data = [json.loads(i.strip()) for i in f.readlines()]

        if self.shuffle:
            random.seed(self.random_seed)
            random.shuffle(self.data)

    def sample(
        self,
        num_requests: int,
        lora_path: Optional[str] = None,
        max_loras: Optional[int] = None,
        output_len: Optional[int] = None,
        enable_multimodal_chat: bool = False,
        **kwargs,
    ) -> list:
        samples: list = []
        cnt = 1
        for entry in self.data:
            if len(samples) >= num_requests:
                break
            json_data = entry
            prompt = entry["messages"][-1].get("content", "")
            history_QA = entry.get("messages", [])
            response_format = entry.get("response_format")
            new_output_len = int(entry.get("max_tokens", output_len if output_len else 12288))

            if enable_multimodal_chat:
                prompt = self.apply_multimodal_chat_transformation(prompt, None)
            samples.append(
                SampleRequest(
                    no=cnt,
                    json_data=json_data,
                    prompt=prompt,
                    prompt_len=0,
                    history_QA=history_QA,
                    expected_output_len=new_output_len,
                    response_format=response_format,
                )
            )
            cnt += 1

        self.maybe_oversample_requests(samples, num_requests)
        return samples


class RandomTextDataset(BenchmarkDataset):
    """
    Generates random English words for pure text benchmarking.
    """

    # Common English words vocabulary
    COMMON_WORDS = [
        "the",
        "be",
        "to",
        "of",
        "and",
        "a",
        "in",
        "that",
        "have",
        "i",
        "it",
        "for",
        "not",
        "on",
        "with",
        "he",
        "as",
        "you",
        "do",
        "at",
        "this",
        "but",
        "his",
        "by",
        "from",
        "they",
        "we",
        "say",
        "her",
        "she",
        "or",
        "an",
        "will",
        "my",
        "one",
        "all",
        "would",
        "there",
        "their",
        "what",
        "so",
        "up",
        "out",
        "if",
        "about",
        "who",
        "get",
        "which",
        "go",
        "me",
        "when",
        "make",
        "can",
        "like",
        "time",
        "no",
        "just",
        "him",
        "know",
        "take",
        "people",
        "into",
        "year",
        "your",
        "good",
        "some",
        "could",
        "them",
        "see",
        "other",
        "than",
        "then",
        "now",
        "look",
        "only",
        "come",
        "its",
        "over",
        "think",
        "also",
        "back",
        "after",
        "use",
        "two",
        "how",
        "our",
        "work",
        "first",
        "well",
        "way",
        "even",
        "new",
        "want",
        "because",
        "any",
        "these",
        "give",
        "day",
        "most",
        "us",
        "is",
        "are",
        "was",
        "were",
        "been",
        "has",
        "had",
        "did",
        "done",
        "said",
        "told",
        "asked",
        "thought",
        "went",
        "saw",
        "looked",
        "found",
        "took",
        "gave",
        "made",
        "put",
        "set",
        "got",
        "ran",
        "came",
        "walked",
        "stood",
        "sat",
        "lay",
        "felt",
        "heard",
        "saw",
        "knew",
        "thought",
        "understood",
        "believed",
        "wanted",
        "needed",
        "liked",
        "loved",
        "hated",
        "feared",
        "hoped",
        "expected",
        "planned",
        "decided",
        "agreed",
        "disagreed",
        "argued",
        "discussed",
        "explained",
        "described",
        "reported",
        "announced",
        "declared",
        "stated",
        "claimed",
        "suggested",
        "proposed",
        "recommended",
        "advised",
        "warned",
        "threatened",
        "promised",
        "offered",
        "refused",
        "denied",
        "admitted",
        "confessed",
        "apologized",
        "forgave",
        "thanked",
        "congratulated",
        "celebrated",
        "welcomed",
        "greeted",
        "introduced",
        "presented",
        "showed",
        "demonstrated",
        "proved",
        "tested",
        "examined",
        "studied",
        "learned",
        "taught",
        "trained",
        "practiced",
        "performed",
        "played",
        "worked",
        "built",
        "created",
        "designed",
        "developed",
        "improved",
        "changed",
        "fixed",
        "solved",
        "completed",
        "finished",
        "started",
        "began",
        "continued",
        "stopped",
        "ended",
        "left",
        "arrived",
        "departed",
        "traveled",
        "moved",
        "stayed",
        "waited",
        "rested",
        "slept",
        "woke",
        "ate",
        "drank",
        "cooked",
        "cleaned",
        "washed",
        "dressed",
        "undressed",
        "showered",
        "bathed",
        "brushed",
        "combed",
        "shaved",
        "cut",
        "trimmed",
        "painted",
        "drew",
        "wrote",
        "read",
        "spoke",
        "listened",
        "heard",
        "saw",
        "watched",
        "looked",
        "observed",
        "noticed",
        "recognized",
        "remembered",
        "forgot",
        "learned",
        "understood",
        "knew",
        "believed",
        "doubted",
        "wondered",
        "thought",
        "considered",
        "decided",
        "chose",
        "selected",
        "preferred",
        "liked",
        "loved",
        "hated",
        "feared",
        "worried",
        "hoped",
        "expected",
        "planned",
        "prepared",
        "organized",
        "arranged",
        "scheduled",
        "timed",
        "measured",
        "counted",
        "calculated",
        "estimated",
        "valued",
        "priced",
        "cost",
        "paid",
        "bought",
        "sold",
        "traded",
        "exchanged",
        "shared",
        "divided",
        "combined",
        "joined",
        "connected",
        "attached",
        "separated",
        "divided",
        "cut",
        "broke",
        "fixed",
        "repaired",
        "built",
        "created",
        "made",
        "produced",
        "manufactured",
        "assembled",
        "constructed",
        "designed",
        "planned",
        "developed",
        "improved",
        "enhanced",
        "changed",
        "modified",
        "adjusted",
        "adapted",
        "converted",
        "transformed",
        "turned",
        "became",
        "grew",
        "developed",
        "evolved",
        "progressed",
        "advanced",
        "moved",
        "went",
        "came",
        "arrived",
        "departed",
        "left",
        "returned",
        "went back",
        "came back",
        "arrived back",
        "departed again",
        "left again",
        "returned again",
        "went away",
        "came close",
        "moved away",
        "approached",
        "reached",
        "arrived at",
        "departed from",
        "left from",
        "returned to",
        "went to",
        "came from",
        "traveled to",
        "traveled from",
        "moved to",
        "moved from",
        "stayed at",
        "remained at",
        "waited for",
        "rested at",
        "slept at",
        "woke up at",
        "ate at",
        "drank at",
        "cooked at",
        "cleaned at",
        "washed at",
        "dressed at",
        "undressed at",
        "showered at",
        "bathed at",
        "brushed at",
        "combed at",
        "shaved at",
        "cut at",
        "trimmed at",
        "painted at",
        "drew at",
        "wrote at",
        "read at",
        "spoke at",
        "listened at",
        "heard at",
        "saw at",
        "watched at",
        "looked at",
        "observed at",
        "noticed at",
        "recognized at",
        "remembered at",
        "forgot at",
        "learned at",
        "understood at",
        "knew at",
        "believed at",
        "doubted at",
        "wondered at",
        "thought at",
        "considered at",
        "decided at",
        "chose at",
        "selected at",
        "preferred at",
        "liked at",
        "loved at",
        "hated at",
        "feared at",
        "worried at",
        "hoped at",
        "expected at",
        "planned at",
        "prepared at",
        "organized at",
        "arranged at",
        "scheduled at",
        "timed at",
        "measured at",
        "counted at",
        "calculated at",
        "estimated at",
        "valued at",
        "priced at",
        "cost at",
        "paid at",
        "bought at",
        "sold at",
        "traded at",
        "exchanged at",
        "shared at",
        "divided at",
        "combined at",
        "joined at",
        "connected at",
        "attached at",
        "separated at",
        "divided at",
        "cut at",
        "broke at",
        "fixed at",
        "repaired at",
        "built at",
        "created at",
        "made at",
        "produced at",
        "manufactured at",
    ]

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def sample(
        self,
        num_requests: int,
        lora_path: Optional[str] = None,
        max_loras: Optional[int] = None,
        random_input_len: Optional[int] = None,
        random_output_len: Optional[int] = None,
        random_range_ratio: Optional[float] = None,
        enable_multimodal_chat: bool = False,
        **kwargs,
    ) -> list:
        samples = []

        def sample_len(base_len: int, ratio: float) -> int:
            if base_len is None:
                return None
            if ratio is None or ratio <= 0:
                return base_len
            lo = max(1, int(base_len * (1 - ratio)))
            hi = int(base_len * (1 + ratio))
            return random.randint(lo, hi)

        for i in range(1, num_requests + 1):
            # [length * (1 - range_ratio), length * (1 + range_ratio)]
            sampled_input_len = sample_len(random_input_len, random_range_ratio)
            sampled_output_len = sample_len(random_output_len, random_range_ratio)

            words = [random.choice(self.COMMON_WORDS) for _ in range(sampled_input_len)]
            prompt_text = " ".join(words)

            data = {
                "messages": [{"role": "user", "content": prompt_text}],
            }

            samples.append(
                SampleRequest(
                    no=i,
                    json_data=data,
                    prompt=prompt_text,
                    prompt_len=sampled_input_len,
                    history_QA=data["messages"],
                    expected_output_len=sampled_output_len,
                    random_flag=True,
                )
            )
        return samples