This commit is contained in:
Zhang Yulong
2025-11-28 18:29:16 +08:00
committed by GitHub
parent a535050b11
commit 5b49142988
4 changed files with 561 additions and 29 deletions

View File

@@ -46,6 +46,7 @@ class SampleRequest:
prompt_len: int
expected_output_len: int
response_format: Optional[dict] = None
random_flag: bool = False
class BenchmarkDataset(ABC):
@@ -318,3 +319,499 @@ class EBChatDataset(BenchmarkDataset):
self.maybe_oversample_requests(samples, num_requests)
return samples
class RandomTextDataset(BenchmarkDataset):
"""
Generates random English words for pure text benchmarking.
"""
# Common English words vocabulary
COMMON_WORDS = [
"the",
"be",
"to",
"of",
"and",
"a",
"in",
"that",
"have",
"i",
"it",
"for",
"not",
"on",
"with",
"he",
"as",
"you",
"do",
"at",
"this",
"but",
"his",
"by",
"from",
"they",
"we",
"say",
"her",
"she",
"or",
"an",
"will",
"my",
"one",
"all",
"would",
"there",
"their",
"what",
"so",
"up",
"out",
"if",
"about",
"who",
"get",
"which",
"go",
"me",
"when",
"make",
"can",
"like",
"time",
"no",
"just",
"him",
"know",
"take",
"people",
"into",
"year",
"your",
"good",
"some",
"could",
"them",
"see",
"other",
"than",
"then",
"now",
"look",
"only",
"come",
"its",
"over",
"think",
"also",
"back",
"after",
"use",
"two",
"how",
"our",
"work",
"first",
"well",
"way",
"even",
"new",
"want",
"because",
"any",
"these",
"give",
"day",
"most",
"us",
"is",
"are",
"was",
"were",
"been",
"has",
"had",
"did",
"done",
"said",
"told",
"asked",
"thought",
"went",
"saw",
"looked",
"found",
"took",
"gave",
"made",
"put",
"set",
"got",
"ran",
"came",
"walked",
"stood",
"sat",
"lay",
"felt",
"heard",
"saw",
"knew",
"thought",
"understood",
"believed",
"wanted",
"needed",
"liked",
"loved",
"hated",
"feared",
"hoped",
"expected",
"planned",
"decided",
"agreed",
"disagreed",
"argued",
"discussed",
"explained",
"described",
"reported",
"announced",
"declared",
"stated",
"claimed",
"suggested",
"proposed",
"recommended",
"advised",
"warned",
"threatened",
"promised",
"offered",
"refused",
"denied",
"admitted",
"confessed",
"apologized",
"forgave",
"thanked",
"congratulated",
"celebrated",
"welcomed",
"greeted",
"introduced",
"presented",
"showed",
"demonstrated",
"proved",
"tested",
"examined",
"studied",
"learned",
"taught",
"trained",
"practiced",
"performed",
"played",
"worked",
"built",
"created",
"designed",
"developed",
"improved",
"changed",
"fixed",
"solved",
"completed",
"finished",
"started",
"began",
"continued",
"stopped",
"ended",
"left",
"arrived",
"departed",
"traveled",
"moved",
"stayed",
"waited",
"rested",
"slept",
"woke",
"ate",
"drank",
"cooked",
"cleaned",
"washed",
"dressed",
"undressed",
"showered",
"bathed",
"brushed",
"combed",
"shaved",
"cut",
"trimmed",
"painted",
"drew",
"wrote",
"read",
"spoke",
"listened",
"heard",
"saw",
"watched",
"looked",
"observed",
"noticed",
"recognized",
"remembered",
"forgot",
"learned",
"understood",
"knew",
"believed",
"doubted",
"wondered",
"thought",
"considered",
"decided",
"chose",
"selected",
"preferred",
"liked",
"loved",
"hated",
"feared",
"worried",
"hoped",
"expected",
"planned",
"prepared",
"organized",
"arranged",
"scheduled",
"timed",
"measured",
"counted",
"calculated",
"estimated",
"valued",
"priced",
"cost",
"paid",
"bought",
"sold",
"traded",
"exchanged",
"shared",
"divided",
"combined",
"joined",
"connected",
"attached",
"separated",
"divided",
"cut",
"broke",
"fixed",
"repaired",
"built",
"created",
"made",
"produced",
"manufactured",
"assembled",
"constructed",
"designed",
"planned",
"developed",
"improved",
"enhanced",
"changed",
"modified",
"adjusted",
"adapted",
"converted",
"transformed",
"turned",
"became",
"grew",
"developed",
"evolved",
"progressed",
"advanced",
"moved",
"went",
"came",
"arrived",
"departed",
"left",
"returned",
"went back",
"came back",
"arrived back",
"departed again",
"left again",
"returned again",
"went away",
"came close",
"moved away",
"approached",
"reached",
"arrived at",
"departed from",
"left from",
"returned to",
"went to",
"came from",
"traveled to",
"traveled from",
"moved to",
"moved from",
"stayed at",
"remained at",
"waited for",
"rested at",
"slept at",
"woke up at",
"ate at",
"drank at",
"cooked at",
"cleaned at",
"washed at",
"dressed at",
"undressed at",
"showered at",
"bathed at",
"brushed at",
"combed at",
"shaved at",
"cut at",
"trimmed at",
"painted at",
"drew at",
"wrote at",
"read at",
"spoke at",
"listened at",
"heard at",
"saw at",
"watched at",
"looked at",
"observed at",
"noticed at",
"recognized at",
"remembered at",
"forgot at",
"learned at",
"understood at",
"knew at",
"believed at",
"doubted at",
"wondered at",
"thought at",
"considered at",
"decided at",
"chose at",
"selected at",
"preferred at",
"liked at",
"loved at",
"hated at",
"feared at",
"worried at",
"hoped at",
"expected at",
"planned at",
"prepared at",
"organized at",
"arranged at",
"scheduled at",
"timed at",
"measured at",
"counted at",
"calculated at",
"estimated at",
"valued at",
"priced at",
"cost at",
"paid at",
"bought at",
"sold at",
"traded at",
"exchanged at",
"shared at",
"divided at",
"combined at",
"joined at",
"connected at",
"attached at",
"separated at",
"divided at",
"cut at",
"broke at",
"fixed at",
"repaired at",
"built at",
"created at",
"made at",
"produced at",
"manufactured at",
]
def __init__(self, **kwargs):
super().__init__(**kwargs)
def sample(
self,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
random_input_len: Optional[int] = None,
random_output_len: Optional[int] = None,
random_range_ratio: Optional[float] = None,
enable_multimodal_chat: bool = False,
**kwargs,
) -> list:
samples = []
def sample_len(base_len: int, ratio: float) -> int:
if base_len is None:
return None
if ratio is None or ratio <= 0:
return base_len
lo = max(1, int(base_len * (1 - ratio)))
hi = int(base_len * (1 + ratio))
return random.randint(lo, hi)
for i in range(1, num_requests + 1):
# [length * (1 - range_ratio), length * (1 + range_ratio)]
sampled_input_len = sample_len(random_input_len, random_range_ratio)
sampled_output_len = sample_len(random_output_len, random_range_ratio)
words = [random.choice(self.COMMON_WORDS) for _ in range(sampled_input_len)]
prompt_text = " ".join(words)
data = {
"messages": [{"role": "user", "content": prompt_text}],
}
samples.append(
SampleRequest(
no=i,
json_data=data,
prompt=prompt_text,
prompt_len=sampled_input_len,
history_QA=data["messages"],
expected_output_len=sampled_output_len,
random_flag=True,
)
)
return samples