Files
FastDeploy/tests/benchmarks/test_throughput_benchmarks.py
qwes5s5 6fd3e72da1 [FastDeploy Cli] Bench Command eval and throughput (#4239)
* bench command

* bench command

* bench command

* bench command

* bench command

---------

Co-authored-by: K11OntheBoat <your_email@example.com>
2025-10-10 16:17:44 +08:00

486 lines
17 KiB
Python

# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import unittest
from unittest.mock import MagicMock, patch
try:
import torch
TORCH_AVAILABLE = True
except (ImportError, NameError, AttributeError, OSError):
TORCH_AVAILABLE = False
from fastdeploy.benchmarks.datasets import SampleRequest
from fastdeploy.benchmarks.throughput import (
EngineArgs,
add_cli_args,
get_requests,
main,
run_fd,
run_fd_chat,
run_hf,
validate_args,
)
class TestThroughput(unittest.TestCase):
@patch("fastdeploy.LLM")
def test_run_fd(self, mock_llm):
mock_llm_instance = MagicMock()
mock_llm.return_value = mock_llm_instance
mock_llm_instance.generate.return_value = ["output1", "output2"]
# Mock cfg.max_model_len
mock_cfg = MagicMock()
mock_cfg.max_model_len = 2048
mock_llm_instance.llm_engine.cfg = mock_cfg
requests = [
SampleRequest(
no=1, prompt="test prompt", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None
)
]
engine_args = EngineArgs(model="test_model")
elapsed_time, outputs = run_fd(requests, n=1, engine_args=engine_args)
self.assertIsInstance(elapsed_time, float)
self.assertEqual(len(outputs), 2)
@patch("fastdeploy.LLM")
def test_run_fd_chat(self, mock_llm):
mock_llm_instance = MagicMock()
mock_llm.return_value = mock_llm_instance
mock_llm_instance.chat.return_value = ["chat output1", "chat output2"]
# Mock cfg.max_model_len
mock_cfg = MagicMock()
mock_cfg.max_model_len = 2048
mock_llm_instance.llm_engine.cfg = mock_cfg
requests = [
SampleRequest(
no=1, prompt="test chat prompt", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None
)
]
engine_args = EngineArgs(model="test_model")
elapsed_time, outputs = run_fd_chat(requests, n=1, engine_args=engine_args)
self.assertIsInstance(elapsed_time, float)
self.assertEqual(len(outputs), 2)
@unittest.skipIf(not TORCH_AVAILABLE, "PyTorch is not available")
@patch("transformers.AutoModelForCausalLM.from_pretrained")
@patch("transformers.AutoTokenizer.from_pretrained")
def test_run_hf(self, mock_tokenizer, mock_model):
mock_model_instance = MagicMock()
mock_model.return_value = mock_model_instance
mock_model_instance.generate.return_value = torch.tensor([[1, 2, 3]]) if TORCH_AVAILABLE else None
mock_tokenizer_instance = MagicMock()
mock_tokenizer.return_value = mock_tokenizer_instance
mock_tokenizer_instance.pad_token = "pad"
requests = [
SampleRequest(
no=1, prompt="test hf prompt", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None
)
]
elapsed_time = run_hf(
requests,
model="test_model",
tokenizer=mock_tokenizer_instance,
n=1,
max_batch_size=4,
trust_remote_code=True,
)
self.assertIsInstance(elapsed_time, float)
@patch("fastdeploy.benchmarks.datasets.RandomDataset")
def test_get_requests(self, mock_dataset):
mock_dataset_instance = MagicMock()
mock_dataset.return_value = mock_dataset_instance
mock_dataset_instance.sample.return_value = [
SampleRequest(no=1, prompt="test1", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None),
SampleRequest(no=2, prompt="test2", prompt_len=15, expected_output_len=25, history_QA=[], json_data=None),
]
args = argparse.Namespace(
dataset_name="random",
dataset_path=None,
seed=42,
input_len=10,
output_len=20,
num_prompts=2,
hf_max_batch_size=4,
lora_path=None,
random_range_ratio=0.0,
prefix_len=0,
)
tokenizer = MagicMock()
tokenizer.vocab_size = 10000 # 设置合理的词汇表大小
tokenizer.num_special_tokens_to_add.return_value = 0 # 设置特殊token数量
requests = get_requests(args, tokenizer)
self.assertEqual(len(requests), 2)
def test_validate_args(self):
# Test basic validation
args = argparse.Namespace(
backend="fastdeploy",
dataset_name="random",
dataset=None,
dataset_path=None,
input_len=10,
output_len=20,
tokenizer=None,
model="test_model",
hf_max_batch_size=None,
trust_remote_code=False,
quantization=None,
)
validate_args(args)
self.assertEqual(args.tokenizer, "test_model")
def test_add_cli_args(self):
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
self.assertEqual(args.backend, "fastdeploy")
self.assertEqual(args.dataset_name, "random")
@patch("fastdeploy.benchmarks.throughput.run_fd")
@patch("fastdeploy.benchmarks.throughput.get_requests")
@patch("transformers.AutoTokenizer.from_pretrained")
def test_main_fastdeploy(self, mock_tokenizer, mock_get_requests, mock_run_fd):
mock_get_requests.return_value = [
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
]
mock_run_fd.return_value = (1.0, ["output1", "output2"])
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy"
args.dataset_name = "random"
args.dataset_path = None
args.seed = 42
args.input_len = 10
args.output_len = 20
args.num_prompts = 1
args.tokenizer = "test_tokenizer"
args.model = "test_model"
args.n = 1
args.hf_max_batch_size = None
args.trust_remote_code = False
args.output_json = None
args.disable_detokenize = False
args.tensor_parallel_size = 1
with patch("builtins.print") as mock_print:
main(args)
mock_print.assert_called()
@unittest.skipIf(not TORCH_AVAILABLE, "PyTorch is not available")
@patch("fastdeploy.benchmarks.throughput.run_hf")
@patch("fastdeploy.benchmarks.throughput.get_requests")
@patch("transformers.AutoTokenizer.from_pretrained")
@patch("transformers.AutoModelForCausalLM.from_pretrained")
def test_main_hf(self, mock_model, mock_tokenizer, mock_get_requests, mock_run_hf):
mock_get_requests.return_value = [
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
]
mock_run_hf.return_value = 1.0
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "hf"
args.dataset_name = "random"
args.dataset_path = None
args.seed = 42
args.input_len = 10
args.output_len = 20
args.num_prompts = 1
args.tokenizer = "test_tokenizer"
args.model = "test_model"
args.n = 1
args.hf_max_batch_size = 4
args.trust_remote_code = True
args.output_json = None
args.disable_detokenize = False
args.tensor_parallel_size = 1
with patch("builtins.print") as mock_print:
main(args)
mock_print.assert_called()
@patch("fastdeploy.benchmarks.throughput.run_fd_chat")
@patch("fastdeploy.benchmarks.throughput.get_requests")
@patch("transformers.AutoTokenizer.from_pretrained")
def test_main_fastdeploy_chat(self, mock_tokenizer, mock_get_requests, mock_run_fd_chat):
mock_get_requests.return_value = [
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
]
mock_run_fd_chat.return_value = (1.0, ["output1", "output2"])
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy-chat"
args.dataset_name = "random"
args.dataset_path = None
args.seed = 42
args.input_len = 10
args.output_len = 20
args.num_prompts = 1
args.tokenizer = "test_tokenizer"
args.model = "test_model"
args.n = 1
args.hf_max_batch_size = None
args.trust_remote_code = False
args.output_json = None
args.disable_detokenize = False
args.tensor_parallel_size = 1
with patch("builtins.print") as mock_print:
main(args)
mock_print.assert_called()
@patch("builtins.open")
@patch("json.dump")
@patch("fastdeploy.benchmarks.throughput.run_fd")
@patch("fastdeploy.benchmarks.throughput.get_requests")
def test_main_with_output_json(self, mock_get_requests, mock_run_fd, mock_json_dump, mock_open):
mock_get_requests.return_value = [
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
]
mock_run_fd.return_value = (1.0, ["output1", "output2"])
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy"
args.dataset_name = "random"
args.dataset_path = None
args.seed = 42
args.input_len = 10
args.output_len = 20
args.num_prompts = 1
args.tokenizer = "test_tokenizer"
args.model = "test_model"
args.n = 1
args.hf_max_batch_size = None
args.trust_remote_code = False
args.output_json = "output.json"
args.disable_detokenize = False
args.tensor_parallel_size = 1
main(args)
mock_json_dump.assert_called()
# 新增测试用例覆盖缺失的行
def test_validate_args_with_lora(self):
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy" # LoRA只支持vLLM后端
args.dataset_name = "random"
args.enable_lora = True
args.lora_path = "/path/to/lora"
args.input_len = 10
args.output_len = 20
args.tokenizer = "test_tokenizer"
args.model = "test_model"
validate_args(args)
def test_validate_args_with_hf_backend(self):
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "hf"
args.dataset_name = "random"
args.hf_max_batch_size = 4
args.input_len = 10
args.output_len = 20
args.tokenizer = "test_tokenizer"
args.model = "test_model"
validate_args(args)
def test_validate_args_with_quantization(self):
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy"
args.dataset_name = "random"
args.quantization = "w4a8"
args.input_len = 10
args.output_len = 20
args.tokenizer = "test_tokenizer"
args.model = "test_model"
validate_args(args)
@patch("fastdeploy.benchmarks.throughput.write_to_json")
@patch("fastdeploy.benchmarks.throughput.convert_to_pytorch_benchmark_format")
def test_save_to_pytorch_benchmark_format(self, mock_convert, mock_write):
args = argparse.Namespace(
output_json="test.json",
model="test_model",
input_len=10,
output_len=20,
backend="fastdeploy",
)
results = {
"elapsed_time": 1.0,
"num_requests": 10,
"total_num_tokens": 100,
"requests_per_second": 10.0,
"tokens_per_second": 100.0,
}
mock_convert.return_value = [{"metrics": {"requests_per_second": 10.0}}]
from fastdeploy.benchmarks.throughput import save_to_pytorch_benchmark_format
save_to_pytorch_benchmark_format(args, results)
mock_write.assert_called()
@patch("fastdeploy.benchmarks.throughput.run_fd")
@patch("fastdeploy.benchmarks.throughput.get_requests")
def test_main_with_disable_detokenize(self, mock_get_requests, mock_run_fd):
mock_get_requests.return_value = [
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
]
mock_run_fd.return_value = (1.0, ["output1", "output2"])
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy"
args.dataset_name = "random"
args.dataset_path = None
args.seed = 42
args.input_len = 10
args.output_len = 20
args.num_prompts = 1
args.tokenizer = "test_tokenizer"
args.model = "test_model"
args.n = 1
args.hf_max_batch_size = None
args.trust_remote_code = False
args.output_json = None
args.disable_detokenize = True
args.tensor_parallel_size = 1
with patch("builtins.print") as mock_print:
main(args)
mock_print.assert_called()
def test_validate_args_with_random_range_ratio(self):
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy"
args.dataset_name = "random"
args.random_range_ratio = 0.5
args.input_len = 10
args.output_len = 20
args.tokenizer = "test_tokenizer"
args.model = "test_model"
validate_args(args)
def test_validate_args_with_prefix_len(self):
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy"
args.dataset_name = "random"
args.prefix_len = 5
args.input_len = 10
args.output_len = 20
args.tokenizer = "test_tokenizer"
args.model = "test_model"
validate_args(args)
def test_validate_args_with_eb_dataset(self):
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy"
args.dataset_name = "EB"
args.dataset_path = "/path/to/eb"
args.tokenizer = "test_tokenizer"
args.model = "test_model"
validate_args(args)
def test_validate_args_with_ebchat_dataset(self):
parser = argparse.ArgumentParser()
add_cli_args(parser)
args = parser.parse_args([])
args.backend = "fastdeploy-chat"
args.dataset_name = "EBChat"
args.dataset_path = "/path/to/ebchat"
args.tokenizer = "test_tokenizer"
args.model = "test_model"
validate_args(args)
def test_add_cli_args_with_all_options(self):
parser = argparse.ArgumentParser()
add_cli_args(parser)
# 使用parse_known_args避免未识别参数导致的SystemExit
args, _ = parser.parse_known_args(
[
"--backend",
"fastdeploy-chat",
"--dataset-name",
"EBChat",
"--dataset-path",
"/path/to/dataset",
"--input-len",
"10",
"--output-len",
"20",
"--n",
"2",
"--num-prompts",
"50",
"--hf-max-batch-size",
"4",
"--output-json",
"output.json",
"--disable-detokenize",
"--lora-path",
"/path/to/lora",
"--prefix-len",
"5",
"--random-range-ratio",
"0.5",
]
)
self.assertEqual(args.backend, "fastdeploy-chat")
self.assertEqual(args.dataset_name, "EBChat")
self.assertEqual(args.dataset_path, "/path/to/dataset")
self.assertEqual(args.input_len, 10)
self.assertEqual(args.output_len, 20)
self.assertEqual(args.n, 2)
self.assertEqual(args.num_prompts, 50)
self.assertEqual(args.hf_max_batch_size, 4)
self.assertEqual(args.output_json, "output.json")
self.assertTrue(args.disable_detokenize)
self.assertEqual(args.lora_path, "/path/to/lora")
self.assertEqual(args.prefix_len, 5)
self.assertEqual(args.random_range_ratio, 0.5)
if __name__ == "__main__":
unittest.main()