mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
* bench command * bench command * bench command * bench command * bench command --------- Co-authored-by: K11OntheBoat <your_email@example.com>
486 lines
17 KiB
Python
486 lines
17 KiB
Python
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License"
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import argparse
|
|
import unittest
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
try:
|
|
import torch
|
|
|
|
TORCH_AVAILABLE = True
|
|
except (ImportError, NameError, AttributeError, OSError):
|
|
TORCH_AVAILABLE = False
|
|
|
|
from fastdeploy.benchmarks.datasets import SampleRequest
|
|
from fastdeploy.benchmarks.throughput import (
|
|
EngineArgs,
|
|
add_cli_args,
|
|
get_requests,
|
|
main,
|
|
run_fd,
|
|
run_fd_chat,
|
|
run_hf,
|
|
validate_args,
|
|
)
|
|
|
|
|
|
class TestThroughput(unittest.TestCase):
|
|
@patch("fastdeploy.LLM")
|
|
def test_run_fd(self, mock_llm):
|
|
mock_llm_instance = MagicMock()
|
|
mock_llm.return_value = mock_llm_instance
|
|
mock_llm_instance.generate.return_value = ["output1", "output2"]
|
|
# Mock cfg.max_model_len
|
|
mock_cfg = MagicMock()
|
|
mock_cfg.max_model_len = 2048
|
|
mock_llm_instance.llm_engine.cfg = mock_cfg
|
|
|
|
requests = [
|
|
SampleRequest(
|
|
no=1, prompt="test prompt", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None
|
|
)
|
|
]
|
|
engine_args = EngineArgs(model="test_model")
|
|
|
|
elapsed_time, outputs = run_fd(requests, n=1, engine_args=engine_args)
|
|
self.assertIsInstance(elapsed_time, float)
|
|
self.assertEqual(len(outputs), 2)
|
|
|
|
@patch("fastdeploy.LLM")
|
|
def test_run_fd_chat(self, mock_llm):
|
|
mock_llm_instance = MagicMock()
|
|
mock_llm.return_value = mock_llm_instance
|
|
mock_llm_instance.chat.return_value = ["chat output1", "chat output2"]
|
|
# Mock cfg.max_model_len
|
|
mock_cfg = MagicMock()
|
|
mock_cfg.max_model_len = 2048
|
|
mock_llm_instance.llm_engine.cfg = mock_cfg
|
|
|
|
requests = [
|
|
SampleRequest(
|
|
no=1, prompt="test chat prompt", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None
|
|
)
|
|
]
|
|
engine_args = EngineArgs(model="test_model")
|
|
|
|
elapsed_time, outputs = run_fd_chat(requests, n=1, engine_args=engine_args)
|
|
self.assertIsInstance(elapsed_time, float)
|
|
self.assertEqual(len(outputs), 2)
|
|
|
|
@unittest.skipIf(not TORCH_AVAILABLE, "PyTorch is not available")
|
|
@patch("transformers.AutoModelForCausalLM.from_pretrained")
|
|
@patch("transformers.AutoTokenizer.from_pretrained")
|
|
def test_run_hf(self, mock_tokenizer, mock_model):
|
|
mock_model_instance = MagicMock()
|
|
mock_model.return_value = mock_model_instance
|
|
mock_model_instance.generate.return_value = torch.tensor([[1, 2, 3]]) if TORCH_AVAILABLE else None
|
|
|
|
mock_tokenizer_instance = MagicMock()
|
|
mock_tokenizer.return_value = mock_tokenizer_instance
|
|
mock_tokenizer_instance.pad_token = "pad"
|
|
|
|
requests = [
|
|
SampleRequest(
|
|
no=1, prompt="test hf prompt", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None
|
|
)
|
|
]
|
|
|
|
elapsed_time = run_hf(
|
|
requests,
|
|
model="test_model",
|
|
tokenizer=mock_tokenizer_instance,
|
|
n=1,
|
|
max_batch_size=4,
|
|
trust_remote_code=True,
|
|
)
|
|
self.assertIsInstance(elapsed_time, float)
|
|
|
|
@patch("fastdeploy.benchmarks.datasets.RandomDataset")
|
|
def test_get_requests(self, mock_dataset):
|
|
mock_dataset_instance = MagicMock()
|
|
mock_dataset.return_value = mock_dataset_instance
|
|
mock_dataset_instance.sample.return_value = [
|
|
SampleRequest(no=1, prompt="test1", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None),
|
|
SampleRequest(no=2, prompt="test2", prompt_len=15, expected_output_len=25, history_QA=[], json_data=None),
|
|
]
|
|
|
|
args = argparse.Namespace(
|
|
dataset_name="random",
|
|
dataset_path=None,
|
|
seed=42,
|
|
input_len=10,
|
|
output_len=20,
|
|
num_prompts=2,
|
|
hf_max_batch_size=4,
|
|
lora_path=None,
|
|
random_range_ratio=0.0,
|
|
prefix_len=0,
|
|
)
|
|
tokenizer = MagicMock()
|
|
tokenizer.vocab_size = 10000 # 设置合理的词汇表大小
|
|
tokenizer.num_special_tokens_to_add.return_value = 0 # 设置特殊token数量
|
|
|
|
requests = get_requests(args, tokenizer)
|
|
self.assertEqual(len(requests), 2)
|
|
|
|
def test_validate_args(self):
|
|
# Test basic validation
|
|
args = argparse.Namespace(
|
|
backend="fastdeploy",
|
|
dataset_name="random",
|
|
dataset=None,
|
|
dataset_path=None,
|
|
input_len=10,
|
|
output_len=20,
|
|
tokenizer=None,
|
|
model="test_model",
|
|
hf_max_batch_size=None,
|
|
trust_remote_code=False,
|
|
quantization=None,
|
|
)
|
|
validate_args(args)
|
|
self.assertEqual(args.tokenizer, "test_model")
|
|
|
|
def test_add_cli_args(self):
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
self.assertEqual(args.backend, "fastdeploy")
|
|
self.assertEqual(args.dataset_name, "random")
|
|
|
|
@patch("fastdeploy.benchmarks.throughput.run_fd")
|
|
@patch("fastdeploy.benchmarks.throughput.get_requests")
|
|
@patch("transformers.AutoTokenizer.from_pretrained")
|
|
def test_main_fastdeploy(self, mock_tokenizer, mock_get_requests, mock_run_fd):
|
|
mock_get_requests.return_value = [
|
|
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
|
|
]
|
|
mock_run_fd.return_value = (1.0, ["output1", "output2"])
|
|
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy"
|
|
args.dataset_name = "random"
|
|
args.dataset_path = None
|
|
args.seed = 42
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.num_prompts = 1
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
args.n = 1
|
|
args.hf_max_batch_size = None
|
|
args.trust_remote_code = False
|
|
args.output_json = None
|
|
args.disable_detokenize = False
|
|
args.tensor_parallel_size = 1
|
|
|
|
with patch("builtins.print") as mock_print:
|
|
main(args)
|
|
mock_print.assert_called()
|
|
|
|
@unittest.skipIf(not TORCH_AVAILABLE, "PyTorch is not available")
|
|
@patch("fastdeploy.benchmarks.throughput.run_hf")
|
|
@patch("fastdeploy.benchmarks.throughput.get_requests")
|
|
@patch("transformers.AutoTokenizer.from_pretrained")
|
|
@patch("transformers.AutoModelForCausalLM.from_pretrained")
|
|
def test_main_hf(self, mock_model, mock_tokenizer, mock_get_requests, mock_run_hf):
|
|
mock_get_requests.return_value = [
|
|
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
|
|
]
|
|
mock_run_hf.return_value = 1.0
|
|
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "hf"
|
|
args.dataset_name = "random"
|
|
args.dataset_path = None
|
|
args.seed = 42
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.num_prompts = 1
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
args.n = 1
|
|
args.hf_max_batch_size = 4
|
|
args.trust_remote_code = True
|
|
args.output_json = None
|
|
args.disable_detokenize = False
|
|
args.tensor_parallel_size = 1
|
|
|
|
with patch("builtins.print") as mock_print:
|
|
main(args)
|
|
mock_print.assert_called()
|
|
|
|
@patch("fastdeploy.benchmarks.throughput.run_fd_chat")
|
|
@patch("fastdeploy.benchmarks.throughput.get_requests")
|
|
@patch("transformers.AutoTokenizer.from_pretrained")
|
|
def test_main_fastdeploy_chat(self, mock_tokenizer, mock_get_requests, mock_run_fd_chat):
|
|
mock_get_requests.return_value = [
|
|
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
|
|
]
|
|
mock_run_fd_chat.return_value = (1.0, ["output1", "output2"])
|
|
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy-chat"
|
|
args.dataset_name = "random"
|
|
args.dataset_path = None
|
|
args.seed = 42
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.num_prompts = 1
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
args.n = 1
|
|
args.hf_max_batch_size = None
|
|
args.trust_remote_code = False
|
|
args.output_json = None
|
|
args.disable_detokenize = False
|
|
args.tensor_parallel_size = 1
|
|
|
|
with patch("builtins.print") as mock_print:
|
|
main(args)
|
|
mock_print.assert_called()
|
|
|
|
@patch("builtins.open")
|
|
@patch("json.dump")
|
|
@patch("fastdeploy.benchmarks.throughput.run_fd")
|
|
@patch("fastdeploy.benchmarks.throughput.get_requests")
|
|
def test_main_with_output_json(self, mock_get_requests, mock_run_fd, mock_json_dump, mock_open):
|
|
mock_get_requests.return_value = [
|
|
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
|
|
]
|
|
mock_run_fd.return_value = (1.0, ["output1", "output2"])
|
|
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy"
|
|
args.dataset_name = "random"
|
|
args.dataset_path = None
|
|
args.seed = 42
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.num_prompts = 1
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
args.n = 1
|
|
args.hf_max_batch_size = None
|
|
args.trust_remote_code = False
|
|
args.output_json = "output.json"
|
|
args.disable_detokenize = False
|
|
args.tensor_parallel_size = 1
|
|
|
|
main(args)
|
|
mock_json_dump.assert_called()
|
|
|
|
# 新增测试用例覆盖缺失的行
|
|
def test_validate_args_with_lora(self):
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy" # LoRA只支持vLLM后端
|
|
args.dataset_name = "random"
|
|
args.enable_lora = True
|
|
args.lora_path = "/path/to/lora"
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
validate_args(args)
|
|
|
|
def test_validate_args_with_hf_backend(self):
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "hf"
|
|
args.dataset_name = "random"
|
|
args.hf_max_batch_size = 4
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
validate_args(args)
|
|
|
|
def test_validate_args_with_quantization(self):
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy"
|
|
args.dataset_name = "random"
|
|
args.quantization = "w4a8"
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
validate_args(args)
|
|
|
|
@patch("fastdeploy.benchmarks.throughput.write_to_json")
|
|
@patch("fastdeploy.benchmarks.throughput.convert_to_pytorch_benchmark_format")
|
|
def test_save_to_pytorch_benchmark_format(self, mock_convert, mock_write):
|
|
args = argparse.Namespace(
|
|
output_json="test.json",
|
|
model="test_model",
|
|
input_len=10,
|
|
output_len=20,
|
|
backend="fastdeploy",
|
|
)
|
|
results = {
|
|
"elapsed_time": 1.0,
|
|
"num_requests": 10,
|
|
"total_num_tokens": 100,
|
|
"requests_per_second": 10.0,
|
|
"tokens_per_second": 100.0,
|
|
}
|
|
mock_convert.return_value = [{"metrics": {"requests_per_second": 10.0}}]
|
|
from fastdeploy.benchmarks.throughput import save_to_pytorch_benchmark_format
|
|
|
|
save_to_pytorch_benchmark_format(args, results)
|
|
mock_write.assert_called()
|
|
|
|
@patch("fastdeploy.benchmarks.throughput.run_fd")
|
|
@patch("fastdeploy.benchmarks.throughput.get_requests")
|
|
def test_main_with_disable_detokenize(self, mock_get_requests, mock_run_fd):
|
|
mock_get_requests.return_value = [
|
|
SampleRequest(no=1, prompt="test", prompt_len=10, expected_output_len=20, history_QA=[], json_data=None)
|
|
]
|
|
mock_run_fd.return_value = (1.0, ["output1", "output2"])
|
|
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy"
|
|
args.dataset_name = "random"
|
|
args.dataset_path = None
|
|
args.seed = 42
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.num_prompts = 1
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
args.n = 1
|
|
args.hf_max_batch_size = None
|
|
args.trust_remote_code = False
|
|
args.output_json = None
|
|
args.disable_detokenize = True
|
|
args.tensor_parallel_size = 1
|
|
|
|
with patch("builtins.print") as mock_print:
|
|
main(args)
|
|
mock_print.assert_called()
|
|
|
|
def test_validate_args_with_random_range_ratio(self):
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy"
|
|
args.dataset_name = "random"
|
|
args.random_range_ratio = 0.5
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
validate_args(args)
|
|
|
|
def test_validate_args_with_prefix_len(self):
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy"
|
|
args.dataset_name = "random"
|
|
args.prefix_len = 5
|
|
args.input_len = 10
|
|
args.output_len = 20
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
validate_args(args)
|
|
|
|
def test_validate_args_with_eb_dataset(self):
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy"
|
|
args.dataset_name = "EB"
|
|
args.dataset_path = "/path/to/eb"
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
validate_args(args)
|
|
|
|
def test_validate_args_with_ebchat_dataset(self):
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
args = parser.parse_args([])
|
|
args.backend = "fastdeploy-chat"
|
|
args.dataset_name = "EBChat"
|
|
args.dataset_path = "/path/to/ebchat"
|
|
args.tokenizer = "test_tokenizer"
|
|
args.model = "test_model"
|
|
validate_args(args)
|
|
|
|
def test_add_cli_args_with_all_options(self):
|
|
parser = argparse.ArgumentParser()
|
|
add_cli_args(parser)
|
|
# 使用parse_known_args避免未识别参数导致的SystemExit
|
|
args, _ = parser.parse_known_args(
|
|
[
|
|
"--backend",
|
|
"fastdeploy-chat",
|
|
"--dataset-name",
|
|
"EBChat",
|
|
"--dataset-path",
|
|
"/path/to/dataset",
|
|
"--input-len",
|
|
"10",
|
|
"--output-len",
|
|
"20",
|
|
"--n",
|
|
"2",
|
|
"--num-prompts",
|
|
"50",
|
|
"--hf-max-batch-size",
|
|
"4",
|
|
"--output-json",
|
|
"output.json",
|
|
"--disable-detokenize",
|
|
"--lora-path",
|
|
"/path/to/lora",
|
|
"--prefix-len",
|
|
"5",
|
|
"--random-range-ratio",
|
|
"0.5",
|
|
]
|
|
)
|
|
self.assertEqual(args.backend, "fastdeploy-chat")
|
|
self.assertEqual(args.dataset_name, "EBChat")
|
|
self.assertEqual(args.dataset_path, "/path/to/dataset")
|
|
self.assertEqual(args.input_len, 10)
|
|
self.assertEqual(args.output_len, 20)
|
|
self.assertEqual(args.n, 2)
|
|
self.assertEqual(args.num_prompts, 50)
|
|
self.assertEqual(args.hf_max_batch_size, 4)
|
|
self.assertEqual(args.output_json, "output.json")
|
|
self.assertTrue(args.disable_detokenize)
|
|
self.assertEqual(args.lora_path, "/path/to/lora")
|
|
self.assertEqual(args.prefix_len, 5)
|
|
self.assertEqual(args.random_range_ratio, 0.5)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|