mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[unitest]clean code (#5094)
This commit is contained in:
@@ -50,7 +50,9 @@ from fastdeploy.model_executor.layers.rotary_embedding import get_rope
|
||||
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_Attention
|
||||
from fastdeploy.model_executor.ops.gpu import get_padding_offset
|
||||
|
||||
os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17")
|
||||
if "nvidia graphics device" in paddle.device.cuda.get_device_name().lower():
|
||||
# (ZKK): CI machine.
|
||||
os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17")
|
||||
|
||||
|
||||
class TestAttentionPerformance(unittest.TestCase):
|
||||
@@ -119,10 +121,10 @@ class TestAttentionPerformance(unittest.TestCase):
|
||||
"dtype": "bfloat16",
|
||||
"hidden_size": 4096,
|
||||
"max_position_embeddings": 131072,
|
||||
"max_model_len": 2 * (9000 + 128),
|
||||
"max_model_len": 5500,
|
||||
"num_attention_heads": 32,
|
||||
"num_key_value_heads": 4,
|
||||
"num_hidden_layers": 39,
|
||||
"num_hidden_layers": 5,
|
||||
}
|
||||
model_dir = tempfile.mkdtemp(prefix="tmp_model_config_")
|
||||
config_path = os.path.join(model_dir, "config.json")
|
||||
@@ -293,7 +295,6 @@ class TestAttentionPerformance(unittest.TestCase):
|
||||
# Test parameters
|
||||
test_steps = 100
|
||||
prefill_batch_size = 1
|
||||
decode_batch_size = 100 # This can be configured as needed
|
||||
prefill_seq_len = 4096
|
||||
use_dynamic_quant = True
|
||||
act_tensor_dtype = paddle.bfloat16
|
||||
@@ -317,8 +318,7 @@ class TestAttentionPerformance(unittest.TestCase):
|
||||
|
||||
paddle.device.synchronize()
|
||||
|
||||
import paddle.profiler as profiler
|
||||
|
||||
# import paddle.profiler as profiler
|
||||
# p = profiler.Profiler(
|
||||
# targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
|
||||
# on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
|
||||
@@ -341,56 +341,57 @@ class TestAttentionPerformance(unittest.TestCase):
|
||||
|
||||
# p.stop()
|
||||
|
||||
decode_hidden_states = paddle.randn(
|
||||
[decode_batch_size, self.fd_config.model_config.hidden_size], dtype=act_tensor_dtype
|
||||
)
|
||||
# p = profiler.Profiler(
|
||||
# targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
|
||||
# on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
|
||||
# )
|
||||
|
||||
forward_meta = self.create_forward_meta(
|
||||
batch_size=decode_batch_size,
|
||||
seq_len=5000,
|
||||
mode=ForwardMode.DECODE,
|
||||
fd_config=self.fd_config,
|
||||
attn_backend=self.attn_backend,
|
||||
use_dynamic_quant=use_dynamic_quant,
|
||||
)
|
||||
# p.start()
|
||||
# p.step()
|
||||
|
||||
self.attn_backend.init_attention_metadata(forward_meta)
|
||||
for decode_batch_size in [10, 20, 40, 60, 80, 100, 128]:
|
||||
decode_hidden_states = paddle.randn(
|
||||
[decode_batch_size, self.fd_config.model_config.hidden_size], dtype=act_tensor_dtype
|
||||
)
|
||||
|
||||
p = profiler.Profiler(
|
||||
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
|
||||
on_trace_ready=profiler.export_chrome_tracing("./profile_log"),
|
||||
)
|
||||
forward_meta = self.create_forward_meta(
|
||||
batch_size=decode_batch_size,
|
||||
seq_len=5000,
|
||||
mode=ForwardMode.DECODE,
|
||||
fd_config=self.fd_config,
|
||||
attn_backend=self.attn_backend,
|
||||
use_dynamic_quant=use_dynamic_quant,
|
||||
)
|
||||
|
||||
p.start()
|
||||
p.step()
|
||||
self.attn_backend.init_attention_metadata(forward_meta)
|
||||
|
||||
paddle.device.synchronize()
|
||||
paddle.device.synchronize()
|
||||
|
||||
# 必须要先预热一次!因为预处理被放到了第一层再做了!
|
||||
self.attn_forward(forward_meta, decode_hidden_states)
|
||||
# 必须要先预热一次!因为预处理被放到了第一层再做了!
|
||||
self.attn_forward(forward_meta, decode_hidden_states)
|
||||
|
||||
attn_cuda_graphs = graphs.CUDAGraph()
|
||||
attn_cuda_graphs.capture_begin()
|
||||
attn_cuda_graphs = graphs.CUDAGraph()
|
||||
attn_cuda_graphs.capture_begin()
|
||||
|
||||
self.attn_forward(forward_meta, decode_hidden_states)
|
||||
self.attn_forward(forward_meta, decode_hidden_states)
|
||||
|
||||
attn_cuda_graphs.capture_end()
|
||||
attn_cuda_graphs.capture_end()
|
||||
|
||||
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
|
||||
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
|
||||
for i in range(test_steps):
|
||||
start_events[i].record()
|
||||
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
|
||||
end_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(test_steps)]
|
||||
for i in range(test_steps):
|
||||
start_events[i].record()
|
||||
|
||||
attn_cuda_graphs.replay()
|
||||
# self.attn_forward(forward_meta, decode_hidden_states)
|
||||
attn_cuda_graphs.replay()
|
||||
# self.attn_forward(forward_meta, decode_hidden_states)
|
||||
|
||||
end_events[i].record()
|
||||
paddle.device.synchronize()
|
||||
end_events[i].record()
|
||||
paddle.device.synchronize()
|
||||
|
||||
times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
|
||||
print(times[-5:])
|
||||
times = np.array([round(s.elapsed_time(e), 1) for s, e in zip(start_events, end_events)])[1:]
|
||||
print(times[-5:])
|
||||
|
||||
p.stop()
|
||||
# p.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -39,6 +39,9 @@ from fastdeploy.scheduler import SchedulerConfig
|
||||
from fastdeploy.worker.worker_process import init_distributed_environment
|
||||
|
||||
paddle.set_default_dtype("bfloat16")
|
||||
if "nvidia graphics device" in paddle.device.cuda.get_device_name().lower():
|
||||
# (ZKK): CI machine.
|
||||
os.environ.setdefault("DG_NVCC_OVERRIDE_CPP_STANDARD", "17")
|
||||
|
||||
|
||||
class FFNWrapper(paddle.nn.Layer):
|
||||
@@ -46,7 +49,7 @@ class FFNWrapper(paddle.nn.Layer):
|
||||
super().__init__()
|
||||
self.model_config = model_config
|
||||
|
||||
self.intermediate_size = 3584
|
||||
self.intermediate_size = self.model_config.intermediate_size
|
||||
self.hidden_size = self.model_config.hidden_size
|
||||
self.prefix = "hahahha"
|
||||
self.fd_config = FDConfig(
|
||||
@@ -94,10 +97,9 @@ class FFNWrapper(paddle.nn.Layer):
|
||||
class TestFusedMoE(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.architectures = ["Ernie4_5_MoeForCausalLM"]
|
||||
self.hidden_size = 7168
|
||||
self.moe_intermediate_size = 1
|
||||
self.moe_num_experts = 1
|
||||
self.moe_k = 1
|
||||
self.hidden_size = 4096
|
||||
self.intermediate_size = 2048
|
||||
self.num_layers = 1
|
||||
self.hidden_act = "silu"
|
||||
self.num_attention_heads = 64
|
||||
self.model_config = self.build_model_config()
|
||||
@@ -115,9 +117,7 @@ class TestFusedMoE(unittest.TestCase):
|
||||
config_dict = {
|
||||
"architectures": self.architectures,
|
||||
"hidden_size": self.hidden_size,
|
||||
"moe_intermediate_size": self.moe_intermediate_size,
|
||||
"moe_num_experts": self.moe_num_experts,
|
||||
"moe_k": self.moe_k,
|
||||
"intermediate_size": self.intermediate_size,
|
||||
"hidden_act": self.hidden_act,
|
||||
"num_attention_heads": self.num_attention_heads,
|
||||
"dtype": "bfloat16",
|
||||
@@ -135,20 +135,17 @@ class TestFusedMoE(unittest.TestCase):
|
||||
|
||||
ffn = FFNWrapper(self.model_config)
|
||||
|
||||
# (ZKK): disable this test,
|
||||
# CI machine does not support deepgemm blockwise_fp8, compilation error.
|
||||
return
|
||||
|
||||
moe_cuda_graphs = [None] * 100
|
||||
cache_hidden_states = [None] * 100
|
||||
for idx, num_tokens in enumerate([10, 20, 40, 60, 80, 100, 128, 160, 192, 256, 512, 1024, 2048, 4096]):
|
||||
test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256, 4096, 4096 * 4]
|
||||
for idx, num_tokens in enumerate(test_token_nums):
|
||||
|
||||
cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16)
|
||||
|
||||
moe_cuda_graphs[idx] = graphs.CUDAGraph()
|
||||
moe_cuda_graphs[idx].capture_begin()
|
||||
|
||||
num_layers = 80
|
||||
num_layers = self.num_layers
|
||||
for _ in range(num_layers):
|
||||
out = ffn.ffn(cache_hidden_states[idx])
|
||||
|
||||
|
||||
@@ -535,12 +535,12 @@ class FuseMoEWrapper(paddle.nn.Layer):
|
||||
class TestFusedMoE(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.architectures = ["Ernie4_5_MoeForCausalLM"]
|
||||
self.hidden_size = 7168
|
||||
self.moe_intermediate_size = 3584
|
||||
self.moe_num_experts = 64
|
||||
self.hidden_size = 4096
|
||||
self.moe_intermediate_size = 2048
|
||||
self.moe_num_experts = 160
|
||||
self.moe_k = 8
|
||||
self.hidden_act = "silu"
|
||||
self.num_attention_heads = 64
|
||||
self.num_layers = 2
|
||||
self.num_attention_heads = -1
|
||||
self.model_config = self.build_model_config()
|
||||
|
||||
def build_model_config(self) -> ModelConfig:
|
||||
@@ -559,7 +559,6 @@ class TestFusedMoE(unittest.TestCase):
|
||||
"moe_intermediate_size": self.moe_intermediate_size,
|
||||
"moe_num_experts": self.moe_num_experts,
|
||||
"moe_k": self.moe_k,
|
||||
"hidden_act": self.hidden_act,
|
||||
"num_attention_heads": self.num_attention_heads,
|
||||
"dtype": "bfloat16",
|
||||
}
|
||||
@@ -590,16 +589,18 @@ class TestFusedMoE(unittest.TestCase):
|
||||
# 这行代码必须保留,否则影响均匀性!
|
||||
paddle.seed(ep_rank + 100)
|
||||
|
||||
num_layers = 80
|
||||
real_weight_layers = 20
|
||||
num_layers = self.num_layers
|
||||
real_weight_layers = num_layers // 2
|
||||
fused_moe = [None] * real_weight_layers
|
||||
for i in range(real_weight_layers):
|
||||
fused_moe[i] = FuseMoEWrapper(self.model_config, tp_size, tp_rank, ep_size, ep_rank, nnodes=nnodes)
|
||||
|
||||
moe_cuda_graphs = [None] * 100
|
||||
cache_hidden_states = [None] * 100
|
||||
test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256]
|
||||
# test_token_nums = [1024 * i for i in [1,2,4,8,16,32]]
|
||||
is_decoder = fused_moe[0].fd_config.model_config.moe_phase.phase == "decode"
|
||||
test_token_nums = [4096 * i for i in [1, 2, 4, 8]]
|
||||
if is_decoder:
|
||||
test_token_nums = [10, 20, 40, 60, 80, 100, 128, 160, 192, 256]
|
||||
for idx, num_tokens in enumerate(test_token_nums):
|
||||
|
||||
cache_hidden_states[idx] = paddle.rand((num_tokens, self.model_config.hidden_size), dtype=paddle.bfloat16)
|
||||
@@ -610,12 +611,14 @@ class TestFusedMoE(unittest.TestCase):
|
||||
|
||||
return out
|
||||
|
||||
moe_cuda_graphs[idx] = graphs.CUDAGraph()
|
||||
moe_cuda_graphs[idx].capture_begin()
|
||||
if is_decoder:
|
||||
moe_cuda_graphs[idx] = graphs.CUDAGraph()
|
||||
moe_cuda_graphs[idx].capture_begin()
|
||||
|
||||
fake_model_run()
|
||||
|
||||
moe_cuda_graphs[idx].capture_end()
|
||||
if is_decoder:
|
||||
moe_cuda_graphs[idx].capture_end()
|
||||
|
||||
num_tests = 20
|
||||
start_events = [paddle.device.cuda.Event(enable_timing=True) for _ in range(num_tests)]
|
||||
@@ -623,7 +626,10 @@ class TestFusedMoE(unittest.TestCase):
|
||||
for i in range(num_tests):
|
||||
start_events[i].record()
|
||||
|
||||
moe_cuda_graphs[idx].replay()
|
||||
if is_decoder:
|
||||
moe_cuda_graphs[idx].replay()
|
||||
else:
|
||||
fake_model_run()
|
||||
|
||||
end_events[i].record()
|
||||
paddle.device.cuda.synchronize()
|
||||
|
||||
Reference in New Issue
Block a user