import csv import os import re import sys log_patterns = [ re.compile( r"benchmarkdata_(.+?)_inputlength_(\d+)_outputlength_(\d+)_batchsize_(\d+)_numprompts_(\d+)_.*(? 1: log_dir = sys.argv[1] else: log_dir = "." try: from natsort import natsorted natsort_available = True except ImportError: natsort_available = False all_files = set(os.listdir(log_dir)) files = [] for f in os.listdir(log_dir): for pat in log_patterns: if pat.match(f): files.append(f) break if natsort_available: files = natsorted(files) else: import re as _re def natural_key(s): return [int(text) if text.isdigit() else text.lower() for text in _re.split("([0-9]+)", s)] files.sort(key=natural_key) rows = [] for file in files: m = None matched_idx = -1 for idx, pat in enumerate(log_patterns): m = pat.match(file) if m: matched_idx = idx break if not m: continue # model_name, input_len, output_len, batch_size, num_prompts # model_name, num_prompts, max_concurrency if matched_idx == 0: model_name, input_len, output_len, batch_size, num_prompts = m.groups() elif matched_idx == 1: model_name, num_prompts, max_concurrency = m.groups() input_len = "-" output_len = "-" if file.endswith(".log"): profile_file = file[:-4] + "_profile.log" else: profile_file = "" model_first = model_average = postprocessing_average = steppaddle_average = "" if profile_file in all_files: prepare_input_times, model_times, postprocessing_times, steppaddle_times = parse_profile_log_file( os.path.join(log_dir, profile_file) ) _, pia = calculate_times(prepare_input_times, False) mf, ma = calculate_times(model_times, True) _, pa = calculate_times(postprocessing_times, False) _, sa = calculate_times(steppaddle_times, False) prepare_input_average = pia if pia is not None else "" model_first = mf if mf is not None else "" model_average = ma if ma is not None else "" postprocessing_average = pa if pa is not None else "" steppaddle_average = sa if sa is not None else "" data = parse_benchmark_log_file(os.path.join(log_dir, file)) data["dataset"] = "Fixed-Length" if matched_idx == 0 else "ShareGPT" data["model_name"] = model_name data["input_length"] = input_len data["output_length"] = output_len data["batch_size"] = batch_size if matched_idx == 0 else max_concurrency data["num_prompts"] = num_prompts data["prepare_input_average"] = prepare_input_average data["model_execute_first"] = model_first data["model_execute_average"] = model_average data["postprocessing_execute_average"] = postprocessing_average data["steppaddle_execute_average"] = steppaddle_average rows.append(data) import datetime import pytz shanghai_tz = pytz.timezone("Asia/Shanghai") now = datetime.datetime.now(shanghai_tz) ts = now.strftime("%Y%m%d_%H%M%S") log_dir_name = os.path.basename(os.path.abspath(log_dir)) if log_dir_name == "" or log_dir == "." or log_dir == "/": csv_filename = f"benchmark_summary_{ts}.csv" else: csv_filename = f"benchmark_summary_{log_dir_name}_{ts}.csv" fieldnames = ( [ "model_name", "dataset", "input_length", "output_length", "batch_size", "num_prompts", ] + [name for name, _ in metrics] + [ "prepare_input_average", "model_execute_first", "model_execute_average", "postprocessing_execute_average", "steppaddle_execute_average", ] ) with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for row in rows: writer.writerow(row) print(f"CSV saved as: {csv_filename}") if __name__ == "__main__": print("Starting to parse logs...") main()