diff --git a/benchmarks/yaml/GLM45-air-32k-bf16.yaml b/benchmarks/yaml/GLM45-air-32k-bf16.yaml new file mode 100644 index 000000000..b14dce761 --- /dev/null +++ b/benchmarks/yaml/GLM45-air-32k-bf16.yaml @@ -0,0 +1,5 @@ +max_model_len: 32768 +max_num_seqs: 128 +tensor_parallel_size: 4 +use_cudagraph: True +load_choices: "default_v1" diff --git a/benchmarks/yaml/GLM45-air-32k-wfp8afp8.yaml b/benchmarks/yaml/GLM45-air-32k-wfp8afp8.yaml new file mode 100644 index 000000000..5e4afe79e --- /dev/null +++ b/benchmarks/yaml/GLM45-air-32k-wfp8afp8.yaml @@ -0,0 +1,6 @@ +max_model_len: 32768 +max_num_seqs: 128 +tensor_parallel_size: 4 +use_cudagraph: True +load_choices: "default_v1" +quantization: wfp8afp8 diff --git a/benchmarks/yaml/request_yaml/GLM-32k.yaml b/benchmarks/yaml/request_yaml/GLM-32k.yaml new file mode 100644 index 000000000..c70bb5af6 --- /dev/null +++ b/benchmarks/yaml/request_yaml/GLM-32k.yaml @@ -0,0 +1,8 @@ +top_p: 0.95 +temperature: 0.6 +metadata: + min_tokens: 1 +max_tokens: 12288 +repetition_penalty: 1.0 +frequency_penalty: 0 +presence_penalty: 0