diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml index d621249d6..851c0648b 100644 --- a/.github/workflows/_base_test.yml +++ b/.github/workflows/_base_test.yml @@ -206,6 +206,13 @@ jobs: check_service 90 python -m pytest -sv test_max_waiting_time.py || TEST_EXIT_CODE=1 + curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \ + -H "Content-Type: application/json" \ + -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_sot.yaml\", \"--enable-logprob\": \"False\"}" + check_service 360 + export TEMPLATE=TOKEN_NORMAL + python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1 + curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \ -H "Content-Type: application/json" \ -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_cinn.yaml\", \"--enable-logprob\": \"False\"}" diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 0be9564d2..334191da3 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -244,8 +244,6 @@ class AppendAttentionBackend(AttentionBackend): # 128 is qwen3 # 32 is glm assert forward_meta.rotary_embs.shape[4] in [128, 32] - else: - assert forward_meta.rotary_embs.shape == [2, 1, self.max_seq_len, 1, 64] if self.pd_disaggregation_mode == "per_query": metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( diff --git a/tests/ce/deploy/ernie45t_21b_sot.yaml b/tests/ce/deploy/ernie45t_21b_sot.yaml new file mode 100644 index 000000000..46142bf61 --- /dev/null +++ b/tests/ce/deploy/ernie45t_21b_sot.yaml @@ -0,0 +1,8 @@ +max_model_len: 32768 +max_num_seqs: 128 +tensor_parallel_size: 1 +quantization: wint4 +graph_optimization_config: + graph_opt_level: 1 + sot_warmup_sizes: [2,16,32,64] + use_cudagraph: True