From e58fed36656e3860db27eb2141be234be6656c7d Mon Sep 17 00:00:00 2001 From: Ryan Date: Thu, 11 Dec 2025 16:25:26 +0800 Subject: [PATCH] [Graph Optimization][BugFix][CI] Fix 0size bug && add unitest (#5495) --- .github/workflows/_base_test.yml | 7 +++++++ fastdeploy/model_executor/forward_meta.py | 3 +++ fastdeploy/model_executor/layers/embeddings.py | 4 ++-- fastdeploy/model_executor/models/ernie4_5_moe.py | 2 +- fastdeploy/worker/gpu_model_runner.py | 3 +++ tests/ce/deploy/ernie45t_21b_cinn.yaml | 8 ++++++++ tests/e2e/test_EB_VL_Lite_sot_serving.py | 2 +- 7 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 tests/ce/deploy/ernie45t_21b_cinn.yaml diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml index d5dffb02d..d621249d6 100644 --- a/.github/workflows/_base_test.yml +++ b/.github/workflows/_base_test.yml @@ -206,6 +206,13 @@ jobs: check_service 90 python -m pytest -sv test_max_waiting_time.py || TEST_EXIT_CODE=1 + curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \ + -H "Content-Type: application/json" \ + -d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_cinn.yaml\", \"--enable-logprob\": \"False\"}" + check_service 360 + export TEMPLATE=TOKEN_NORMAL + python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1 + export TEMPLATE=TOKEN_NORMAL curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \ -H "Content-Type: application/json" \ diff --git a/fastdeploy/model_executor/forward_meta.py b/fastdeploy/model_executor/forward_meta.py index 787ec77c0..8534441f8 100644 --- a/fastdeploy/model_executor/forward_meta.py +++ b/fastdeploy/model_executor/forward_meta.py @@ -149,6 +149,9 @@ class ForwardMeta: moe_num_chunk: int = 1 max_moe_num_chunk: int = 1 + # for zero size + is_zero_size: bool = False + def clear_caches(self): """Safely clean up the caches""" if self.caches: diff --git a/fastdeploy/model_executor/layers/embeddings.py b/fastdeploy/model_executor/layers/embeddings.py index 52d7dadee..7d9bc2c6c 100644 --- a/fastdeploy/model_executor/layers/embeddings.py +++ b/fastdeploy/model_executor/layers/embeddings.py @@ -288,7 +288,7 @@ class VocabParallelEmbedding(nn.Layer): h2d_copy(param[:, : shard_weight.shape[1]], shard_weight) param[:, shard_weight.shape[1] :].fill_(0) - def forward(self, ids_remove_padding=None) -> paddle.Tensor: + def forward(self, ids_remove_padding, forward_meta=None) -> paddle.Tensor: """ Defines the forward computation of the layer. @@ -299,7 +299,7 @@ class VocabParallelEmbedding(nn.Layer): Returns: Tensor: Embedded tensor representation of the input IDs. """ - if ids_remove_padding.shape[0] == 0: + if forward_meta is not None and forward_meta.is_zero_size: return paddle.empty([0, self.embedding_dim], dtype=self.embeddings.weight.dtype) if self.column_cut: input_embedings = self.embeddings(ids_remove_padding) diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 3c042e17f..87678a692 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -448,7 +448,7 @@ class Ernie4_5_Model(nn.Layer): ids_remove_padding: paddle.Tensor, forward_meta: ForwardMeta, ): - hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding) + hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta) if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed: hidden_states = forward_meta.attn_backend.transpose(hidden_states) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index ee64fb247..564893e8c 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1541,6 +1541,9 @@ class GPUModelRunner(ModelRunnerBase): for attn_backend in self.attn_backends: attn_backend.init_attention_metadata(self.forward_meta) + # for zero size + self.forward_meta.is_zero_size = self.forward_meta.ids_remove_padding.shape[0] == 0 + def initialize_kv_cache(self, profile: bool = False) -> None: """ Initialize kv cache diff --git a/tests/ce/deploy/ernie45t_21b_cinn.yaml b/tests/ce/deploy/ernie45t_21b_cinn.yaml new file mode 100644 index 000000000..50276f1b3 --- /dev/null +++ b/tests/ce/deploy/ernie45t_21b_cinn.yaml @@ -0,0 +1,8 @@ +max_model_len: 32768 +max_num_seqs: 128 +tensor_parallel_size: 1 +quantization: wint4 +graph_optimization_config: + graph_opt_level: 2 + sot_warmup_sizes: [2,16,32,64] + use_cudagraph: True diff --git a/tests/e2e/test_EB_VL_Lite_sot_serving.py b/tests/e2e/test_EB_VL_Lite_sot_serving.py index b21c99329..ece97aef0 100644 --- a/tests/e2e/test_EB_VL_Lite_sot_serving.py +++ b/tests/e2e/test_EB_VL_Lite_sot_serving.py @@ -91,7 +91,7 @@ def setup_and_run_server(): "--reasoning-parser", "ernie-45-vl", "--graph-optimization-config", - '{"graph_opt_level": 1, "use_cudagraph": true, "full_cuda_graph": false}', + '{"graph_opt_level": 2, "use_cudagraph": true, "full_cuda_graph": false}', ] # Start subprocess in new process group