mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Graph Optimization][BugFix][CI] Fix 0size bug && add unitest (#5495)
This commit is contained in:
7
.github/workflows/_base_test.yml
vendored
7
.github/workflows/_base_test.yml
vendored
@@ -206,6 +206,13 @@ jobs:
|
||||
check_service 90
|
||||
python -m pytest -sv test_max_waiting_time.py || TEST_EXIT_CODE=1
|
||||
|
||||
curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"--model\": \"/MODELDATA/ernie-4_5-21b-a3b-bf16-paddle\", \"--config\": \"ernie45t_21b_cinn.yaml\", \"--enable-logprob\": \"False\"}"
|
||||
check_service 360
|
||||
export TEMPLATE=TOKEN_NORMAL
|
||||
python -m pytest -sv test_seed_usage.py -k "not test_seed_stream" || TEST_EXIT_CODE=1
|
||||
|
||||
export TEMPLATE=TOKEN_NORMAL
|
||||
curl -X POST http://0.0.0.0:${FLASK_PORT}/switch \
|
||||
-H "Content-Type: application/json" \
|
||||
|
||||
@@ -149,6 +149,9 @@ class ForwardMeta:
|
||||
moe_num_chunk: int = 1
|
||||
max_moe_num_chunk: int = 1
|
||||
|
||||
# for zero size
|
||||
is_zero_size: bool = False
|
||||
|
||||
def clear_caches(self):
|
||||
"""Safely clean up the caches"""
|
||||
if self.caches:
|
||||
|
||||
@@ -288,7 +288,7 @@ class VocabParallelEmbedding(nn.Layer):
|
||||
h2d_copy(param[:, : shard_weight.shape[1]], shard_weight)
|
||||
param[:, shard_weight.shape[1] :].fill_(0)
|
||||
|
||||
def forward(self, ids_remove_padding=None) -> paddle.Tensor:
|
||||
def forward(self, ids_remove_padding, forward_meta=None) -> paddle.Tensor:
|
||||
"""
|
||||
Defines the forward computation of the layer.
|
||||
|
||||
@@ -299,7 +299,7 @@ class VocabParallelEmbedding(nn.Layer):
|
||||
Returns:
|
||||
Tensor: Embedded tensor representation of the input IDs.
|
||||
"""
|
||||
if ids_remove_padding.shape[0] == 0:
|
||||
if forward_meta is not None and forward_meta.is_zero_size:
|
||||
return paddle.empty([0, self.embedding_dim], dtype=self.embeddings.weight.dtype)
|
||||
if self.column_cut:
|
||||
input_embedings = self.embeddings(ids_remove_padding)
|
||||
|
||||
@@ -448,7 +448,7 @@ class Ernie4_5_Model(nn.Layer):
|
||||
ids_remove_padding: paddle.Tensor,
|
||||
forward_meta: ForwardMeta,
|
||||
):
|
||||
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
|
||||
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta)
|
||||
|
||||
if current_platform.is_iluvatar() and forward_meta.attn_backend.mixed:
|
||||
hidden_states = forward_meta.attn_backend.transpose(hidden_states)
|
||||
|
||||
@@ -1541,6 +1541,9 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
for attn_backend in self.attn_backends:
|
||||
attn_backend.init_attention_metadata(self.forward_meta)
|
||||
|
||||
# for zero size
|
||||
self.forward_meta.is_zero_size = self.forward_meta.ids_remove_padding.shape[0] == 0
|
||||
|
||||
def initialize_kv_cache(self, profile: bool = False) -> None:
|
||||
"""
|
||||
Initialize kv cache
|
||||
|
||||
8
tests/ce/deploy/ernie45t_21b_cinn.yaml
Normal file
8
tests/ce/deploy/ernie45t_21b_cinn.yaml
Normal file
@@ -0,0 +1,8 @@
|
||||
max_model_len: 32768
|
||||
max_num_seqs: 128
|
||||
tensor_parallel_size: 1
|
||||
quantization: wint4
|
||||
graph_optimization_config:
|
||||
graph_opt_level: 2
|
||||
sot_warmup_sizes: [2,16,32,64]
|
||||
use_cudagraph: True
|
||||
@@ -91,7 +91,7 @@ def setup_and_run_server():
|
||||
"--reasoning-parser",
|
||||
"ernie-45-vl",
|
||||
"--graph-optimization-config",
|
||||
'{"graph_opt_level": 1, "use_cudagraph": true, "full_cuda_graph": false}',
|
||||
'{"graph_opt_level": 2, "use_cudagraph": true, "full_cuda_graph": false}',
|
||||
]
|
||||
|
||||
# Start subprocess in new process group
|
||||
|
||||
Reference in New Issue
Block a user