diff --git a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py index 98057c9e2..9e0f4b93e 100644 --- a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py +++ b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py @@ -130,7 +130,6 @@ class GraphOptBackend: ) def __call__(self, **kwargs): - self._debug_count_total_step += 1 if not self.fd_config.graph_opt_config.use_cudagraph: return self.runnable(**kwargs) if self.cudagraph_piecewise_backend is None: @@ -140,6 +139,9 @@ class GraphOptBackend: assert kwargs["forward_meta"].ids_remove_padding is not None real_shape = kwargs["forward_meta"].ids_remove_padding.shape[0] + if real_shape > 0: + # only count the actual load. + self._debug_count_total_step += 1 if (not kwargs["forward_meta"].step_use_cudagraph) or (real_shape > self.cudagraph_switch_threshold): return self.dy_runnable(**kwargs) diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 78dddc93f..0be9564d2 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -233,9 +233,20 @@ class AppendAttentionBackend(AttentionBackend): forward_mixed """ metadata = self.attention_metadata - sliding_window = layer.sliding_window + if self.rope_3d: + assert len(forward_meta.rotary_embs.shape) == 6 + else: + assert len(forward_meta.rotary_embs.shape) == 5 + if layer.use_neox_rotary_style: + assert forward_meta.rotary_embs.shape[0:4] == [2, 1, self.max_seq_len, 1] + # 128 is qwen3 + # 32 is glm + assert forward_meta.rotary_embs.shape[4] in [128, 32] + else: + assert forward_meta.rotary_embs.shape == [2, 1, self.max_seq_len, 1, 64] + if self.pd_disaggregation_mode == "per_query": metadata.kv_signal_data_list[layer.layer_id] = init_signal_layerwise( metadata.kv_signal_metadata,