diff --git a/fastdeploy/worker/forward_meta.py b/fastdeploy/model_executor/forward_meta.py similarity index 96% rename from fastdeploy/worker/forward_meta.py rename to fastdeploy/model_executor/forward_meta.py index 4948821e6..ae3f092e4 100644 --- a/fastdeploy/worker/forward_meta.py +++ b/fastdeploy/model_executor/forward_meta.py @@ -18,11 +18,10 @@ import logging from dataclasses import dataclass from enum import IntEnum, auto from typing import TYPE_CHECKING, Optional +from fastdeploy.model_executor.layers.attention import AttentionBackend import paddle - -if TYPE_CHECKING: - from fastdeploy.model_executor.layers.attention import AttentionBackend + logger = logging.getLogger(__name__) @@ -69,7 +68,7 @@ class ForwardMeta(): is_decode_batch: bool = False # Attention backend object - attn_backend: 'AttentionBackend' = None + attn_backend: AttentionBackend = None # Forward mode used during attention forward_mode: ForwardMode = ForwardMode.MIXED # Attention mask @@ -100,7 +99,7 @@ class ForwardMeta(): # Block tables block_tables: Optional[paddle.Tensor] = None # KV caches - caches: Optional[paddle.Tensor] = None + caches: Optional[list[paddle.Tensor]] = None def clear_caches(self): """ Safely clean up the caches """ diff --git a/fastdeploy/model_executor/layers/attention/append_attn_backend.py b/fastdeploy/model_executor/layers/attention/append_attn_backend.py index 9210ff241..7821f95ac 100644 --- a/fastdeploy/model_executor/layers/attention/append_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/append_attn_backend.py @@ -27,14 +27,13 @@ from fastdeploy.model_executor.layers.attention.ops import ( init_signal_layerwise, open_shm_and_get_meta_signal) if TYPE_CHECKING: - from paddle._typing.dtype_like import _DTypeLiteral + from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, AttentionMetadata) from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id -from fastdeploy.worker.forward_meta import ForwardMeta @dataclass @@ -54,7 +53,7 @@ class AppendAttentionMetadata(AttentionMetadata): decoder_tile_ids_per_batch: paddle.Tensor = None decoder_num_blocks: paddle.Tensor = None - _dtype: _DTypeLiteral = paddle.bfloat16 + _dtype: paddle.dtype = paddle.bfloat16 encoder_max_partition_size: int = 32768 max_partition_size: int = 32768 block_tables: Optional[paddle.Tensor] = None diff --git a/fastdeploy/model_executor/layers/attention/attention.py b/fastdeploy/model_executor/layers/attention/attention.py index 0ee0b41f4..9597ca58f 100644 --- a/fastdeploy/model_executor/layers/attention/attention.py +++ b/fastdeploy/model_executor/layers/attention/attention.py @@ -14,7 +14,9 @@ # limitations under the License. """ -from typing import Dict, Optional +from __future__ import annotations + +from typing import TYPE_CHECKING, Dict, Optional import numpy as np import paddle @@ -24,7 +26,8 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.quantization.quant_base import \ QuantMethodBase -from fastdeploy.worker.forward_meta import ForwardMeta +if TYPE_CHECKING: + from fastdeploy.model_executor.forward_meta import ForwardMeta class Attention(nn.Layer): diff --git a/fastdeploy/model_executor/layers/attention/base_attention_backend.py b/fastdeploy/model_executor/layers/attention/base_attention_backend.py index 02d1d65db..4a442e5c3 100644 --- a/fastdeploy/model_executor/layers/attention/base_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/base_attention_backend.py @@ -21,10 +21,11 @@ from __future__ import annotations from abc import ABC, abstractmethod from dataclasses import dataclass +from typing import TYPE_CHECKING import paddle - -from fastdeploy.worker.forward_meta import ForwardMeta +if TYPE_CHECKING: + from fastdeploy.model_executor.forward_meta import ForwardMeta @dataclass diff --git a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py index 0c4c05579..5d48f5477 100644 --- a/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/block_multihead_attn_backend.py @@ -23,13 +23,13 @@ from typing import TYPE_CHECKING, List, Optional import paddle if TYPE_CHECKING: - from paddle._typing.dtype_like import _DTypeLiteral + from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, AttentionMetadata) -from fastdeploy.worker.forward_meta import ForwardMeta + @dataclass class BlockAttentionMetadata(AttentionMetadata): @@ -48,7 +48,7 @@ class BlockAttentionMetadata(AttentionMetadata): decoder_tile_ids_per_batch: paddle.Tensor = None decoder_num_blocks: paddle.Tensor = None - _dtype: _DTypeLiteral = paddle.bfloat16 + _dtype: paddle.dtype = paddle.bfloat16 encoder_max_partition_size: int = 32768 max_partition_size: int = 32768 block_tables: Optional[paddle.Tensor] = None diff --git a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py index b68d6cab4..d5ff65b8f 100644 --- a/fastdeploy/model_executor/layers/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/flash_attn_backend.py @@ -18,7 +18,7 @@ from __future__ import annotations import os from dataclasses import dataclass, field -from typing import List, Optional +from typing import List, Optional, TYPE_CHECKING import paddle @@ -35,7 +35,8 @@ from fastdeploy.model_executor.layers.attention.ops import ( get_block_shape_and_split_kv_block, gqa_rope_write_cache, init_signal_layerwise, open_shm_and_get_meta_signal, pre_cache_len_concat) from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id -from fastdeploy.worker.forward_meta import ForwardMeta +if TYPE_CHECKING: + from fastdeploy.model_executor.forward_meta import ForwardMeta @dataclass diff --git a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py index 43e034194..339342313 100644 --- a/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/iluvatar_attn_backend.py @@ -20,7 +20,7 @@ import os import paddle from dataclasses import dataclass -from typing import Optional +from typing import Optional, TYPE_CHECKING from math import sqrt from paddle.nn.functional.flash_attention import flash_attn_unpadded @@ -30,7 +30,8 @@ from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, AttentionMetadata) -from fastdeploy.worker.forward_meta import ForwardMeta +if TYPE_CHECKING: + from fastdeploy.model_executor.forward_meta import ForwardMeta @dataclass diff --git a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py index a72518566..b88d98756 100644 --- a/fastdeploy/model_executor/layers/attention/mla_attention_backend.py +++ b/fastdeploy/model_executor/layers/attention/mla_attention_backend.py @@ -35,15 +35,13 @@ if current_platform.is_cuda() and not current_platform.is_dcu(): prefill_mla_write_cache) if TYPE_CHECKING: - from paddle._typing.dtype_like import _DTypeLiteral + from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, AttentionMetadata) -from fastdeploy.model_executor.layers.attention.utils import \ - init_rank_and_device_id -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id def yarn_get_mscale(scale=1, mscale=1): @@ -71,7 +69,7 @@ class MLAAttentionMetadata(AttentionMetadata): decoder_tile_ids_per_batch: paddle.Tensor = None decoder_num_blocks: paddle.Tensor = None - _dtype: _DTypeLiteral = paddle.bfloat16 + _dtype: paddle.dtype = paddle.bfloat16 encoder_max_partition_size: int = 32768 max_partition_size: int = 32768 block_tables: Optional[paddle.Tensor] = None diff --git a/fastdeploy/model_executor/layers/attention/native_paddle_backend.py b/fastdeploy/model_executor/layers/attention/native_paddle_backend.py index 8e8b9ce77..b8f5db6a1 100644 --- a/fastdeploy/model_executor/layers/attention/native_paddle_backend.py +++ b/fastdeploy/model_executor/layers/attention/native_paddle_backend.py @@ -17,12 +17,14 @@ from __future__ import annotations +from typing import TYPE_CHECKING import paddle from paddle.nn.functional import scaled_dot_product_attention from fastdeploy.model_executor.layers.attention.base_attention_backend import \ AttentionBackend -from fastdeploy.worker.forward_meta import ForwardMeta +if TYPE_CHECKING: + from fastdeploy.model_executor.forward_meta import ForwardMeta class PaddleNativeAttnBackend(AttentionBackend): diff --git a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py index 9ecc01fb8..c95bfd671 100644 --- a/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py +++ b/fastdeploy/model_executor/layers/attention/xpu_attn_backend.py @@ -26,13 +26,12 @@ from fastdeploy.model_executor.layers.attention.ops import ( init_signal_layerwise, open_shm_and_get_meta_signal) if TYPE_CHECKING: - from paddle._typing.dtype_like import _DTypeLiteral + from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, AttentionMetadata) -from fastdeploy.worker.forward_meta import ForwardMeta @dataclass @@ -52,7 +51,7 @@ class XPUAttentionMetadata(AttentionMetadata): decoder_tile_ids_per_batch: paddle.Tensor = None decoder_num_blocks: paddle.Tensor = None - _dtype: _DTypeLiteral = paddle.bfloat16 + _dtype: paddle.dtype = paddle.bfloat16 encoder_max_partition_size: int = 32768 max_partition_size: int = 32768 block_tables: Optional[paddle.Tensor] = None diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py index 56870de82..b1a883c41 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/flash_attn_backend.py @@ -24,15 +24,12 @@ import paddle import numpy as np - -if TYPE_CHECKING: - from paddle._typing.dtype_like import _DTypeLiteral - from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, AttentionMetadata) -from fastdeploy.worker.forward_meta import ForwardMeta, ForwardMode +if TYPE_CHECKING: + from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding, mem_efficient_attention, @@ -47,7 +44,7 @@ class GCUFlashAttnMetadata(AttentionMetadata): """ forward_mode: ForwardMode = ForwardMode.MIXED - _dtype: _DTypeLiteral = paddle.bfloat16 + _dtype: paddle.dtype = paddle.bfloat16 seq_lens_encoder: Optional[paddle.Tensor] = None seq_lens_decoder: Optional[paddle.Tensor] = None diff --git a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py index bc5d8f151..29174ddda 100644 --- a/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py +++ b/fastdeploy/model_executor/layers/backends/gcu/attention/mem_efficient_attn_backend.py @@ -25,28 +25,26 @@ import paddle import numpy as np import math - -if TYPE_CHECKING: - from paddle._typing.dtype_like import _DTypeLiteral - from fastdeploy.config import FDConfig from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, AttentionMetadata) -from fastdeploy.worker.forward_meta import ForwardMeta, ForwardMode from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding, mem_efficient_attention, flash_attn_var_len) from paddleformers.utils.log import logger +if TYPE_CHECKING: + from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode + @dataclass class GCUMemEfficientAttnMetadata(AttentionMetadata): """ GCUMemEfficientAttnMetadata """ forward_mode: ForwardMode = ForwardMode.MIXED - _dtype: _DTypeLiteral = paddle.bfloat16 + _dtype: paddle.dtype = paddle.bfloat16 seq_lens_encoder: Optional[paddle.Tensor] = None seq_lens_decoder: Optional[paddle.Tensor] = None diff --git a/fastdeploy/model_executor/models/deepseek_v3.py b/fastdeploy/model_executor/models/deepseek_v3.py index eac6ec9ec..dc1d56f9c 100644 --- a/fastdeploy/model_executor/models/deepseek_v3.py +++ b/fastdeploy/model_executor/models/deepseek_v3.py @@ -40,7 +40,7 @@ from fastdeploy.model_executor.layers.rotary_embedding import \ DeepseekScalingRotaryEmbedding from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.platforms import current_platform -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta if current_platform.is_cuda(): from fastdeploy.model_executor.ops.gpu import \ diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index a6d064043..f36adc20e 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -41,7 +41,7 @@ from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm from fastdeploy.model_executor.models.utils import \ LayerIdPlaceholder as layerid from fastdeploy.model_executor.models.utils import WeightMeta -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta class Ernie4_5_MLP(nn.Layer): diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 7920155ec..6b3b6ff15 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -30,7 +30,7 @@ from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer from fastdeploy.model_executor.models.model_base import ModelForCasualLM -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta class Ernie4_5_MTPPretrainedModel(PretrainedModel): diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index b6de4a2f8..2c5786570 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -44,7 +44,7 @@ if current_platform.is_cuda() and not current_platform.is_dcu(): text_image_gather_scatter, text_image_index_out) -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta class Ernie4_5_VLMLP(Ernie4_5_MLP): diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 4fab1e30b..6695f3854 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -34,7 +34,7 @@ from fastdeploy.model_executor.layers.linear import ( from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta class Qwen2MLP(nn.Layer): diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py index 8c734e422..1b62f0ec4 100644 --- a/fastdeploy/model_executor/models/qwen3.py +++ b/fastdeploy/model_executor/models/qwen3.py @@ -34,7 +34,7 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta class Qwen3MLP(Qwen2MLP): diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 9962fa1ee..8da5ed744 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -35,7 +35,7 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.models.model_base import ModelForCasualLM -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta class Qwen3MLP(nn.Layer): diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 6de3ce633..cf24a7e57 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -36,7 +36,7 @@ from fastdeploy.model_executor.ops.gpu import (draft_model_postprocess, share_external_data) from fastdeploy.model_executor.pre_and_post_process import (pre_process, rebuild_padding) -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta from .base import Proposer diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 18b89c248..45bebd0fb 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -39,7 +39,7 @@ from fastdeploy.model_executor.ops.gcu import set_value_by_flags_and_idx from fastdeploy.model_executor.pre_and_post_process import (post_process, pre_process, rebuild_padding) -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 71b954784..4aa4b1847 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -46,7 +46,7 @@ from fastdeploy.platforms import current_platform if not current_platform.is_dcu(): from fastdeploy.spec_decode import MTPProposer, NgramProposer -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 77f92676c..c1eb240b7 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -37,7 +37,7 @@ from fastdeploy.model_executor.pre_and_post_process import (post_process, pre_process, rebuild_padding, step_cuda) -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput diff --git a/fastdeploy/worker/vl_gpu_model_runner.py b/fastdeploy/worker/vl_gpu_model_runner.py index 681702392..5ad5c0f72 100644 --- a/fastdeploy/worker/vl_gpu_model_runner.py +++ b/fastdeploy/worker/vl_gpu_model_runner.py @@ -46,7 +46,7 @@ from fastdeploy.model_executor.models.ernie4_5_vl.dfnrope.modeling import \ from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ( ScatterOp, VariableResolutionResamplerModel) from fastdeploy.platforms import current_platform -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.worker.output import SamplerOutput from fastdeploy.worker.utils import check_safetensors_model from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 8be2d9d47..adfddf1ba 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -31,7 +31,7 @@ from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.sampler import Sampler from fastdeploy.model_executor.model_loader import get_model_from_loader from fastdeploy.utils import get_logger -from fastdeploy.worker.forward_meta import ForwardMeta, XPUForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput diff --git a/test/layers/test_attention.py b/test/layers/test_attention.py index b499ee1c2..989ecd4e2 100644 --- a/test/layers/test_attention.py +++ b/test/layers/test_attention.py @@ -21,7 +21,7 @@ import paddle from fastdeploy.model_executor.layers.attention import ( Attention, PaddleNativeAttnBackend) -from fastdeploy.worker.forward_meta import ForwardMeta, ForwardMode +from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode class MockModelRunner: diff --git a/test/worker/test_cuda_graph.py b/test/worker/test_cuda_graph.py index f00b129c5..30c0dca1e 100644 --- a/test/worker/test_cuda_graph.py +++ b/test/worker/test_cuda_graph.py @@ -18,7 +18,7 @@ import paddle from fastdeploy.config import FDConfig, GraphOptimizationConfig from fastdeploy.model_executor.graph_optimization.decorator import \ support_graph_optimization -from fastdeploy.worker.forward_meta import ForwardMeta +from fastdeploy.model_executor.forward_meta import ForwardMeta @support_graph_optimization