[Executor] Move forward_meta.py to fastdeploy/model_executor (#2774)

* Use PEP 563 in attention.py and fix conflict

* merge commit

* Change what was left out last time
This commit is contained in:
littledgg
2025-07-10 20:36:51 +08:00
committed by GitHub
parent 8c660a0dfb
commit 59071268b6
27 changed files with 53 additions and 55 deletions

View File

@@ -18,11 +18,10 @@ import logging
from dataclasses import dataclass from dataclasses import dataclass
from enum import IntEnum, auto from enum import IntEnum, auto
from typing import TYPE_CHECKING, Optional from typing import TYPE_CHECKING, Optional
from fastdeploy.model_executor.layers.attention import AttentionBackend
import paddle import paddle
if TYPE_CHECKING:
from fastdeploy.model_executor.layers.attention import AttentionBackend
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -69,7 +68,7 @@ class ForwardMeta():
is_decode_batch: bool = False is_decode_batch: bool = False
# Attention backend object # Attention backend object
attn_backend: 'AttentionBackend' = None attn_backend: AttentionBackend = None
# Forward mode used during attention # Forward mode used during attention
forward_mode: ForwardMode = ForwardMode.MIXED forward_mode: ForwardMode = ForwardMode.MIXED
# Attention mask # Attention mask
@@ -100,7 +99,7 @@ class ForwardMeta():
# Block tables # Block tables
block_tables: Optional[paddle.Tensor] = None block_tables: Optional[paddle.Tensor] = None
# KV caches # KV caches
caches: Optional[paddle.Tensor] = None caches: Optional[list[paddle.Tensor]] = None
def clear_caches(self): def clear_caches(self):
""" Safely clean up the caches """ """ Safely clean up the caches """

View File

@@ -27,14 +27,13 @@ from fastdeploy.model_executor.layers.attention.ops import (
init_signal_layerwise, open_shm_and_get_meta_signal) init_signal_layerwise, open_shm_and_get_meta_signal)
if TYPE_CHECKING: if TYPE_CHECKING:
from paddle._typing.dtype_like import _DTypeLiteral from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import ( from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata) AttentionBackend, AttentionMetadata)
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
from fastdeploy.worker.forward_meta import ForwardMeta
@dataclass @dataclass
@@ -54,7 +53,7 @@ class AppendAttentionMetadata(AttentionMetadata):
decoder_tile_ids_per_batch: paddle.Tensor = None decoder_tile_ids_per_batch: paddle.Tensor = None
decoder_num_blocks: paddle.Tensor = None decoder_num_blocks: paddle.Tensor = None
_dtype: _DTypeLiteral = paddle.bfloat16 _dtype: paddle.dtype = paddle.bfloat16
encoder_max_partition_size: int = 32768 encoder_max_partition_size: int = 32768
max_partition_size: int = 32768 max_partition_size: int = 32768
block_tables: Optional[paddle.Tensor] = None block_tables: Optional[paddle.Tensor] = None

View File

@@ -14,7 +14,9 @@
# limitations under the License. # limitations under the License.
""" """
from typing import Dict, Optional from __future__ import annotations
from typing import TYPE_CHECKING, Dict, Optional
import numpy as np import numpy as np
import paddle import paddle
@@ -24,7 +26,8 @@ from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.quantization.quant_base import \ from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantMethodBase QuantMethodBase
from fastdeploy.worker.forward_meta import ForwardMeta if TYPE_CHECKING:
from fastdeploy.model_executor.forward_meta import ForwardMeta
class Attention(nn.Layer): class Attention(nn.Layer):

View File

@@ -21,10 +21,11 @@ from __future__ import annotations
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
from typing import TYPE_CHECKING
import paddle import paddle
if TYPE_CHECKING:
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
@dataclass @dataclass

View File

@@ -23,13 +23,13 @@ from typing import TYPE_CHECKING, List, Optional
import paddle import paddle
if TYPE_CHECKING: if TYPE_CHECKING:
from paddle._typing.dtype_like import _DTypeLiteral from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import ( from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata) AttentionBackend, AttentionMetadata)
from fastdeploy.worker.forward_meta import ForwardMeta
@dataclass @dataclass
class BlockAttentionMetadata(AttentionMetadata): class BlockAttentionMetadata(AttentionMetadata):
@@ -48,7 +48,7 @@ class BlockAttentionMetadata(AttentionMetadata):
decoder_tile_ids_per_batch: paddle.Tensor = None decoder_tile_ids_per_batch: paddle.Tensor = None
decoder_num_blocks: paddle.Tensor = None decoder_num_blocks: paddle.Tensor = None
_dtype: _DTypeLiteral = paddle.bfloat16 _dtype: paddle.dtype = paddle.bfloat16
encoder_max_partition_size: int = 32768 encoder_max_partition_size: int = 32768
max_partition_size: int = 32768 max_partition_size: int = 32768
block_tables: Optional[paddle.Tensor] = None block_tables: Optional[paddle.Tensor] = None

View File

@@ -18,7 +18,7 @@ from __future__ import annotations
import os import os
from dataclasses import dataclass, field from dataclasses import dataclass, field
from typing import List, Optional from typing import List, Optional, TYPE_CHECKING
import paddle import paddle
@@ -35,7 +35,8 @@ from fastdeploy.model_executor.layers.attention.ops import (
get_block_shape_and_split_kv_block, gqa_rope_write_cache, get_block_shape_and_split_kv_block, gqa_rope_write_cache,
init_signal_layerwise, open_shm_and_get_meta_signal, pre_cache_len_concat) init_signal_layerwise, open_shm_and_get_meta_signal, pre_cache_len_concat)
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
from fastdeploy.worker.forward_meta import ForwardMeta if TYPE_CHECKING:
from fastdeploy.model_executor.forward_meta import ForwardMeta
@dataclass @dataclass

View File

@@ -20,7 +20,7 @@ import os
import paddle import paddle
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional from typing import Optional, TYPE_CHECKING
from math import sqrt from math import sqrt
from paddle.nn.functional.flash_attention import flash_attn_unpadded from paddle.nn.functional.flash_attention import flash_attn_unpadded
@@ -30,7 +30,8 @@ from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import ( from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata) AttentionBackend, AttentionMetadata)
from fastdeploy.worker.forward_meta import ForwardMeta if TYPE_CHECKING:
from fastdeploy.model_executor.forward_meta import ForwardMeta
@dataclass @dataclass

View File

@@ -35,15 +35,13 @@ if current_platform.is_cuda() and not current_platform.is_dcu():
prefill_mla_write_cache) prefill_mla_write_cache)
if TYPE_CHECKING: if TYPE_CHECKING:
from paddle._typing.dtype_like import _DTypeLiteral from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import ( from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata) AttentionBackend, AttentionMetadata)
from fastdeploy.model_executor.layers.attention.utils import \ from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
init_rank_and_device_id
from fastdeploy.worker.forward_meta import ForwardMeta
def yarn_get_mscale(scale=1, mscale=1): def yarn_get_mscale(scale=1, mscale=1):
@@ -71,7 +69,7 @@ class MLAAttentionMetadata(AttentionMetadata):
decoder_tile_ids_per_batch: paddle.Tensor = None decoder_tile_ids_per_batch: paddle.Tensor = None
decoder_num_blocks: paddle.Tensor = None decoder_num_blocks: paddle.Tensor = None
_dtype: _DTypeLiteral = paddle.bfloat16 _dtype: paddle.dtype = paddle.bfloat16
encoder_max_partition_size: int = 32768 encoder_max_partition_size: int = 32768
max_partition_size: int = 32768 max_partition_size: int = 32768
block_tables: Optional[paddle.Tensor] = None block_tables: Optional[paddle.Tensor] = None

View File

@@ -17,12 +17,14 @@
from __future__ import annotations from __future__ import annotations
from typing import TYPE_CHECKING
import paddle import paddle
from paddle.nn.functional import scaled_dot_product_attention from paddle.nn.functional import scaled_dot_product_attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import \ from fastdeploy.model_executor.layers.attention.base_attention_backend import \
AttentionBackend AttentionBackend
from fastdeploy.worker.forward_meta import ForwardMeta if TYPE_CHECKING:
from fastdeploy.model_executor.forward_meta import ForwardMeta
class PaddleNativeAttnBackend(AttentionBackend): class PaddleNativeAttnBackend(AttentionBackend):

View File

@@ -26,13 +26,12 @@ from fastdeploy.model_executor.layers.attention.ops import (
init_signal_layerwise, open_shm_and_get_meta_signal) init_signal_layerwise, open_shm_and_get_meta_signal)
if TYPE_CHECKING: if TYPE_CHECKING:
from paddle._typing.dtype_like import _DTypeLiteral from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import ( from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata) AttentionBackend, AttentionMetadata)
from fastdeploy.worker.forward_meta import ForwardMeta
@dataclass @dataclass
@@ -52,7 +51,7 @@ class XPUAttentionMetadata(AttentionMetadata):
decoder_tile_ids_per_batch: paddle.Tensor = None decoder_tile_ids_per_batch: paddle.Tensor = None
decoder_num_blocks: paddle.Tensor = None decoder_num_blocks: paddle.Tensor = None
_dtype: _DTypeLiteral = paddle.bfloat16 _dtype: paddle.dtype = paddle.bfloat16
encoder_max_partition_size: int = 32768 encoder_max_partition_size: int = 32768
max_partition_size: int = 32768 max_partition_size: int = 32768
block_tables: Optional[paddle.Tensor] = None block_tables: Optional[paddle.Tensor] = None

View File

@@ -24,15 +24,12 @@ import paddle
import numpy as np import numpy as np
if TYPE_CHECKING:
from paddle._typing.dtype_like import _DTypeLiteral
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import ( from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata) AttentionBackend, AttentionMetadata)
from fastdeploy.worker.forward_meta import ForwardMeta, ForwardMode if TYPE_CHECKING:
from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode
from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding, from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
mem_efficient_attention, mem_efficient_attention,
@@ -47,7 +44,7 @@ class GCUFlashAttnMetadata(AttentionMetadata):
""" """
forward_mode: ForwardMode = ForwardMode.MIXED forward_mode: ForwardMode = ForwardMode.MIXED
_dtype: _DTypeLiteral = paddle.bfloat16 _dtype: paddle.dtype = paddle.bfloat16
seq_lens_encoder: Optional[paddle.Tensor] = None seq_lens_encoder: Optional[paddle.Tensor] = None
seq_lens_decoder: Optional[paddle.Tensor] = None seq_lens_decoder: Optional[paddle.Tensor] = None

View File

@@ -25,28 +25,26 @@ import paddle
import numpy as np import numpy as np
import math import math
if TYPE_CHECKING:
from paddle._typing.dtype_like import _DTypeLiteral
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.attention.base_attention_backend import ( from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionMetadata) AttentionBackend, AttentionMetadata)
from fastdeploy.worker.forward_meta import ForwardMeta, ForwardMode
from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding, from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
mem_efficient_attention, mem_efficient_attention,
flash_attn_var_len) flash_attn_var_len)
from paddleformers.utils.log import logger from paddleformers.utils.log import logger
if TYPE_CHECKING:
from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode
@dataclass @dataclass
class GCUMemEfficientAttnMetadata(AttentionMetadata): class GCUMemEfficientAttnMetadata(AttentionMetadata):
""" """
GCUMemEfficientAttnMetadata GCUMemEfficientAttnMetadata
""" """
forward_mode: ForwardMode = ForwardMode.MIXED forward_mode: ForwardMode = ForwardMode.MIXED
_dtype: _DTypeLiteral = paddle.bfloat16 _dtype: paddle.dtype = paddle.bfloat16
seq_lens_encoder: Optional[paddle.Tensor] = None seq_lens_encoder: Optional[paddle.Tensor] = None
seq_lens_decoder: Optional[paddle.Tensor] = None seq_lens_decoder: Optional[paddle.Tensor] = None

View File

@@ -40,7 +40,7 @@ from fastdeploy.model_executor.layers.rotary_embedding import \
DeepseekScalingRotaryEmbedding DeepseekScalingRotaryEmbedding
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.platforms import current_platform from fastdeploy.platforms import current_platform
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
if current_platform.is_cuda(): if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \ from fastdeploy.model_executor.ops.gpu import \

View File

@@ -41,7 +41,7 @@ from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
from fastdeploy.model_executor.models.utils import \ from fastdeploy.model_executor.models.utils import \
LayerIdPlaceholder as layerid LayerIdPlaceholder as layerid
from fastdeploy.model_executor.models.utils import WeightMeta from fastdeploy.model_executor.models.utils import WeightMeta
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
class Ernie4_5_MLP(nn.Layer): class Ernie4_5_MLP(nn.Layer):

View File

@@ -30,7 +30,7 @@ from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection
from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
class Ernie4_5_MTPPretrainedModel(PretrainedModel): class Ernie4_5_MTPPretrainedModel(PretrainedModel):

View File

@@ -44,7 +44,7 @@ if current_platform.is_cuda() and not current_platform.is_dcu():
text_image_gather_scatter, text_image_gather_scatter,
text_image_index_out) text_image_index_out)
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
class Ernie4_5_VLMLP(Ernie4_5_MLP): class Ernie4_5_VLMLP(Ernie4_5_MLP):

View File

@@ -34,7 +34,7 @@ from fastdeploy.model_executor.layers.linear import (
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
class Qwen2MLP(nn.Layer): class Qwen2MLP(nn.Layer):

View File

@@ -34,7 +34,7 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
class Qwen3MLP(Qwen2MLP): class Qwen3MLP(Qwen2MLP):

View File

@@ -35,7 +35,7 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
class Qwen3MLP(nn.Layer): class Qwen3MLP(nn.Layer):

View File

@@ -36,7 +36,7 @@ from fastdeploy.model_executor.ops.gpu import (draft_model_postprocess,
share_external_data) share_external_data)
from fastdeploy.model_executor.pre_and_post_process import (pre_process, from fastdeploy.model_executor.pre_and_post_process import (pre_process,
rebuild_padding) rebuild_padding)
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from .base import Proposer from .base import Proposer

View File

@@ -39,7 +39,7 @@ from fastdeploy.model_executor.ops.gcu import set_value_by_flags_and_idx
from fastdeploy.model_executor.pre_and_post_process import (post_process, from fastdeploy.model_executor.pre_and_post_process import (post_process,
pre_process, pre_process,
rebuild_padding) rebuild_padding)
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.model_runner_base import ModelRunnerBase
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput

View File

@@ -46,7 +46,7 @@ from fastdeploy.platforms import current_platform
if not current_platform.is_dcu(): if not current_platform.is_dcu():
from fastdeploy.spec_decode import MTPProposer, NgramProposer from fastdeploy.spec_decode import MTPProposer, NgramProposer
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.model_runner_base import ModelRunnerBase
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput

View File

@@ -37,7 +37,7 @@ from fastdeploy.model_executor.pre_and_post_process import (post_process,
pre_process, pre_process,
rebuild_padding, rebuild_padding,
step_cuda) step_cuda)
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.model_runner_base import ModelRunnerBase
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput

View File

@@ -46,7 +46,7 @@ from fastdeploy.model_executor.models.ernie4_5_vl.dfnrope.modeling import \
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ( from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import (
ScatterOp, VariableResolutionResamplerModel) ScatterOp, VariableResolutionResamplerModel)
from fastdeploy.platforms import current_platform from fastdeploy.platforms import current_platform
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.worker.output import SamplerOutput from fastdeploy.worker.output import SamplerOutput
from fastdeploy.worker.utils import check_safetensors_model from fastdeploy.worker.utils import check_safetensors_model
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase

View File

@@ -31,7 +31,7 @@ from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.sampler import Sampler from fastdeploy.model_executor.layers.sample.sampler import Sampler
from fastdeploy.model_executor.model_loader import get_model_from_loader from fastdeploy.model_executor.model_loader import get_model_from_loader
from fastdeploy.utils import get_logger from fastdeploy.utils import get_logger
from fastdeploy.worker.forward_meta import ForwardMeta, XPUForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
from fastdeploy.worker.model_runner_base import ModelRunnerBase from fastdeploy.worker.model_runner_base import ModelRunnerBase
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput

View File

@@ -21,7 +21,7 @@ import paddle
from fastdeploy.model_executor.layers.attention import ( from fastdeploy.model_executor.layers.attention import (
Attention, PaddleNativeAttnBackend) Attention, PaddleNativeAttnBackend)
from fastdeploy.worker.forward_meta import ForwardMeta, ForwardMode from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode
class MockModelRunner: class MockModelRunner:

View File

@@ -18,7 +18,7 @@ import paddle
from fastdeploy.config import FDConfig, GraphOptimizationConfig from fastdeploy.config import FDConfig, GraphOptimizationConfig
from fastdeploy.model_executor.graph_optimization.decorator import \ from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization support_graph_optimization
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
@support_graph_optimization @support_graph_optimization