mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-09-27 21:02:24 +08:00
[Executor] Move forward_meta.py to fastdeploy/model_executor (#2774)
* Use PEP 563 in attention.py and fix conflict * merge commit * Change what was left out last time
This commit is contained in:
@@ -18,11 +18,10 @@ import logging
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import IntEnum, auto
|
from enum import IntEnum, auto
|
||||||
from typing import TYPE_CHECKING, Optional
|
from typing import TYPE_CHECKING, Optional
|
||||||
|
from fastdeploy.model_executor.layers.attention import AttentionBackend
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from fastdeploy.model_executor.layers.attention import AttentionBackend
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -69,7 +68,7 @@ class ForwardMeta():
|
|||||||
is_decode_batch: bool = False
|
is_decode_batch: bool = False
|
||||||
|
|
||||||
# Attention backend object
|
# Attention backend object
|
||||||
attn_backend: 'AttentionBackend' = None
|
attn_backend: AttentionBackend = None
|
||||||
# Forward mode used during attention
|
# Forward mode used during attention
|
||||||
forward_mode: ForwardMode = ForwardMode.MIXED
|
forward_mode: ForwardMode = ForwardMode.MIXED
|
||||||
# Attention mask
|
# Attention mask
|
||||||
@@ -100,7 +99,7 @@ class ForwardMeta():
|
|||||||
# Block tables
|
# Block tables
|
||||||
block_tables: Optional[paddle.Tensor] = None
|
block_tables: Optional[paddle.Tensor] = None
|
||||||
# KV caches
|
# KV caches
|
||||||
caches: Optional[paddle.Tensor] = None
|
caches: Optional[list[paddle.Tensor]] = None
|
||||||
|
|
||||||
def clear_caches(self):
|
def clear_caches(self):
|
||||||
""" Safely clean up the caches """
|
""" Safely clean up the caches """
|
@@ -27,14 +27,13 @@ from fastdeploy.model_executor.layers.attention.ops import (
|
|||||||
init_signal_layerwise, open_shm_and_get_meta_signal)
|
init_signal_layerwise, open_shm_and_get_meta_signal)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from paddle._typing.dtype_like import _DTypeLiteral
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.model_executor.layers.attention.attention import Attention
|
from fastdeploy.model_executor.layers.attention.attention import Attention
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||||
AttentionBackend, AttentionMetadata)
|
AttentionBackend, AttentionMetadata)
|
||||||
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
|
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -54,7 +53,7 @@ class AppendAttentionMetadata(AttentionMetadata):
|
|||||||
decoder_tile_ids_per_batch: paddle.Tensor = None
|
decoder_tile_ids_per_batch: paddle.Tensor = None
|
||||||
decoder_num_blocks: paddle.Tensor = None
|
decoder_num_blocks: paddle.Tensor = None
|
||||||
|
|
||||||
_dtype: _DTypeLiteral = paddle.bfloat16
|
_dtype: paddle.dtype = paddle.bfloat16
|
||||||
encoder_max_partition_size: int = 32768
|
encoder_max_partition_size: int = 32768
|
||||||
max_partition_size: int = 32768
|
max_partition_size: int = 32768
|
||||||
block_tables: Optional[paddle.Tensor] = None
|
block_tables: Optional[paddle.Tensor] = None
|
||||||
|
@@ -14,7 +14,9 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Dict, Optional
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING, Dict, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import paddle
|
import paddle
|
||||||
@@ -24,7 +26,8 @@ from paddleformers.utils.log import logger
|
|||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.model_executor.layers.quantization.quant_base import \
|
from fastdeploy.model_executor.layers.quantization.quant_base import \
|
||||||
QuantMethodBase
|
QuantMethodBase
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
if TYPE_CHECKING:
|
||||||
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
class Attention(nn.Layer):
|
class Attention(nn.Layer):
|
||||||
|
@@ -21,10 +21,11 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
if TYPE_CHECKING:
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@@ -23,13 +23,13 @@ from typing import TYPE_CHECKING, List, Optional
|
|||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from paddle._typing.dtype_like import _DTypeLiteral
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.model_executor.layers.attention.attention import Attention
|
from fastdeploy.model_executor.layers.attention.attention import Attention
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||||
AttentionBackend, AttentionMetadata)
|
AttentionBackend, AttentionMetadata)
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class BlockAttentionMetadata(AttentionMetadata):
|
class BlockAttentionMetadata(AttentionMetadata):
|
||||||
@@ -48,7 +48,7 @@ class BlockAttentionMetadata(AttentionMetadata):
|
|||||||
decoder_tile_ids_per_batch: paddle.Tensor = None
|
decoder_tile_ids_per_batch: paddle.Tensor = None
|
||||||
decoder_num_blocks: paddle.Tensor = None
|
decoder_num_blocks: paddle.Tensor = None
|
||||||
|
|
||||||
_dtype: _DTypeLiteral = paddle.bfloat16
|
_dtype: paddle.dtype = paddle.bfloat16
|
||||||
encoder_max_partition_size: int = 32768
|
encoder_max_partition_size: int = 32768
|
||||||
max_partition_size: int = 32768
|
max_partition_size: int = 32768
|
||||||
block_tables: Optional[paddle.Tensor] = None
|
block_tables: Optional[paddle.Tensor] = None
|
||||||
|
@@ -18,7 +18,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import List, Optional
|
from typing import List, Optional, TYPE_CHECKING
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
@@ -35,7 +35,8 @@ from fastdeploy.model_executor.layers.attention.ops import (
|
|||||||
get_block_shape_and_split_kv_block, gqa_rope_write_cache,
|
get_block_shape_and_split_kv_block, gqa_rope_write_cache,
|
||||||
init_signal_layerwise, open_shm_and_get_meta_signal, pre_cache_len_concat)
|
init_signal_layerwise, open_shm_and_get_meta_signal, pre_cache_len_concat)
|
||||||
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
|
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
if TYPE_CHECKING:
|
||||||
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@@ -20,7 +20,7 @@ import os
|
|||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
from typing import Optional, TYPE_CHECKING
|
||||||
from math import sqrt
|
from math import sqrt
|
||||||
|
|
||||||
from paddle.nn.functional.flash_attention import flash_attn_unpadded
|
from paddle.nn.functional.flash_attention import flash_attn_unpadded
|
||||||
@@ -30,7 +30,8 @@ from fastdeploy.config import FDConfig
|
|||||||
from fastdeploy.model_executor.layers.attention.attention import Attention
|
from fastdeploy.model_executor.layers.attention.attention import Attention
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||||
AttentionBackend, AttentionMetadata)
|
AttentionBackend, AttentionMetadata)
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
if TYPE_CHECKING:
|
||||||
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@@ -35,15 +35,13 @@ if current_platform.is_cuda() and not current_platform.is_dcu():
|
|||||||
prefill_mla_write_cache)
|
prefill_mla_write_cache)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from paddle._typing.dtype_like import _DTypeLiteral
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.model_executor.layers.attention.attention import Attention
|
from fastdeploy.model_executor.layers.attention.attention import Attention
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||||
AttentionBackend, AttentionMetadata)
|
AttentionBackend, AttentionMetadata)
|
||||||
from fastdeploy.model_executor.layers.attention.utils import \
|
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
|
||||||
init_rank_and_device_id
|
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
|
||||||
|
|
||||||
|
|
||||||
def yarn_get_mscale(scale=1, mscale=1):
|
def yarn_get_mscale(scale=1, mscale=1):
|
||||||
@@ -71,7 +69,7 @@ class MLAAttentionMetadata(AttentionMetadata):
|
|||||||
decoder_tile_ids_per_batch: paddle.Tensor = None
|
decoder_tile_ids_per_batch: paddle.Tensor = None
|
||||||
decoder_num_blocks: paddle.Tensor = None
|
decoder_num_blocks: paddle.Tensor = None
|
||||||
|
|
||||||
_dtype: _DTypeLiteral = paddle.bfloat16
|
_dtype: paddle.dtype = paddle.bfloat16
|
||||||
encoder_max_partition_size: int = 32768
|
encoder_max_partition_size: int = 32768
|
||||||
max_partition_size: int = 32768
|
max_partition_size: int = 32768
|
||||||
block_tables: Optional[paddle.Tensor] = None
|
block_tables: Optional[paddle.Tensor] = None
|
||||||
|
@@ -17,12 +17,14 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
import paddle
|
import paddle
|
||||||
from paddle.nn.functional import scaled_dot_product_attention
|
from paddle.nn.functional import scaled_dot_product_attention
|
||||||
|
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import \
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import \
|
||||||
AttentionBackend
|
AttentionBackend
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
if TYPE_CHECKING:
|
||||||
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
class PaddleNativeAttnBackend(AttentionBackend):
|
class PaddleNativeAttnBackend(AttentionBackend):
|
||||||
|
@@ -26,13 +26,12 @@ from fastdeploy.model_executor.layers.attention.ops import (
|
|||||||
init_signal_layerwise, open_shm_and_get_meta_signal)
|
init_signal_layerwise, open_shm_and_get_meta_signal)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from paddle._typing.dtype_like import _DTypeLiteral
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.model_executor.layers.attention.attention import Attention
|
from fastdeploy.model_executor.layers.attention.attention import Attention
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||||
AttentionBackend, AttentionMetadata)
|
AttentionBackend, AttentionMetadata)
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@@ -52,7 +51,7 @@ class XPUAttentionMetadata(AttentionMetadata):
|
|||||||
decoder_tile_ids_per_batch: paddle.Tensor = None
|
decoder_tile_ids_per_batch: paddle.Tensor = None
|
||||||
decoder_num_blocks: paddle.Tensor = None
|
decoder_num_blocks: paddle.Tensor = None
|
||||||
|
|
||||||
_dtype: _DTypeLiteral = paddle.bfloat16
|
_dtype: paddle.dtype = paddle.bfloat16
|
||||||
encoder_max_partition_size: int = 32768
|
encoder_max_partition_size: int = 32768
|
||||||
max_partition_size: int = 32768
|
max_partition_size: int = 32768
|
||||||
block_tables: Optional[paddle.Tensor] = None
|
block_tables: Optional[paddle.Tensor] = None
|
||||||
|
@@ -24,15 +24,12 @@ import paddle
|
|||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from paddle._typing.dtype_like import _DTypeLiteral
|
|
||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.model_executor.layers.attention.attention import Attention
|
from fastdeploy.model_executor.layers.attention.attention import Attention
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||||
AttentionBackend, AttentionMetadata)
|
AttentionBackend, AttentionMetadata)
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta, ForwardMode
|
if TYPE_CHECKING:
|
||||||
|
from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode
|
||||||
|
|
||||||
from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
|
from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
|
||||||
mem_efficient_attention,
|
mem_efficient_attention,
|
||||||
@@ -47,7 +44,7 @@ class GCUFlashAttnMetadata(AttentionMetadata):
|
|||||||
"""
|
"""
|
||||||
forward_mode: ForwardMode = ForwardMode.MIXED
|
forward_mode: ForwardMode = ForwardMode.MIXED
|
||||||
|
|
||||||
_dtype: _DTypeLiteral = paddle.bfloat16
|
_dtype: paddle.dtype = paddle.bfloat16
|
||||||
|
|
||||||
seq_lens_encoder: Optional[paddle.Tensor] = None
|
seq_lens_encoder: Optional[paddle.Tensor] = None
|
||||||
seq_lens_decoder: Optional[paddle.Tensor] = None
|
seq_lens_decoder: Optional[paddle.Tensor] = None
|
||||||
|
@@ -25,28 +25,26 @@ import paddle
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import math
|
import math
|
||||||
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from paddle._typing.dtype_like import _DTypeLiteral
|
|
||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.model_executor.layers.attention.attention import Attention
|
from fastdeploy.model_executor.layers.attention.attention import Attention
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||||
AttentionBackend, AttentionMetadata)
|
AttentionBackend, AttentionMetadata)
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta, ForwardMode
|
|
||||||
|
|
||||||
from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
|
from fastdeploy.model_executor.ops.gcu import (fused_rotary_embedding,
|
||||||
mem_efficient_attention,
|
mem_efficient_attention,
|
||||||
flash_attn_var_len)
|
flash_attn_var_len)
|
||||||
from paddleformers.utils.log import logger
|
from paddleformers.utils.log import logger
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GCUMemEfficientAttnMetadata(AttentionMetadata):
|
class GCUMemEfficientAttnMetadata(AttentionMetadata):
|
||||||
"""
|
"""
|
||||||
GCUMemEfficientAttnMetadata
|
GCUMemEfficientAttnMetadata
|
||||||
"""
|
"""
|
||||||
forward_mode: ForwardMode = ForwardMode.MIXED
|
forward_mode: ForwardMode = ForwardMode.MIXED
|
||||||
_dtype: _DTypeLiteral = paddle.bfloat16
|
_dtype: paddle.dtype = paddle.bfloat16
|
||||||
|
|
||||||
seq_lens_encoder: Optional[paddle.Tensor] = None
|
seq_lens_encoder: Optional[paddle.Tensor] = None
|
||||||
seq_lens_decoder: Optional[paddle.Tensor] = None
|
seq_lens_decoder: Optional[paddle.Tensor] = None
|
||||||
|
@@ -40,7 +40,7 @@ from fastdeploy.model_executor.layers.rotary_embedding import \
|
|||||||
DeepseekScalingRotaryEmbedding
|
DeepseekScalingRotaryEmbedding
|
||||||
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
if current_platform.is_cuda():
|
if current_platform.is_cuda():
|
||||||
from fastdeploy.model_executor.ops.gpu import \
|
from fastdeploy.model_executor.ops.gpu import \
|
||||||
|
@@ -41,7 +41,7 @@ from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
|
|||||||
from fastdeploy.model_executor.models.utils import \
|
from fastdeploy.model_executor.models.utils import \
|
||||||
LayerIdPlaceholder as layerid
|
LayerIdPlaceholder as layerid
|
||||||
from fastdeploy.model_executor.models.utils import WeightMeta
|
from fastdeploy.model_executor.models.utils import WeightMeta
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
class Ernie4_5_MLP(nn.Layer):
|
class Ernie4_5_MLP(nn.Layer):
|
||||||
|
@@ -30,7 +30,7 @@ from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection
|
|||||||
from fastdeploy.model_executor.layers.normalization import RMSNorm
|
from fastdeploy.model_executor.layers.normalization import RMSNorm
|
||||||
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
|
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
|
||||||
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
class Ernie4_5_MTPPretrainedModel(PretrainedModel):
|
class Ernie4_5_MTPPretrainedModel(PretrainedModel):
|
||||||
|
@@ -44,7 +44,7 @@ if current_platform.is_cuda() and not current_platform.is_dcu():
|
|||||||
text_image_gather_scatter,
|
text_image_gather_scatter,
|
||||||
text_image_index_out)
|
text_image_index_out)
|
||||||
|
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
class Ernie4_5_VLMLP(Ernie4_5_MLP):
|
class Ernie4_5_VLMLP(Ernie4_5_MLP):
|
||||||
|
@@ -34,7 +34,7 @@ from fastdeploy.model_executor.layers.linear import (
|
|||||||
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
|
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
|
||||||
from fastdeploy.model_executor.layers.normalization import RMSNorm
|
from fastdeploy.model_executor.layers.normalization import RMSNorm
|
||||||
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
class Qwen2MLP(nn.Layer):
|
class Qwen2MLP(nn.Layer):
|
||||||
|
@@ -34,7 +34,7 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
|
|||||||
from fastdeploy.model_executor.layers.normalization import RMSNorm
|
from fastdeploy.model_executor.layers.normalization import RMSNorm
|
||||||
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
||||||
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP
|
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
class Qwen3MLP(Qwen2MLP):
|
class Qwen3MLP(Qwen2MLP):
|
||||||
|
@@ -35,7 +35,7 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
|
|||||||
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
|
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
|
||||||
from fastdeploy.model_executor.layers.normalization import RMSNorm
|
from fastdeploy.model_executor.layers.normalization import RMSNorm
|
||||||
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
class Qwen3MLP(nn.Layer):
|
class Qwen3MLP(nn.Layer):
|
||||||
|
@@ -36,7 +36,7 @@ from fastdeploy.model_executor.ops.gpu import (draft_model_postprocess,
|
|||||||
share_external_data)
|
share_external_data)
|
||||||
from fastdeploy.model_executor.pre_and_post_process import (pre_process,
|
from fastdeploy.model_executor.pre_and_post_process import (pre_process,
|
||||||
rebuild_padding)
|
rebuild_padding)
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
from .base import Proposer
|
from .base import Proposer
|
||||||
|
|
||||||
|
@@ -39,7 +39,7 @@ from fastdeploy.model_executor.ops.gcu import set_value_by_flags_and_idx
|
|||||||
from fastdeploy.model_executor.pre_and_post_process import (post_process,
|
from fastdeploy.model_executor.pre_and_post_process import (post_process,
|
||||||
pre_process,
|
pre_process,
|
||||||
rebuild_padding)
|
rebuild_padding)
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
from fastdeploy.worker.model_runner_base import ModelRunnerBase
|
from fastdeploy.worker.model_runner_base import ModelRunnerBase
|
||||||
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
|
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
|
||||||
|
|
||||||
|
@@ -46,7 +46,7 @@ from fastdeploy.platforms import current_platform
|
|||||||
if not current_platform.is_dcu():
|
if not current_platform.is_dcu():
|
||||||
from fastdeploy.spec_decode import MTPProposer, NgramProposer
|
from fastdeploy.spec_decode import MTPProposer, NgramProposer
|
||||||
|
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
from fastdeploy.worker.model_runner_base import ModelRunnerBase
|
from fastdeploy.worker.model_runner_base import ModelRunnerBase
|
||||||
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
|
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
|
||||||
|
|
||||||
|
@@ -37,7 +37,7 @@ from fastdeploy.model_executor.pre_and_post_process import (post_process,
|
|||||||
pre_process,
|
pre_process,
|
||||||
rebuild_padding,
|
rebuild_padding,
|
||||||
step_cuda)
|
step_cuda)
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
from fastdeploy.worker.model_runner_base import ModelRunnerBase
|
from fastdeploy.worker.model_runner_base import ModelRunnerBase
|
||||||
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
|
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
|
||||||
|
|
||||||
|
@@ -46,7 +46,7 @@ from fastdeploy.model_executor.models.ernie4_5_vl.dfnrope.modeling import \
|
|||||||
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import (
|
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import (
|
||||||
ScatterOp, VariableResolutionResamplerModel)
|
ScatterOp, VariableResolutionResamplerModel)
|
||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
from fastdeploy.worker.output import SamplerOutput
|
from fastdeploy.worker.output import SamplerOutput
|
||||||
from fastdeploy.worker.utils import check_safetensors_model
|
from fastdeploy.worker.utils import check_safetensors_model
|
||||||
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
|
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
|
||||||
|
@@ -31,7 +31,7 @@ from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
|
|||||||
from fastdeploy.model_executor.layers.sample.sampler import Sampler
|
from fastdeploy.model_executor.layers.sample.sampler import Sampler
|
||||||
from fastdeploy.model_executor.model_loader import get_model_from_loader
|
from fastdeploy.model_executor.model_loader import get_model_from_loader
|
||||||
from fastdeploy.utils import get_logger
|
from fastdeploy.utils import get_logger
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta, XPUForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
|
||||||
from fastdeploy.worker.model_runner_base import ModelRunnerBase
|
from fastdeploy.worker.model_runner_base import ModelRunnerBase
|
||||||
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
|
from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput
|
||||||
|
|
||||||
|
@@ -21,7 +21,7 @@ import paddle
|
|||||||
|
|
||||||
from fastdeploy.model_executor.layers.attention import (
|
from fastdeploy.model_executor.layers.attention import (
|
||||||
Attention, PaddleNativeAttnBackend)
|
Attention, PaddleNativeAttnBackend)
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta, ForwardMode
|
from fastdeploy.model_executor.forward_meta import ForwardMeta, ForwardMode
|
||||||
|
|
||||||
|
|
||||||
class MockModelRunner:
|
class MockModelRunner:
|
||||||
|
@@ -18,7 +18,7 @@ import paddle
|
|||||||
from fastdeploy.config import FDConfig, GraphOptimizationConfig
|
from fastdeploy.config import FDConfig, GraphOptimizationConfig
|
||||||
from fastdeploy.model_executor.graph_optimization.decorator import \
|
from fastdeploy.model_executor.graph_optimization.decorator import \
|
||||||
support_graph_optimization
|
support_graph_optimization
|
||||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
|
||||||
|
|
||||||
@support_graph_optimization
|
@support_graph_optimization
|
||||||
|
Reference in New Issue
Block a user