[Feature] support pool (#3827)

* support pool * update pooling * add pooler_config and check * update * support AutoWeightsLoader load weight * fix * update * delete print * update pre-commit * fix * fix xpu * fix ModelRegistry->model_registry * fix Copilot review * fix pooler.py * delete StepPooler * fix abstract * fix default_loader_v1 * fix Pre Commit * support torch qwen3 dense * add test and fix torch-qwen * fix * fix * adapter ci: * fix review * fix pooling_params.py * fix * fix tasks.py 2025 * fix print and logger * Modefy ModelRegistry and delete AutoWeightsLoader * fix logger * fix test_embedding * fix ci bug * ernie4_5 model_registry * fix test * support Qwen3-Embedding-0.6B tp=1 load * fix extra code * fix * delete fix vocab_size * delete prepare_params_dict * fix:
2025-09-26 20:41:53 +08:00 · 2025-09-22 14:09:09 +08:00
parent da74a5f0b3
commit c86945ef49
36 changed files with 2371 additions and 51 deletions
--- a/docs/features/plugins.md
+++ b/docs/features/plugins.md
@@ -18,7 +18,7 @@ Assuming you have a custom model class `MyModelForCasualLM` and a pretrained cla

 ```python
 # File: fd_add_dummy_model/__init__.py or fd_add_dummy_model/register.py
-from fastdeploy.model_registry import ModelRegistry
+from fastdeploy.model_executor.models.model_base import ModelRegistry
 from my_custom_model import MyModelForCasualLM, MyPretrainedModel
 from fastdeploy.config import ErnieArchitectures

--- a/docs/zh/features/plugins.md
+++ b/docs/zh/features/plugins.md
@@ -18,7 +18,7 @@ FastDeploy 利用 Python 的 `entry_points` 机制来发现并加载插件。开

 ```python
 # 文件：fd_add_dummy_model/__init__.py
-from fastdeploy.model_registry import ModelRegistry
+from fastdeploy.model_executor.models.model_base import ModelRegistry
 from my_custom_model import MyModelForCasualLM, MyPretrainedModel

 def register():
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -18,12 +18,14 @@ from __future__ import annotations

 import json
 import os
+from dataclasses import field
 from enum import Enum
 from typing import Any, Dict, List, Literal, Optional, Union

 import paddle
 import paddle.distributed as dist
 from paddleformers.transformers.configuration_utils import PretrainedConfig
+from typing_extensions import assert_never

 import fastdeploy
 from fastdeploy import envs
@@ -31,11 +33,68 @@ from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfig
 from fastdeploy.multimodal.registry import MultimodalRegistry
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
+from fastdeploy.transformer_utils.config import get_pooling_config
 from fastdeploy.utils import ceil_div, check_unified_ckpt, get_host_ip, get_logger

 logger = get_logger("config", "config.log")

-TaskOption = Literal["generate"]
+TaskOption = Literal["auto", "generate", "embedding", "embed"]
+
+RunnerType = Literal["generate", "pooling"]
+
+RunnerOption = Literal["auto", "generate", "pooling"]
+
+ConvertOption = Literal["auto", "none", "embed"]
+
+ConvertType = Literal["none", "embed"]
+
+_ResolvedTask = Literal["generate", "encode", "embed"]
+
+_RUNNER_CONVERTS: dict[RunnerType, list[ConvertType]] = {
+    "generate": [],
+    "pooling": ["embed"],
+}
+
+# Some model suffixes are based on auto classes from Transformers:
+# https://huggingface.co/docs/transformers/en/model_doc/auto
+# NOTE: Items higher on this list priority over lower ones
+_SUFFIX_TO_DEFAULTS: list[tuple[str, tuple[RunnerType, ConvertType]]] = [
+    ("ForCausalLM", ("generate", "none")),
+    ("ForConditionalGeneration", ("generate", "none")),
+    ("ChatModel", ("generate", "none")),
+    ("LMHeadModel", ("generate", "none")),
+    ("ForTextEncoding", ("pooling", "embed")),
+    ("EmbeddingModel", ("pooling", "embed")),
+    ("ForSequenceClassification", ("pooling", "classify")),
+    ("ForAudioClassification", ("pooling", "classify")),
+    ("ForImageClassification", ("pooling", "classify")),
+    ("ForVideoClassification", ("pooling", "classify")),
+    ("ClassificationModel", ("pooling", "classify")),
+    ("ForRewardModeling", ("pooling", "reward")),
+    ("RewardModel", ("pooling", "reward")),
+    # Let other `*Model`s take priority
+    ("Model", ("pooling", "embed")),
+]
+
+
+def iter_architecture_defaults():
+    yield from _SUFFIX_TO_DEFAULTS
+
+
+def try_match_architecture_defaults(
+    architecture: str,
+    *,
+    runner_type: Optional[RunnerType] = None,
+    convert_type: Optional[ConvertType] = None,
+):
+    for suffix, (default_runner_type, default_convert_type) in iter_architecture_defaults():
+        if (
+            (runner_type is None or runner_type == default_runner_type)
+            and (convert_type is None or convert_type == default_convert_type)
+            and architecture.endswith(suffix)
+        ):
+            return suffix, (default_runner_type, default_convert_type)
+    return None


 class MoEPhase:
@@ -133,6 +192,12 @@ class ModelConfig:
        self.eos_tokens_lens: int = 2
        self.lm_head_fp32: bool = False
        self.model_format = "auto"
+        self.runner = "auto"
+        self.convert = "auto"
+        self.pooler_config: Optional["PoolerConfig"] = field(init=False)
+        self.override_pooler_config: Optional[Union[dict, "PoolerConfig"]] = None
+        self.revision = None
+
        self.partial_rotary_factor: float = 1.0
        self.num_nextn_predict_layers = 0
        for key, value in args.items():
@@ -161,6 +226,7 @@ class ModelConfig:
        self.ori_vocab_size = args.get("ori_vocab_size", self.vocab_size)

        architectures = self.architectures[0]
+
        if MultimodalRegistry.contains_model(architectures):
            self.enable_mm = True
        else:
@@ -171,6 +237,43 @@ class ModelConfig:
        self.override_name_from_config()
        self.read_from_env()
        self.read_model_config()
+        self.runner_type = self._get_runner_type(self.architectures, self.runner)
+        self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert)
+
+        registry = self.registry
+        is_generative_model = registry.is_text_generation_model(self.architectures, self)
+        is_pooling_model = registry.is_pooling_model(self.architectures, self)
+        is_multimodal_model = registry.is_multimodal_model(self.architectures, self)
+
+        if self.runner_type == "generate" and not is_generative_model:
+            if is_multimodal_model:
+                pass
+            else:
+                generate_converts = _RUNNER_CONVERTS["generate"]
+                if self.convert_type not in generate_converts:
+                    raise ValueError("This model does not support '--runner generate.")
+        if self.runner_type == "pooling" and not is_pooling_model:
+            pooling_converts = _RUNNER_CONVERTS["pooling"]
+            if self.convert_type not in pooling_converts:
+                convert_option = "<" + "|".join(pooling_converts) + ">"
+                raise ValueError(
+                    "This model does not support `--runner pooling`. "
+                    f"You can pass `--convert {convert_option} to adapt "
+                    "it into a pooling model."
+                )
+
+        self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type)
+        model_info, arch = registry.inspect_model_cls(self.architectures, self)
+        self._model_info = model_info
+        self._architecture = arch
+
+        self.pooler_config = self._init_pooler_config()
+
+    @property
+    def registry(self):
+        from fastdeploy.model_executor.models.model_base import ModelRegistry
+
+        return ModelRegistry()

    def override_name_from_config(self):
        """
@@ -194,7 +297,6 @@ class ModelConfig:
    def read_from_env(self):
        """
        Read configuration information from environment variables and update the object's attributes.
-
        If an attribute is not present or is an empty string in the environment variables, use the default value.
        """
        self.max_stop_seqs_num = int(envs.FD_MAX_STOP_SEQS_NUM)
@@ -235,6 +337,165 @@ class ModelConfig:
                    f"Config file path: {config_path}"
                )

+    def _get_default_runner_type(
+        self,
+        architectures: list[str],
+    ) -> RunnerType:
+        registry = self.registry
+        if get_pooling_config(self.model, self.revision):
+            return "pooling"
+        for arch in architectures:
+            if arch in registry.get_supported_archs():
+                if registry.is_pooling_model(architectures, self):
+                    return "pooling"
+                if registry.is_text_generation_model(architectures, self):
+                    return "generate"
+            match = try_match_architecture_defaults(arch)
+            if match:
+                _, (runner_type, _) = match
+                return runner_type
+        return "generate"
+
+    def _get_default_convert_type(
+        self,
+        architectures: list[str],
+        runner_type: RunnerType,
+    ) -> ConvertType:
+        registry = self.registry
+
+        for arch in architectures:
+            if arch in registry.get_supported_archs():
+                if runner_type == "generate" and registry.is_text_generation_model(architectures, self):
+                    return "none"
+                if runner_type == "pooling" and registry.is_pooling_model(architectures, self):
+                    return "none"
+            match = try_match_architecture_defaults(arch, runner_type=runner_type)
+            if match:
+                _, (_, convert_type) = match
+                return convert_type
+
+        # This is to handle Sentence Transformers models that use *ForCausalLM
+        # and also multi-modal pooling models which are not defined as
+        # Sentence Transformers models
+        if runner_type == "pooling":
+            return "embed"
+
+        return "none"
+
+    def _get_runner_type(
+        self,
+        architectures: list[str],
+        runner: RunnerOption,
+    ) -> RunnerType:
+        if runner != "auto":
+            return runner
+
+        runner_type = self._get_default_runner_type(architectures)
+        if runner_type != "generate":
+            logger.info(
+                "Resolved `--runner auto` to `--runner %s`. " "Pass the value explicitly to silence this message.",
+                runner_type,
+            )
+
+        return runner_type
+
+    def _get_convert_type(
+        self,
+        architectures: list[str],
+        runner_type: RunnerType,
+        convert: ConvertOption,
+    ) -> ConvertType:
+        if convert != "auto":
+            return convert
+
+        convert_type = self._get_default_convert_type(architectures, runner_type)
+
+        if convert_type != "none":
+            logger.info(
+                "Resolved `--convert auto` to `--convert %s`. " "Pass the value explicitly to silence this message.",
+                convert_type,
+            )
+
+        return convert_type
+
+    def _get_supported_generation_tasks(
+        self,
+        architectures: list[str],
+        convert_type: ConvertType,
+    ) -> list[_ResolvedTask]:
+        registry = self.registry
+
+        supported_tasks = list[_ResolvedTask]()
+        if registry.is_text_generation_model(architectures, self) or convert_type in _RUNNER_CONVERTS["generate"]:
+            supported_tasks.append("generate")
+
+        # TODO:Temporarily does not support transcription.
+        return supported_tasks
+
+    def _get_default_pooling_task(
+        self,
+        architectures: list[str],
+    ) -> Literal["embed"]:
+        # Temporarily does not support classification and reward.
+        for arch in architectures:
+            match = try_match_architecture_defaults(arch, runner_type="pooling")
+            if match:
+                _, (_, convert_type) = match
+                assert convert_type != "none"
+                return convert_type
+
+        return "embed"
+
+    def _get_supported_pooling_tasks(
+        self,
+        architectures: list[str],
+        convert_type: ConvertType,
+    ) -> list[_ResolvedTask]:
+        registry = self.registry
+
+        supported_tasks = list[_ResolvedTask]()
+        if registry.is_pooling_model(architectures, self) or convert_type in _RUNNER_CONVERTS["pooling"]:
+            supported_tasks.append("encode")
+
+            extra_task = self._get_default_pooling_task(architectures) if convert_type == "none" else convert_type
+            supported_tasks.append(extra_task)
+
+        return supported_tasks
+
+    def _get_supported_tasks(
+        self,
+        architectures: list[str],
+        runner_type: RunnerType,
+        convert_type: ConvertType,
+    ) -> list[_ResolvedTask]:
+        if runner_type == "generate":
+            return self._get_supported_generation_tasks(architectures, convert_type)
+        if runner_type == "pooling":
+            return self._get_supported_pooling_tasks(architectures, convert_type)
+
+        assert_never(runner_type)
+
+    def _init_pooler_config(self) -> Optional["PoolerConfig"]:
+        if self.runner_type == "pooling":
+            if isinstance(self.override_pooler_config, dict):
+                self.override_pooler_config = PoolerConfig(**self.override_pooler_config)
+
+            pooler_config = self.override_pooler_config or PoolerConfig()
+
+            base_config = get_pooling_config(self.model, self.revision)
+            if base_config is not None:
+                for k, v in base_config.items():
+                    if getattr(pooler_config, k) is None:
+                        setattr(pooler_config, k, v)
+
+            default_pooling_type = self._model_info.default_pooling_type
+            if pooler_config.pooling_type is None:
+                pooler_config.pooling_type = default_pooling_type
+
+            return pooler_config
+
+        return None
+
    def _get_download_model(self, model_name, model_type="default"):
        # TODO: Provide dynamic graph for self-downloading and save to the specified download directory.
        pass
@@ -846,6 +1107,41 @@ class LoadConfig:
                setattr(self, key, value)


+class PoolerConfig:
+    """Controls the behavior of output pooling in pooling models."""
+
+    pooling_type: Optional[str] = None
+    """
+    The pooling method of the pooling model.
+    """
+    # for embeddings models
+    normalize: Optional[bool] = None
+    """
+    Whether to normalize the embeddings outputs. Defaults to True.
+    """
+    dimensions: Optional[int] = None
+    """
+    Reduce the dimensions of embeddings if model
+    support matryoshka representation. Defaults to None.
+    """
+    enable_chunked_processing: Optional[bool] = None
+    """
+    Whether to enable chunked processing for long inputs that exceed the model's
+    maximum position embeddings. When enabled, long inputs will be split into
+    chunks, processed separately, and then aggregated using weighted averaging.
+    This allows embedding models to handle arbitrarily long text without CUDA
+    errors. Defaults to False.
+    """
+    max_embed_len: Optional[int] = None
+    """
+    Maximum input length allowed for embedding generation. When set, allows
+    inputs longer than max_embed_len to be accepted for embedding models.
+    When an input exceeds max_embed_len, it will be handled according to
+    the original max_model_len validation logic.
+    Defaults to None (i.e. set to max_model_len).
+    """
+
+
 class LoRAConfig:
    """LoRA Config"""

--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -18,13 +18,14 @@ import argparse
 import json
 from dataclasses import asdict, dataclass
 from dataclasses import fields as dataclass_fields
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union

 import paddle

 from fastdeploy import envs
 from fastdeploy.config import (
    CacheConfig,
+    ConvertOption,
    EarlyStopConfig,
    FDConfig,
    GraphOptimizationConfig,
@@ -32,6 +33,8 @@ from fastdeploy.config import (
    MobaAttentionConfig,
    ModelConfig,
    ParallelConfig,
+    PoolerConfig,
+    RunnerOption,
    SpeculativeConfig,
    TaskOption,
 )
@@ -95,6 +98,20 @@ class EngineArgs:
    """
    The task to be executed by the model.
    """
+    runner: RunnerOption = "auto"
+    """
+    The type of model runner to use.Each FD instance only supports one model runner.
+    even if the same model can be used for multiple types.
+    """
+    convert: ConvertOption = "auto"
+    """
+    Convert the model using adapters. The most common use case is to
+    adapt a text generation model to be used for pooling tasks.
+    """
+    override_pooler_config: Optional[Union[dict, PoolerConfig]] = None
+    """
+    Override configuration for the pooler.
+    """
    max_num_seqs: int = 8
    """
    Maximum number of sequences per iteration.
@@ -473,6 +490,21 @@ class EngineArgs:
            default=EngineArgs.task,
            help="Task to be executed by the model.",
        )
+        model_group.add_argument(
+            "--runner",
+            type=str,
+            default=EngineArgs.runner,
+            help="The type of model runner to use",
+        )
+        model_group.add_argument(
+            "--convert", type=str, default=EngineArgs.convert, help="Convert the model using adapters"
+        )
+        model_group.add_argument(
+            "--override-pooler-config",
+            type=json.loads,
+            default=EngineArgs.override_pooler_config,
+            help="Override the pooler configuration with a JSON string.",
+        )
        model_group.add_argument(
            "--use-warmup",
            type=int,
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -498,6 +498,9 @@ class LLMEngine:
            f" --load_choices {self.cfg.load_config.load_choices}"
            f" --moba_attention_config '{self.cfg.moba_attention_config.to_json_string()}'"
            f" --ips {ips}"
+            f" --runner {self.cfg.model_config.runner}"
+            f" --convert {self.cfg.model_config.convert}"
+            f" --override-pooler-config {self.cfg.model_config.override_pooler_config}"
        )

        worker_append_flag = {
--- a/fastdeploy/engine/pooling_params.py
+++ b/fastdeploy/engine/pooling_params.py
@@ -0,0 +1,170 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from copy import deepcopy
+from typing import TYPE_CHECKING, Annotated, Any, Optional
+
+import msgspec
+
+from fastdeploy.engine.sampling_params import RequestOutputKind
+from fastdeploy.engine.tasks import PoolingTask
+
+if TYPE_CHECKING:
+    from fastdeploy.config import ModelConfig
+
+
+class PoolingParams:
+    """API parameters for pooling models.
+
+    Attributes:
+        normalize: Whether to normalize the embeddings outputs.
+        dimensions: Reduce the dimensions of embeddings
+                    if model support matryoshka representation.
+        activation: Whether to apply activation function to
+                    the classification outputs.
+        softmax: Whether to apply softmax to the reward outputs.
+        step_tag_id: Step tag ID for process reward models to identify
+                    specific steps in multi-step reasoning tasks.
+        returned_token_ids: List of token IDs to return rewards for,
+                           used for fine-grained reward calculation.
+        task: Internal use only. Specifies the pooling task type
+              ("embed" for embeddings, "encode" for reward models).
+        requires_token_ids: Internal use only. Whether token ID information
+                           is required for processing.
+        extra_kwargs: Internal use only. Dictionary for storing additional
+                     custom parameters for extended functionality.
+        output_kind: Output type specification, fixed to FINAL_ONLY
+                    (only final outputs are returned).
+    """
+
+    truncate_prompt_tokens: Optional[Annotated[int, msgspec.Meta(ge=-1)]] = None
+    """If set to -1, will use the truncation size supported by the model. If
+    set to an integer k, will use only the last k tokens from the prompt
+    (i.e., left truncation). If set to `None`, truncation is disabled."""
+
+    # for embeddings models
+    dimensions: Optional[int] = None
+    normalize: Optional[bool] = None
+
+    # for reward models
+    softmax: Optional[bool] = None
+    step_tag_id: Optional[int] = None
+    returned_token_ids: Optional[list[int]] = None
+
+    task: Optional[PoolingTask] = None
+    """Internal use only."""
+
+    requires_token_ids: bool = False
+    """Internal use only."""
+
+    extra_kwargs: Optional[dict[str, Any]] = None
+    """Internal use only."""
+
+    output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY
+
+    @property
+    def _all_parameters(self) -> list[str]:
+        return ["dimensions", "normalize", "softmax", "step_tag_id", "returned_token_ids"]
+
+    @property
+    def valid_parameters(self):
+        return {
+            "embed": ["dimensions", "normalize"],
+            "encode": ["softmax", "step_tag_id", "returned_token_ids"],
+        }
+
+    def clone(self) -> "PoolingParams":
+        """Returns a deep copy of the PoolingParams instance."""
+        return deepcopy(self)
+
+    def verify(self, task: PoolingTask, model_config: Optional["ModelConfig"] = None) -> None:
+
+        if self.task is None:
+            self.task = task
+        elif self.task != task:
+            msg = f"You cannot overwrite {self.task=!r} with {task=!r}!"
+            raise ValueError(msg)
+
+        # NOTE: Task validation needs to done against the model instance,
+        # which is not available in model config. So, it's not included
+        # in this method
+
+        self._merge_default_parameters(model_config)
+        self._set_default_parameters(model_config)
+        self._verify_valid_parameters()
+
+    def _merge_default_parameters(self, model_config: Optional["ModelConfig"] = None) -> None:
+
+        if model_config is None:
+            return
+
+        pooler_config = model_config.pooler_config
+        if pooler_config is None:
+            return
+
+        assert self.task is not None, "task must be set"
+        valid_parameters = self.valid_parameters[self.task]
+
+        for k in valid_parameters:
+            if getattr(pooler_config, k, None) is None:
+                continue
+
+            if getattr(self, k, None) is None:
+                setattr(self, k, getattr(pooler_config, k))
+
+    def _set_default_parameters(self, model_config: Optional["ModelConfig"]):
+        if self.task == "embed":
+            if self.normalize is None:
+                self.normalize = True
+        elif self.task == "encode":
+            if self.softmax is None:
+                self.softmax = True
+        else:
+            raise ValueError(f"Unknown pooling task: {self.task}")
+
+    def _verify_valid_parameters(self):
+        assert self.task is not None, "task must be set"
+        valid_parameters = self.valid_parameters[self.task]
+        invalid_parameters = []
+        for k in self._all_parameters:
+            if k in valid_parameters:
+                continue
+
+            if getattr(self, k, None) is not None:
+                invalid_parameters.append(k)
+
+        if invalid_parameters:
+            raise ValueError(
+                f"Task {self.task} only supports {valid_parameters} "
+                f"parameters, does not support "
+                f"{invalid_parameters} parameters"
+            )
+
+    def __repr__(self) -> str:
+        return (
+            f"PoolingParams("
+            f"task={self.task}, "
+            f"normalize={self.normalize}, "
+            f"dimensions={self.dimensions}, "
+            f"softmax={self.softmax}, "
+            f"step_tag_id={self.step_tag_id}, "
+            f"returned_token_ids={self.returned_token_ids}, "
+            f"requires_token_ids={self.requires_token_ids}, "
+            f"extra_kwargs={self.extra_kwargs})"
+        )
+
+    def __post_init__(self) -> None:
+        assert self.output_kind == RequestOutputKind.FINAL_ONLY, "For pooling output_kind has to be FINAL_ONLY"
--- a/fastdeploy/engine/sampling_params.py
+++ b/fastdeploy/engine/sampling_params.py
@@ -18,6 +18,7 @@ from __future__ import annotations

 import random
 from dataclasses import dataclass, fields
+from enum import Enum
 from typing import Any, List, Optional, Union


@@ -268,3 +269,12 @@ class GuidedDecodingParams:
                "You can only use one kind of guided decoding "
                "('json', 'json_object', 'regex', 'choice', 'grammar', 'structural_tag')."
            )
+
+
+class RequestOutputKind(Enum):
+    # Return entire output so far in every RequestOutput
+    CUMULATIVE = 0
+    # Return only deltas in each RequestOutput
+    DELTA = 1
+    # Do not return intermediate RequestOutput
+    FINAL_ONLY = 2
--- a/fastdeploy/engine/tasks.py
+++ b/fastdeploy/engine/tasks.py
@@ -0,0 +1,25 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Literal, get_args
+
+GenerationTask = Literal["generate"]
+GENERATION_TASKS = get_args(GenerationTask)
+
+PoolingTask = Literal["encode", "embed"]
+POOLING_TASKS = get_args(PoolingTask)
+
+SupportedTask = Literal[GenerationTask, PoolingTask]
--- a/fastdeploy/model_executor/layers/activation.py
+++ b/fastdeploy/model_executor/layers/activation.py
@@ -146,3 +146,26 @@ class SiluAndMul(nn.Layer):
        if self.bias is not None:
            out = out + self.bias
        return out
+
+
+def get_act_fn(act_fn_name: str) -> nn.Layer:
+    """Get an activation function by name."""
+    act_fn_name = act_fn_name.lower()
+
+    if act_fn_name.startswith("paddle.nn.Layer"):
+        activation_name = act_fn_name.split(".")[-1]
+        if activation_name == "identity":
+            return nn.Identity()
+        act_fn_name = activation_name
+
+    activation_map = {
+        "gelu": nn.GELU(),
+        "relu": nn.ReLU(),
+        "silu": nn.Silu(),
+        "tanh": nn.Tanh(),
+        "sigmoid": nn.Sigmoid(),
+    }
+    if act_fn_name in activation_map:
+        return activation_map[act_fn_name]
+    else:
+        raise ValueError(f"Activation function {act_fn_name!r} is not supported.")
--- a/fastdeploy/model_executor/layers/pool/metadata.py
+++ b/fastdeploy/model_executor/layers/pool/metadata.py
@@ -0,0 +1,85 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from dataclasses import dataclass
+from typing import Optional
+
+import paddle
+
+from fastdeploy.engine.pooling_params import PoolingParams
+
+
+@dataclass
+class PoolingCursor:
+    index: list[int]
+    first_token_indices_gpu: paddle.Tensor
+    last_token_indices_gpu: paddle.Tensor
+    prompt_lens_cpu: paddle.Tensor
+    num_scheduled_tokens_cpu: paddle.Tensor
+
+    def __getitem__(self, indices: slice):
+        return PoolingCursor(
+            index=self.index[indices],
+            first_token_indices_gpu=self.first_token_indices_gpu[indices],
+            last_token_indices_gpu=self.last_token_indices_gpu[indices],
+            prompt_lens_cpu=self.prompt_lens_cpu[indices],
+            num_scheduled_tokens_cpu=self.num_scheduled_tokens_cpu[indices],
+        )
+
+    def is_partial_prefill(self):
+        return not paddle.all(self.prompt_lens_cpu == self.num_scheduled_tokens_cpu).item()
+
+
+@dataclass
+class PoolingMetadata:
+    """Tensors for pooling."""
+
+    prompt_lens: paddle.Tensor  # CPU Tensor
+    prompt_token_ids: Optional[paddle.Tensor]
+    pooling_params: list[PoolingParams]
+    pooling_cursor: Optional[PoolingCursor] = None
+
+    def __getitem__(self, indices: slice):
+        return PoolingMetadata(
+            prompt_lens=self.prompt_lens[indices],
+            prompt_token_ids=None if self.prompt_token_ids is None else self.prompt_token_ids[indices],
+            pooling_params=self.pooling_params[indices],
+            pooling_cursor=None if self.pooling_cursor is None else self.pooling_cursor[indices],
+        )
+
+    def build_pooling_cursor(self, num_scheduled_tokens: list[int], device: str):
+        self.pooling_cursor = build_pooling_cursor(num_scheduled_tokens, self.prompt_lens, device)
+
+
+def build_pooling_cursor(num_scheduled_tokens: list[int], prompt_lens: paddle.Tensor, device: str):
+    assert len(prompt_lens) == len(num_scheduled_tokens)
+
+    n_seq = len(num_scheduled_tokens)
+    index = list(range(n_seq))
+    num_scheduled_tokens = paddle.to_tensor(num_scheduled_tokens, device="cpu")
+    cumsum = paddle.zeros([n_seq + 1], dtype="int64", place=paddle.CPUPlace())
+    paddle.cumsum(num_scheduled_tokens, axis=0, out=cumsum[1:])
+    if device == "gpu":
+        cumsum_device = cumsum.cuda()
+    else:
+        cumsum_device = cumsum
+    return PoolingCursor(
+        index=index,
+        first_token_indices_gpu=cumsum_device[:n_seq],
+        last_token_indices_gpu=cumsum_device[1:] - 1,
+        prompt_lens_cpu=prompt_lens,
+        num_scheduled_tokens_cpu=num_scheduled_tokens,
+    )
--- a/fastdeploy/model_executor/layers/pooler.py
+++ b/fastdeploy/model_executor/layers/pooler.py
@@ -0,0 +1,550 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from abc import ABC, abstractmethod
+from collections.abc import Mapping, Set
+from dataclasses import dataclass
+from enum import IntEnum
+from itertools import groupby
+from typing import Callable, Optional, TypeVar, Union, cast
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from fastdeploy.config import FDConfig, ModelConfig, PoolerConfig
+from fastdeploy.engine.tasks import PoolingTask
+from fastdeploy.model_executor.layers.pool.metadata import (
+    PoolingCursor,
+    PoolingMetadata,
+    PoolingParams,
+)
+from fastdeploy.model_executor.models.adapters import _load_st_projector
+from fastdeploy.output.pooler import PoolerOutput, PoolingSequenceGroupOutput
+from fastdeploy.utils import get_logger
+
+logger = get_logger("pooler", "pooler.log")
+
+PoolingFn = Callable[
+    [Union[paddle.Tensor, list[paddle.Tensor]], PoolingMetadata], Union[paddle.Tensor, list[paddle.Tensor]]
+]
+ClassifierFn = Callable[[paddle.Tensor], paddle.Tensor]
+
+
+class PoolingType(IntEnum):
+    """Enumeration for different types of pooling methods."""
+
+    LAST = 0
+    ALL = 1
+    CLS = 2
+    STEP = 3
+    MEAN = 4
+
+
+_T = TypeVar("_T", paddle.Tensor, list[paddle.Tensor])
+
+
+@dataclass(frozen=True)
+class ResolvedPoolingConfig:
+    pooling_type: PoolingType
+    task: PoolingTask
+
+    @classmethod
+    def from_config(
+        cls,
+        task: PoolingTask,
+        pooler_config: PoolerConfig,
+    ) -> "ResolvedPoolingConfig":
+        assert pooler_config.pooling_type is not None
+        return cls(task=task, pooling_type=PoolingType[pooler_config.pooling_type])
+
+
+def get_pooling_params(pooling_metadata: PoolingMetadata) -> list[PoolingParams]:
+    pooling_params = pooling_metadata.pooling_params
+    return pooling_params
+
+
+def get_tasks(pooling_metadata: PoolingMetadata) -> list[PoolingTask]:
+    pooling_params = get_pooling_params(pooling_metadata)
+
+    tasks: list[PoolingTask] = [task for pooling_param in pooling_params if (task := pooling_param.task) is not None]
+    assert len(pooling_params) == len(tasks)
+
+    return tasks
+
+
+def get_prompt_token_ids(pooling_metadata: PoolingMetadata) -> list[paddle.Tensor]:
+    assert (
+        pooling_metadata.prompt_token_ids is not None
+    ), "Please set `requires_token_ids=True` in `get_pooling_updates`"
+
+    return [pooling_metadata.prompt_token_ids[i, :num] for i, num in enumerate(pooling_metadata.prompt_lens)]
+
+
+@dataclass(frozen=True)
+class PoolingParamsUpdate:
+    requires_token_ids: bool = False
+    """Set this flag to enable `get_prompt_token_ids` for your pooler."""
+
+    def apply(self, params: PoolingParams) -> None:
+        params.requires_token_ids = self.requires_token_ids
+
+
+class Pooler(nn.Layer, ABC):
+    """The interface required for all poolers used in pooling models in FastDeploy."""
+
+    @staticmethod
+    def for_encode(pooler_config: PoolerConfig, model_config: Optional["ModelConfig"] = None):
+        if pooler_config.pooling_type == "STEP":
+            return StepPooler()
+
+        resolved_config = ResolvedPoolingConfig(task="encode", pooling_type=PoolingType.ALL)
+        return SimplePooler.from_config(resolved_config, model_config)
+
+    @staticmethod
+    def for_embed(pooler_config: PoolerConfig, model_config: Optional["ModelConfig"] = None):
+        resolved_config = ResolvedPoolingConfig.from_config(
+            task="embed",
+            pooler_config=pooler_config,
+        )
+        return SimplePooler.from_config(resolved_config, model_config)
+
+    @staticmethod
+    def for_classify(
+        pooler_config: PoolerConfig,
+        classify: Optional[ClassifierFn],
+    ):
+        pass
+
+    @abstractmethod
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        """Determine which pooling tasks are supported."""
+        raise NotImplementedError
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        """
+        Construct the updated pooling parameters to use for a supported task.
+        """
+        return PoolingParamsUpdate()
+
+    @abstractmethod
+    def forward(
+        self,
+        hidden_states: Union[list[paddle.Tensor], paddle.Tensor],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        raise NotImplementedError
+
+
+class BasePoolerActication(nn.Layer, ABC):
+
+    @abstractmethod
+    def forward(self, pooled_data: _T) -> _T:
+        # shape:
+        # classify (& score) -> (batch_size, num_classes)
+        # embed -> (batch_size, embedding_dim) or list(embedding_dim)
+        #          (batch_size, dimensions) or list(dimensions) if using MRL
+        raise NotImplementedError
+
+
+class PoolerActivation(BasePoolerActication):
+
+    @staticmethod
+    def wraps(module: nn.Layer):
+        if isinstance(module, nn.Identity):
+            return PoolerIdentity()
+        if isinstance(module, (nn.Sigmoid, nn.Softmax)):
+            return PoolerClassify()
+
+        return LambdaPoolerActivation(module)
+
+    @abstractmethod
+    def forward_chunk(self, pooled_data: paddle.Tensor) -> paddle.Tensor:
+        raise NotImplementedError
+
+    def forward(self, pooled_data: _T) -> _T:
+        if isinstance(pooled_data, list):
+            return [self.forward_chunk(data) for data in pooled_data]
+
+        return self.forward_chunk(pooled_data)
+
+
+class PoolerIdentity(PoolerActivation):
+
+    def forward_chunk(self, pooled_data: paddle.Tensor) -> paddle.Tensor:
+        return pooled_data
+
+
+class PoolerClassify(PoolerActivation):
+
+    def __init__(self, *, static_num_labels: bool = True) -> None:
+        super().__init__()
+
+        if static_num_labels:
+            fd_config = FDConfig()
+            self.num_labels = getattr(fd_config.model_config, "num_labels", 0)
+            if self.num_labels == 0:
+                logger.warning(
+                    "num_labels should be > 0 for classification"
+                    "models, falling back to softmax. "
+                    "Please check if the configuration is correct."
+                )
+        else:
+            self.num_labels = None
+
+    def forward_chunk(self, pooled_data: paddle.Tensor) -> paddle.Tensor:
+        num_labels = self.num_labels if self.num_labels is not None else pooled_data.shape[-1]
+        if num_labels < 2:
+            return F.sigmoid(pooled_data.astype("float32")).astype(pooled_data.dtype)
+
+        return F.softmax(pooled_data.astype("float32"), axis=-1).astype(pooled_data.dtype)
+
+
+class LambdaPoolerActivation(PoolerActivation):
+
+    def __init__(self, fn: Callable[[paddle.Tensor], paddle.Tensor]):
+        super().__init__()
+
+        self.fn = fn
+
+    def forward_chunk(self, pooled_data: paddle.Tensor) -> paddle.Tensor:
+        return self.fn(pooled_data)
+
+
+class PoolerHead(nn.Layer):
+
+    def __init__(self, activation: PoolerActivation) -> None:
+        super().__init__()
+        self.activation = activation
+
+    def forward(self, pooled_data: Union[list[paddle.Tensor], paddle.Tensor], pooling_metadata: PoolingMetadata):
+
+        return self.activation(pooled_data)
+
+
+class EmbeddingPoolerHead(PoolerHead):
+
+    def __init__(self, model_config: Optional["ModelConfig"] = None) -> None:
+        super().__init__(activation=PoolerNormalize())
+
+        self.projector = _load_st_projector(model_config)
+
+    def forward(self, pooled_data: Union[list[paddle.Tensor], paddle.Tensor], pooling_metadata: PoolingMetadata):
+
+        if isinstance(pooled_data, list):
+            pooled_data = paddle.stack(pooled_data)
+        # pooled_data shape: [batchsize, hidden_dimension]
+
+        # Apply ST projector
+        if self.projector is not None:
+            projector = cast(nn.Layer, self.projector)
+
+            def _proj(x: paddle.Tensor) -> paddle.Tensor:
+                orig_dtype = x.dtype
+                y = projector(x.astype("float32"))
+                return y.astype(orig_dtype)
+
+            pooled_data = _proj(pooled_data)
+        # pooled_data shape: [batchsize, embedding_dimension]
+
+        pooling_params = get_pooling_params(pooling_metadata)
+
+        # for matryoshka representation
+        dimensions_list = [pooling_param.dimensions for pooling_param in pooling_params]
+        if any(d is not None for d in dimensions_list):
+            # change the output dimension
+            assert len(pooled_data) == len(dimensions_list)
+            if len(set(dimensions_list)) == 1 and not isinstance(pooled_data, list):
+                # if all dimensions are the same
+                d = dimensions_list[0]
+                pooled_data = pooled_data[..., :d]
+            else:
+                pooled_data = [vecs if d is None else vecs[..., :d] for vecs, d in zip(pooled_data, dimensions_list)]
+        # for normalize
+        flags = [p.normalize for p in pooling_params]
+        if len(set(flags)) == 1:
+            if flags[0]:
+                pooled_data = self.activation(pooled_data)
+        else:
+            pooled_data = [self.activation(vecs) if f else vecs for vecs, f in zip(pooled_data, flags)]
+
+        # pooled_data shape: [batchsize, embedding_dimension]
+        return pooled_data
+
+
+class RewardPoolerHead(PoolerHead):
+
+    def __init__(self, model_config: Optional["ModelConfig"] = None) -> None:
+        super().__init__(activation=PoolerClassify(static_num_labels=False))
+        self.model_config = model_config
+
+    def forward(self, pooled_data: Union[list[paddle.Tensor], paddle.Tensor], pooling_metadata: PoolingMetadata):
+        pooling_params = get_pooling_params(pooling_metadata)
+
+        # for softmax
+        flags = [p.softmax for p in pooling_params]
+        if len(set(flags)) == 1:
+            if flags[0]:
+                pooled_data = self.activation(pooled_data)
+        else:
+            pooled_data = [self.activation(vecs) if f else vecs for vecs, f in zip(pooled_data, flags)]
+
+        return pooled_data
+
+
+def build_output(
+    all_data: Union[paddle.Tensor, list[paddle.Tensor]],
+) -> PoolerOutput:
+    # Pooling models D2H & synchronize occurs here
+    if isinstance(all_data, list):
+        all_data = [d.cpu() for d in all_data]
+    else:
+        all_data = all_data.cpu()
+
+    all_outputs = [PoolingSequenceGroupOutput(data) for data in all_data]
+    return PoolerOutput(outputs=all_outputs)
+
+
+class PoolingMethod(nn.Layer, ABC):
+
+    @staticmethod
+    def from_pooling_type(pooling_type: PoolingType) -> "PoolingMethod":
+        if pooling_type == PoolingType.LAST:
+            return LastPool()
+        if pooling_type == PoolingType.ALL:
+            return AllPool()
+        if pooling_type == PoolingType.CLS:
+            return CLSPool()
+        if pooling_type == PoolingType.MEAN:
+            return MeanPool()
+        raise NotImplementedError(f"Unsupported method: {pooling_type}")
+
+
+class LastPool(PoolingMethod):
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode", "embed", "classify", "score"}
+
+    def forward_all(
+        self,
+        hidden_states: paddle.Tensor,
+        pooling_cursor: PoolingCursor,
+    ) -> Union[list[paddle.Tensor], paddle.Tensor]:
+        return hidden_states[pooling_cursor.last_token_indices_gpu]
+
+
+class AllPool(PoolingMethod):
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode"}
+
+    def forward_all(
+        self,
+        hidden_states: paddle.Tensor,
+        pooling_cursor: PoolingCursor,
+    ) -> Union[list[paddle.Tensor], paddle.Tensor]:
+
+        assert not pooling_cursor.is_partial_prefill(), "partial prefill not supported with ALL pooling"
+
+        hidden_states_lst = list(hidden_states.split(pooling_cursor.num_scheduled_tokens_cpu.tolist()))
+        return [hidden_states_lst[i] for i in pooling_cursor.index]
+
+
+class MeanPool(PoolingMethod):
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode", "embed", "classify", "score"}
+
+    def forward_all(
+        self,
+        hidden_states: paddle.Tensor,
+        pooling_cursor: PoolingCursor,
+    ) -> Union[list[paddle.Tensor], paddle.Tensor]:
+
+        assert not pooling_cursor.is_partial_prefill(), "partial prefill not supported with MEAN pooling"
+
+        if hidden_states.place.is_gpu_place():
+            prompt_lens = pooling_cursor.prompt_lens_cpu.cuda()
+        else:
+            prompt_lens = pooling_cursor.prompt_lens_cpu
+
+        # Use float32 for paddle.cumsum in MeanPool,
+        # otherwise precision will be lost significantly.
+        cumsum = paddle.cumsum(hidden_states.astype("float32"), axis=0)
+
+        start_indices = pooling_cursor.first_token_indices_gpu
+        end_indices = pooling_cursor.last_token_indices_gpu
+        return (cumsum[end_indices] - cumsum[start_indices] + hidden_states[start_indices]) / prompt_lens.unsqueeze(1)
+
+
+class CLSPool(PoolingMethod):
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode", "embed", "classify", "score"}
+
+    def forward_all(
+        self,
+        hidden_states: paddle.Tensor,
+        pooling_cursor: PoolingCursor,
+    ) -> Union[list[paddle.Tensor], paddle.Tensor]:
+        assert not pooling_cursor.is_partial_prefill(), "partial prefill not supported with CLS pooling"
+
+        return hidden_states[pooling_cursor.first_token_indices_gpu]
+
+
+class StepPooler(Pooler):
+    def __init__(
+        self,
+    ) -> None:
+        super().__init__()
+
+        self.pooling = AllPool()
+        self.head = RewardPoolerHead()
+
+    def extract_states(
+        self,
+        hidden_states: Union[paddle.Tensor, list[paddle.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> Union[list[paddle.Tensor], paddle.Tensor]:
+        pooled_data_lst = self.pooling(hidden_states, pooling_metadata)
+        prompt_token_ids = get_prompt_token_ids(pooling_metadata)
+
+        pooled_data = list[paddle.Tensor]()
+
+        pooling_params = get_pooling_params(pooling_metadata)
+
+        for data, token_id, pooling_param in zip(pooled_data_lst, prompt_token_ids, pooling_params):
+            step_tag_id = pooling_param.step_tag_id
+            returned_token_ids = pooling_param.returned_token_ids
+
+            if returned_token_ids is not None and len(returned_token_ids) > 0:
+                data = data[:, returned_token_ids]
+
+            if step_tag_id is not None:
+                data = data[token_id == step_tag_id]
+            pooled_data.append(data)
+
+        return pooled_data
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return {"encode"}
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return PoolingParamsUpdate(requires_token_ids=True)
+
+    def forward(
+        self,
+        hidden_states: Union[paddle.Tensor, list[paddle.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.extract_states(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data, pooling_metadata)
+        return build_output(pooled_data)
+
+
+class SimplePooler(Pooler):
+    """A layer that pools specific information from hidden states.
+
+    This layer does the following:
+    1. Extracts specific tokens or aggregates data based on pooling method.
+    2. Normalizes output if specified.
+    3. Returns structured results as `PoolerOutput`.
+    """
+
+    @classmethod
+    def from_config(
+        cls,
+        pooler_config: ResolvedPoolingConfig,
+        model_config: Optional["ModelConfig"] = None,
+    ) -> "SimplePooler":
+        pooling = PoolingMethod.from_pooling_type(pooler_config.pooling_type)
+        if pooler_config.task == "embed":
+            head = EmbeddingPoolerHead(model_config)
+        elif pooler_config.task == "encode":
+            head = RewardPoolerHead(model_config)
+        else:
+            raise NotImplementedError(f"Unknown task: {pooler_config.task}")
+        return cls(pooling, head)
+
+    def __init__(self, pooling: PoolingMethod, head: PoolerHead) -> None:
+        super().__init__()
+
+        self.pooling = pooling
+        self.head = head
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return self.pooling.get_supported_tasks()
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return self.pooling.get_pooling_updates(task)
+
+    def forward(
+        self,
+        hidden_states: Union[paddle.Tensor, list[paddle.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        pooled_data = self.pooling(hidden_states, pooling_metadata)
+        pooled_data = self.head(pooled_data, pooling_metadata)
+        return build_output(pooled_data)
+
+
+class PoolerNormalize(PoolerActivation):
+    def forward_chunk(self, pooled_data: paddle.Tensor) -> paddle.Tensor:
+        x = F.normalize(pooled_data.astype("float32"), p=2, axis=-1)
+        return x.astype(pooled_data.dtype)
+
+
+class DispatchPooler(Pooler):
+    """Dispatches calls to a sub-pooler based on the pooling task."""
+
+    def __init__(self, poolers_by_task: Mapping[PoolingTask, Pooler]) -> None:
+        super().__init__()
+
+        for task, pooler in poolers_by_task.items():
+            if task not in pooler.get_supported_tasks():
+                raise ValueError(
+                    f"{pooler=} does not support {task=}. " f"Supported tasks: {pooler.get_supported_tasks()}"
+                )
+
+        self.poolers_by_task = poolers_by_task
+
+    def get_supported_tasks(self) -> Set[PoolingTask]:
+        return set(self.poolers_by_task)
+
+    def get_pooling_updates(self, task: PoolingTask) -> PoolingParamsUpdate:
+        return self.poolers_by_task[task].get_pooling_updates(task)
+
+    def forward(
+        self,
+        hidden_states: Union[paddle.Tensor, list[paddle.Tensor]],
+        pooling_metadata: PoolingMetadata,
+    ) -> PoolerOutput:
+        poolers_by_task = self.poolers_by_task
+
+        outputs = list[PoolingSequenceGroupOutput]()
+        offset = 0
+        for task, group in groupby(get_tasks(pooling_metadata)):
+            if not (pooler := poolers_by_task.get(task)):
+                raise ValueError(f"Unsupported task: {task} " f"Supported tasks: {self.get_supported_tasks()}")
+
+            num_items = len(list(group))
+            group_output: PoolerOutput = pooler(
+                hidden_states,
+                pooling_metadata[offset : offset + num_items],
+            )
+            outputs.extend(group_output.outputs)
+            offset += num_items
+
+        return PoolerOutput(outputs)
--- a/fastdeploy/model_executor/model_loader/default_loader.py
+++ b/fastdeploy/model_executor/model_loader/default_loader.py
@@ -61,6 +61,7 @@ class DefaultModelLoader(BaseModelLoader):
            fd_config,
            return_numpy=True,
        )
+
        model.set_state_dict(state_dict)
        self.clean_memory_fragments(state_dict)

--- a/fastdeploy/model_executor/model_loader/default_loader_v1.py
+++ b/fastdeploy/model_executor/model_loader/default_loader_v1.py
@@ -16,7 +16,7 @@

 import paddle
 from paddle import nn
-from paddleformers.utils.log import logger
+from typing_extensions import assert_never

 from fastdeploy.config import FDConfig, LoadConfig, ModelConfig
 from fastdeploy.model_executor.load_weight_utils import (
@@ -27,6 +27,7 @@ from fastdeploy.model_executor.load_weight_utils import (
    save_model,
 )
 from fastdeploy.model_executor.model_loader.base_loader import BaseModelLoader
+from fastdeploy.model_executor.models.adapters import as_embedding_model
 from fastdeploy.model_executor.models.model_base import ModelRegistry
 from fastdeploy.platforms import current_platform

@@ -54,11 +55,11 @@ class DefaultModelLoaderV1(BaseModelLoader):
            load_weights_form_cache(model, weights_iterator)
        else:
            model.load_weights(weights_iterator)
+
        self.clean_memory_fragments()

    def load_model(self, fd_config: FDConfig) -> nn.Layer:
        architectures = fd_config.model_config.architectures[0]
-        logger.info(f"Starting to load model {architectures}")
        context = paddle.LazyGuard()
        if fd_config.load_config.dynamic_load_weight:
            # register rl model
@@ -70,6 +71,14 @@ class DefaultModelLoaderV1(BaseModelLoader):
        with weight_cache_context:
            with context:
                model_cls = ModelRegistry.get_class(architectures)
+                convert_type = fd_config.model_config.convert_type
+                if convert_type == "none":
+                    pass
+                elif convert_type == "embed":
+                    model_cls = as_embedding_model(model_cls)
+                else:
+                    assert_never(convert_type)
+
                model = model_cls(fd_config)

        model.eval()
--- a/fastdeploy/model_executor/models/init.py
+++ b/fastdeploy/model_executor/models/init.py
@@ -47,8 +47,10 @@ def auto_models_registry(dir_path, register_path="fastdeploy.model_executor.mode
            module = importlib.import_module(f"{register_path}.{module_file}")
            for attr_name in dir(module):
                attr = getattr(module, attr_name)
+
                if inspect.isclass(attr) and issubclass(attr, ModelForCasualLM) and attr is not ModelForCasualLM:
                    ModelRegistry.register_model_class(attr)
+
                if (
                    inspect.isclass(attr)
                    and issubclass(attr, PretrainedModel)
@@ -56,6 +58,7 @@ def auto_models_registry(dir_path, register_path="fastdeploy.model_executor.mode
                    and hasattr(attr, "arch_name")
                ):
                    ModelRegistry.register_pretrained_model(attr)
+
        except ImportError:
            raise ImportError(f"{module_file=} import error")

--- a/fastdeploy/model_executor/models/adapters.py
+++ b/fastdeploy/model_executor/models/adapters.py
@@ -0,0 +1,214 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from collections.abc import Iterable
+from typing import Optional, TypeVar
+
+import paddle
+import paddle.nn as nn
+
+from fastdeploy.config import ModelConfig
+from fastdeploy.model_executor.layers.activation import get_act_fn
+from fastdeploy.model_executor.models.interfaces_base import is_pooling_model
+from fastdeploy.transformer_utils.config import get_hf_file_to_dict
+
+_T = TypeVar("_T", bound=type[nn.Layer])
+
+_GENERATE_SUFFIXES = [
+    "ForCausalLM",
+    "ForConditionalGeneration",
+    "ChatModel",
+    "LMHeadModel",
+]
+
+
+def _load_dense_weights(linear: nn.Linear, folder: str, model_config: "ModelConfig") -> bool:
+    """Load weights using vLLM's weight_loader pattern."""
+
+    from fastdeploy.model_executor.utils import default_weight_loader
+
+    filename = "model.safetensors"
+    file_path = f"{folder}/{filename}" if folder else filename
+
+    try:
+        file_bytes = get_hf_file_to_dict(file_path, model_config.model, model_config.revision)
+        if not file_bytes:
+            return False
+
+        state_dict = {}
+        if filename.endswith(".safetensors"):
+            import io
+
+            from safetensors.numpy import load as load_safetensors
+
+            numpy_tensors = load_safetensors(io.BytesIO(file_bytes))
+            for key, numpy_array in numpy_tensors.items():
+                state_dict[key] = paddle.to_tensor(numpy_array)
+        else:
+            import io
+
+            state_dict = paddle.load(io.BytesIO(file_bytes))
+
+        weight_keys = ["weight", "linear.weight", "dense.weight"]
+
+        for weight_key in weight_keys:
+            if weight_key in state_dict:
+                weight_loader = getattr(linear.weight, "weight_loader", default_weight_loader)
+                weight_loader(linear.weight, state_dict[weight_key].astype(paddle.float32))
+                bias_key = weight_key.replace("weight", "bias")
+                if linear.bias is not None and bias_key in state_dict:
+                    bias_loader = getattr(linear.bias, "weight_loader", default_weight_loader)
+                    bias_loader(linear.bias, state_dict[bias_key].astype(paddle.float32))
+                return True
+    except Exception as e:
+        print(f"Failed to load :{e}")
+        return False
+    return False
+
+
+def _load_st_projector(model_config: "ModelConfig") -> Optional[nn.Layer]:
+    try:
+        modules = get_hf_file_to_dict("modules.json", model_config.model, model_config.revision)
+        if not modules:
+            return None
+
+        if isinstance(modules, dict):
+            modules = modules.get("modules", [])
+
+        dense_modules = [m for m in modules if m.get("type") == "sentence_transformers.models.Dense"]
+        if not dense_modules:
+            return None
+
+        layers = []
+        for module in dense_modules:
+            folder = module.get("path", "")
+            config_path = f"{folder}/config.json" if folder else "config.json"
+            layer_config = get_hf_file_to_dict(config_path, model_config.model, model_config.revision)
+            if not layer_config:
+                continue
+            linear = nn.Linear(
+                layer_config.get("in_features", 768),
+                layer_config.get("out_features", 768),
+                bias=layer_config.get("bias", True),
+            )
+            linear = linear.astype(paddle.float32)
+
+            if not _load_dense_weights(linear, folder, model_config):
+                continue
+
+            layers.append(linear)
+            if act_name := layer_config.get("activation_function"):
+                layers.append(get_act_fn(act_name))
+        return nn.Sequential(*layers).astype(paddle.float32)
+    except Exception as e:
+        print(f"ST projector loading failed:{e}")
+
+    return None
+
+
+def _create_pooling_model_cls(orig_cls: _T) -> _T:
+
+    class ModelForPooling(orig_cls):
+
+        def __init__(self, fd_config, *args, **kwargs):
+            super().__init__(fd_config, *args, **kwargs)
+            self.fd_config = fd_config
+            self.is_pooling_model = True
+
+            # These are not used in pooling models
+            for attr in ("lm_head", "logits_processor"):
+                if hasattr(self, attr):
+                    delattr(self, attr)
+
+            # If the model already defines a pooler instance, don't overwrite it
+            if not getattr(self, "pooler", None):
+                self._init_pooler(fd_config)
+
+        def _init_pooler(self, fd_config):
+            raise NotImplementedError
+
+        def load_weights(self, weights: Iterable[tuple[str, paddle.Tensor]]):
+            # TODO: Support uninitialized params tracking
+
+            # We have deleted this attribute, so don't load it
+            weights = ((name, data) for name, data in weights if not name.startswith("lm_head."))
+
+            # If `*ForCausalLM` defines `load_weights` on the inner model
+            # and there are no other inner modules with parameters,
+            # we support loading from both `*Model` and `*ForCausalLM`
+
+            if hasattr(self, "model") and hasattr(self.model, "load_weights"):
+                # Whether only `self.model` contains parameters
+                model_is_only_param = all(
+                    name == "model" or not any(child.parameters()) for name, child in self.named_children()
+                )
+                if model_is_only_param:
+                    weights = ((name[6:], data) for name, data in weights if name.startswith("model."))
+                    loaded_params = self.model.load_weights(weights)
+                    loaded_params = {f"model.{name}" for name in loaded_params}
+                    return loaded_params
+
+            # For most other models
+            if hasattr(orig_cls, "load_weights"):
+                return orig_cls.load_weights(self, weights)  # type: ignore
+            # Fallback
+            else:
+                raise ValueError("No load_weights method found in the model.")
+
+    return ModelForPooling
+
+
+def _get_pooling_model_name(orig_model_name: str, pooling_suffix: str) -> str:
+    model_name = orig_model_name
+
+    for generate_suffix in _GENERATE_SUFFIXES:
+        model_name = model_name.removesuffix(generate_suffix)
+    return model_name + pooling_suffix
+
+
+def as_embedding_model(cls: _T) -> _T:
+    """
+    Subclass an existing vLLM model to support embeddings.
+
+    By default, the embeddings of the whole prompt are extracted from the
+    normalized hidden state corresponding to the last token.
+
+    Note:
+        We assume that no extra layers are added to the original model;
+        please implement your own model if this is not the case.
+    """
+    # Avoid modifying existing embedding models
+    if is_pooling_model(cls):
+        return cls
+
+    from fastdeploy.model_executor.layers.pooler import DispatchPooler, Pooler
+
+    class ModelForEmbedding(_create_pooling_model_cls(cls)):
+
+        def _init_pooler(self, fd_config, prefix: str = ""):
+            pooler_config = fd_config.model_config.pooler_config
+            assert pooler_config is not None
+
+            self.pooler = DispatchPooler(
+                {
+                    "encode": Pooler.for_encode(pooler_config, fd_config.model_config),
+                    "embed": Pooler.for_embed(pooler_config, fd_config.model_config),
+                },
+            )
+
+    ModelForEmbedding.__name__ = _get_pooling_model_name(cls.__name__, "ForEmbedding")
+
+    return ModelForEmbedding
--- a/fastdeploy/model_executor/models/deepseek_v3.py
+++ b/fastdeploy/model_executor/models/deepseek_v3.py
@@ -48,7 +48,11 @@ from fastdeploy.model_executor.layers.normalization import RMSNorm
 from fastdeploy.model_executor.layers.rotary_embedding import (
    DeepseekScalingRotaryEmbedding,
 )
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import (
+    ModelCategory,
+    ModelForCasualLM,
+    ModelRegistry,
+)
 from fastdeploy.platforms import current_platform

 if current_platform.is_cuda():
@@ -588,6 +592,12 @@ class DeepSeekV3Model(nn.Layer):
        return out


+@ModelRegistry.register_model_class(
+    architecture="DeepseekV3ForCausalLM",
+    module_path="deepseek_v3",
+    category=ModelCategory.TEXT_GENERATION,
+    primary_use=ModelCategory.TEXT_GENERATION,
+)
 class DeepseekV3ForCausalLM(ModelForCasualLM):
    """
    DeepseekV3ForCausalLM
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -45,7 +45,11 @@ from fastdeploy.model_executor.layers.linear import (
 from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
 from fastdeploy.model_executor.layers.moe.moe import FusedMoE
 from fastdeploy.model_executor.layers.normalization import RMSNorm
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import (
+    ModelCategory,
+    ModelForCasualLM,
+    ModelRegistry,
+)
 from fastdeploy.model_executor.models.tp_utils import TensorSplitMode as tsm
 from fastdeploy.model_executor.models.utils import LayerIdPlaceholder as layerid
 from fastdeploy.model_executor.models.utils import WeightMeta
@@ -471,6 +475,12 @@ class Ernie4_5_Model(nn.Layer):
        return out


+@ModelRegistry.register_model_class(
+    architecture="Ernie4_5_MoeForCausalLM",
+    module_path="ernie4_5_moe",
+    category=ModelCategory.TEXT_GENERATION,
+    primary_use=ModelCategory.TEXT_GENERATION,
+)
 class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
    """
    Ernie4_5_MoeForCausalLM
@@ -646,6 +656,12 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
        self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)


+@ModelRegistry.register_model_class(
+    architecture="Ernie4_5_ForCausalLM",
+    module_path="ernie4_5_moe",
+    category=ModelCategory.TEXT_GENERATION,
+    primary_use=ModelCategory.TEXT_GENERATION,
+)
 class Ernie4_5_ForCausalLM(Ernie4_5_MoeForCausalLM):
    """
    Ernie4_5_ForCausalLM
@@ -659,6 +675,12 @@ class Ernie4_5_ForCausalLM(Ernie4_5_MoeForCausalLM):
        return "Ernie4_5_ForCausalLM"


+@ModelRegistry.register_model_class(
+    architecture="Ernie4_5ForCausalLM",
+    module_path="ernie4_5_moe",
+    category=ModelCategory.TEXT_GENERATION,
+    primary_use=ModelCategory.TEXT_GENERATION,
+)
 class Ernie4_5ForCausalLM(Ernie4_5_ForCausalLM):
    """
    Ernie4_5ForCausalLM 0.3B-PT
--- a/fastdeploy/model_executor/models/ernie4_5_mtp.py
+++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py
@@ -31,7 +31,11 @@ from fastdeploy.model_executor.forward_meta import ForwardMeta
 from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection
 from fastdeploy.model_executor.layers.normalization import RMSNorm
 from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import (
+    ModelCategory,
+    ModelForCasualLM,
+    ModelRegistry,
+)


 class Ernie4_5_MTPPretrainedModel(PretrainedModel):
@@ -325,6 +329,12 @@ class Ernie4_5_MTPModel(nn.Layer):
        return hidden_states


+@ModelRegistry.register_model_class(
+    architecture="Ernie4_5_MTPForCausalLM",
+    module_path="ernie4_5_mtp",
+    category=ModelCategory.TEXT_GENERATION,
+    primary_use=ModelCategory.TEXT_GENERATION,
+)
 class Ernie4_5_MTPForCausalLM(ModelForCasualLM):
    """
    Ernie4_5_MTPForCausalLM
--- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py
@@ -44,7 +44,11 @@ from fastdeploy.model_executor.models.ernie4_5_moe import (
    Ernie4_5_Attention,
    Ernie4_5_MLP,
 )
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import (
+    ModelCategory,
+    ModelForCasualLM,
+    ModelRegistry,
+)
 from fastdeploy.platforms import current_platform

 if current_platform.is_cuda():
@@ -792,6 +796,12 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
        self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config)


+@ModelRegistry.register_model_class(
+    architecture="Ernie4_5_VLMoeForConditionalGeneration",
+    module_path="ernie4_5_vl.ernie4_5_vl_moe",
+    category=ModelCategory.MULTIMODAL,
+    primary_use=ModelCategory.MULTIMODAL,
+)
 class Ernie4_5_VLPretrainedModel(PretrainedModel):
    """
    Ernie4_5_MoePretrainedModel
--- a/fastdeploy/model_executor/models/glm4_moe.py
+++ b/fastdeploy/model_executor/models/glm4_moe.py
@@ -39,7 +39,11 @@ from fastdeploy.model_executor.layers.linear import (
 from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
 from fastdeploy.model_executor.layers.moe.moe import FusedMoE
 from fastdeploy.model_executor.layers.normalization import RMSNorm
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import (
+    ModelCategory,
+    ModelForCasualLM,
+    ModelRegistry,
+)


 class Glm4MoeMLP(nn.Layer):
@@ -363,6 +367,12 @@ class Glm4MoeModel(nn.Layer):
        return out


+@ModelRegistry.register_model_class(
+    architecture="Glm4MoeForCausalLM",
+    module_path="glm4_moe",
+    category=ModelCategory.TEXT_GENERATION,
+    primary_use=ModelCategory.TEXT_GENERATION,
+)
 class Glm4MoeForCausalLM(ModelForCasualLM):
    """
    Glm4MoeForCausalLM
--- a/fastdeploy/model_executor/models/interfaces_base.py
+++ b/fastdeploy/model_executor/models/interfaces_base.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Type
+
+from paddle import nn
+
+
+def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool:
+    from .model_base import ModelForCasualLM
+
+    return issubclass(model_cls, ModelForCasualLM)
+
+
+def is_pooling_model(model_cls: Type[nn.Layer]) -> bool:
+    class_name = model_cls.__name__
+    pooling_indicators = ["Embedding", "ForSequenceClassification"]
+    return (
+        any(indicator in class_name for indicator in pooling_indicators)
+        or hasattr(model_cls, "is_embedding_model")
+        and model_cls.is_embedding_model
+    )
+
+
+def is_multimodal_model(class_name: str) -> bool:
+    multimodal_indicators = ["VL", "Vision", "ConditionalGeneration"]
+    return any(indicator in class_name for indicator in multimodal_indicators)
+
+
+def determine_model_category(class_name: str):
+    from fastdeploy.model_executor.models.model_base import ModelCategory
+
+    if any(pattern in class_name for pattern in ["VL", "Vision", "ConditionalGeneration"]):
+        return ModelCategory.MULTIMODAL
+    elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]):
+        return ModelCategory.EMBEDDING
+    return ModelCategory.TEXT_GENERATION
+
+
+def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str:
+    if model_cls is not None:
+        return getattr(model_cls, "default_pooling_type", "LAST")
+    return "LAST"
--- a/fastdeploy/model_executor/models/model_base.py
+++ b/fastdeploy/model_executor/models/model_base.py
@@ -3,40 +3,269 @@
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
+        @@ -12,31 +11,265 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """

+import importlib
 from abc import ABC, abstractmethod
-from typing import Dict, Union
+from dataclasses import dataclass
+from enum import Enum
+from functools import lru_cache
+from typing import Dict, List, Optional, Tuple, Type, Union

 import numpy as np
 import paddle
 from paddle import nn
 from paddleformers.transformers import PretrainedModel

+from fastdeploy.config import (
+    ModelConfig,
+    iter_architecture_defaults,
+    try_match_architecture_defaults,
+)
+from fastdeploy.model_executor.models.interfaces_base import (
+    determine_model_category,
+    get_default_pooling_type,
+    is_multimodal_model,
+    is_pooling_model,
+    is_text_generation_model,
+)
+
+
+class ModelCategory(Enum):
+    TEXT_GENERATION = "text_generation"
+    MULTIMODAL = "multimodal"
+    EMBEDDING = "embedding"
+
+
+@dataclass(frozen=True)
+class ModelInfo:
+    architecture: str
+    category: ModelCategory
+    is_text_generation: bool
+    is_multimodal: bool
+    is_pooling: bool
+    module_path: str
+    default_pooling_type: str
+
+    @staticmethod
+    def from_model_cls(model_cls: Type[nn.Layer], module_path: str = "") -> "ModelInfo":
+        return ModelInfo(
+            architecture=model_cls.__name__,
+            category=determine_model_category(model_cls.__name__),
+            is_text_generation=is_text_generation_model(model_cls),
+            is_multimodal=is_multimodal_model(model_cls.__name__),
+            is_pooling=is_pooling_model(model_cls),
+            default_pooling_type=get_default_pooling_type(model_cls),
+            module_path=module_path,
+        )
+
+
+class BaseRegisteredModel(ABC):
+    """Base class for registered models"""
+
+    @abstractmethod
+    def load_model_cls(self) -> Type[nn.Layer]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def inspect_model_cls(self) -> ModelInfo:
+        raise NotImplementedError
+
+
+@dataclass(frozen=True)
+class LazyRegisteredModel(BaseRegisteredModel):
+    """Lazy loaded model"""
+
+    module_name: str
+    class_name: str
+
+    def load_model_cls(self) -> Type[nn.Layer]:
+        try:
+            full_module = f"fastdeploy.model_executor.models.{self.module_name}"
+            module = importlib.import_module(full_module)
+            return getattr(module, self.class_name)
+        except (ImportError, AttributeError) as e:
+            raise ImportError(f"Failed to load {self.class_name}: {e}")
+
+    def inspect_model_cls(self) -> ModelInfo:
+        model_cls = self.load_model_cls()
+        return ModelInfo.from_model_cls(model_cls, self.module_name)
+
+
+@dataclass(frozen=True)
+class RegisteredModel(BaseRegisteredModel):
+
+    model_cls: Type[nn.Layer]
+
+    def load_model_cls(self) -> Type[nn.Layer]:
+        return self.model_cls
+
+    def inspect_model_cls(self) -> ModelInfo:
+        return ModelInfo.from_model_cls(self.model_cls)
+
+
+@lru_cache(maxsize=128)
+def _try_inspect_model_cls(
+    model_arch: str,
+    model: BaseRegisteredModel,
+) -> Optional[ModelInfo]:
+    try:
+        return model.inspect_model_cls()
+    except Exception:
+        print("Error in inspecting model architecture '%s'", model_arch)
+        return None
+

 class ModelRegistry:
-    """
-    Used to register and retrieve model classes.
-    """
-
    _arch_to_model_cls = {}
    _arch_to_pretrained_model_cls = {}
+    _enhanced_models: Dict[str, Dict] = {}
+
+    def __init__(self):
+        self.models: Dict[str, BaseRegisteredModel] = {}
+        self.pretrained_models: Dict[str, Type[PretrainedModel]] = {}
+        self._registered_models: Dict[str, BaseRegisteredModel] = {}
+        self._register_enhanced_models()
+
+    def _register_enhanced_models(self):
+        for arch, model_info in self._enhanced_models.items():
+            model = LazyRegisteredModel(module_name=model_info["module_path"], class_name=model_info["class_name"])
+            self.models[arch] = model
+            self._registered_models[arch] = model
+
+    @lru_cache(maxsize=128)
+    def _try_load_model_cls(self, architecture: str) -> Optional[Type[nn.Layer]]:
+        if architecture not in self.models:
+            return None
+        try:
+            return self.models[architecture].load_model_cls()
+        except Exception as e:
+            print(f"Failed to load model {architecture}: {e}")
+            return None
+
+    @lru_cache(maxsize=128)
+    def _try_inspect_model_cls(self, model_arch: str) -> Optional[ModelInfo]:
+        if model_arch not in self.models:
+            return None
+        try:
+            return self.models[model_arch].inspect_model_cls()
+        except Exception as e:
+            print(f"Failed to inspect model {model_arch}: {e}")
+            return None
+
+    def _normalize_arch(self, architecture: str, model_config: ModelConfig) -> str:
+        if architecture in self.models:
+            return architecture
+
+        match = try_match_architecture_defaults(
+            architecture,
+            runner_type=getattr(model_config, "runner_type", None),
+            convert_type=getattr(model_config, "convert_type", None),
+        )
+        if match:
+            suffix, _ = match
+            for repl_suffix, _ in iter_architecture_defaults():
+                base_arch = architecture.replace(suffix, repl_suffix)
+                if base_arch in self.models:
+                    return base_arch
+
+        return architecture
+
+    def _raise_for_unsupported(self, architectures: list[str]):
+        all_supported_archs = self.get_supported_archs()
+
+        if any(arch in all_supported_archs for arch in architectures):
+            raise ValueError(
+                f"Model architectures {architectures} failed to be inspected. "
+                "Please check the logs for more details."
+            )
+
+        raise ValueError(
+            f"Model architectures {architectures} are not supported for now. "
+            f"Supported architectures: {all_supported_archs}"
+        )
+
+    def inspect_model_cls(
+        self, architectures: Union[str, List[str]], model_config: ModelConfig = None
+    ) -> Tuple[ModelInfo, str]:
+        if isinstance(architectures, str):
+            architectures = [architectures]
+
+        if not architectures:
+            raise ValueError("No model architectures are specified")
+
+        for arch in architectures:
+            normalized_arch = self._normalize_arch(arch, model_config)
+            model_info = self._try_inspect_model_cls(normalized_arch)
+            if model_info is not None:
+                return (model_info, arch)
+
+        return self._raise_for_unsupported(architectures)

    @classmethod
-    def register_model_class(cls, model_class):
-        """register model class"""
-        if issubclass(model_class, ModelForCasualLM) and model_class is not ModelForCasualLM:
-            cls._arch_to_model_cls[model_class.name()] = model_class
-        return model_class
+    def register_model_class(
+        cls,
+        model_class=None,
+        *,
+        architecture: str = None,
+        module_path: str = None,
+        category: Union[ModelCategory, List[ModelCategory]] = ModelCategory.TEXT_GENERATION,
+        primary_use: ModelCategory = None,
+    ):
+        """
+        Enhanced model class registration supporting both traditional and decorator-style registration.
+
+        Can be used as:
+        1. Traditional decorator: @ModelRegistry.register_model_class
+        2. Enhanced decorator with metadata: @ModelRegistry.register_model_class(architecture="...", module_path="...")
+
+        Args:
+            model_class: The model class (when used as simple decorator)
+            architecture (str): Unique identifier for the model architecture
+            module_path (str): Relative path to the module containing the model
+            category: Model category or list of categories
+            primary_use: Primary category for multi-category models
+        """
+
+        def _register(model_cls):
+            # Traditional registration for ModelForCasualLM subclasses
+            if issubclass(model_cls, ModelForCasualLM) and model_cls is not ModelForCasualLM:
+                cls._arch_to_model_cls[model_cls.name()] = model_cls
+
+            # Enhanced decorator-style registration
+            if architecture and module_path:
+                categories = category if isinstance(category, list) else [category]
+
+                # Register main entry
+                arch_key = architecture
+                cls._enhanced_models[arch_key] = {
+                    "class_name": model_cls.__name__,
+                    "module_path": module_path,
+                    "category": primary_use or categories[0],
+                    "class": model_cls,
+                }
+
+                # Register category-specific entries for multi-category models
+                if len(categories) > 1:
+                    for cat in categories:
+                        key = f"{arch_key}_{cat.value}"
+                        cls._enhanced_models[key] = {
+                            "class_name": model_cls.__name__,
+                            "module_path": module_path,
+                            "category": cat,
+                            "primary_use": primary_use or categories[0],
+                            "class": model_cls,
+                        }
+            return model_cls
+
+        if model_class is not None:
+            return _register(model_class)
+        else:
+            return _register

    @classmethod
    def register_pretrained_model(cls, pretrained_model):
@@ -50,11 +279,6 @@ class ModelRegistry:

        return pretrained_model

-    @classmethod
-    def get_pretrain_cls(cls, architectures: str):
-        """get_pretrain_cls"""
-        return cls._arch_to_pretrained_model_cls[architectures]
-
    @classmethod
    def get_class(cls, name):
        """get model class"""
@@ -62,12 +286,61 @@ class ModelRegistry:
            raise ValueError(f"Model '{name}' is not registered!")
        return cls._arch_to_model_cls[name]

+    @classmethod
+    def get_pretrain_cls(cls, architectures: str):
+        """get_pretrain_cls"""
+        return cls._arch_to_pretrained_model_cls[architectures]
+
    @classmethod
    def get_supported_archs(cls):
-        assert len(cls._arch_to_model_cls) >= len(
-            cls._arch_to_pretrained_model_cls
-        ), "model class num is more than pretrained model registry num"
-        return [key for key in cls._arch_to_model_cls.keys()]
+        traditional_archs = list(cls._arch_to_model_cls.keys())
+        enhanced_archs = list(cls._enhanced_models.keys())
+        return traditional_archs + enhanced_archs
+
+    def resolve_model_cls(self, architectures: Union[str, List[str]]) -> Tuple[Type[nn.Layer], str]:
+        """Resolve model class"""
+        if isinstance(architectures, str):
+            architectures = [architectures]
+
+        for arch in architectures:
+            model_cls = self._try_load_model_cls(arch)
+            if model_cls is not None:
+                return model_cls, arch
+
+        raise ValueError(f"Cannot find supported model: {architectures}")
+
+    def is_multimodal_model(self, architectures: Union[str, List[str]], model_config: ModelConfig = None) -> bool:
+        """Check if it's a multimodal model"""
+        if isinstance(architectures, str):
+            architectures = [architectures]
+
+        for arch in architectures:
+            model_info = self._try_inspect_model_cls(arch)
+            if model_info is not None:
+                return model_info.is_multimodal
+        return False
+
+    def is_text_generation_model(self, architectures: Union[str, List[str]], model_config: ModelConfig = None) -> bool:
+        """Check if it's a text generation model"""
+        if isinstance(architectures, str):
+            architectures = [architectures]
+
+        for arch in architectures:
+            model_info = self._try_inspect_model_cls(arch)
+            if model_info is not None:
+                return model_info.is_text_generation
+        return False
+
+    def is_pooling_model(self, architectures: Union[str, List[str]], model_config: ModelConfig = None) -> bool:
+        """Check if it's a pooling model"""
+        if isinstance(architectures, str):
+            architectures = [architectures]
+
+        for arch in architectures:
+            model_info = self._try_inspect_model_cls(arch)
+            if model_info is not None:
+                return model_info.is_pooling
+        return False


 class ModelForCasualLM(nn.Layer, ABC):
@@ -88,7 +361,6 @@ class ModelForCasualLM(nn.Layer, ABC):
    def set_state_dict(self, state_dict: Dict[str, Union[np.ndarray, paddle.Tensor]]):
        """
        Load model parameters from a given state dictionary.
-
        Args:
            state_dict (dict[str, np.ndarray | paddle.Tensor]):
                A dictionary containing model parameters, where keys are parameter names
@@ -105,12 +377,10 @@ class ModelForCasualLM(nn.Layer, ABC):
    ):
        """
        Defines the forward pass of the model for generating text.
-
        Args:
            input_ids (Tensor, optional): The input token ids to the model.
            pos_emb (Tensor, optional): position Embeddings for model.
            **model_kwargs: Additional keyword arguments for the model.
-
        Returns:
            Tensor or list of Tensors: Generated tokens or decoded outputs.
        """
--- a/fastdeploy/model_executor/models/qwen2.py
+++ b/fastdeploy/model_executor/models/qwen2.py
@@ -39,7 +39,11 @@ from fastdeploy.model_executor.layers.linear import (
 )
 from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
 from fastdeploy.model_executor.layers.normalization import RMSNorm
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import (
+    ModelCategory,
+    ModelForCasualLM,
+    ModelRegistry,
+)


 class Qwen2MLP(nn.Layer):
@@ -282,6 +286,12 @@ class Qwen2Model(nn.Layer):
        return out


+@ModelRegistry.register_model_class(
+    architecture="Qwen2ForCausalLM",
+    module_path="qwen2",
+    category=[ModelCategory.TEXT_GENERATION, ModelCategory.EMBEDDING],
+    primary_use=ModelCategory.TEXT_GENERATION,
+)
 class Qwen2ForCausalLM(ModelForCasualLM):
    """
    Qwen2ForCausalLM
--- a/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py
+++ b/fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py
@@ -33,7 +33,11 @@ from fastdeploy.model_executor.graph_optimization.decorator import (
 from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
 from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
 from fastdeploy.model_executor.layers.normalization import RMSNorm
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import (
+    ModelCategory,
+    ModelForCasualLM,
+    ModelRegistry,
+)
 from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer
 from fastdeploy.platforms import current_platform

@@ -157,6 +161,12 @@ class Qwen2_5_VLModel(nn.Layer):
        return out


+@ModelRegistry.register_model_class(
+    architecture="Qwen2_5_VLForConditionalGeneration",
+    module_path="qwen2_5_vl.qwen2_5_vl",
+    category=ModelCategory.MULTIMODAL,
+    primary_use=ModelCategory.MULTIMODAL,
+)
 class Qwen2_5_VLForConditionalGeneration(ModelForCasualLM):
    """
    Qwen2_5_VLForConditionalGeneration
--- a/fastdeploy/model_executor/models/qwen3.py
+++ b/fastdeploy/model_executor/models/qwen3.py
@@ -34,8 +34,13 @@ from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
 from fastdeploy.model_executor.layers.linear import QKVParallelLinear, RowParallelLinear
 from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
 from fastdeploy.model_executor.layers.normalization import RMSNorm
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import (
+    ModelCategory,
+    ModelForCasualLM,
+    ModelRegistry,
+)
 from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP
+from fastdeploy.transformer_utils.config import get_pooling_config


 class Qwen3MLP(Qwen2MLP):
@@ -218,6 +223,12 @@ class Qwen3Model(nn.Layer):
        return out


+@ModelRegistry.register_model_class(
+    architecture="Qwen3ForCausalLM",
+    module_path="qwen3",
+    category=[ModelCategory.TEXT_GENERATION],
+    primary_use=ModelCategory.TEXT_GENERATION,
+)
 class Qwen3ForCausalLM(ModelForCasualLM):
    """
    Qwen3ForCausalLM
@@ -260,6 +271,8 @@ class Qwen3ForCausalLM(ModelForCasualLM):
            process_weights_after_loading,
        )

+        is_pooling_model = hasattr(self, "is_pooling_model") and self.is_pooling_model
+
        stacked_params_mapping = [
            # (param_name, shard_name, shard_id)
            ("qkv_proj", "q_proj", "q"),
@@ -270,8 +283,18 @@ class Qwen3ForCausalLM(ModelForCasualLM):
            ("embed_tokens.embeddings", "embed_tokens", None),
            ("lm_head.linear", "lm_head", None),
        ]
+
        params_dict = dict(self.named_parameters())
+        model_path = self.fd_config.model_config.model
+        revision = self.fd_config.model_config.revision
+        if is_pooling_model and get_pooling_config(model_path, revision):
+            params_dict = {
+                param_name[6:] if param_name.startswith("model.") else param_name: param
+                for param_name, param in params_dict.items()
+            }
+
        process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
+
        for loaded_weight_name, loaded_weight in weights_iterator:
            for param_name, weight_name, shard_id in stacked_params_mapping:
                if weight_name not in loaded_weight_name:
@@ -282,6 +305,7 @@ class Qwen3ForCausalLM(ModelForCasualLM):
                param = params_dict[model_param_name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
                weight_loader(param, loaded_weight, shard_id)
+
                break
            else:
                model_param_name = loaded_weight_name
@@ -290,10 +314,11 @@ class Qwen3ForCausalLM(ModelForCasualLM):
                param = params_dict[model_param_name]
                weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
                weight_loader(param, loaded_weight)
+
            model_sublayer_name = re.sub(r"\.(weight)$", "", model_param_name)
            process_weights_after_loading_fn(model_sublayer_name, param)

-        if self.tie_word_embeddings:
+        if self.tie_word_embeddings and not is_pooling_model:
            self.lm_head.load_state_dict({self.lm_head.weight_key: self.model.embed_tokens.embeddings.weight})

    @paddle.no_grad()
--- a/fastdeploy/model_executor/models/qwen3moe.py
+++ b/fastdeploy/model_executor/models/qwen3moe.py
@@ -39,7 +39,11 @@ from fastdeploy.model_executor.layers.linear import (
 from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
 from fastdeploy.model_executor.layers.moe.moe import FusedMoE
 from fastdeploy.model_executor.layers.normalization import RMSNorm
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import (
+    ModelCategory,
+    ModelForCasualLM,
+    ModelRegistry,
+)
 from fastdeploy.model_executor.models.qwen3 import Qwen3Attention


@@ -316,6 +320,12 @@ class Qwen3MoeModel(nn.Layer):
        return out


+@ModelRegistry.register_model_class(
+    architecture="Qwen3MoeForCausalLM",
+    module_path="qwen3moe",
+    category=ModelCategory.TEXT_GENERATION,
+    primary_use=ModelCategory.TEXT_GENERATION,
+)
 class Qwen3MoeForCausalLM(ModelForCasualLM):
    """
    Qwen3MoeForCausalLM
--- a/fastdeploy/model_executor/utils.py
+++ b/fastdeploy/model_executor/utils.py
@@ -158,6 +158,7 @@ def default_weight_loader(fd_config: FDConfig) -> None:

    def fn(param, loaded_weight, shard_id: Optional[Union[int, str]] = None):
        """fn"""
+
        output_dim = getattr(param, "output_dim", None)
        weight_need_transpose = getattr(param, "weight_need_transpose", False)
        if weight_need_transpose:
--- a/fastdeploy/output/pooler.py
+++ b/fastdeploy/output/pooler.py
@@ -0,0 +1,69 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Any
+
+import msgspec
+import paddle
+
+
+class PoolingSequenceGroupOutput(
+    msgspec.Struct,
+    omit_defaults=True,
+    array_like=True,
+):
+    """The model output associated with a pooling sequence group."""
+
+    # Annotated as Any to be compatible with msgspec
+    # The actual type is in SequenceGroup.pooled_data
+    data: Any
+
+    def get_data_nbytes(self) -> int:
+        if isinstance(self.data, paddle.Tensor):
+            return self.data.numel() * self.data.element_size()
+        elif hasattr(self.data, "nbytes"):
+            return self.data.nbytes
+        else:
+            return 0
+
+    def __repr__(self) -> str:
+        return f"PoolingSequenceGroupOutput(data={self.data}"
+
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, PoolingSequenceGroupOutput):
+            raise NotImplementedError()
+        return self.data == other.data
+
+
+class PoolerOutput(msgspec.Struct, omit_defaults=True, array_like=True):
+    """The output from a pooling operation in the pooling model."""
+
+    outputs: list[PoolingSequenceGroupOutput]
+
+    def get_data_nbytes(self) -> int:
+        return sum(o.get_data_nbytes() for o in self.outputs)
+
+    def __getitem__(self, idx: int) -> PoolingSequenceGroupOutput:
+        return self.outputs[idx]
+
+    def __setitem__(self, idx: int, value: PoolingSequenceGroupOutput):
+        self.outputs[idx] = value
+
+    def __len__(self):
+        return len(self.outputs)
+
+    def __eq__(self, other: object):
+        return isinstance(other, self.__class__) and self.outputs == other.outputs
--- a/fastdeploy/transformer_utils/init.py
+++ b/fastdeploy/transformer_utils/init.py
@@ -0,0 +1,15 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
--- a/fastdeploy/transformer_utils/config.py
+++ b/fastdeploy/transformer_utils/config.py
@@ -0,0 +1,139 @@
+import json
+from pathlib import Path
+from typing import Optional, Union
+
+import huggingface_hub
+from huggingface_hub import hf_hub_download, try_to_load_from_cache
+from huggingface_hub.utils import (
+    EntryNotFoundError,
+    HfHubHTTPError,
+    LocalEntryNotFoundError,
+    RepositoryNotFoundError,
+    RevisionNotFoundError,
+)
+
+from fastdeploy.utils import get_logger
+
+logger = get_logger("transformer_config", "transformer_config.log")
+
+
+def file_or_path_exists(model, config_name):
+    if (local_path := Path(model)).exists():
+        return (local_path / config_name).is_file()
+
+    return False
+
+
+def get_pooling_config_name(pooling_name: str):
+
+    if "pooling_mode_" in pooling_name:
+        pooling_name = pooling_name.replace("pooling_mode_", "")
+
+    if "_" in pooling_name:
+        pooling_name = pooling_name.split("_")[0]
+
+    if "lasttoken" in pooling_name:
+        pooling_name = "last"
+
+    supported_pooling_types = ["LAST", "ALL", "CLS", "STEP", "MEAN"]
+    pooling_type_name = pooling_name.upper()
+
+    if pooling_type_name in supported_pooling_types:
+        return pooling_type_name
+
+    raise NotImplementedError(f"Pooling type {pooling_type_name} not supported")
+
+
+def try_get_local_file(model: Union[str, Path], file_name: str, revision: Optional[str] = "main") -> Optional[Path]:
+    file_path = Path(model) / file_name
+    if file_path.is_file():
+        return file_path
+    else:
+        try:
+            cached_filepath = try_to_load_from_cache(repo_id=model, filename=file_name, revision=revision)
+            if isinstance(cached_filepath, str):
+                return Path(cached_filepath)
+        except ValueError:
+            ...
+    return None
+
+
+def get_hf_file_to_dict(file_name: str, model: Union[str, Path], revision: Optional[str] = "main"):
+    """
+    Downloads a file from the Hugging Face Hub and returns
+    its contents as a dictionary.
+
+    Parameters:
+    - file_name (str): The name of the file to download.
+    - model (str): The name of the model on the Hugging Face Hub.
+    - revision (str): The specific version of the model.
+
+    Returns:
+    - config_dict (dict): A dictionary containing
+    the contents of the downloaded file.
+    """
+    file_path = try_get_local_file(model=model, file_name=file_name, revision=revision)
+
+    if file_path is None:
+        try:
+            hf_hub_file = hf_hub_download(model, file_name, revision=revision)
+        except huggingface_hub.errors.OfflineModeIsEnabled:
+            return None
+        except (RepositoryNotFoundError, RevisionNotFoundError, EntryNotFoundError, LocalEntryNotFoundError) as e:
+            logger.debug("File or repository not found in hf_hub_download", e)
+            return None
+        except HfHubHTTPError as e:
+            logger.warning(
+                "Cannot connect to Hugging Face Hub. Skipping file " "download for '%s':", file_name, exc_info=e
+            )
+            return None
+        file_path = Path(hf_hub_file)
+
+    if file_path is not None and file_path.is_file():
+        with open(file_path) as file:
+            return json.load(file)
+
+    return None
+
+
+def get_pooling_config(model: str, revision: Optional[str] = "main"):
+    """
+    This function gets the pooling and normalize
+    config from the model - only applies to
+    sentence-transformers models.
+
+    Args:
+        model (str): The name of the Hugging Face model.
+        revision (str, optional): The specific version
+        of the model to use. Defaults to 'main'.
+
+    Returns:
+        dict: A dictionary containing the pooling
+        type and whether normalization is used.
+    """
+
+    modules_file_name = "modules.json"
+    modules_dict = None
+    if file_or_path_exists(model, config_name=modules_file_name):
+        modules_dict = get_hf_file_to_dict(modules_file_name, model)
+
+    if modules_dict is None:
+        return None
+
+    pooling = next((item for item in modules_dict if item["type"] == "sentence_transformers.models.Pooling"), None)
+
+    normalize = bool(
+        next((item for item in modules_dict if item["type"] == "sentence_transformers.models.Normalize"), False)
+    )
+
+    if pooling:
+        pooling_file_name = "{}/config.json".format(pooling["path"])
+        pooling_dict = get_hf_file_to_dict(pooling_file_name, model)
+        pooling_type_name = next((item for item, val in pooling_dict.items() if val is True), None)
+
+        if pooling_type_name is not None:
+            pooling_type_name = get_pooling_config_name(pooling_type_name)
+
+        return {"pooling_type": pooling_type_name, "normalize": normalize}
+
+    return None
--- a/fastdeploy/utils.py
+++ b/fastdeploy/utils.py
@@ -51,6 +51,7 @@ from fastdeploy.entrypoints.openai.protocol import ErrorInfo, ErrorResponse
 from fastdeploy.logger.logger import FastDeployLogger

 T = TypeVar("T")
+from typing import Callable, Optional

 # [N,2] -> every line is [config_name, enable_xxx_name]
 # Make sure enable_xxx equal to config.enable_xxx
@@ -852,3 +853,24 @@ api_server_logger = get_logger("api_server", "api_server.log")
 console_logger = get_logger("console", "console.log", print_to_console=True)
 spec_logger = get_logger("speculate", "speculate.log")
 zmq_client_logger = get_logger("zmq_client", "zmq_client.log")
+
+
+def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
+
+    def _parse_type(val: str) -> T:
+        try:
+            return return_type(val)
+        except ValueError as e:
+            raise argparse.ArgumentTypeError(f"Value {val} cannot be converted to {return_type}.") from e
+
+    return _parse_type
+
+
+def optional_type(return_type: Callable[[str], T]) -> Callable[[str], Optional[T]]:
+
+    def _optional_type(val: str) -> Optional[T]:
+        if val == "" or val == "None":
+            return None
+        return parse_type(return_type)(val)
+
+    return _optional_type
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1319,8 +1319,12 @@ class GPUModelRunner(ModelRunnerBase):
                self.parallel_config.max_model_len,
            )

-            # 4. Execute spec decode
-            logits = self.model.compute_logits(hidden_states)
+            logits = None
+            if hasattr(self.model, "is_pooling_model") and self.model.is_pooling_model:
+                pass
+            else:
+                # 4. Execute spec decode
+                logits = self.model.compute_logits(hidden_states)

            if not self.speculative_decoding:
                set_value_by_flags_and_idx(
@@ -1625,8 +1629,13 @@ class GPUModelRunner(ModelRunnerBase):
            self.parallel_config.max_model_len,
        )

+        logits = None
        # 4. Compute logits, Sample
-        logits = self.model.compute_logits(hidden_states)
+        if hasattr(self.model, "is_pooling_model") and self.model.is_pooling_model:
+            pass
+        else:
+            # 4. Execute spec decode
+            logits = self.model.compute_logits(hidden_states)

        if not self.speculative_decoding:
            set_value_by_flags_and_idx(
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -45,7 +45,7 @@ from fastdeploy.inter_communicator import IPCSignal
 from fastdeploy.model_executor.layers.quantization import parse_quant_config
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
-from fastdeploy.utils import get_logger
+from fastdeploy.utils import get_logger, optional_type
 from fastdeploy.worker.worker_base import WorkerBase

 logger = get_logger("worker_process", "worker_process.log")
@@ -643,6 +643,27 @@ def parse_args():
        help="Flag to specify dtype of lm_head as FP32",
    )

+    parser.add_argument(
+        "--runner",
+        type=str,
+        default="auto",
+        help="The type of model runner to use.Each FD instance only supports one model runner.even if the same model can be used for multiple types.",
+    )
+
+    parser.add_argument(
+        "--convert",
+        type=str,
+        default="auto",
+        help="Convert the model using adapters. The most common use case is to adapt a text generation model to be used for pooling tasks.",
+    )
+
+    parser.add_argument(
+        "--override-pooler-config",
+        type=optional_type(json.loads),
+        default=None,
+        help="Override configuration for the pooler.",
+    )
+
    args = parser.parse_args()
    return args

--- a/requirements.txt
+++ b/requirements.txt
@@ -39,3 +39,4 @@ opentelemetry-distro
 opentelemetry-exporter-otlp
 opentelemetry-instrumentation-fastapi
 partial_json_parser
+msgspec
--- a/tests/plugins/fd_add_dummy_model/init.py
+++ b/tests/plugins/fd_add_dummy_model/init.py
@@ -14,9 +14,8 @@

 from paddleformers.transformers import PretrainedModel

-from fastdeploy import ModelRegistry
 from fastdeploy.config import ErnieArchitectures
-from fastdeploy.model_executor.models.model_base import ModelForCasualLM
+from fastdeploy.model_executor.models.model_base import ModelForCasualLM, ModelRegistry


 class MyPretrainedModel(PretrainedModel):
--- a/tests/pooling/test_embedding.py
+++ b/tests/pooling/test_embedding.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import sys
+
+import paddle
+import pytest
+
+from fastdeploy.config import (
+    CacheConfig,
+    FDConfig,
+    GraphOptimizationConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+)
+from fastdeploy.model_executor.models.model_base import ModelRegistry
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+project_root = os.path.abspath(os.path.join(current_dir, ".."))
+if project_root not in sys.path:
+    sys.path.insert(0, project_root)
+
+from tests.model_loader.utils import get_torch_model_path
+
+
+class TestModelLoader:
+
+    @pytest.fixture(scope="session", autouse=True)
+    def setup_paddle(self):
+        if not paddle.is_compiled_with_cuda():
+            print("CUDA not available, using CPU")
+            paddle.set_device("cpu")
+        else:
+            print("Using CUDA device")
+            paddle.set_device("gpu")
+        yield
+
+    @pytest.fixture(scope="session")
+    def model_path(self):
+        try:
+            torch_model_path = get_torch_model_path("Qwen3-0.6B")
+            if os.path.exists(torch_model_path):
+                return torch_model_path
+        except Exception as e:
+            print(f"Could not get torch model path: {e}")
+
+    @pytest.fixture
+    def model_config(self, model_path):
+        model_args = {
+            "model": model_path,
+            "dtype": "bfloat16",
+            "max_model_len": 8192,
+            "tensor_parallel_size": 1,
+            "runner": "auto",
+            "convert": "auto",
+        }
+
+        try:
+            return ModelConfig(model_args)
+        except Exception as e:
+            print(f"Could not create ModelConfig: {e}")
+
+    @pytest.fixture
+    def fd_config(self, model_config):
+        try:
+            cache_args = {
+                "block_size": 64,
+                "gpu_memory_utilization": 0.9,
+                "cache_dtype": "bfloat16",
+                "model_cfg": model_config,
+                "tensor_parallel_size": 1,
+            }
+            cache_config = CacheConfig(cache_args)
+
+            parallel_args = {
+                "tensor_parallel_size": 1,
+                "data_parallel_size": 1,
+            }
+            parallel_config = ParallelConfig(parallel_args)
+
+            load_args = {}
+            load_config = LoadConfig(load_args)
+
+            graph_opt_args = {
+                "enable_cudagraph": False,
+                "cudagraph_capture_sizes": None,
+            }
+            graph_opt_config = GraphOptimizationConfig(graph_opt_args)
+
+            return FDConfig(
+                model_config=model_config,
+                cache_config=cache_config,
+                parallel_config=parallel_config,
+                load_config=load_config,
+                graph_opt_config=graph_opt_config,
+                test_mode=True,
+            )
+        except Exception as e:
+            print(f"Could not create FDConfig: {e}")
+
+    @pytest.fixture
+    def model_json_config(self, model_path):
+        config_path = os.path.join(model_path, "config.json")
+        if os.path.exists(config_path):
+            with open(config_path, "r", encoding="utf-8") as f:
+                return json.load(f)
+        return None
+
+    def test_embedding_with_none_convert_type(self, fd_config, model_json_config):
+        if model_json_config is None:
+            pytest.skip("Model config not available")
+
+        if fd_config is None:
+            pytest.skip("FDConfig not available")
+
+        print("=" * 60)
+        print("Testing initialize_model with convert_type='none'")
+        print("=" * 60)
+
+        architectures = model_json_config.get("architectures", [])
+        if not architectures:
+            pytest.skip("No architectures found in model config")
+
+        fd_config.model_config.convert_type = "none"
+
+        try:
+            model_cls = ModelRegistry.get_class(architectures)
+
+            if hasattr(model_cls, "__name__"):
+                assert (
+                    "ForEmbedding" not in model_cls.__name__
+                ), f"Standard model should not have 'ForEmbedding' in name, but got: {model_cls.__name__}"
+                print(f"Confirmed standard model type (no ForEmbedding): {model_cls.__name__}")
+
+            standard_methods = set(dir(model_cls))
+            assert "_init_pooler" not in standard_methods, "Standard model should not have _init_pooler method"
+
+        except Exception as e:
+            print(f"Error in none: {e}")
+
+    def test_embedding_with_embed_convert_type(self, fd_config, model_json_config):
+        if model_json_config is None:
+            pytest.skip("Model config not available")
+
+        if fd_config is None:
+            pytest.skip("FDConfig not available")
+
+        print("=" * 60)
+        print("Testing embedding with convert_type='embed'")
+        print("=" * 60)
+
+        architectures = model_json_config.get("architectures", [])
+        if not architectures:
+            pytest.skip("No architectures found in model config")
+
+        fd_config.model_config.convert_type = "embed"
+
+        try:
+            model_cls = ModelRegistry.get_class(architectures)
+            if hasattr(model_cls, "__name__"):
+                assert "ForEmbedding" in model_cls.__name__, "Embedding model should have 'ForEmbedding' in name"
+                print(f"Confirmed embedding model type: {model_cls.__name__}")
+
+            embedding_methods = set(dir(model_cls))
+            assert "_init_pooler" in embedding_methods, "Embedding model should have _init_pooler method"
+
+        except Exception as e:
+            print(f"Error in convert embed: {e}")