diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py
index edb2aa43a..ebf2eea5b 100644
--- a/fastdeploy/__init__.py
+++ b/fastdeploy/__init__.py
@@ -28,7 +28,7 @@ from paddleformers.utils.log import logger as pf_logger
 
 from fastdeploy.engine.sampling_params import SamplingParams
 from fastdeploy.entrypoints.llm import LLM
-from fastdeploy.utils import envs
+from fastdeploy.utils import current_package_version, envs
 
 if envs.FD_DEBUG != "1":
     import logging
@@ -43,6 +43,8 @@ except ImportError:
     pass
 # TODO(tangbinhan): remove this code
 
+__version__ = current_package_version()
+
 
 def _patch_fastsafetensors():
     try:
diff --git a/fastdeploy/entrypoints/cli/main.py b/fastdeploy/entrypoints/cli/main.py
index 0686e1e16..b770dc604 100644
--- a/fastdeploy/entrypoints/cli/main.py
+++ b/fastdeploy/entrypoints/cli/main.py
@@ -17,17 +17,19 @@
 # This file is modified from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py
 from __future__ import annotations
 
-import importlib.metadata
+from fastdeploy import __version__
 
 
 def main():
     import fastdeploy.entrypoints.cli.benchmark.main
     import fastdeploy.entrypoints.cli.openai
+    import fastdeploy.entrypoints.cli.serve
     from fastdeploy.utils import FlexibleArgumentParser
 
     CMD_MODULES = [
         fastdeploy.entrypoints.cli.openai,
         fastdeploy.entrypoints.cli.benchmark.main,
+        fastdeploy.entrypoints.cli.serve,
     ]
 
     parser = FlexibleArgumentParser(description="FastDeploy CLI")
@@ -35,7 +37,7 @@ def main():
         "-v",
         "--version",
         action="version",
-        version=importlib.metadata.version("fastdeploy-gpu"),
+        version=__version__,
     )
     subparsers = parser.add_subparsers(required=False, dest="subparser")
     cmds = {}
diff --git a/fastdeploy/entrypoints/cli/openai.py b/fastdeploy/entrypoints/cli/openai.py
index 0ab4c9ae0..7a92925ee 100644
--- a/fastdeploy/entrypoints/cli/openai.py
+++ b/fastdeploy/entrypoints/cli/openai.py
@@ -86,7 +86,7 @@ def _add_query_options(parser: FlexibleArgumentParser) -> FlexibleArgumentParser
     parser.add_argument(
         "--url",
         type=str,
-        default="http://localhost:9904/v1",
+        default="http://localhost:8000/v1",
         help="url of the running OpenAI-Compatible RESTful API server",
     )
     parser.add_argument(
diff --git a/fastdeploy/entrypoints/cli/serve.py b/fastdeploy/entrypoints/cli/serve.py
new file mode 100644
index 000000000..ead694695
--- /dev/null
+++ b/fastdeploy/entrypoints/cli/serve.py
@@ -0,0 +1,84 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+# This file is modified from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py
+
+import argparse
+import atexit
+import os
+import signal
+import subprocess
+import sys
+
+from fastdeploy.entrypoints.cli.types import CLISubcommand
+from fastdeploy.entrypoints.openai.utils import make_arg_parser
+from fastdeploy.utils import FlexibleArgumentParser
+
+
+class ServeSubcommand(CLISubcommand):
+    """The `serve` subcommand for the fastdeploy CLI."""
+
+    name = "serve"
+
+    @staticmethod
+    def cmd(args: argparse.Namespace) -> None:
+        env = os.environ.copy()
+        cmd = [
+            sys.executable,
+            "-m",
+            "fastdeploy.entrypoints.openai.api_server",
+            *sys.argv[2:],
+        ]
+
+        # 启动子进程
+        proc = subprocess.Popen(cmd, env=env)
+        print(f"Starting server (PID: {proc.pid})")
+
+        # 定义清理函数
+        def cleanup():
+            """终止子进程并确保资源释放"""
+            if proc.poll() is None:  # 检查子进程是否仍在运行
+                print(f"\nTerminating child process (PID: {proc.pid})...")
+                proc.terminate()  # 发送终止信号
+
+        # 注册退出时的清理函数
+        atexit.register(cleanup)
+        # 设置信号处理
+
+        def signal_handler(signum, frame):
+            cleanup()
+            sys.exit(0)
+
+        # 捕获 SIGINT (Ctrl+C) 和 SIGTERM
+        signal.signal(signal.SIGINT, signal_handler)
+        signal.signal(signal.SIGTERM, signal_handler)
+        # 主进程阻塞等待子进程
+        proc.wait()
+
+    def subparser_init(self, subparsers: argparse._SubParsersAction) -> FlexibleArgumentParser:
+        serve_parser = subparsers.add_parser(
+            name=self.name,
+            help="Start the FastDeploy OpenAI Compatible API server.",
+            description="Start the FastDeploy OpenAI Compatible API server.",
+            usage="fastdeploy serve [model_tag] [options]",
+        )
+        serve_parser = make_arg_parser(serve_parser)
+        serve_parser.add_argument("--config", help="Read CLI options from a config file. Must be a YAML file")
+        return serve_parser
+
+
+def cmd_init() -> list[CLISubcommand]:
+    return [ServeSubcommand()]
diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py
index 9f90fbf10..25a38b89d 100644
--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -49,7 +49,7 @@ from fastdeploy.entrypoints.openai.serving_chat import OpenAIServingChat
 from fastdeploy.entrypoints.openai.serving_completion import OpenAIServingCompletion
 from fastdeploy.entrypoints.openai.serving_models import ModelPath, OpenAIServingModels
 from fastdeploy.entrypoints.openai.tool_parsers import ToolParserManager
-from fastdeploy.entrypoints.openai.utils import UVICORN_CONFIG
+from fastdeploy.entrypoints.openai.utils import UVICORN_CONFIG, make_arg_parser
 from fastdeploy.metrics.metrics import (
     EXCLUDE_LABELS,
     cleanup_prometheus_files,
@@ -67,31 +67,7 @@ from fastdeploy.utils import (
     retrive_model_from_server,
 )
 
-parser = FlexibleArgumentParser()
-parser.add_argument("--port", default=8000, type=int, help="port to the http server")
-parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server")
-parser.add_argument("--workers", default=1, type=int, help="number of workers")
-parser.add_argument("--metrics-port", default=8001, type=int, help="port for metrics server")
-parser.add_argument("--controller-port", default=-1, type=int, help="port for controller server")
-parser.add_argument(
-    "--max-waiting-time",
-    default=-1,
-    type=int,
-    help="max waiting time for connection, if set value -1 means no waiting time limit",
-)
-parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency")
-
-parser.add_argument(
-    "--enable-mm-output", action="store_true", help="Enable 'multimodal_content' field in response output. "
-)
-parser.add_argument(
-    "--timeout-graceful-shutdown",
-    default=0,
-    type=int,
-    help="timeout for graceful shutdown in seconds (used by uvicorn)",
-)
-
-parser = EngineArgs.add_cli_args(parser)
+parser = make_arg_parser(FlexibleArgumentParser())
 args = parser.parse_args()
 
 console_logger.info(f"Number of api-server workers: {args.workers}.")
diff --git a/fastdeploy/entrypoints/openai/utils.py b/fastdeploy/entrypoints/openai/utils.py
index 58855f91e..99212e0ee 100644
--- a/fastdeploy/entrypoints/openai/utils.py
+++ b/fastdeploy/entrypoints/openai/utils.py
@@ -22,7 +22,8 @@ import aiozmq
 import msgpack
 import zmq
 
-from fastdeploy.utils import api_server_logger
+from fastdeploy.engine.args_utils import EngineArgs
+from fastdeploy.utils import FlexibleArgumentParser, api_server_logger
 
 UVICORN_CONFIG = {
     "version": 1,
@@ -201,3 +202,31 @@ class DealerConnectionManager:
             self.request_map.clear()
 
         api_server_logger.info("All connections and tasks closed")
+
+
+def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
+    parser.add_argument("--port", default=8000, type=int, help="port to the http server")
+    parser.add_argument("--host", default="0.0.0.0", type=str, help="host to the http server")
+    parser.add_argument("--workers", default=1, type=int, help="number of workers")
+    parser.add_argument("--metrics-port", default=8001, type=int, help="port for metrics server")
+    parser.add_argument("--controller-port", default=-1, type=int, help="port for controller server")
+    parser.add_argument(
+        "--max-waiting-time",
+        default=-1,
+        type=int,
+        help="max waiting time for connection, if set value -1 means no waiting time limit",
+    )
+    parser.add_argument("--max-concurrency", default=512, type=int, help="max concurrency")
+
+    parser.add_argument(
+        "--enable-mm-output", action="store_true", help="Enable 'multimodal_content' field in response output. "
+    )
+    parser.add_argument(
+        "--timeout-graceful-shutdown",
+        default=0,
+        type=int,
+        help="timeout for graceful shutdown in seconds (used by uvicorn)",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    return parser
diff --git a/fastdeploy/plugins/utils.py b/fastdeploy/plugins/utils.py
index e457223ac..572b1a157 100644
--- a/fastdeploy/plugins/utils.py
+++ b/fastdeploy/plugins/utils.py
@@ -32,7 +32,7 @@ def load_plugins_by_group(group: str) -> dict[str, Callable[[], Any]]:
 
     discovered_plugins = entry_points(group=group)
     if len(discovered_plugins) == 0:
-        logger.info("No plugins for group %s found.", group)
+        logger.debug("No plugins for group %s found.", group)
         return {}
 
     logger.info("Available plugins for group %s:", group)
diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py
index 924d283c3..5975f1a5c 100644
--- a/fastdeploy/utils.py
+++ b/fastdeploy/utils.py
@@ -757,6 +757,36 @@ def version():
     return content
 
 
+def current_package_version():
+    """
+    读取version.txt文件,解析出fastdeploy version对应的版本号
+
+    Args:
+    Returns:
+        str: fastdeploy版本号,如果解析失败返回Unknown
+    """
+    fd_version = "Unknown"
+    try:
+        content = version()
+        if content == "Unknown":
+            return fd_version
+
+        # 按行分割内容
+        lines = content.strip().split("\n")
+        # 查找包含"fastdeploy version:"的行
+        for line in lines:
+            if line.startswith("fastdeploy version:"):
+                # 提取版本号部分
+                fd_version = line.split("fastdeploy version:")[1].strip()
+                return fd_version
+        llm_logger.warning("fastdeploy version not found in version.txt")
+        # 如果没有找到对应的行，返回None
+        return fd_version
+    except Exception as e:
+        llm_logger.error(f"Failed to parse fastdeploy version from version.txt: {e}")
+        return fd_version
+
+
 class DeprecatedOptionWarning(argparse.Action):
     def __init__(self, option_strings, dest, **kwargs):
         super().__init__(option_strings, dest, nargs=0, **kwargs)
diff --git a/setup.py b/setup.py
index 6c79b6826..41cf71e26 100644
--- a/setup.py
+++ b/setup.py
@@ -190,6 +190,16 @@ cmdclass_dict["build_ext"] = CMakeBuild
 FASTDEPLOY_VERSION = os.environ.get("FASTDEPLOY_VERSION", "2.3.0-dev")
 cmdclass_dict["build_optl"] = PostInstallCommand
 
+
+def write_version_to_file():
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    version_file_path = os.path.join(current_dir, "fastdeploy/version.txt")
+    with open(version_file_path, "a") as f:
+        f.write(f"fastdeploy version: {FASTDEPLOY_VERSION}\n")
+
+
+write_version_to_file()
+
 setup(
     name=get_name(),
     version=FASTDEPLOY_VERSION,
diff --git a/tests/entrypoints/cli/test_main.py b/tests/entrypoints/cli/test_main.py
index 4b82ecba6..787d3d035 100644
--- a/tests/entrypoints/cli/test_main.py
+++ b/tests/entrypoints/cli/test_main.py
@@ -6,10 +6,8 @@ from fastdeploy.entrypoints.cli.main import main as cli_main
 
 class TestCliMain(unittest.TestCase):
     @patch("fastdeploy.utils.FlexibleArgumentParser")
-    @patch("fastdeploy.entrypoints.cli.main.importlib.metadata")
-    def test_main_basic(self, mock_metadata, mock_parser):
+    def test_main_basic(self, mock_parser):
         # Setup mocks
-        mock_metadata.version.return_value = "1.0.0"
         mock_args = MagicMock()
         mock_args.subparser = None
         mock_parser.return_value.parse_args.return_value = mock_args
@@ -18,7 +16,6 @@ class TestCliMain(unittest.TestCase):
         cli_main()
 
         # Verify version check
-        mock_metadata.version.assert_called_once_with("fastdeploy-gpu")
         mock_args.dispatch_function.assert_called_once()
 
 
diff --git a/tests/entrypoints/cli/test_serve.py b/tests/entrypoints/cli/test_serve.py
new file mode 100644
index 000000000..9c3235183
--- /dev/null
+++ b/tests/entrypoints/cli/test_serve.py
@@ -0,0 +1,46 @@
+import argparse
+import unittest
+from unittest.mock import MagicMock, patch
+
+from fastdeploy.entrypoints.cli.serve import ServeSubcommand, cmd_init
+
+
+class TestServeSubcommand(unittest.TestCase):
+    """Tests for ServeSubcommand class."""
+
+    def test_name_property(self):
+        """Test the name property is correctly set."""
+        self.assertEqual(ServeSubcommand.name, "serve")
+
+    @patch("subprocess.Popen", return_value=MagicMock())
+    def test_cmd_method(self, mock_subprocess):
+        """Test the cmd method calls the expected API server functions."""
+        test_args = argparse.Namespace(port=8000)
+        mock_subprocess.return_value.pid = 1
+        ServeSubcommand.cmd(test_args)
+        mock_subprocess.assert_called_once()
+
+    def test_validate_method(self):
+        """Test the validate method does nothing (no-op)."""
+        test_args = argparse.Namespace()
+        instance = ServeSubcommand()
+        instance.validate(test_args)  # Should not raise any exceptions
+
+    @patch("argparse._SubParsersAction.add_parser")
+    def test_subparser_init(self, mock_add_parser):
+        """Test the subparser initialization."""
+        mock_subparsers = MagicMock()
+        instance = ServeSubcommand()
+        result = instance.subparser_init(mock_subparsers)
+        self.assertIsNotNone(result)
+
+    def test_cmd_init_returns_list(self):
+        """Test cmd_init returns a list of subcommands."""
+        result = cmd_init()
+        self.assertIsInstance(result, list)
+        self.assertEqual(len(result), 1)
+        self.assertIsInstance(result[0], ServeSubcommand)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/utils/test_version.py b/tests/utils/test_version.py
index b5ea2f4a7..ddbd28992 100644
--- a/tests/utils/test_version.py
+++ b/tests/utils/test_version.py
@@ -1,6 +1,8 @@
 import unittest
+from unittest.mock import patch
 
 import fastdeploy
+from fastdeploy.utils import current_package_version
 
 
 class TestVersion(unittest.TestCase):
@@ -8,6 +10,24 @@ class TestVersion(unittest.TestCase):
         ver = fastdeploy.version()
         assert ver.count("COMMIT") > 0
 
+    @patch("fastdeploy.utils.version")
+    def test_normal_version(self, mock_version):
+        """测试正常版本号解析"""
+        mock_version.return_value = "fastdeploy version: 1.0.0\nother info"
+        self.assertEqual(current_package_version(), "1.0.0")
+
+    @patch("fastdeploy.utils.version")
+    def test_unknown_version(self, mock_version):
+        """测试version返回Unknown的情况"""
+        mock_version.return_value = "Unknown"
+        self.assertEqual(current_package_version(), "Unknown")
+
+    @patch("fastdeploy.utils.version")
+    def test_no_version_line(self, mock_version):
+        """测试找不到版本行的情况"""
+        mock_version.return_value = "some other content"
+        self.assertEqual(current_package_version(), "Unknown")
+
 
 if __name__ == "__main__":
     unittest.main()