[Feature] [Benchmark]: add ZMQ-based FMQ implementation and benchmark tools (#5418)

* feat(fmq): add ZMQ-based FMQ implementation and benchmark tools * move FMQ_CONFIG_JSON to envs * fix top_p_candidates (#5400) Co-authored-by: freeliuzc <lzc842650834@gmail.com> * [RL] Support Rollout Routing Replay (#5321) * [RL] Support Rollout Routing Replay * add routing indices cache * fix config bug and moe forward bug * R3 Support GLM * support eb4.5 * fix merge bug * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * add routing replay ci * support glm topk * support orther top_k * fix ci bug * pre-commit * only support chatcmpl --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yuanle Liu <yuanlehome@163.com> * [Bug fix] Fix the multi-input accuracy issue in the pooling model. (#5374) * fix multi-inputs * fix threshold * fix threshold * fix * [BugFix]remove _execute_empty_input (#5396) * Revert "[RL] Support Rollout Routing Replay (#5321)" (#5402) This reverts commit 96d2d4877b. * [New][RL] Support Rollout Routing Replay (#5405) * [RL] Support Rollout Routing Replay * add routing indices cache * fix config bug and moe forward bug * R3 Support GLM * support eb4.5 * fix merge bug * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * add routing replay ci * support glm topk * support orther top_k * fix ci bug * pre-commit * only support chatcmpl * Revert "Revert "[RL] Support Rollout Routing Replay (#5321)" (#5402)" This reverts commit c45e064f3d. * Fix XPU and NPU bug --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yuanle Liu <yuanlehome@163.com> * bf16 deepseek (#5379) * fix deepseek (#5410) * Update tests/inter_communicator/test_fmq_factory.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update benchmarks/benchmark_fmq.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * Update fastdeploy/inter_communicator/fmq.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --------- Co-authored-by: GoldPancake <56388518+Deleter-D@users.noreply.github.com> Co-authored-by: freeliuzc <lzc842650834@gmail.com> Co-authored-by: RAM <gstian5555@outlook.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yuanle Liu <yuanlehome@163.com> Co-authored-by: lizexu123 <39205361+lizexu123@users.noreply.github.com> Co-authored-by: 周周周 <39978853+zhoutianzi666@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: bukejiyu <52310069+bukejiyu@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-12-08 22:04:49 +08:00
parent 364197c4b5
commit 5fb93d84f5
7 changed files with 848 additions and 0 deletions
--- a/benchmarks/benchmark_fmq.py
+++ b/benchmarks/benchmark_fmq.py
@@ -0,0 +1,233 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import asyncio
+import multiprocessing as mp
+import os
+import statistics
+import time
+
+from tqdm import tqdm
+
+from fastdeploy.inter_communicator.fmq import FMQ
+
+
+# ============================================================
+# Producer Task
+# ============================================================
+async def producer_task(proc_id, msg_count, payload_size, shm_threshold, result_q):
+    fmq = FMQ()
+    q = fmq.queue("mp_bench_latency", role="producer")
+    payload = b"x" * payload_size
+
+    # tqdm 进度条
+    pbar = tqdm(total=msg_count, desc=f"Producer-{proc_id}", position=proc_id, leave=True, disable=False)
+
+    t0 = time.perf_counter()
+    for i in range(msg_count):
+        send_ts = time.perf_counter()
+        await q.put(data={"pid": proc_id, "i": i, "send_ts": send_ts, "payload": payload}, shm_threshold=shm_threshold)
+        pbar.update(1)
+        # pbar.write(f"send {i}")
+    t1 = time.perf_counter()
+    result_q.put({"producer_id": proc_id, "count": msg_count, "time": t1 - t0})
+
+    pbar.close()
+
+    # wait for 2 seconds before closing
+    await asyncio.sleep(5)
+
+
+def producer_process(proc_id, msg_count, payload_size, shm_threshold, result_q):
+    async def run():
+        await producer_task(proc_id, msg_count, payload_size, shm_threshold, result_q)
+
+    asyncio.run(run())
+
+
+# ============================================================
+# Consumer Task
+# ============================================================
+async def consumer_task(consumer_id, total_msgs, result_q, consumer_event):
+    fmq = FMQ()
+    q = fmq.queue("mp_bench_latency", role="consumer")
+    consumer_event.set()
+
+    latencies = []
+    recv = 0
+
+    # tqdm 显示进度
+    pbar = tqdm(total=total_msgs, desc=f"Consumer-{consumer_id}", position=consumer_id + 1, leave=True, disable=False)
+
+    first_recv = None
+    last_recv = None
+
+    while recv < total_msgs:
+        msg = await q.get()
+        recv_ts = time.perf_counter()
+        if msg is None:
+            pbar.write("recv None")
+            continue
+        if first_recv is None:
+            first_recv = recv_ts
+        last_recv = recv_ts
+        send_ts = msg.payload["send_ts"]
+        latencies.append((recv_ts - send_ts) * 1000)  # ms
+        pbar.update(1)
+        recv += 1
+
+    pbar.close()
+
+    result_q.put(
+        {"consumer_id": consumer_id, "latencies": latencies, "first_recv": first_recv, "last_recv": last_recv}
+    )
+
+
+def consumer_process(consumer_id, total_msgs, result_q, consumer_event):
+    async def run():
+        await consumer_task(consumer_id, total_msgs, result_q, consumer_event)
+
+    asyncio.run(run())
+
+
+# ============================================================
+# MAIN benchmark
+# ============================================================
+def run_benchmark(
+    NUM_PRODUCERS=1,
+    NUM_CONSUMERS=1,
+    NUM_MESSAGES_PER_PRODUCER=1000,
+    PAYLOAD_SIZE=1 * 1024 * 1024,
+    SHM_THRESHOLD=1 * 1024 * 1024,
+):
+    total_messages = NUM_PRODUCERS * NUM_MESSAGES_PER_PRODUCER
+    total_bytes = total_messages * PAYLOAD_SIZE
+
+    print(f"\nFastDeploy Message Queue Benchmark, pid:{os.getpid()}")
+    print(f"Producers: {NUM_PRODUCERS}")
+    print(f"Consumers: {NUM_CONSUMERS}")
+    print(f"Messages per producer: {NUM_MESSAGES_PER_PRODUCER}")
+    print(f"Total bytes: {total_bytes / 1024 / 1024 / 1024:.2f} GB")
+    print(f"Total messages: {total_messages:,}")
+    print(f"Payload per message: {PAYLOAD_SIZE / 1024 / 1024:.2f} MB")
+
+    mp.set_start_method("fork")
+    manager = mp.Manager()
+    result_q = manager.Queue()
+
+    # 两个信号事件
+    consumer_event = manager.Event()
+
+    procs = []
+
+    # Start Consumers
+    msgs_per_consumer = total_messages // NUM_CONSUMERS
+    for i in range(NUM_CONSUMERS):
+        p = mp.Process(target=consumer_process, args=(i, msgs_per_consumer, result_q, consumer_event))
+        procs.append(p)
+        p.start()
+
+    consumer_event.wait()
+
+    # Start Producers
+    for i in range(NUM_PRODUCERS):
+        p = mp.Process(
+            target=producer_process, args=(i, NUM_MESSAGES_PER_PRODUCER, PAYLOAD_SIZE, SHM_THRESHOLD, result_q)
+        )
+        procs.append(p)
+        p.start()
+
+    # Join
+    for p in procs:
+        p.join()
+
+    # Collect results
+    producer_stats = []
+    consumer_stats = {}
+
+    while not result_q.empty():
+        item = result_q.get()
+        if "producer_id" in item:
+            producer_stats.append(item)
+        if "consumer_id" in item:
+            consumer_stats[item["consumer_id"]] = item
+
+    # Producer stats
+    print("\nProducer Stats:")
+    for p in producer_stats:
+        throughput = p["count"] / p["time"]
+        bandwidth = (p["count"] * PAYLOAD_SIZE) / (1024**2 * p["time"])
+        print(
+            f"[Producer-{p['producer_id']}] Sent {p['count']:,} msgs "
+            f"in {p['time']:.3f} s | Throughput: {throughput:,.0f} msg/s | Bandwidth: {bandwidth:.2f} MB/s"
+        )
+
+    # Consumer latency stats
+    print("\nConsumer Latency Stats:")
+    all_latencies = []
+    first_recv_times = []
+    last_recv_times = []
+
+    for cid, data in consumer_stats.items():
+        lats = data["latencies"]
+        if len(lats) == 0:
+            continue
+        all_latencies.extend(lats)
+        first_recv_times.append(data["first_recv"])
+        last_recv_times.append(data["last_recv"])
+
+        avg = statistics.mean(lats)
+        p50 = statistics.median(lats)
+        p95 = statistics.quantiles(lats, n=20)[18]
+        p99 = statistics.quantiles(lats, n=100)[98]
+
+        print(
+            f"[Consumer-{cid}] msgs={len(lats):5d} | avg={avg:.3f} ms | "
+            f"P50={p50:.3f} ms | P95={p95:.3f} ms | P99={p99:.3f} ms"
+        )
+
+    # Global summary
+    if first_recv_times and last_recv_times:
+        total_time = max(last_recv_times) - min(first_recv_times)
+        global_throughput = total_messages / total_time
+        global_bandwidth = total_bytes / (1024**2 * total_time)
+
+        if all_latencies:
+            avg_latency = statistics.mean(all_latencies)
+            min_latency = min(all_latencies)
+            max_latency = max(all_latencies)
+            p50_latency = statistics.median(all_latencies)
+            p95_latency = statistics.quantiles(all_latencies, n=20)[18]
+            p99_latency = statistics.quantiles(all_latencies, n=100)[98]
+        else:
+            avg_latency = min_latency = max_latency = p50_latency = p95_latency = p99_latency = 0.0
+
+        print("\nGlobal Summary:")
+        print(f"Total messages   : {total_messages:,}")
+        print(f"Total data       : {total_bytes / 1024**2:.2f} MB")
+        print(f"Total time       : {total_time:.3f} s")
+        print(f"Global throughput: {global_throughput:,.0f} msg/s")
+        print(f"Global bandwidth : {global_bandwidth:.2f} MB/s")
+        print(
+            f"Latency (ms)     : avg={avg_latency:.3f} "
+            f"| min={min_latency:.3f} | max={max_latency:.3f} "
+            f"| P50={p50_latency:.3f} | P95={p95_latency:.3f} | P99={p99_latency:.3f}\n"
+        )
+
+
+# Entry
+if __name__ == "__main__":
+    run_benchmark()
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -151,6 +151,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # "Number of tokens in the group for Mixture of Experts (MoE) computation processing on HPU"
    "FD_HPU_CHUNK_SIZE": lambda: int(os.getenv("FD_HPU_CHUNK_SIZE", "64")),
    "FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS": lambda: int(os.getenv("FD_PREFILL_WAIT_DECODE_RESOURCE_SECONDS", "30")),
+    "FMQ_CONFIG_JSON": lambda: os.getenv("FMQ_CONFIG_JSON", None),
 }


--- a/fastdeploy/inter_communicator/fmq.py
+++ b/fastdeploy/inter_communicator/fmq.py
@@ -0,0 +1,347 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import asyncio
+import json
+import time
+import uuid
+from dataclasses import dataclass, field
+from enum import Enum
+from multiprocessing import shared_memory
+from multiprocessing.reduction import ForkingPickler
+from typing import Any, Callable, Dict, Optional
+
+import zmq
+import zmq.asyncio
+
+from fastdeploy import envs
+from fastdeploy.utils import fmq_logger
+
+# ==========================
+# Config & Enum Definitions
+# ==========================
+
+
+class EndpointType(Enum):
+    QUEUE = "queue"
+    TOPIC = "topic"
+
+
+class Role(Enum):
+    PRODUCER = "producer"
+    CONSUMER = "consumer"
+
+
+@dataclass
+class SocketOptions:
+    sndhwm: int = 0
+    rcvhwm: int = 0
+    linger: int = -1
+    sndbuf: int = 32 * 1024 * 1024
+    rcvbuf: int = 32 * 1024 * 1024
+    immediate: int = 1
+
+    def apply(self, socket: zmq.Socket, is_producer: bool):
+        # Apply socket-level configurations
+        socket.setsockopt(zmq.LINGER, self.linger)
+        socket.setsockopt(zmq.IMMEDIATE, self.immediate)
+
+        if is_producer:
+            socket.setsockopt(zmq.SNDHWM, self.sndhwm)
+            socket.setsockopt(zmq.SNDBUF, self.sndbuf)
+        else:
+            socket.setsockopt(zmq.RCVHWM, self.rcvhwm)
+            socket.setsockopt(zmq.RCVBUF, self.rcvbuf)
+
+
+@dataclass
+class Endpoint:
+    # Represents a single endpoint with protocol, address, io_threads, and copy behavior
+    protocol: str
+    address: str
+    io_threads: int = 1
+    copy: bool = False
+
+
+@dataclass
+class Config:
+    ipc_root: str = "/dev/shm"
+    io_threads: int = 1
+    copy: bool = False
+    endpoints: Dict[str, Endpoint] = field(default_factory=dict)
+    socket_config: SocketOptions = SocketOptions()
+
+
+# ==========================
+# Endpoint Manager
+# ==========================
+
+
+class EndpointManager:
+    config: Config = Config()
+
+    @classmethod
+    def load_config(cls, _ignored_file_path: str = None):
+        cfg_str = envs.FMQ_CONFIG_JSON
+        if cfg_str:
+            try:
+                custom_cfg = json.loads(cfg_str)
+                for key, value in vars(custom_cfg).items():
+                    if value is not None:
+                        setattr(cls.config, key, value)
+            except Exception as e:
+                fmq_logger.error(f"Failed to load FMQ config: {e}")
+        fmq_logger.info(f"Loaded FMQ config: {cls.config}")
+
+    @classmethod
+    def get_endpoint(cls, name: str) -> Endpoint:
+        # Retrieve endpoint object
+        if name in cls.config.endpoints:
+            return cls.config.endpoints[name]
+
+        # Fallback: auto-generate endpoint
+        address = f"{cls.config.ipc_root}/fmq_{name}.ipc"
+        return Endpoint(protocol="ipc", address=address)
+
+
+# ==========================
+# Shared Memory Descriptor
+# ==========================
+
+
+@dataclass
+class Descriptor:
+    shm_name: str
+    size: int
+
+    @staticmethod
+    def create(data_bytes: bytes) -> "Descriptor":
+        # Create shared memory buffer and store payload
+        name = f"fmq_shm_{uuid.uuid4().hex}"
+        shm = shared_memory.SharedMemory(create=True, size=len(data_bytes), name=name)
+        shm.buf[: len(data_bytes)] = data_bytes
+        shm.close()
+        return Descriptor(shm_name=name, size=len(data_bytes))
+
+    def read_and_unlink(self) -> bytes:
+        # Read and cleanup shared memory
+        try:
+            shm = shared_memory.SharedMemory(name=self.shm_name)
+            data = bytes(shm.buf[: self.size])
+            shm.close()
+            shm.unlink()
+            return data
+        except FileNotFoundError:
+            return b""
+
+
+# ==========================
+# Message Wrapper
+# ==========================
+
+
+@dataclass
+class Message:
+    payload: Any
+    msg_id: int = None
+    timestamp: float = field(default_factory=time.time)
+    descriptor: Optional[Descriptor] = None
+
+    def serialize(self) -> bytes:
+        # Serialize message
+        return ForkingPickler.dumps(self)
+
+    @staticmethod
+    def deserialize(data: bytes) -> "Message":
+        # Deserialize message
+        return ForkingPickler.loads(data)
+
+
+# ==========================
+# Base Component
+# ==========================
+
+
+class BaseComponent:
+    def __init__(self, context: zmq.asyncio.Context, endpoint: Endpoint):
+        self.context = context
+        self.endpoint = endpoint
+        self.socket = None
+        self.lock = asyncio.Lock()
+
+    async def close(self):
+        # Close socket
+        if self.socket:
+            self.socket.close()
+
+
+# ==========================
+# FIFO Queue
+# ==========================
+
+
+class Queue(BaseComponent):
+    def __init__(self, context, name: str, role: str = "producer"):
+        endpoint = EndpointManager.get_endpoint(name)
+        super().__init__(context, endpoint)
+
+        self.name = name
+        self.role = Role(role)
+        self.copy = endpoint.copy
+        self.socket_conf = EndpointManager.config.socket_config
+        self._msg_id = 0
+
+        full_ep = f"{endpoint.protocol}://{endpoint.address}"
+
+        self.socket = self.context.socket(zmq.PUSH if self.role == Role.PRODUCER else zmq.PULL)
+        self.socket_conf.apply(self.socket, self.role == Role.PRODUCER)
+
+        if self.role == Role.PRODUCER:
+            self.socket.connect(full_ep)
+        else:
+            self.socket.bind(full_ep)
+
+        fmq_logger.info(f"Queue {name} initialized on {full_ep}")
+
+    async def put(self, data: Any, shm_threshold: int = 1024 * 1024):
+        """
+        Send data to the queue.
+
+        Args:
+            data: The data to send. Can be any serializable object or bytes.
+            shm_threshold: Size threshold in bytes. If the data is of type bytes and its size is
+                greater than or equal to this threshold, shared memory will be used to send the message.
+                Default is 1MB (1024 * 1024 bytes).
+
+        Raises:
+            PermissionError: If called by a non-producer role.
+        """
+        if self.role != Role.PRODUCER:
+            raise PermissionError("Only producers can send messages.")
+
+        desc = None
+        payload = data
+
+        if isinstance(data, bytes) and len(data) >= shm_threshold:
+            desc = Descriptor.create(data)
+            payload = None
+
+        msg = Message(msg_id=self._msg_id, payload=payload, descriptor=desc)
+        raw = msg.serialize()
+
+        async with self.lock:
+            await self.socket.send(raw, copy=self.copy)
+            self._msg_id += 1
+
+    async def get(self, timeout: int = None) -> Optional[Message]:
+        # Receive data from queue
+        if self.role != Role.CONSUMER:
+            raise PermissionError("Only consumers can get messages.")
+
+        try:
+            if timeout:
+                raw = await asyncio.wait_for(self.socket.recv(), timeout / 1000)
+            else:
+                raw = await self.socket.recv(copy=self.copy)
+        except asyncio.TimeoutError:
+            fmq_logger.error(f"Timeout receiving message on {self.name}")
+            return None
+
+        msg = Message.deserialize(raw)
+        if msg.descriptor:
+            msg.payload = msg.descriptor.read_and_unlink()
+
+        self._msg_id += 1
+        return msg
+
+
+# ==========================
+# Pub/Sub Topic
+# ==========================
+
+
+class Topic(BaseComponent):
+    def __init__(self, context, name: str):
+        endpoint = EndpointManager.get_endpoint(name)
+        super().__init__(context, endpoint)
+        self.name = name
+        self._pub_socket = None
+        self._sub_socket = None
+        self._task = None
+
+    async def pub(self, data: Any):
+        # Publish a message
+        if not self._pub_socket:
+            ep = f"{self.endpoint.protocol}://{self.endpoint.address}"
+            self._pub_socket = self.context.socket(zmq.PUB)
+            self._pub_socket.bind(ep)
+            await asyncio.sleep(0.05)
+
+        msg = Message(payload=data)
+        async with self.lock:
+            await self._pub_socket.send(msg.serialize())
+
+    async def sub(self, callback: Callable[[Message], Any]):
+        # Subscribe and handle messages
+        if not self._sub_socket:
+            ep = f"{self.endpoint.protocol}://{self.endpoint.address}"
+            self._sub_socket = self.context.socket(zmq.SUB)
+            self._sub_socket.connect(ep)
+            self._sub_socket.setsockopt_string(zmq.SUBSCRIBE, "")
+
+        async def loop():
+            while True:
+                raw = await self._sub_socket.recv()
+                msg = Message.deserialize(raw)
+                result = callback(msg)
+                if asyncio.iscoroutine(result):
+                    await result
+
+        self._task = asyncio.create_task(loop())
+
+
+# ==========================
+# FMQ Main Interface
+# ==========================
+
+
+class FMQ:
+    _instance = None
+    _context = None
+
+    def __new__(cls, config_path="fmq_config.json"):
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+            EndpointManager.load_config()
+
+            # Determine IO threads based on global defaults
+            io_threads = 1
+            if EndpointManager.config.endpoints:
+                # Use max io_threads among all endpoints
+                io_threads = max(ep.io_threads for ep in EndpointManager.config.endpoints.values())
+
+            cls._context = zmq.asyncio.Context(io_threads=io_threads)
+        return cls._instance
+
+    def queue(self, name: str, role="producer") -> Queue:
+        return Queue(self._context, name, role)
+
+    def topic(self, name: str) -> Topic:
+        return Topic(self._context, name)
+
+    async def destroy(self):
+        # Destroy ZeroMQ context
+        self._context.term()
--- a/fastdeploy/inter_communicator/fmq_factory.py
+++ b/fastdeploy/inter_communicator/fmq_factory.py
@@ -0,0 +1,83 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from fastdeploy.inter_communicator.fmq import FMQ
+
+
+class FMQFactory:
+    """
+    Static factory for creating the four standard FMQ queues:
+        1. q_a2e: api server --> engine
+        2. q_e2w: engine --> worker
+        3. q_w2e: worker --> engine
+        4. q_e2a: engine --> api server
+    API Server: q_a2e producer / q_e2a consumer
+    Engine: q_a2e consumer / q_e2w producer / q_w2e consumer / q_e2a producer
+    Worker: q_e2w consumer / q_w2e producer
+    """
+
+    _fmq = FMQ()
+
+    # ------------------------------
+    # API → Engine
+    # ------------------------------
+    @classmethod
+    def q_a2e_producer(cls):
+        return cls._fmq.queue("q_a2e", role="producer")
+
+    @classmethod
+    def q_a2e_consumer(cls):
+        return cls._fmq.queue("q_a2e", role="consumer")
+
+    # ------------------------------
+    # Engine → Worker
+    # ------------------------------
+    @classmethod
+    def q_e2w_producer(cls):
+        return cls._fmq.queue("q_e2w", role="producer")
+
+    @classmethod
+    def q_e2w_consumer(cls):
+        return cls._fmq.queue("q_e2w", role="consumer")
+
+    # ------------------------------
+    # Worker → Engine
+    # ------------------------------
+    @classmethod
+    def q_w2e_producer(cls):
+        return cls._fmq.queue("q_w2e", role="producer")
+
+    @classmethod
+    def q_w2e_consumer(cls):
+        return cls._fmq.queue("q_w2e", role="consumer")
+
+    # ------------------------------
+    # Engine → API
+    # ------------------------------
+    @classmethod
+    def q_e2a_producer(cls):
+        return cls._fmq.queue("q_e2a", role="producer")
+
+    @classmethod
+    def q_e2a_consumer(cls):
+        return cls._fmq.queue("q_e2a", role="consumer")
+
+    # ------------------------------
+    # Destroy context
+    # ------------------------------
+    @classmethod
+    async def destroy(cls):
+        await cls._fmq.destroy()
--- a/fastdeploy/utils.py
+++ b/fastdeploy/utils.py
@@ -1051,6 +1051,7 @@ spec_logger = get_logger("speculate", "speculate.log")
 zmq_client_logger = get_logger("zmq_client", "zmq_client.log")
 trace_logger = FastDeployLogger().get_trace_logger("trace_logger", "trace_logger.log")
 router_logger = get_logger("router", "router.log")
+fmq_logger = get_logger("fmq", "fmq.log")


 def parse_type(return_type: Callable[[str], T]) -> Callable[[str], T]:
--- a/tests/inter_communicator/test_fmq.py
+++ b/tests/inter_communicator/test_fmq.py
@@ -0,0 +1,92 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import asyncio
+import json
+import os
+import unittest
+
+from fastdeploy.inter_communicator.fmq import FMQ, Message
+
+# Prepare environment config for testing
+cfg = {
+    "ipc_root": "/dev/shm",
+    "io_threads": 1,
+    "copy": False,
+    "endpoints": {
+        "test_queue": {"protocol": "ipc", "address": "/dev/shm/fmq_test_queue.ipc", "io_threads": 1, "copy": False},
+        "test_topic": {"protocol": "ipc", "address": "/dev/shm/fmq_test_topic.ipc", "io_threads": 1, "copy": False},
+    },
+}
+os.environ["FMQ_CONFIG_JSON"] = json.dumps(cfg)
+
+
+class TestFMQ(unittest.TestCase):
+
+    def setUp(self):
+        self.fmq = FMQ()
+
+    def test_queue_send_receive(self):
+        async def run_test():
+            producer = self.fmq.queue("test_queue", role="producer")
+            consumer = self.fmq.queue("test_queue", role="consumer")
+
+            test_data = b"hello world"
+            await producer.put(test_data)
+            msg = await consumer.get(timeout=1000)
+
+            self.assertIsNotNone(msg)
+            self.assertEqual(msg.payload, test_data)
+
+        asyncio.run(run_test())
+
+    def test_queue_large_shm_transfer(self):
+        async def run_test():
+            producer = self.fmq.queue("test_queue", role="producer")
+            consumer = self.fmq.queue("test_queue", role="consumer")
+
+            large_data = b"x" * (2 * 1024 * 1024)  # > 1MB
+            await producer.put(large_data)
+            msg = await consumer.get(timeout=1000)
+
+            self.assertIsNotNone(msg)
+            self.assertEqual(msg.payload, large_data)
+            self.assertIsNotNone(msg.descriptor)
+
+        asyncio.run(run_test())
+
+    def test_topic_pub_sub(self):
+        received = []
+
+        async def run_test():
+            topic = self.fmq.topic("test_topic")
+
+            async def callback(msg: Message):
+                received.append(msg.payload)
+
+            await topic.sub(callback)
+            await asyncio.sleep(0.1)  # allow SUB to connect
+
+            await topic.pub("hello")
+            await asyncio.sleep(0.2)
+
+            self.assertIn("hello", received)
+
+        asyncio.run(run_test())
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/inter_communicator/test_fmq_factory.py
+++ b/tests/inter_communicator/test_fmq_factory.py
@@ -0,0 +1,91 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import unittest
+
+from fastdeploy.inter_communicator.fmq import Message
+from fastdeploy.inter_communicator.fmq_factory import FMQFactory as factory
+
+
+class TestFMQFactory(unittest.IsolatedAsyncioTestCase):
+
+    async def test_create_queues(self):
+        """Test whether all producer/consumer queues can be created."""
+        q1 = factory.q_a2e_producer()
+        q2 = factory.q_a2e_consumer()
+        q3 = factory.q_e2w_producer()
+        q4 = factory.q_e2w_consumer()
+        q5 = factory.q_w2e_producer()
+        q6 = factory.q_w2e_consumer()
+        q7 = factory.q_e2a_producer()
+        q8 = factory.q_e2a_consumer()
+
+        self.assertEqual(q1.name, "q_a2e")
+        self.assertEqual(q2.name, "q_a2e")
+        self.assertEqual(q3.name, "q_e2w")
+        self.assertEqual(q4.name, "q_e2w")
+        self.assertEqual(q5.name, "q_w2e")
+        self.assertEqual(q6.name, "q_w2e")
+        self.assertEqual(q7.name, "q_e2a")
+        self.assertEqual(q8.name, "q_e2a")
+
+        # 同一进程内 context 应相同
+        self.assertIs(q1.context, q2.context)
+        self.assertIs(q1.context, q3.context)
+
+    async def test_message_roundtrip(self):
+        """测试 producer → consumer 消息流转"""
+        producer = factory.q_a2e_producer()
+        consumer = factory.q_a2e_consumer()
+
+        payload = {"k": "v"}
+
+        await producer.put(payload)
+        msg = await consumer.get(timeout=1500)
+
+        self.assertIsInstance(msg, Message)
+        self.assertEqual(msg.payload, payload)
+
+    async def test_multi_queue_independence(self):
+        """测试多个队列互不干扰"""
+
+        prod_a2e = factory.q_a2e_producer()
+        cons_a2e = factory.q_a2e_consumer()
+
+        prod_e2w = factory.q_e2w_producer()
+        cons_e2w = factory.q_e2w_consumer()
+
+        await prod_a2e.put("msg_api")
+        await prod_e2w.put("msg_worker")
+
+        msg1 = await cons_a2e.get(timeout=1500)
+        msg2 = await cons_e2w.get(timeout=1500)
+
+        self.assertEqual(msg1.payload, "msg_api")
+        self.assertEqual(msg2.payload, "msg_worker")
+
+    async def test_shared_context(self):
+        """验证 FMQFactory 始终返回同一个 context (单进程)"""
+        q1 = factory.q_a2e_producer()
+        q2 = factory.q_e2w_consumer()
+        q3 = factory.q_e2a_producer()
+
+        self.assertIs(q1.context, q2.context)
+        self.assertIs(q1.context, q3.context)
+
+
+if __name__ == "__main__":
+    unittest.main()