"""
# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""

import asyncio
import multiprocessing as mp
import os
import statistics
import time

from tqdm import tqdm

from fastdeploy.inter_communicator.fmq import FMQ


# ============================================================
# Producer Task
# ============================================================
async def producer_task(proc_id, msg_count, payload_size, shm_threshold, result_q):
    fmq = FMQ()
    q = fmq.queue("mp_bench_latency", role="producer")
    payload = b"x" * payload_size

    # tqdm 进度条
    pbar = tqdm(total=msg_count, desc=f"Producer-{proc_id}", position=proc_id, leave=True, disable=False)

    t0 = time.perf_counter()
    for i in range(msg_count):
        send_ts = time.perf_counter()
        await q.put(data={"pid": proc_id, "i": i, "send_ts": send_ts, "payload": payload}, shm_threshold=shm_threshold)
        pbar.update(1)
        # pbar.write(f"send {i}")
    t1 = time.perf_counter()
    result_q.put({"producer_id": proc_id, "count": msg_count, "time": t1 - t0})

    pbar.close()

    # wait for 2 seconds before closing
    await asyncio.sleep(5)


def producer_process(proc_id, msg_count, payload_size, shm_threshold, result_q):
    async def run():
        await producer_task(proc_id, msg_count, payload_size, shm_threshold, result_q)

    asyncio.run(run())


# ============================================================
# Consumer Task
# ============================================================
async def consumer_task(consumer_id, total_msgs, result_q, consumer_event):
    fmq = FMQ()
    q = fmq.queue("mp_bench_latency", role="consumer")
    consumer_event.set()

    latencies = []
    recv = 0

    # tqdm 显示进度
    pbar = tqdm(total=total_msgs, desc=f"Consumer-{consumer_id}", position=consumer_id + 1, leave=True, disable=False)

    first_recv = None
    last_recv = None

    while recv < total_msgs:
        msg = await q.get()
        recv_ts = time.perf_counter()
        if msg is None:
            pbar.write("recv None")
            continue
        if first_recv is None:
            first_recv = recv_ts
        last_recv = recv_ts
        send_ts = msg.payload["send_ts"]
        latencies.append((recv_ts - send_ts) * 1000)  # ms
        pbar.update(1)
        recv += 1

    pbar.close()

    result_q.put(
        {"consumer_id": consumer_id, "latencies": latencies, "first_recv": first_recv, "last_recv": last_recv}
    )


def consumer_process(consumer_id, total_msgs, result_q, consumer_event):
    async def run():
        await consumer_task(consumer_id, total_msgs, result_q, consumer_event)

    asyncio.run(run())


# ============================================================
# MAIN benchmark
# ============================================================
def run_benchmark(
    NUM_PRODUCERS=1,
    NUM_CONSUMERS=1,
    NUM_MESSAGES_PER_PRODUCER=1000,
    PAYLOAD_SIZE=1 * 1024 * 1024,
    SHM_THRESHOLD=1 * 1024 * 1024,
):
    total_messages = NUM_PRODUCERS * NUM_MESSAGES_PER_PRODUCER
    total_bytes = total_messages * PAYLOAD_SIZE

    print(f"\nFastDeploy Message Queue Benchmark, pid:{os.getpid()}")
    print(f"Producers: {NUM_PRODUCERS}")
    print(f"Consumers: {NUM_CONSUMERS}")
    print(f"Messages per producer: {NUM_MESSAGES_PER_PRODUCER}")
    print(f"Total bytes: {total_bytes / 1024 / 1024 / 1024:.2f} GB")
    print(f"Total messages: {total_messages:,}")
    print(f"Payload per message: {PAYLOAD_SIZE / 1024 / 1024:.2f} MB")

    mp.set_start_method("fork")
    manager = mp.Manager()
    result_q = manager.Queue()

    # 两个信号事件
    consumer_event = manager.Event()

    procs = []

    # Start Consumers
    msgs_per_consumer = total_messages // NUM_CONSUMERS
    for i in range(NUM_CONSUMERS):
        p = mp.Process(target=consumer_process, args=(i, msgs_per_consumer, result_q, consumer_event))
        procs.append(p)
        p.start()

    consumer_event.wait()

    # Start Producers
    for i in range(NUM_PRODUCERS):
        p = mp.Process(
            target=producer_process, args=(i, NUM_MESSAGES_PER_PRODUCER, PAYLOAD_SIZE, SHM_THRESHOLD, result_q)
        )
        procs.append(p)
        p.start()

    # Join
    for p in procs:
        p.join()

    # Collect results
    producer_stats = []
    consumer_stats = {}

    while not result_q.empty():
        item = result_q.get()
        if "producer_id" in item:
            producer_stats.append(item)
        if "consumer_id" in item:
            consumer_stats[item["consumer_id"]] = item

    # Producer stats
    print("\nProducer Stats:")
    for p in producer_stats:
        throughput = p["count"] / p["time"]
        bandwidth = (p["count"] * PAYLOAD_SIZE) / (1024**2 * p["time"])
        print(
            f"[Producer-{p['producer_id']}] Sent {p['count']:,} msgs "
            f"in {p['time']:.3f} s | Throughput: {throughput:,.0f} msg/s | Bandwidth: {bandwidth:.2f} MB/s"
        )

    # Consumer latency stats
    print("\nConsumer Latency Stats:")
    all_latencies = []
    first_recv_times = []
    last_recv_times = []

    for cid, data in consumer_stats.items():
        lats = data["latencies"]
        if len(lats) == 0:
            continue
        all_latencies.extend(lats)
        first_recv_times.append(data["first_recv"])
        last_recv_times.append(data["last_recv"])

        avg = statistics.mean(lats)
        p50 = statistics.median(lats)
        p95 = statistics.quantiles(lats, n=20)[18]
        p99 = statistics.quantiles(lats, n=100)[98]

        print(
            f"[Consumer-{cid}] msgs={len(lats):5d} | avg={avg:.3f} ms | "
            f"P50={p50:.3f} ms | P95={p95:.3f} ms | P99={p99:.3f} ms"
        )

    # Global summary
    if first_recv_times and last_recv_times:
        total_time = max(last_recv_times) - min(first_recv_times)
        global_throughput = total_messages / total_time
        global_bandwidth = total_bytes / (1024**2 * total_time)

        if all_latencies:
            avg_latency = statistics.mean(all_latencies)
            min_latency = min(all_latencies)
            max_latency = max(all_latencies)
            p50_latency = statistics.median(all_latencies)
            p95_latency = statistics.quantiles(all_latencies, n=20)[18]
            p99_latency = statistics.quantiles(all_latencies, n=100)[98]
        else:
            avg_latency = min_latency = max_latency = p50_latency = p95_latency = p99_latency = 0.0

        print("\nGlobal Summary:")
        print(f"Total messages   : {total_messages:,}")
        print(f"Total data       : {total_bytes / 1024**2:.2f} MB")
        print(f"Total time       : {total_time:.3f} s")
        print(f"Global throughput: {global_throughput:,.0f} msg/s")
        print(f"Global bandwidth : {global_bandwidth:.2f} MB/s")
        print(
            f"Latency (ms)     : avg={avg_latency:.3f} "
            f"| min={min_latency:.3f} | max={max_latency:.3f} "
            f"| P50={p50_latency:.3f} | P95={p95_latency:.3f} | P99={p99_latency:.3f}\n"
        )


# Entry
if __name__ == "__main__":
    run_benchmark()