polish code with new pre-commit rule (#2923)

2025-10-05 08:37:06 +08:00 · 2025-07-19 23:19:27 +08:00
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
+
 import argparse
 import json
 import time
@@ -21,17 +22,23 @@ from typing import List
 import numpy as np
 import paddle
 import paddle.distributed as dist
-import paddle.distributed.fleet as fleet
+from paddle.distributed import fleet

-from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig,
-                               GraphOptimizationConfig, LoadConfig,
-                               ModelConfig, ParallelConfig, SpeculativeConfig,
-                               ErnieArchitectures)
+from fastdeploy.config import (
+    DecodingConfig,
+    DeviceConfig,
+    ErnieArchitectures,
+    FDConfig,
+    GraphOptimizationConfig,
+    LoadConfig,
+    ModelConfig,
+    ParallelConfig,
+    SpeculativeConfig,
+)
 from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
 from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
 from fastdeploy.inter_communicator import IPCSignal
-from fastdeploy.model_executor.layers.quantization import \
-    get_quantization_config
+from fastdeploy.model_executor.layers.quantization import get_quantization_config
 from fastdeploy.platforms import current_platform
 from fastdeploy.utils import get_logger, none_or_str
 from fastdeploy.worker.worker_base import WorkerBase
@@ -47,24 +54,28 @@ def get_worker(fd_config: FDConfig, local_rank: int, rank: int) -> WorkerBase:
        raise NotImplementedError("Only CUDA platform supports logprob.")
    if current_platform.is_dcu():
        from fastdeploy.worker.dcu_worker import DcuWorker
+
        return DcuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)
    if current_platform.is_cuda():
        from fastdeploy.worker.gpu_worker import GpuWorker
+
        return GpuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)
    if current_platform.is_xpu():
        from fastdeploy.worker.xpu_worker import XpuWorker
+
        return XpuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)
    if current_platform.is_iluvatar():
        from fastdeploy.worker.iluvatar_worker import IluvatarWorker
-        return IluvatarWorker(fd_config=fd_config,
-                              local_rank=local_rank,
-                              rank=rank)
+
+        return IluvatarWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)
    if current_platform.is_gcu():
        from fastdeploy.worker.gcu_worker import GcuWorker
+
        return GcuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)

+
 def init_distributed_environment(seed: int = 20) -> List[int]:
-    """ Initialize Paddle Fleet and get rank of worker """
+    """Initialize Paddle Fleet and get rank of worker"""
    # Global rank
    ranks = dist.get_world_size()
    dist_strategy = fleet.DistributedStrategy()
@@ -85,6 +96,7 @@ def init_distributed_environment(seed: int = 20) -> List[int]:

    return ranks, local_rank

+
 def update_fd_config_for_mm(fd_config: FDConfig) -> None:
    if fd_config.model_config.enable_mm:
        tokenizer = ErnieBotTokenizer.from_pretrained(
@@ -103,13 +115,12 @@ def update_fd_config_for_mm(fd_config: FDConfig) -> None:
        vision_config.dtype = fd_config.model_config.dtype
        # vision_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size
        # vision_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank
-        fd_config.model_config.im_patch_id = tokenizer.get_vocab()[
-            "<|IMAGE_PLACEHOLDER|>"
-        ]
+        fd_config.model_config.im_patch_id = tokenizer.get_vocab()["<|IMAGE_PLACEHOLDER|>"]
        fd_config.model_config.think_end_id = tokenizer.get_vocab()["</think>"]
        fd_config.model_config.sequence_parallel = fd_config.parallel_config.sequence_parallel

-class PaddleDisWorkerProc():
+
+class PaddleDisWorkerProc:
    """
    Paddle Distrubuted wrapper for fastdeploy.worker.Worker,
        for handling single-node multi-GPU tensor parallel.
@@ -117,12 +128,7 @@ class PaddleDisWorkerProc():
        in the task queue. Control flow is transmitted by IPC.
    """

-    def __init__(
-        self,
-        fd_config: FDConfig,
-        ranks: int = 1,
-        local_rank: int = 0
-    ) -> None:
+    def __init__(self, fd_config: FDConfig, ranks: int = 1, local_rank: int = 0) -> None:
        """
        Initialize a distributed worker and task queue for single-node multi-GPU setup.
        Args:
@@ -136,20 +142,21 @@ class PaddleDisWorkerProc():
        self.parallel_config = fd_config.parallel_config

        # TODO(gongshaotian): Use worker factory to get worker
-        self.worker = get_worker(fd_config=fd_config,
-                                 local_rank=self.local_rank,
-                                 rank=self.ranks)
+        self.worker = get_worker(fd_config=fd_config, local_rank=self.local_rank, rank=self.ranks)

        # Initialize task queue
-        task_address = (self.parallel_config.pod_ip,
-                        self.parallel_config.engine_worker_queue_port)
+        task_address = (
+            self.parallel_config.pod_ip,
+            self.parallel_config.engine_worker_queue_port,
+        )

        self.task_queue = TaskQueue(
            address=task_address,
            is_server=False,
            num_client=self.parallel_config.tensor_parallel_size,
            client_id=self.parallel_config.tensor_parallel_rank,
-            local_data_parallel_id=self.parallel_config.expert_parallel_rank)
+            local_data_parallel_id=self.parallel_config.expert_parallel_rank,
+        )

    def init_health_status(self) -> None:
        """
@@ -164,17 +171,18 @@ class PaddleDisWorkerProc():
        # init worker_ready_signal
        self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
        array_size = min(
-            self.max_chips_per_node, self.parallel_config.tensor_parallel_size *
-            self.parallel_config.expert_parallel_size)
+            self.max_chips_per_node,
+            self.parallel_config.tensor_parallel_size * self.parallel_config.expert_parallel_size,
+        )
        workers_ready = np.zeros(shape=[array_size], dtype=np.int32)
        self.worker_ready_signal = IPCSignal(
            name="worker_ready_signal",
            array=workers_ready,
            dtype=np.int32,
            suffix=self.parallel_config.engine_pid,
-            create=False)
-        self.worker_ready_signal.value[self.local_rank %
-                                       self.max_chips_per_node] = 1
+            create=False,
+        )
+        self.worker_ready_signal.value[self.local_rank % self.max_chips_per_node] = 1

        # init worker_healthy_live_signal
        workers_alive = np.zeros(shape=[array_size], dtype=np.int32)
@@ -183,9 +191,9 @@ class PaddleDisWorkerProc():
            array=workers_alive,
            dtype=np.int32,
            suffix=self.parallel_config.engine_pid,
-            create=False)
-        self.worker_healthy_live_signal.value[self.local_rank % 8] = int(
-            time.time())
+            create=False,
+        )
+        self.worker_healthy_live_signal.value[self.local_rank % 8] = int(time.time())

        # init model_weights_status
        workers_model_weights = np.zeros(shape=[1], dtype=np.int32)
@@ -194,28 +202,28 @@ class PaddleDisWorkerProc():
            array=workers_model_weights,
            dtype=np.int32,
            suffix=self.parallel_config.engine_pid,
-            create=False)
+            create=False,
+        )

        # init exist_task_signal
-        workers_exist_task = np.zeros(
-            [self.parallel_config.expert_parallel_size], dtype=np.int32)
+        workers_exist_task = np.zeros([self.parallel_config.expert_parallel_size], dtype=np.int32)
        self.exist_task_signal = IPCSignal(
            name="exist_task_signal",
            array=workers_exist_task,
            dtype=np.int32,
            suffix=self.parallel_config.engine_pid,
-            create=False)
+            create=False,
+        )

        # init exist_swapped_task_signal
-        workers_swapped_task = np.zeros(
-            shape=[self.parallel_config.expert_parallel_size],
-            dtype=np.int32)
+        workers_swapped_task = np.zeros(shape=[self.parallel_config.expert_parallel_size], dtype=np.int32)
        self.exist_swapped_task_signal = IPCSignal(
            name="exist_swapped_task_signal",
            array=workers_swapped_task,
            dtype=np.int32,
            suffix=self.parallel_config.engine_pid,
-            create=False)
+            create=False,
+        )

        # init exist_prefill_task_signal
        exist_prefill_task_signal_data = np.zeros([1], dtype=np.int32)
@@ -224,26 +232,27 @@ class PaddleDisWorkerProc():
            array=exist_prefill_task_signal_data,
            dtype=np.int32,
            suffix=self.parallel_config.engine_pid,
-            create=False)
+            create=False,
+        )

    def event_loop_ep(self) -> None:
        """
        Tmp loop function for ep utill DP is supported
        """
        while True:
-            self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(
-                time.time())
+            self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(time.time())

-            if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks(
-            ) > 0:
+            if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks() > 0:
                tasks, read_finish = self.task_queue.get_tasks()

                req_dicts = []
                for req_dict, bsz in tasks:
                    num_running_requests = int(bsz)
                    req_dicts.extend(req_dict)
-                logger.info(f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, " \
-                            f"num_insert_requests: {len(req_dicts)}")
+                logger.info(
+                    f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, "
+                    f"num_insert_requests: {len(req_dicts)}"
+                )
                # Process prefill inputs
                self.worker.preprocess_new_task(req_dicts)

@@ -252,7 +261,7 @@ class PaddleDisWorkerProc():
            self.worker.execute_model()

    def event_loop_normal(self) -> None:
-        """ Main event loop for Paddle Distrubuted Workers.
+        """Main event loop for Paddle Distrubuted Workers.
        TODO(gongshaotian): support remote calling of functions that control worker.
        """
        # Currently, only support single node
@@ -271,18 +280,15 @@ class PaddleDisWorkerProc():
                paddle.distributed.barrier()

            self.insert_step = False
-            self.worker_healthy_live_signal.value[self.local_rank] = int(
-                time.time())
+            self.worker_healthy_live_signal.value[self.local_rank] = int(time.time())

            # The first worker detects whether there are tasks in the task queue
-            if self.local_rank %  mp_num_per_node == 0:
+            if self.local_rank % mp_num_per_node == 0:
                if self.task_queue.num_tasks() > 0:
                    if self.nnode > 1:
                        self.task_queue.read_finish_flag.set(1)
                    else:
-                        self.exist_task_signal.value[
-                            self.fd_config.parallel_config.
-                            expert_parallel_rank] = 1
+                        self.exist_task_signal.value[self.fd_config.parallel_config.expert_parallel_rank] = 1

            if self.parallel_config.tensor_parallel_size > 1:
                # Synchronize the signal for other workers
@@ -291,23 +297,27 @@ class PaddleDisWorkerProc():

            if self.fd_config.load_config.dynamic_load_weight:
                if self.exist_task_signal.value[0] == 2:
-                    from fastdeploy.rl.dynamic_weight_manager import \
-                        DynamicWeightManager
-                    DynamicWeightManager.check_model_weights_status(
-                        self.model_weights_status, self.worker.model_runner,
-                        self.parallel_config.engine_pid)
+                    from fastdeploy.rl.dynamic_weight_manager import (
+                        DynamicWeightManager,
+                    )

-            if self.exist_task_signal.value[
-                    self.fd_config.parallel_config.expert_parallel_rank] == 1 or \
-                    self.task_queue.read_finish_flag.get() == 1:
+                    DynamicWeightManager.check_model_weights_status(
+                        self.model_weights_status,
+                        self.worker.model_runner,
+                        self.parallel_config.engine_pid,
+                    )
+
+            if (
+                self.exist_task_signal.value[self.fd_config.parallel_config.expert_parallel_rank] == 1
+                or self.task_queue.read_finish_flag.get() == 1
+            ):
                logger.info(f"Rank: {self.local_rank} Detected new requests.")
                self.insert_step = True

                tasks, read_finish = self.task_queue.get_tasks()
                if read_finish:
                    # Ensure that every worker get the task
-                    self.exist_task_signal.value[self.fd_config.parallel_config
-                                                 .expert_parallel_rank] = 0
+                    self.exist_task_signal.value[self.fd_config.parallel_config.expert_parallel_rank] = 0
                    self.task_queue.read_finish_flag.set(0)

                req_dicts = []
@@ -316,8 +326,10 @@ class PaddleDisWorkerProc():
                    req_dicts.extend(req_dict)

                req_ids = [req.request_id for req in req_dicts]
-                logger.info(f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, " \
-                            f"num_insert_requests: {len(req_dicts)}, req_ids: {req_ids}")
+                logger.info(
+                    f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, "
+                    f"num_insert_requests: {len(req_dicts)}, req_ids: {req_ids}"
+                )

                # Process prefill inputs
                self.worker.preprocess_new_task(req_dicts)
@@ -333,9 +345,7 @@ class PaddleDisWorkerProc():
            # These generated tokens can be obtained through get_output op.
            self.worker.execute_model(req_dicts)

-            self.exist_prefill_task_signal.value[
-                0] = self.worker.prefill_finished()
-
+            self.exist_prefill_task_signal.value[0] = self.worker.prefill_finished()

    def determine_num_available_blocks(self) -> None:
        """Profiles the peak memory usage of the model to determine how many
@@ -351,64 +361,51 @@ class PaddleDisWorkerProc():
        """
        if self.fd_config.parallel_config.do_profile:
            # 1. Get available memory(bytes)
-            available_kv_cache_memory = self.worker.determine_available_memory(
-            )
-            logger.info(
-                f"------- available_kv_cache_memory:{available_kv_cache_memory / 1024**3} GB --------"
-            )
+            available_kv_cache_memory = self.worker.determine_available_memory()
+            logger.info(f"------- available_kv_cache_memory:{available_kv_cache_memory / 1024**3} GB --------")

            # 2. Calculate the appropriate number of blocks
            model_block_memory_used = self.worker.cal_theortical_kvcache()
-            num_blocks_local = int(available_kv_cache_memory //
-                                   model_block_memory_used)
+            num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
            # NOTE(liuzichang): Too many block will lead to illegal memory access
            # We will develop dynamic limits in future.
            if num_blocks_local > 40000:
-                logger.info(
-                    f"------- Reset num_blocks_local {num_blocks_local} to 40000"
-                )
+                logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000")
                num_blocks_local = min(40000, num_blocks_local)
-            logger.info(
-                f"------- model_block_memory_used:{model_block_memory_used} --------"
-            )
-            logger.info(
-                f"------- num_blocks_local:{num_blocks_local} --------")
+            logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------")
+            logger.info(f"------- num_blocks_local:{num_blocks_local} --------")

-            logger.info(
-                f"self.fd_config.parallel_config.do_profile:{self.fd_config.parallel_config.do_profile}"
-            )
+            logger.info(f"self.fd_config.parallel_config.do_profile:{self.fd_config.parallel_config.do_profile}")

            # 3. Send IPCSignal
-            get_profile_block_num = np.zeros(shape=[self.ranks],
-                                             dtype=np.int32)
+            get_profile_block_num = np.zeros(shape=[self.ranks], dtype=np.int32)
            self.get_profile_block_num_signal = IPCSignal(
                name="get_profile_block_num",
                array=get_profile_block_num,
                dtype=np.int32,
                suffix=self.parallel_config.engine_pid,
-                create=False)
-            self.get_profile_block_num_signal.value[
-                self.local_rank] = num_blocks_local
+                create=False,
+            )
+            self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_local

            # Wait all worker send the signal
            while np.any(self.get_profile_block_num_signal.value <= 0):
                time.sleep(0.01)
-            num_blocks_global = self.get_profile_block_num_signal.value.min(
-            ).item()
+            num_blocks_global = self.get_profile_block_num_signal.value.min().item()

            if num_blocks_global < 0:
                logger.error(
                    "The total number of blocks cannot be less than zero."
                    "Please increase gpu_memory_utilization"
-                    "Or decrease max_num_batched_tokens(max model length) ")
+                    "Or decrease max_num_batched_tokens(max model length) "
+                )
                raise ValueError(
                    "The total number of blocks cannot be less than zero."
                    "Please increase gpu_memory_utilization"
-                    "Or decrease max_num_batched_tokens(max model length) ")
+                    "Or decrease max_num_batched_tokens(max model length) "
+                )

-
-            self.get_profile_block_num_signal.value[
-                self.local_rank] = num_blocks_global
+            self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_global
        else:
            num_blocks_global = self.fd_config.parallel_config.total_block_num
        # NOTE(liuzichang): Too big num_blocks_global will lead to error 700
@@ -416,11 +413,11 @@ class PaddleDisWorkerProc():
        self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)

    def init_device(self) -> None:
-        """ Initialize device and Construct model runner """
+        """Initialize device and Construct model runner"""
        self.worker.init_device()

    def load_model(self) -> None:
-        """ Load weights and create model """
+        """Load weights and create model"""
        self.worker.load_model()


@@ -429,66 +426,44 @@ def parse_args():
    Parse args from command line
    """
    parser = argparse.ArgumentParser("FastDeploy LLM Inference")
-    parser.add_argument("-m",
-                        "--model_name_or_path",
-                        type=str,
-                        default="./output",
-                        help="model dir")
-    parser.add_argument("-mbs",
-                        "--max_num_seqs",
-                        type=int,
-                        default=34,
-                        help="max batch size")
+    parser.add_argument(
+        "-m",
+        "--model_name_or_path",
+        type=str,
+        default="./output",
+        help="model dir",
+    )
+    parser.add_argument("-mbs", "--max_num_seqs", type=int, default=34, help="max batch size")
    parser.add_argument("--total_block_num", type=int, default=2000)
    parser.add_argument("--block_size", type=int, default=64)
    parser.add_argument("--pod_ip", type=str, default="127.0.0.1")
    parser.add_argument("--engine_worker_queue_port", type=int, default=9923)
-    parser.add_argument("--max_model_len",
-                        type=int,
-                        default=3072,
-                        help="max model len")
-    parser.add_argument("--device_ids",
-                        type=str,
-                        default="0",
-                        help="cuda visible devices")
-    parser.add_argument("--dtype",
-                        type=str,
-                        default="bfloat16",
-                        help="input dtype")
-    parser.add_argument("--enc_dec_block_num",
-                        type=int,
-                        default=1,
-                        help="encoder's decoder num")
-    parser.add_argument("--kv_cache_ratio",
-                        type=float,
-                        default=0.7,
-                        help="kv cache ratio for input")
-    parser.add_argument("--first_token_id",
-                        type=int,
-                        default=1,
-                        help="first token id")
-    parser.add_argument("--gpu_memory_utilization",
-                        type=float,
-                        default=0.9,
-                        help="gpu memory utilization")
-    parser.add_argument("--engine_pid",
-                        type=int,
-                        default=None,
-                        help="Process ID of engine")
-    parser.add_argument("--do_profile",
-                        action='store_true',
-                        help="do profile or not")
-    parser.add_argument("--pad_token_id",
-                        type=int,
-                        default=-1,
-                        help="pad token id")
-    parser.add_argument("--eos_tokens_lens",
-                        type=int,
-                        default=2,
-                        help="eos token lens")
-    parser.add_argument("--enable_chunked_prefill",
-                        action='store_true',
-                        help="enable chunked prefill")
+    parser.add_argument("--max_model_len", type=int, default=3072, help="max model len")
+    parser.add_argument("--device_ids", type=str, default="0", help="cuda visible devices")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="input dtype")
+    parser.add_argument("--enc_dec_block_num", type=int, default=1, help="encoder's decoder num")
+    parser.add_argument(
+        "--kv_cache_ratio",
+        type=float,
+        default=0.7,
+        help="kv cache ratio for input",
+    )
+    parser.add_argument("--first_token_id", type=int, default=1, help="first token id")
+    parser.add_argument(
+        "--gpu_memory_utilization",
+        type=float,
+        default=0.9,
+        help="gpu memory utilization",
+    )
+    parser.add_argument("--engine_pid", type=int, default=None, help="Process ID of engine")
+    parser.add_argument("--do_profile", action="store_true", help="do profile or not")
+    parser.add_argument("--pad_token_id", type=int, default=-1, help="pad token id")
+    parser.add_argument("--eos_tokens_lens", type=int, default=2, help="eos token lens")
+    parser.add_argument(
+        "--enable_chunked_prefill",
+        action="store_true",
+        help="enable chunked prefill",
+    )
    parser.add_argument(
        "--speculative_method",
        default=None,
@@ -519,71 +494,90 @@ def parse_args():
        default=False,
        type=bool,
    )
-    parser.add_argument("--max_num_batched_tokens",
-                        type=int,
-                        default=2048,
-                        help="max num batched tokens")
+    parser.add_argument(
+        "--max_num_batched_tokens",
+        type=int,
+        default=2048,
+        help="max num batched tokens",
+    )

-    parser.add_argument("--enable_prefix_caching",
-                        action='store_true',
-                        help="enable prefix cache")
-    parser.add_argument("--enable-custom-all-reduce",
-                        action='store_true',
-                        help="enable custom all-reduce")
-    parser.add_argument("--splitwise_role",
-                        type=str,
-                        default="mixed",
-                        help="splitwise role")
-    parser.add_argument("--tensor_parallel_size",
-                        type=int,
-                        default=1,
-                        help="tensor parallel size")
-    parser.add_argument("--expert_parallel_size",
-                        type=int,
-                        default=1,
-                        help="expert parallel size")
-    parser.add_argument("--enable_expert_parallel",
-                        action='store_true',
-                        help="enable expert parallel")
+    parser.add_argument(
+        "--enable_prefix_caching",
+        action="store_true",
+        help="enable prefix cache",
+    )
+    parser.add_argument(
+        "--enable-custom-all-reduce",
+        action="store_true",
+        help="enable custom all-reduce",
+    )
+    parser.add_argument("--splitwise_role", type=str, default="mixed", help="splitwise role")
+    parser.add_argument(
+        "--tensor_parallel_size",
+        type=int,
+        default=1,
+        help="tensor parallel size",
+    )
+    parser.add_argument(
+        "--expert_parallel_size",
+        type=int,
+        default=1,
+        help="expert parallel size",
+    )
+    parser.add_argument(
+        "--enable_expert_parallel",
+        action="store_true",
+        help="enable expert parallel",
+    )
    parser.add_argument("--ori_vocab_size", type=int, default=None)

-    parser.add_argument("--quantization",
-                        type=str,
-                        default="None",
-                        help="Quantization name for the model, currentlly support " \
-                            "'wint4', 'wint8'," \
-                            "default is None. The priority of this configuration "\
-                            "is lower than that of the config file. " \
-                            "More complex quantization methods need to be configured via the config file.")
-    parser.add_argument("--graph_optimization_config",
-                        type=json.loads,
-                        default=None,
-                        help=" Configation of Graph optimization backend. "
+    parser.add_argument(
+        "--quantization",
+        type=str,
+        default="None",
+        help="Quantization name for the model, currentlly support "
+        "'wint4', 'wint8',"
+        "default is None. The priority of this configuration "
+        "is lower than that of the config file. "
+        "More complex quantization methods need to be configured via the config file.",
+    )
+    parser.add_argument(
+        "--graph_optimization_config",
+        type=json.loads,
+        default=None,
+        help=" Configation of Graph optimization backend. ",
+    )
+    parser.add_argument(
+        "--guided_decoding_backend",
+        type=str,
+        default="off",
+        help="guided decoding backend",
+    )
+    parser.add_argument(
+        "--disable_any_whitespace",
+        action="store_false",
+        help="Disable any whitespace for guided decoding.",
+    )
+    parser.add_argument(
+        "--dynamic_load_weight",
+        action="store_true",
+        help="Enable dynamic weight loading strategy",
    )
-    parser.add_argument("--guided_decoding_backend",
-                        type=str,
-                        default="off",
-                        help="guided decoding backend")
-    parser.add_argument("--disable_any_whitespace",
-                        action='store_false',
-                        help="Disable any whitespace for guided decoding.")
-    parser.add_argument("--dynamic_load_weight",
-                        action='store_true',
-                        help="Enable dynamic weight loading strategy")
    parser.add_argument(
        "--load_strategy",
        type=str,
-        choices=['ipc', 'ipc_snapshot'],
+        choices=["ipc", "ipc_snapshot"],
        default="ipc_snapshot",
        help="Weight loading method when dynamic loading is enabled: "
        "'ipc': real-time IPC streaming with automatic resharding, "
-        "'ipc_snapshot': load from disk snapshot of IPC weights.")
-    parser.add_argument("--enable_mm",
-                        action='store_true',
-                        help="Whether to enable vl model")
-    parser.add_argument("--enable_logprob",
-                        action='store_true',
-                        help="Enable output of token-level log probabilities.")
+        "'ipc_snapshot': load from disk snapshot of IPC weights.",
+    )
+    parser.add_argument("--enable_mm", action="store_true", help="Whether to enable vl model")
+    parser.add_argument(
+        "--enable_logprob",
+        action="store_true",
+        help="Enable output of token-level log probabilities.",
+    )

    args = parser.parse_args()
    return args
@@ -615,7 +609,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
        else:
            num_experts = model_config.moe_num_experts

-        num_experts_per_rank =  num_experts // args.expert_parallel_size
+        num_experts_per_rank = num_experts // args.expert_parallel_size
        num_experts_start_offset = expert_parallel_rank * num_experts_per_rank

        parallel_config.expert_parallel_rank = expert_parallel_rank
@@ -629,7 +623,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
        graph_opt_config = GraphOptimizationConfig(
            use_cudagraph=args.graph_optimization_config["use_cudagraph"],
            graph_opt_level=args.graph_optimization_config["graph_opt_level"],
-            cudagraph_capture_sizes=args.graph_optimization_config["cudagraph_capture_sizes"]
+            cudagraph_capture_sizes=args.graph_optimization_config["cudagraph_capture_sizes"],
        )

    # Note(tangbinhan): used for load_checkpoint
@@ -639,14 +633,10 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
    model_config.pretrained_config.head_dim = model_config.head_dim

    logger.info(f"parallel_config.use_ep {parallel_config.use_ep}")
-    logger.info(
-        f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}"
-    )
-    logger.info(
-        f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}"
-    )
+    logger.info(f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}")
+    logger.info(f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}")

-    if getattr(model_config, 'num_hidden_layers', None) is None:
+    if getattr(model_config, "num_hidden_layers", None) is None:
        raise ValueError("num_hidden_layers is None")

    quantization_config = model_config.quantization_config
@@ -656,11 +646,8 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
                model_config.is_quantized = True

    quant_config_name = None
-    if quantization_config is not None and quantization_config.get(
-            "quantization", None) is None:
-        raise ValueError(
-            "quantization_config should have a key named 'quantization' for specify quant config."
-        )
+    if quantization_config is not None and quantization_config.get("quantization", None) is None:
+        raise ValueError("quantization_config should have a key named 'quantization' for specify quant config.")

    if quantization_config is not None:
        quant_config_name = quantization_config["quantization"]
@@ -688,32 +675,29 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
    logger.info("===========quantization_config==============")
    if quant_config is not None:
        if model_config.is_quantized:
-            logger.info(
-                "Model Status: Offline Quantized (pre-quantized weights loaded)"
-            )
+            logger.info("Model Status: Offline Quantized (pre-quantized weights loaded)")
        else:
-            logger.info(
-                "Model Status: Original (will apply online quantization)")
+            logger.info("Model Status: Original (will apply online quantization)")

        logger.info(f"{quantization_config}")
    else:
-        logger.info(
-            "No quantization config found and use original weight and act dtype."
-        )
+        logger.info("No quantization config found and use original weight and act dtype.")

    # Set VL tag
    model_config.enable_mm = args.enable_mm
    logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
    logger.info(f"- Load strategy: {load_config.load_strategy}")

-    fd_config = FDConfig(model_config=model_config,
-                         parallel_config=parallel_config,
-                         speculative_config=speculative_config,
-                         device_config=device_config,
-                         load_config=load_config,
-                         decoding_config=decoding_config,
-                         quant_config=quant_config,
-                         graph_opt_config=graph_opt_config)
+    fd_config = FDConfig(
+        model_config=model_config,
+        parallel_config=parallel_config,
+        speculative_config=speculative_config,
+        device_config=device_config,
+        load_config=load_config,
+        decoding_config=decoding_config,
+        quant_config=quant_config,
+        graph_opt_config=graph_opt_config,
+    )
    update_fd_config_for_mm(fd_config)

    return fd_config