polish code with new pre-commit rule (#2923)

2025-10-05 16:48:03 +08:00 · 2025-07-19 23:19:27 +08:00
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions
--- a/fastdeploy/engine/config.py
+++ b/fastdeploy/engine/config.py
@@ -23,8 +23,14 @@ from typing import Any, Dict, List, Literal, Optional
 from fastdeploy import envs
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
-from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip,
-                              is_port_available, get_random_port, llm_logger)
+from fastdeploy.utils import (
+    ceil_div,
+    check_unified_ckpt,
+    get_host_ip,
+    get_random_port,
+    is_port_available,
+    llm_logger,
+)

 TaskOption = Literal["generate"]

@@ -39,13 +45,15 @@ class ModelConfig:
        model_name_or_path (str): Name or path of the model.
    """

-    def __init__(self,
-                 model_name_or_path: str,
-                 config_json_file: str = "config.json",
-                 dynamic_load_weight: bool = False,
-                 load_strategy: str = "ipc_snapshot",
-                 quantization: str = None,
-                 download_dir: Optional[str] = None):
+    def __init__(
+        self,
+        model_name_or_path: str,
+        config_json_file: str = "config.json",
+        dynamic_load_weight: bool = False,
+        load_strategy: str = "ipc_snapshot",
+        quantization: str = None,
+        download_dir: Optional[str] = None,
+    ):
        """
        Initialize the ModelConfig class.

@@ -64,11 +72,9 @@ class ModelConfig:
        if os.path.isfile(model_name_or_path):
            try:
                from paddleformers.transformers import AutoConfig
+
                config = AutoConfig.from_pretrained(model_name_or_path)
-                config_dict = {
-                    k: v
-                    for k, v in vars(config).items() if not k.startswith('_')
-                }
+                config_dict = {k: v for k, v in vars(config).items() if not k.startswith("_")}
                for key, value in config_dict.items():
                    setattr(self, key, value)
            except Exception:
@@ -115,8 +121,7 @@ class ModelConfig:
        if not hasattr(self, "mla_use_absorb"):
            self.mla_use_absorb = False
        if not hasattr(self, "head_dim"):
-            assert hasattr(self, "hidden_size") and hasattr(
-                self, "num_attention_heads")
+            assert hasattr(self, "hidden_size") and hasattr(self, "num_attention_heads")
            self.head_dim = self.hidden_size // self.num_attention_heads

    def read_from_env(self):
@@ -132,11 +137,9 @@ class ModelConfig:
            if not hasattr(self, key.lower()):
                if os.getenv(key, None):
                    value = eval(os.getenv(key))
-                    llm_logger.info(
-                        f"Get parameter `{key}` = {value} from environment.")
+                    llm_logger.info(f"Get parameter `{key}` = {value} from environment.")
                else:
-                    llm_logger.info(
-                        f"Parameter `{key}` will use default value {value}.")
+                    llm_logger.info(f"Parameter `{key}` will use default value {value}.")
                setattr(self, key.lower(), value)

        reset_config_value("COMPRESSION_RATIO", 1.0)
@@ -153,8 +156,7 @@ class ModelConfig:
        llm_logger.info("Model Configuration Information :")
        for k, v in self.__dict__.items():
            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info(
-            "=============================================================")
+        llm_logger.info("=============================================================")


 class CacheConfig:
@@ -211,8 +213,7 @@ class CacheConfig:
        self.enc_dec_block_num = enc_dec_block_num
        self.cache_dtype = cache_dtype
        if hasattr(model_cfg, "quantization_config"):
-            self.cache_dtype = model_cfg.quantization_config.get(
-                "kv_cache_quant_type", cache_dtype)
+            self.cache_dtype = model_cfg.quantization_config.get("kv_cache_quant_type", cache_dtype)

        self.enable_chunked_prefill = enable_chunked_prefill
        self.rdma_comm_ports = rdma_comm_ports
@@ -220,7 +221,7 @@ class CacheConfig:
        self.pd_comm_port = pd_comm_port

        if rdma_comm_ports is not None and isinstance(rdma_comm_ports, str):
-            self.rdma_comm_ports = rdma_comm_ports.split(',')
+            self.rdma_comm_ports = rdma_comm_ports.split(",")

        if pd_comm_port is not None and isinstance(pd_comm_port, str):
            self.pd_comm_port = [int(port) for port in pd_comm_port.split(",")]
@@ -236,41 +237,39 @@ class CacheConfig:
        self.cache_queue_port = cache_queue_port
        self.swap_space = swap_space

-        if (hasattr(self.model_cfg, "num_key_value_heads")
-                and hasattr(self.model_cfg, "num_key_value_heads")
-                and self.model_cfg.num_key_value_heads is not None
-                and int(self.model_cfg.num_key_value_heads) > 0):
+        if (
+            hasattr(self.model_cfg, "num_key_value_heads")
+            and hasattr(self.model_cfg, "num_key_value_heads")
+            and self.model_cfg.num_key_value_heads is not None
+            and int(self.model_cfg.num_key_value_heads) > 0
+        ):
            kv_num_head = int(self.model_cfg.num_key_value_heads)
        else:
            kv_num_head = self.model_cfg.num_attention_heads
        self.model_cfg.kv_num_head = kv_num_head

        # TODO check name
-        if "int4" in self.cache_dtype.lower(
-        ) or "float4" in self.cache_dtype.lower():
+        if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower():
            byte_size = 0.5
            self.cache_dtype = "uint8"
-        elif "int8" in self.cache_dtype.lower(
-        ) or "float8" in self.cache_dtype.lower():
+        elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower():
            self.cache_dtype = "uint8"
            byte_size = 1
        else:
            byte_size = 2

        self.each_token_cache_space = int(
-            self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim *
-            byte_size)
-        self.bytes_per_block = int(self.each_token_cache_space *
-                                   self.block_size)
+            self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
+        )
+        self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
        self.bytes_per_layer_per_block = int(
-            self.block_size * self.model_cfg.kv_num_head *
-            self.model_cfg.head_dim // tensor_parallel_size * byte_size)
+            self.block_size * self.model_cfg.kv_num_head * self.model_cfg.head_dim // tensor_parallel_size * byte_size
+        )

        if self.swap_space is None:
            self.num_cpu_blocks = 0
        else:
-            self.num_cpu_blocks = int(self.swap_space * 1024**3 /
-                                      self.bytes_per_block)
+            self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block)
        self._verify_args()

    def metrics_info(self):
@@ -279,12 +278,9 @@ class CacheConfig:

    def _verify_args(self):
        if self.gpu_memory_utilization > 1.0:
-            raise ValueError(
-                "GPU memory utilization must be less than 1.0. Got "
-                f"{self.gpu_memory_utilization}.")
+            raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.")
        if self.kv_cache_ratio > 1.0:
-            raise ValueError("KV cache ratio must be less than 1.0. Got "
-                             f"{self.kv_cache_ratio}.")
+            raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")

    def postprocess(self, num_total_tokens, number_of_tasks):
        """
@@ -293,27 +289,24 @@ class CacheConfig:
        self.dec_token_num = self.enc_dec_block_num * self.block_size
        if self.num_gpu_blocks_override is not None:
            self.total_block_num = self.num_gpu_blocks_override
-            self.prefill_kvcache_block_num = int(self.total_block_num *
-                                                 self.kv_cache_ratio)
+            self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
        else:
            length = num_total_tokens // number_of_tasks
-            block_num = (length + self.block_size - 1 +
-                         self.dec_token_num) // self.block_size
+            block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
            self.total_block_num = block_num * number_of_tasks
            self.prefill_kvcache_block_num = self.total_block_num
-            llm_logger.info(
-                f"Doing profile, the total_block_num:{self.total_block_num}")
+            llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")

    def reset(self, num_gpu_blocks):
        """
        reset gpu block number
        """
        self.total_block_num = num_gpu_blocks
-        self.prefill_kvcache_block_num = int(self.total_block_num *
-                                             self.kv_cache_ratio)
+        self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
        llm_logger.info(
-            (f"Reset block num, the total_block_num:{self.total_block_num},"
-             f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"))
+            f"Reset block num, the total_block_num:{self.total_block_num},"
+            f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
+        )

    def print(self):
        """
@@ -323,8 +316,7 @@ class CacheConfig:
        llm_logger.info("Cache Configuration Information :")
        for k, v in self.__dict__.items():
            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info(
-            "=============================================================")
+        llm_logger.info("=============================================================")


 class SpeculativeConfig:
@@ -340,14 +332,16 @@ class SpeculativeConfig:
        benchmark_mode (bool): Whether to use benchmark mode.
    """

-    def __init__(self,
-                 method: Optional[str] = None,
-                 num_speculative_tokens: Optional[int] = 1,
-                 model: Optional[str] = None,
-                 quantization: Optional[str] = "WINT8",
-                 max_model_len: Optional[int] = None,
-                 benchmark_mode: bool = False,
-                 **kwargs):
+    def __init__(
+        self,
+        method: Optional[str] = None,
+        num_speculative_tokens: Optional[int] = 1,
+        model: Optional[str] = None,
+        quantization: Optional[str] = "WINT8",
+        max_model_len: Optional[int] = None,
+        benchmark_mode: bool = False,
+        **kwargs,
+    ):
        self.model_name_or_path = model
        self.method = method
        self.num_speculative_tokens = num_speculative_tokens
@@ -381,8 +375,7 @@ class SpeculativeConfig:

        self.config_path = os.path.join(self.model_name_or_path, "config.json")
        if os.path.exists(self.config_path):
-            self.model_config = json.load(
-                open(self.config_path, 'r', encoding='utf-8'))
+            self.model_config = json.load(open(self.config_path, "r", encoding="utf-8"))

    def reset(self):
        """
@@ -414,10 +407,7 @@ class SpeculativeConfig:
        """
        Convert speculative_config to json string.
        """
-        return json.dumps({
-            key: value
-            for key, value in self.__dict__.items() if value is not None
-        })
+        return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})

    def print(self):
        """
@@ -427,8 +417,7 @@ class SpeculativeConfig:
        llm_logger.info("Speculative Decoding Configuration Information :")
        for k, v in self.__dict__.items():
            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info(
-            "=============================================================")
+        llm_logger.info("=============================================================")

    def __str__(self) -> str:
        return self.to_json_string()
@@ -440,7 +429,7 @@ class GraphOptimizationConfig:
        graph_opt_level: Optional[int] = 0,
        use_cudagraph: Optional[bool] = None,
        cudagraph_capture_sizes: Optional[List[int]] = None,
-        **kwargs
+        **kwargs,
    ):
        """
        Graph Optimization Configuration class.
@@ -460,10 +449,7 @@ class GraphOptimizationConfig:
        """
        Convert speculative_config to json string.
        """
-        return json.dumps({
-            key: value
-            for key, value in self.__dict__.items()
-        })
+        return json.dumps({key: value for key, value in self.__dict__.items()})

    def __str__(self) -> str:
        return self.to_json_string()
@@ -473,22 +459,30 @@ class GraphOptimizationConfig:
        graph_opt_level: Optional[int] = None,
        use_cudagraph: Optional[bool] = None,
        cudagraph_capture_sizes: Optional[List[int]] = None,
-        **kwargs
+        **kwargs,
    ) -> None:
-        """ Check the legality of parameters passed in from the command line """
+        """Check the legality of parameters passed in from the command line"""

        if graph_opt_level is not None:
-            assert graph_opt_level in [0, 1, 2], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
+            assert graph_opt_level in [
+                0,
+                1,
+                2,
+            ], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
        if use_cudagraph is not None:
            assert type(use_cudagraph) is bool, "In graph optimization config, type of use_cudagraph must is bool."
        if cudagraph_capture_sizes is not None:
-            assert type(cudagraph_capture_sizes) is list, "In graph optimization config, type of cudagraph_capture_sizes must is list."
-            assert len(cudagraph_capture_sizes) > 0, "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
+            assert (
+                type(cudagraph_capture_sizes) is list
+            ), "In graph optimization config, type of cudagraph_capture_sizes must is list."
+            assert (
+                len(cudagraph_capture_sizes) > 0
+            ), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."

        for key, value in kwargs.items():
            raise ValueError(f"Invalid --graph-optimization-config parameter {key}")

-    def update_use_cudagraph(self, argument:bool):
+    def update_use_cudagraph(self, argument: bool):
        """
        Unified user specifies the use_cudagraph parameter through two methods,
        '--use-cudagraph' and '--graph-optimization-config'
@@ -499,9 +493,12 @@ class GraphOptimizationConfig:
        else:
            # User both set '--use-cudagraph' and '--graph-optimization-config'
            if self.use_cudagraph is False and argument is True:
-                raise ValueError("Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously.")
+                raise ValueError(
+                    "Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
+                )
            argument = self.use_cudagraph

+
 class ParallelConfig:
    """
    Configuration for parallelism.
@@ -544,8 +541,7 @@ class ParallelConfig:
        llm_logger.info("Parallel Configuration Information :")
        for k, v in self.__dict__.items():
            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info(
-            "=============================================================")
+        llm_logger.info("=============================================================")


@dataclass
@@ -560,6 +556,7 @@ class CommitConfig:
        cuda_version: CUDA version string
        compiler_version: CXX compiler version string
    """
+
    fastdeploy_commit: str = ""
    paddle_version: str = ""
    paddle_commit: str = ""
@@ -573,7 +570,7 @@ class CommitConfig:
    def _load_from_version_file(self, file_path: str = "fastdeploy/version.txt"):
        """Internal method to load version info from file"""
        try:
-            with open(file_path, 'r') as f:
+            with open(file_path, "r") as f:
                for line in f:
                    line = line.strip()
                    if line.startswith("fastdeploy GIT COMMIT ID:"):
@@ -589,7 +586,7 @@ class CommitConfig:
        except FileNotFoundError:
            llm_logger.info(f"Warning: Version file not found at {file_path}")
        except Exception as e:
-            llm_logger.info(f"Warning: Could not read version file - {str(e)}")
+            llm_logger.info(f"Warning: Could not read version file - {e!s}")

    def print(self):
        """
@@ -599,8 +596,7 @@ class CommitConfig:
        llm_logger.info("Fasedeploy Commit Information :")
        for k, v in self.__dict__.items():
            llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info(
-            "=============================================================")
+        llm_logger.info("=============================================================")


 class Config:
@@ -701,7 +697,7 @@ class Config:
        self.max_num_batched_tokens = max_num_batched_tokens
        self.tensor_parallel_size = tensor_parallel_size
        self.dist_init_ip = dist_init_ip
-        
+
        self.nnode = nnodes
        self.node_rank = node_rank
        if self.dist_init_ip is None:
@@ -728,7 +724,6 @@ class Config:
        self.disable_any_whitespace = disable_any_whitespace
        self._str_to_list("innode_prefill_ports", int)

-
        assert self.splitwise_role in ["mixed", "prefill", "decode"]

        # TODO
@@ -739,19 +734,16 @@ class Config:
            self.max_prefill_batch = 1  # TODO:当前多模prefill阶段只支持并行度为1,待优化

        # TODO(@wufeisheng): TP and EP need to be supported simultaneously.
-        assert (self.tensor_parallel_size == 1
-                and self.parallel_config.expert_parallel_size
-                >= 1) or (self.tensor_parallel_size >= 1
-                          and self.parallel_config.expert_parallel_size
-                          == 1), "TP and EP cannot be enabled at the same time"
+        assert (self.tensor_parallel_size == 1 and self.parallel_config.expert_parallel_size >= 1) or (
+            self.tensor_parallel_size >= 1 and self.parallel_config.expert_parallel_size == 1
+        ), "TP and EP cannot be enabled at the same time"

        num_ranks = self.tensor_parallel_size * self.parallel_config.expert_parallel_size
        self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
        if num_ranks > self.max_chips_per_node:
            self.worker_num_per_node = self.max_chips_per_node
            nnode = ceil_div(num_ranks, self.worker_num_per_node)
-            assert nnode == self.nnode, \
-                f"nnode: {nnode}, but got {self.nnode}"
+            assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
        else:
            self.worker_num_per_node = num_ranks

@@ -772,13 +764,14 @@ class Config:
        """
        calculate some parameters
        """
-        assert self.device_ids.split(',').__len__() == self.worker_num_per_node, \
-        f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}"
+        assert (
+            self.device_ids.split(",").__len__() == self.worker_num_per_node
+        ), f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}"

-        assert self.worker_num_per_node % self.tensor_parallel_size == 0, \
-            f"tensor_parallel_size: {self.tensor_parallel_size} should be divisible by worker_num_per_node: {self.worker_num_per_node}"
-        self.local_device_ids = self.device_ids.split(
-            ',')[:self.tensor_parallel_size]
+        assert (
+            self.worker_num_per_node % self.tensor_parallel_size == 0
+        ), f"tensor_parallel_size: {self.tensor_parallel_size} should be divisible by worker_num_per_node: {self.worker_num_per_node}"
+        self.local_device_ids = self.device_ids.split(",")[: self.tensor_parallel_size]

        self.host_ip = get_host_ip()

@@ -788,6 +781,7 @@ class Config:
            self.is_master = False

        import paddle
+
        self.paddle_commit_id = paddle.version.commit

        if self.max_num_batched_tokens is None:
@@ -799,10 +793,8 @@ class Config:
        if self.long_prefill_token_threshold == 0:
            self.long_prefill_token_threshold = int(self.max_model_len * 0.04)

-        self.cache_config.postprocess(self.max_num_batched_tokens,
-                                      self.max_num_seqs)
-        self.cache_config.max_block_num_per_seq = int(
-            self.max_model_len // self.cache_config.block_size)
+        self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs)
+        self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)

        if self.guided_decoding_backend == "auto":
            if self.enable_mm:
@@ -814,30 +806,26 @@ class Config:
        """
        check the legality of config
        """
-        assert (
-            self.max_num_seqs <= 256
-        ), "The parameter `max_num_seqs` is not allowed to exceed 256, " "but now it's {}.".format(
-            self.max_num_seqs)
-        assert (
-            is_port_available('0.0.0.0', self.engine_worker_queue_port)
+        assert self.max_num_seqs <= 256, (
+            "The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}."
+        )
+        assert is_port_available(
+            "0.0.0.0", self.engine_worker_queue_port
        ), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."
        assert (
            self.max_chips_per_node >= self.tensor_parallel_size > 0
        ), f"tensor_parallel_size: {self.tensor_parallel_size} should be between 1 and {self.max_chips_per_node}"
-        assert (self.nnode >= 1), f"nnode: {self.nnode} should no less than 1"
-        assert (
-            self.max_model_len >= 16
-        ), f"max_model_len: {self.max_model_len} should be larger than 16"
-        assert (
-            self.max_num_seqs
-            >= 1), f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
-        assert (
-            self.max_num_batched_tokens >= self.max_num_seqs
-        ), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
+        assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
+        assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
+        assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
+        assert self.max_num_batched_tokens >= self.max_num_seqs, (
+            f"max_num_batched_tokens: {self.max_num_batched_tokens} "
            f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}"
-        assert (self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs), \
-                f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger" \
-                f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
+        )
+        assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, (
+            f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger"
+            f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
+        )
        assert (
            self.max_num_partial_prefills >= 1
        ), f"max_num_partial_prefills: {self.max_num_partial_prefills} should be larger than or equal to 1"
@@ -845,31 +833,38 @@ class Config:
        assert (
            self.max_long_partial_prefills >= 1
        ), f"max_long_partial_prefills: {self.max_long_partial_prefills} should be larger than or equal to 1"
-        assert (self.max_long_partial_prefills <= self.max_num_partial_prefills), \
-                f"max_long_partial_prefills: {self.max_long_partial_prefills} should " \
-                f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
+        assert self.max_long_partial_prefills <= self.max_num_partial_prefills, (
+            f"max_long_partial_prefills: {self.max_long_partial_prefills} should "
+            f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
+        )

        if not self.cache_config.enable_chunked_prefill:
-            assert (
-                self.max_num_batched_tokens >= self.max_model_len
-            ), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
+            assert self.max_num_batched_tokens >= self.max_model_len, (
+                f"max_num_batched_tokens: {self.max_num_batched_tokens} "
                f"should be larger than or equal to max_model_len: {self.max_model_len}"
+            )
        else:
-            assert (
-                self.max_num_batched_tokens >= self.cache_config.block_size
-            ), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
+            assert self.max_num_batched_tokens >= self.cache_config.block_size, (
+                f"max_num_batched_tokens: {self.max_num_batched_tokens} "
                f"should be larger than or equal to block_size: {self.cache_config.block_size}"
+            )

        if self.max_num_partial_prefills > 1:
-            assert (self.cache_config.enable_chunked_prefill is True), \
-            "Chunked prefill must be enabled to set max_num_partial_prefills > 1"
-            assert (self.long_prefill_token_threshold < self.max_model_len), \
-            f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"\
-            f" max_model_len: {self.max_model_len}"
+            assert (
+                self.cache_config.enable_chunked_prefill is True
+            ), "Chunked prefill must be enabled to set max_num_partial_prefills > 1"
+            assert self.long_prefill_token_threshold < self.max_model_len, (
+                f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"
+                f" max_model_len: {self.max_model_len}"
+            )

        if self.guided_decoding_backend is not None:
-            assert self.guided_decoding_backend in ["xgrammar", "XGrammar", "auto", "off"], \
-                f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
+            assert self.guided_decoding_backend in [
+                "xgrammar",
+                "XGrammar",
+                "auto",
+                "off",
+            ], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."

            if self.guided_decoding_backend != "off":
                # TODO: mm support guided_decoding
@@ -878,8 +873,7 @@ class Config:
                # TODO: speculative decoding support guided_decoding

                # TODO: xpu support guided_decoding
-                assert not current_platform.is_xpu(
-                ), "XPU currently do not support guided_decoding"
+                assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding"

                try:
                    import xgrammar  # noqa
@@ -897,22 +891,22 @@ class Config:
        Args:
            file (str): the path of file to save config
        """
-        llm_logger.info(
-            "=================== Configuration Information ===============")
+        llm_logger.info("=================== Configuration Information ===============")
        for k, v in self.__dict__.items():
            if k == "generation_config" and v is not None:
                for gck, gcv in v.to_dict().items():
                    llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv))
-            elif (k == "cache_config" or
-                  k == "model_config" or
-                  k == "scheduler_config" or
-                  k == "parallel_config" or
-                  k == "commit_config"):
+            elif (
+                k == "cache_config"
+                or k == "model_config"
+                or k == "scheduler_config"
+                or k == "parallel_config"
+                or k == "commit_config"
+            ):
                v.print()
            else:
                llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
-        llm_logger.info(
-            "=============================================================")
+        llm_logger.info("=============================================================")
        if file is not None:
            f = open(file, "a")
            now_time = datetime.now()
@@ -929,15 +923,14 @@ class Config:
        if self.splitwise_role != "mixed":
            disaggregate_info["role"] = self.splitwise_role
            disaggregate_info["cache_info"] = dict()
-            current_protocol = self.cache_config.cache_transfer_protocol.split(
-                ",")
+            current_protocol = self.cache_config.cache_transfer_protocol.split(",")
            disaggregate_info["transfer_protocol"] = current_protocol
            for protocol in current_protocol:
                if protocol == "ipc":
                    disaggregate_info["cache_info"][protocol] = {
                        "ip": self.host_ip,
                        "port": self.engine_worker_queue_port,
-                        "device_ids": self.local_device_ids
+                        "device_ids": self.local_device_ids,
                    }
                elif protocol == "rdma":
                    disaggregate_info["cache_info"][protocol] = {
@@ -957,13 +950,14 @@ class Config:
            if hasattr(cls, key):
                value = getattr(cls, key)
                setattr(cls, value_name, value)
-                llm_logger.info(
-                    f"Reset parameter {value_name} = {value} from configuration."
-                )
+                llm_logger.info(f"Reset parameter {value_name} = {value} from configuration.")

        reset_value(self.cache_config, "block_size", "infer_model_block_size")
-        reset_value(self.model_config, "return_full_hidden_states",
-                    "return_full_hidden_states")
+        reset_value(
+            self.model_config,
+            "return_full_hidden_states",
+            "return_full_hidden_states",
+        )
        reset_value(self.cache_config, "cache_dtype", "infer_model_dtype")

    def _check_master(self):