polish code with new pre-commit rule (#2923)

This commit is contained in:
Zero Rains
2025-07-19 23:19:27 +08:00
committed by GitHub
parent b8676d71a8
commit 25698d56d1
424 changed files with 14307 additions and 13518 deletions

View File

@@ -23,8 +23,14 @@ from typing import Any, Dict, List, Literal, Optional
from fastdeploy import envs
from fastdeploy.platforms import current_platform
from fastdeploy.scheduler import SchedulerConfig
from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip,
is_port_available, get_random_port, llm_logger)
from fastdeploy.utils import (
ceil_div,
check_unified_ckpt,
get_host_ip,
get_random_port,
is_port_available,
llm_logger,
)
TaskOption = Literal["generate"]
@@ -39,13 +45,15 @@ class ModelConfig:
model_name_or_path (str): Name or path of the model.
"""
def __init__(self,
model_name_or_path: str,
config_json_file: str = "config.json",
dynamic_load_weight: bool = False,
load_strategy: str = "ipc_snapshot",
quantization: str = None,
download_dir: Optional[str] = None):
def __init__(
self,
model_name_or_path: str,
config_json_file: str = "config.json",
dynamic_load_weight: bool = False,
load_strategy: str = "ipc_snapshot",
quantization: str = None,
download_dir: Optional[str] = None,
):
"""
Initialize the ModelConfig class.
@@ -64,11 +72,9 @@ class ModelConfig:
if os.path.isfile(model_name_or_path):
try:
from paddleformers.transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name_or_path)
config_dict = {
k: v
for k, v in vars(config).items() if not k.startswith('_')
}
config_dict = {k: v for k, v in vars(config).items() if not k.startswith("_")}
for key, value in config_dict.items():
setattr(self, key, value)
except Exception:
@@ -115,8 +121,7 @@ class ModelConfig:
if not hasattr(self, "mla_use_absorb"):
self.mla_use_absorb = False
if not hasattr(self, "head_dim"):
assert hasattr(self, "hidden_size") and hasattr(
self, "num_attention_heads")
assert hasattr(self, "hidden_size") and hasattr(self, "num_attention_heads")
self.head_dim = self.hidden_size // self.num_attention_heads
def read_from_env(self):
@@ -132,11 +137,9 @@ class ModelConfig:
if not hasattr(self, key.lower()):
if os.getenv(key, None):
value = eval(os.getenv(key))
llm_logger.info(
f"Get parameter `{key}` = {value} from environment.")
llm_logger.info(f"Get parameter `{key}` = {value} from environment.")
else:
llm_logger.info(
f"Parameter `{key}` will use default value {value}.")
llm_logger.info(f"Parameter `{key}` will use default value {value}.")
setattr(self, key.lower(), value)
reset_config_value("COMPRESSION_RATIO", 1.0)
@@ -153,8 +156,7 @@ class ModelConfig:
llm_logger.info("Model Configuration Information :")
for k, v in self.__dict__.items():
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
llm_logger.info(
"=============================================================")
llm_logger.info("=============================================================")
class CacheConfig:
@@ -211,8 +213,7 @@ class CacheConfig:
self.enc_dec_block_num = enc_dec_block_num
self.cache_dtype = cache_dtype
if hasattr(model_cfg, "quantization_config"):
self.cache_dtype = model_cfg.quantization_config.get(
"kv_cache_quant_type", cache_dtype)
self.cache_dtype = model_cfg.quantization_config.get("kv_cache_quant_type", cache_dtype)
self.enable_chunked_prefill = enable_chunked_prefill
self.rdma_comm_ports = rdma_comm_ports
@@ -220,7 +221,7 @@ class CacheConfig:
self.pd_comm_port = pd_comm_port
if rdma_comm_ports is not None and isinstance(rdma_comm_ports, str):
self.rdma_comm_ports = rdma_comm_ports.split(',')
self.rdma_comm_ports = rdma_comm_ports.split(",")
if pd_comm_port is not None and isinstance(pd_comm_port, str):
self.pd_comm_port = [int(port) for port in pd_comm_port.split(",")]
@@ -236,41 +237,39 @@ class CacheConfig:
self.cache_queue_port = cache_queue_port
self.swap_space = swap_space
if (hasattr(self.model_cfg, "num_key_value_heads")
and hasattr(self.model_cfg, "num_key_value_heads")
and self.model_cfg.num_key_value_heads is not None
and int(self.model_cfg.num_key_value_heads) > 0):
if (
hasattr(self.model_cfg, "num_key_value_heads")
and hasattr(self.model_cfg, "num_key_value_heads")
and self.model_cfg.num_key_value_heads is not None
and int(self.model_cfg.num_key_value_heads) > 0
):
kv_num_head = int(self.model_cfg.num_key_value_heads)
else:
kv_num_head = self.model_cfg.num_attention_heads
self.model_cfg.kv_num_head = kv_num_head
# TODO check name
if "int4" in self.cache_dtype.lower(
) or "float4" in self.cache_dtype.lower():
if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower():
byte_size = 0.5
self.cache_dtype = "uint8"
elif "int8" in self.cache_dtype.lower(
) or "float8" in self.cache_dtype.lower():
elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower():
self.cache_dtype = "uint8"
byte_size = 1
else:
byte_size = 2
self.each_token_cache_space = int(
self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim *
byte_size)
self.bytes_per_block = int(self.each_token_cache_space *
self.block_size)
self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
)
self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
self.bytes_per_layer_per_block = int(
self.block_size * self.model_cfg.kv_num_head *
self.model_cfg.head_dim // tensor_parallel_size * byte_size)
self.block_size * self.model_cfg.kv_num_head * self.model_cfg.head_dim // tensor_parallel_size * byte_size
)
if self.swap_space is None:
self.num_cpu_blocks = 0
else:
self.num_cpu_blocks = int(self.swap_space * 1024**3 /
self.bytes_per_block)
self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block)
self._verify_args()
def metrics_info(self):
@@ -279,12 +278,9 @@ class CacheConfig:
def _verify_args(self):
if self.gpu_memory_utilization > 1.0:
raise ValueError(
"GPU memory utilization must be less than 1.0. Got "
f"{self.gpu_memory_utilization}.")
raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.")
if self.kv_cache_ratio > 1.0:
raise ValueError("KV cache ratio must be less than 1.0. Got "
f"{self.kv_cache_ratio}.")
raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")
def postprocess(self, num_total_tokens, number_of_tasks):
"""
@@ -293,27 +289,24 @@ class CacheConfig:
self.dec_token_num = self.enc_dec_block_num * self.block_size
if self.num_gpu_blocks_override is not None:
self.total_block_num = self.num_gpu_blocks_override
self.prefill_kvcache_block_num = int(self.total_block_num *
self.kv_cache_ratio)
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
else:
length = num_total_tokens // number_of_tasks
block_num = (length + self.block_size - 1 +
self.dec_token_num) // self.block_size
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
self.total_block_num = block_num * number_of_tasks
self.prefill_kvcache_block_num = self.total_block_num
llm_logger.info(
f"Doing profile, the total_block_num:{self.total_block_num}")
llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
def reset(self, num_gpu_blocks):
"""
reset gpu block number
"""
self.total_block_num = num_gpu_blocks
self.prefill_kvcache_block_num = int(self.total_block_num *
self.kv_cache_ratio)
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
llm_logger.info(
(f"Reset block num, the total_block_num:{self.total_block_num},"
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"))
f"Reset block num, the total_block_num:{self.total_block_num},"
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
)
def print(self):
"""
@@ -323,8 +316,7 @@ class CacheConfig:
llm_logger.info("Cache Configuration Information :")
for k, v in self.__dict__.items():
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
llm_logger.info(
"=============================================================")
llm_logger.info("=============================================================")
class SpeculativeConfig:
@@ -340,14 +332,16 @@ class SpeculativeConfig:
benchmark_mode (bool): Whether to use benchmark mode.
"""
def __init__(self,
method: Optional[str] = None,
num_speculative_tokens: Optional[int] = 1,
model: Optional[str] = None,
quantization: Optional[str] = "WINT8",
max_model_len: Optional[int] = None,
benchmark_mode: bool = False,
**kwargs):
def __init__(
self,
method: Optional[str] = None,
num_speculative_tokens: Optional[int] = 1,
model: Optional[str] = None,
quantization: Optional[str] = "WINT8",
max_model_len: Optional[int] = None,
benchmark_mode: bool = False,
**kwargs,
):
self.model_name_or_path = model
self.method = method
self.num_speculative_tokens = num_speculative_tokens
@@ -381,8 +375,7 @@ class SpeculativeConfig:
self.config_path = os.path.join(self.model_name_or_path, "config.json")
if os.path.exists(self.config_path):
self.model_config = json.load(
open(self.config_path, 'r', encoding='utf-8'))
self.model_config = json.load(open(self.config_path, "r", encoding="utf-8"))
def reset(self):
"""
@@ -414,10 +407,7 @@ class SpeculativeConfig:
"""
Convert speculative_config to json string.
"""
return json.dumps({
key: value
for key, value in self.__dict__.items() if value is not None
})
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
def print(self):
"""
@@ -427,8 +417,7 @@ class SpeculativeConfig:
llm_logger.info("Speculative Decoding Configuration Information :")
for k, v in self.__dict__.items():
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
llm_logger.info(
"=============================================================")
llm_logger.info("=============================================================")
def __str__(self) -> str:
return self.to_json_string()
@@ -440,7 +429,7 @@ class GraphOptimizationConfig:
graph_opt_level: Optional[int] = 0,
use_cudagraph: Optional[bool] = None,
cudagraph_capture_sizes: Optional[List[int]] = None,
**kwargs
**kwargs,
):
"""
Graph Optimization Configuration class.
@@ -460,10 +449,7 @@ class GraphOptimizationConfig:
"""
Convert speculative_config to json string.
"""
return json.dumps({
key: value
for key, value in self.__dict__.items()
})
return json.dumps({key: value for key, value in self.__dict__.items()})
def __str__(self) -> str:
return self.to_json_string()
@@ -473,22 +459,30 @@ class GraphOptimizationConfig:
graph_opt_level: Optional[int] = None,
use_cudagraph: Optional[bool] = None,
cudagraph_capture_sizes: Optional[List[int]] = None,
**kwargs
**kwargs,
) -> None:
""" Check the legality of parameters passed in from the command line """
"""Check the legality of parameters passed in from the command line"""
if graph_opt_level is not None:
assert graph_opt_level in [0, 1, 2], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
assert graph_opt_level in [
0,
1,
2,
], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
if use_cudagraph is not None:
assert type(use_cudagraph) is bool, "In graph optimization config, type of use_cudagraph must is bool."
if cudagraph_capture_sizes is not None:
assert type(cudagraph_capture_sizes) is list, "In graph optimization config, type of cudagraph_capture_sizes must is list."
assert len(cudagraph_capture_sizes) > 0, "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
assert (
type(cudagraph_capture_sizes) is list
), "In graph optimization config, type of cudagraph_capture_sizes must is list."
assert (
len(cudagraph_capture_sizes) > 0
), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
for key, value in kwargs.items():
raise ValueError(f"Invalid --graph-optimization-config parameter {key}")
def update_use_cudagraph(self, argument:bool):
def update_use_cudagraph(self, argument: bool):
"""
Unified user specifies the use_cudagraph parameter through two methods,
'--use-cudagraph' and '--graph-optimization-config'
@@ -499,9 +493,12 @@ class GraphOptimizationConfig:
else:
# User both set '--use-cudagraph' and '--graph-optimization-config'
if self.use_cudagraph is False and argument is True:
raise ValueError("Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously.")
raise ValueError(
"Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
)
argument = self.use_cudagraph
class ParallelConfig:
"""
Configuration for parallelism.
@@ -544,8 +541,7 @@ class ParallelConfig:
llm_logger.info("Parallel Configuration Information :")
for k, v in self.__dict__.items():
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
llm_logger.info(
"=============================================================")
llm_logger.info("=============================================================")
@dataclass
@@ -560,6 +556,7 @@ class CommitConfig:
cuda_version: CUDA version string
compiler_version: CXX compiler version string
"""
fastdeploy_commit: str = ""
paddle_version: str = ""
paddle_commit: str = ""
@@ -573,7 +570,7 @@ class CommitConfig:
def _load_from_version_file(self, file_path: str = "fastdeploy/version.txt"):
"""Internal method to load version info from file"""
try:
with open(file_path, 'r') as f:
with open(file_path, "r") as f:
for line in f:
line = line.strip()
if line.startswith("fastdeploy GIT COMMIT ID:"):
@@ -589,7 +586,7 @@ class CommitConfig:
except FileNotFoundError:
llm_logger.info(f"Warning: Version file not found at {file_path}")
except Exception as e:
llm_logger.info(f"Warning: Could not read version file - {str(e)}")
llm_logger.info(f"Warning: Could not read version file - {e!s}")
def print(self):
"""
@@ -599,8 +596,7 @@ class CommitConfig:
llm_logger.info("Fasedeploy Commit Information :")
for k, v in self.__dict__.items():
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
llm_logger.info(
"=============================================================")
llm_logger.info("=============================================================")
class Config:
@@ -701,7 +697,7 @@ class Config:
self.max_num_batched_tokens = max_num_batched_tokens
self.tensor_parallel_size = tensor_parallel_size
self.dist_init_ip = dist_init_ip
self.nnode = nnodes
self.node_rank = node_rank
if self.dist_init_ip is None:
@@ -728,7 +724,6 @@ class Config:
self.disable_any_whitespace = disable_any_whitespace
self._str_to_list("innode_prefill_ports", int)
assert self.splitwise_role in ["mixed", "prefill", "decode"]
# TODO
@@ -739,19 +734,16 @@ class Config:
self.max_prefill_batch = 1 # TODO:当前多模prefill阶段只支持并行度为1,待优化
# TODO(@wufeisheng): TP and EP need to be supported simultaneously.
assert (self.tensor_parallel_size == 1
and self.parallel_config.expert_parallel_size
>= 1) or (self.tensor_parallel_size >= 1
and self.parallel_config.expert_parallel_size
== 1), "TP and EP cannot be enabled at the same time"
assert (self.tensor_parallel_size == 1 and self.parallel_config.expert_parallel_size >= 1) or (
self.tensor_parallel_size >= 1 and self.parallel_config.expert_parallel_size == 1
), "TP and EP cannot be enabled at the same time"
num_ranks = self.tensor_parallel_size * self.parallel_config.expert_parallel_size
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
if num_ranks > self.max_chips_per_node:
self.worker_num_per_node = self.max_chips_per_node
nnode = ceil_div(num_ranks, self.worker_num_per_node)
assert nnode == self.nnode, \
f"nnode: {nnode}, but got {self.nnode}"
assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
else:
self.worker_num_per_node = num_ranks
@@ -772,13 +764,14 @@ class Config:
"""
calculate some parameters
"""
assert self.device_ids.split(',').__len__() == self.worker_num_per_node, \
f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}"
assert (
self.device_ids.split(",").__len__() == self.worker_num_per_node
), f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}"
assert self.worker_num_per_node % self.tensor_parallel_size == 0, \
f"tensor_parallel_size: {self.tensor_parallel_size} should be divisible by worker_num_per_node: {self.worker_num_per_node}"
self.local_device_ids = self.device_ids.split(
',')[:self.tensor_parallel_size]
assert (
self.worker_num_per_node % self.tensor_parallel_size == 0
), f"tensor_parallel_size: {self.tensor_parallel_size} should be divisible by worker_num_per_node: {self.worker_num_per_node}"
self.local_device_ids = self.device_ids.split(",")[: self.tensor_parallel_size]
self.host_ip = get_host_ip()
@@ -788,6 +781,7 @@ class Config:
self.is_master = False
import paddle
self.paddle_commit_id = paddle.version.commit
if self.max_num_batched_tokens is None:
@@ -799,10 +793,8 @@ class Config:
if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
self.cache_config.postprocess(self.max_num_batched_tokens,
self.max_num_seqs)
self.cache_config.max_block_num_per_seq = int(
self.max_model_len // self.cache_config.block_size)
self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs)
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
if self.guided_decoding_backend == "auto":
if self.enable_mm:
@@ -814,30 +806,26 @@ class Config:
"""
check the legality of config
"""
assert (
self.max_num_seqs <= 256
), "The parameter `max_num_seqs` is not allowed to exceed 256, " "but now it's {}.".format(
self.max_num_seqs)
assert (
is_port_available('0.0.0.0', self.engine_worker_queue_port)
assert self.max_num_seqs <= 256, (
"The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}."
)
assert is_port_available(
"0.0.0.0", self.engine_worker_queue_port
), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."
assert (
self.max_chips_per_node >= self.tensor_parallel_size > 0
), f"tensor_parallel_size: {self.tensor_parallel_size} should be between 1 and {self.max_chips_per_node}"
assert (self.nnode >= 1), f"nnode: {self.nnode} should no less than 1"
assert (
self.max_model_len >= 16
), f"max_model_len: {self.max_model_len} should be larger than 16"
assert (
self.max_num_seqs
>= 1), f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
assert (
self.max_num_batched_tokens >= self.max_num_seqs
), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
assert self.max_num_batched_tokens >= self.max_num_seqs, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}"
assert (self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs), \
f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger" \
f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
)
assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger"
f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
)
assert (
self.max_num_partial_prefills >= 1
), f"max_num_partial_prefills: {self.max_num_partial_prefills} should be larger than or equal to 1"
@@ -845,31 +833,38 @@ class Config:
assert (
self.max_long_partial_prefills >= 1
), f"max_long_partial_prefills: {self.max_long_partial_prefills} should be larger than or equal to 1"
assert (self.max_long_partial_prefills <= self.max_num_partial_prefills), \
f"max_long_partial_prefills: {self.max_long_partial_prefills} should " \
f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
assert self.max_long_partial_prefills <= self.max_num_partial_prefills, (
f"max_long_partial_prefills: {self.max_long_partial_prefills} should "
f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
)
if not self.cache_config.enable_chunked_prefill:
assert (
self.max_num_batched_tokens >= self.max_model_len
), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
assert self.max_num_batched_tokens >= self.max_model_len, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
f"should be larger than or equal to max_model_len: {self.max_model_len}"
)
else:
assert (
self.max_num_batched_tokens >= self.cache_config.block_size
), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
f"should be larger than or equal to block_size: {self.cache_config.block_size}"
)
if self.max_num_partial_prefills > 1:
assert (self.cache_config.enable_chunked_prefill is True), \
"Chunked prefill must be enabled to set max_num_partial_prefills > 1"
assert (self.long_prefill_token_threshold < self.max_model_len), \
f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"\
f" max_model_len: {self.max_model_len}"
assert (
self.cache_config.enable_chunked_prefill is True
), "Chunked prefill must be enabled to set max_num_partial_prefills > 1"
assert self.long_prefill_token_threshold < self.max_model_len, (
f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"
f" max_model_len: {self.max_model_len}"
)
if self.guided_decoding_backend is not None:
assert self.guided_decoding_backend in ["xgrammar", "XGrammar", "auto", "off"], \
f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
assert self.guided_decoding_backend in [
"xgrammar",
"XGrammar",
"auto",
"off",
], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
if self.guided_decoding_backend != "off":
# TODO: mm support guided_decoding
@@ -878,8 +873,7 @@ class Config:
# TODO: speculative decoding support guided_decoding
# TODO: xpu support guided_decoding
assert not current_platform.is_xpu(
), "XPU currently do not support guided_decoding"
assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding"
try:
import xgrammar # noqa
@@ -897,22 +891,22 @@ class Config:
Args:
file (str): the path of file to save config
"""
llm_logger.info(
"=================== Configuration Information ===============")
llm_logger.info("=================== Configuration Information ===============")
for k, v in self.__dict__.items():
if k == "generation_config" and v is not None:
for gck, gcv in v.to_dict().items():
llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv))
elif (k == "cache_config" or
k == "model_config" or
k == "scheduler_config" or
k == "parallel_config" or
k == "commit_config"):
elif (
k == "cache_config"
or k == "model_config"
or k == "scheduler_config"
or k == "parallel_config"
or k == "commit_config"
):
v.print()
else:
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
llm_logger.info(
"=============================================================")
llm_logger.info("=============================================================")
if file is not None:
f = open(file, "a")
now_time = datetime.now()
@@ -929,15 +923,14 @@ class Config:
if self.splitwise_role != "mixed":
disaggregate_info["role"] = self.splitwise_role
disaggregate_info["cache_info"] = dict()
current_protocol = self.cache_config.cache_transfer_protocol.split(
",")
current_protocol = self.cache_config.cache_transfer_protocol.split(",")
disaggregate_info["transfer_protocol"] = current_protocol
for protocol in current_protocol:
if protocol == "ipc":
disaggregate_info["cache_info"][protocol] = {
"ip": self.host_ip,
"port": self.engine_worker_queue_port,
"device_ids": self.local_device_ids
"device_ids": self.local_device_ids,
}
elif protocol == "rdma":
disaggregate_info["cache_info"][protocol] = {
@@ -957,13 +950,14 @@ class Config:
if hasattr(cls, key):
value = getattr(cls, key)
setattr(cls, value_name, value)
llm_logger.info(
f"Reset parameter {value_name} = {value} from configuration."
)
llm_logger.info(f"Reset parameter {value_name} = {value} from configuration.")
reset_value(self.cache_config, "block_size", "infer_model_block_size")
reset_value(self.model_config, "return_full_hidden_states",
"return_full_hidden_states")
reset_value(
self.model_config,
"return_full_hidden_states",
"return_full_hidden_states",
)
reset_value(self.cache_config, "cache_dtype", "infer_model_dtype")
def _check_master(self):