mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
polish code with new pre-commit rule (#2923)
This commit is contained in:
@@ -23,8 +23,14 @@ from typing import Any, Dict, List, Literal, Optional
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.scheduler import SchedulerConfig
|
||||
from fastdeploy.utils import (ceil_div, check_unified_ckpt, get_host_ip,
|
||||
is_port_available, get_random_port, llm_logger)
|
||||
from fastdeploy.utils import (
|
||||
ceil_div,
|
||||
check_unified_ckpt,
|
||||
get_host_ip,
|
||||
get_random_port,
|
||||
is_port_available,
|
||||
llm_logger,
|
||||
)
|
||||
|
||||
TaskOption = Literal["generate"]
|
||||
|
||||
@@ -39,13 +45,15 @@ class ModelConfig:
|
||||
model_name_or_path (str): Name or path of the model.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
model_name_or_path: str,
|
||||
config_json_file: str = "config.json",
|
||||
dynamic_load_weight: bool = False,
|
||||
load_strategy: str = "ipc_snapshot",
|
||||
quantization: str = None,
|
||||
download_dir: Optional[str] = None):
|
||||
def __init__(
|
||||
self,
|
||||
model_name_or_path: str,
|
||||
config_json_file: str = "config.json",
|
||||
dynamic_load_weight: bool = False,
|
||||
load_strategy: str = "ipc_snapshot",
|
||||
quantization: str = None,
|
||||
download_dir: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Initialize the ModelConfig class.
|
||||
|
||||
@@ -64,11 +72,9 @@ class ModelConfig:
|
||||
if os.path.isfile(model_name_or_path):
|
||||
try:
|
||||
from paddleformers.transformers import AutoConfig
|
||||
|
||||
config = AutoConfig.from_pretrained(model_name_or_path)
|
||||
config_dict = {
|
||||
k: v
|
||||
for k, v in vars(config).items() if not k.startswith('_')
|
||||
}
|
||||
config_dict = {k: v for k, v in vars(config).items() if not k.startswith("_")}
|
||||
for key, value in config_dict.items():
|
||||
setattr(self, key, value)
|
||||
except Exception:
|
||||
@@ -115,8 +121,7 @@ class ModelConfig:
|
||||
if not hasattr(self, "mla_use_absorb"):
|
||||
self.mla_use_absorb = False
|
||||
if not hasattr(self, "head_dim"):
|
||||
assert hasattr(self, "hidden_size") and hasattr(
|
||||
self, "num_attention_heads")
|
||||
assert hasattr(self, "hidden_size") and hasattr(self, "num_attention_heads")
|
||||
self.head_dim = self.hidden_size // self.num_attention_heads
|
||||
|
||||
def read_from_env(self):
|
||||
@@ -132,11 +137,9 @@ class ModelConfig:
|
||||
if not hasattr(self, key.lower()):
|
||||
if os.getenv(key, None):
|
||||
value = eval(os.getenv(key))
|
||||
llm_logger.info(
|
||||
f"Get parameter `{key}` = {value} from environment.")
|
||||
llm_logger.info(f"Get parameter `{key}` = {value} from environment.")
|
||||
else:
|
||||
llm_logger.info(
|
||||
f"Parameter `{key}` will use default value {value}.")
|
||||
llm_logger.info(f"Parameter `{key}` will use default value {value}.")
|
||||
setattr(self, key.lower(), value)
|
||||
|
||||
reset_config_value("COMPRESSION_RATIO", 1.0)
|
||||
@@ -153,8 +156,7 @@ class ModelConfig:
|
||||
llm_logger.info("Model Configuration Information :")
|
||||
for k, v in self.__dict__.items():
|
||||
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
||||
llm_logger.info(
|
||||
"=============================================================")
|
||||
llm_logger.info("=============================================================")
|
||||
|
||||
|
||||
class CacheConfig:
|
||||
@@ -211,8 +213,7 @@ class CacheConfig:
|
||||
self.enc_dec_block_num = enc_dec_block_num
|
||||
self.cache_dtype = cache_dtype
|
||||
if hasattr(model_cfg, "quantization_config"):
|
||||
self.cache_dtype = model_cfg.quantization_config.get(
|
||||
"kv_cache_quant_type", cache_dtype)
|
||||
self.cache_dtype = model_cfg.quantization_config.get("kv_cache_quant_type", cache_dtype)
|
||||
|
||||
self.enable_chunked_prefill = enable_chunked_prefill
|
||||
self.rdma_comm_ports = rdma_comm_ports
|
||||
@@ -220,7 +221,7 @@ class CacheConfig:
|
||||
self.pd_comm_port = pd_comm_port
|
||||
|
||||
if rdma_comm_ports is not None and isinstance(rdma_comm_ports, str):
|
||||
self.rdma_comm_ports = rdma_comm_ports.split(',')
|
||||
self.rdma_comm_ports = rdma_comm_ports.split(",")
|
||||
|
||||
if pd_comm_port is not None and isinstance(pd_comm_port, str):
|
||||
self.pd_comm_port = [int(port) for port in pd_comm_port.split(",")]
|
||||
@@ -236,41 +237,39 @@ class CacheConfig:
|
||||
self.cache_queue_port = cache_queue_port
|
||||
self.swap_space = swap_space
|
||||
|
||||
if (hasattr(self.model_cfg, "num_key_value_heads")
|
||||
and hasattr(self.model_cfg, "num_key_value_heads")
|
||||
and self.model_cfg.num_key_value_heads is not None
|
||||
and int(self.model_cfg.num_key_value_heads) > 0):
|
||||
if (
|
||||
hasattr(self.model_cfg, "num_key_value_heads")
|
||||
and hasattr(self.model_cfg, "num_key_value_heads")
|
||||
and self.model_cfg.num_key_value_heads is not None
|
||||
and int(self.model_cfg.num_key_value_heads) > 0
|
||||
):
|
||||
kv_num_head = int(self.model_cfg.num_key_value_heads)
|
||||
else:
|
||||
kv_num_head = self.model_cfg.num_attention_heads
|
||||
self.model_cfg.kv_num_head = kv_num_head
|
||||
|
||||
# TODO check name
|
||||
if "int4" in self.cache_dtype.lower(
|
||||
) or "float4" in self.cache_dtype.lower():
|
||||
if "int4" in self.cache_dtype.lower() or "float4" in self.cache_dtype.lower():
|
||||
byte_size = 0.5
|
||||
self.cache_dtype = "uint8"
|
||||
elif "int8" in self.cache_dtype.lower(
|
||||
) or "float8" in self.cache_dtype.lower():
|
||||
elif "int8" in self.cache_dtype.lower() or "float8" in self.cache_dtype.lower():
|
||||
self.cache_dtype = "uint8"
|
||||
byte_size = 1
|
||||
else:
|
||||
byte_size = 2
|
||||
|
||||
self.each_token_cache_space = int(
|
||||
self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim *
|
||||
byte_size)
|
||||
self.bytes_per_block = int(self.each_token_cache_space *
|
||||
self.block_size)
|
||||
self.model_cfg.num_layers * kv_num_head * self.model_cfg.head_dim * byte_size
|
||||
)
|
||||
self.bytes_per_block = int(self.each_token_cache_space * self.block_size)
|
||||
self.bytes_per_layer_per_block = int(
|
||||
self.block_size * self.model_cfg.kv_num_head *
|
||||
self.model_cfg.head_dim // tensor_parallel_size * byte_size)
|
||||
self.block_size * self.model_cfg.kv_num_head * self.model_cfg.head_dim // tensor_parallel_size * byte_size
|
||||
)
|
||||
|
||||
if self.swap_space is None:
|
||||
self.num_cpu_blocks = 0
|
||||
else:
|
||||
self.num_cpu_blocks = int(self.swap_space * 1024**3 /
|
||||
self.bytes_per_block)
|
||||
self.num_cpu_blocks = int(self.swap_space * 1024**3 / self.bytes_per_block)
|
||||
self._verify_args()
|
||||
|
||||
def metrics_info(self):
|
||||
@@ -279,12 +278,9 @@ class CacheConfig:
|
||||
|
||||
def _verify_args(self):
|
||||
if self.gpu_memory_utilization > 1.0:
|
||||
raise ValueError(
|
||||
"GPU memory utilization must be less than 1.0. Got "
|
||||
f"{self.gpu_memory_utilization}.")
|
||||
raise ValueError("GPU memory utilization must be less than 1.0. Got " f"{self.gpu_memory_utilization}.")
|
||||
if self.kv_cache_ratio > 1.0:
|
||||
raise ValueError("KV cache ratio must be less than 1.0. Got "
|
||||
f"{self.kv_cache_ratio}.")
|
||||
raise ValueError("KV cache ratio must be less than 1.0. Got " f"{self.kv_cache_ratio}.")
|
||||
|
||||
def postprocess(self, num_total_tokens, number_of_tasks):
|
||||
"""
|
||||
@@ -293,27 +289,24 @@ class CacheConfig:
|
||||
self.dec_token_num = self.enc_dec_block_num * self.block_size
|
||||
if self.num_gpu_blocks_override is not None:
|
||||
self.total_block_num = self.num_gpu_blocks_override
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num *
|
||||
self.kv_cache_ratio)
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||
else:
|
||||
length = num_total_tokens // number_of_tasks
|
||||
block_num = (length + self.block_size - 1 +
|
||||
self.dec_token_num) // self.block_size
|
||||
block_num = (length + self.block_size - 1 + self.dec_token_num) // self.block_size
|
||||
self.total_block_num = block_num * number_of_tasks
|
||||
self.prefill_kvcache_block_num = self.total_block_num
|
||||
llm_logger.info(
|
||||
f"Doing profile, the total_block_num:{self.total_block_num}")
|
||||
llm_logger.info(f"Doing profile, the total_block_num:{self.total_block_num}")
|
||||
|
||||
def reset(self, num_gpu_blocks):
|
||||
"""
|
||||
reset gpu block number
|
||||
"""
|
||||
self.total_block_num = num_gpu_blocks
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num *
|
||||
self.kv_cache_ratio)
|
||||
self.prefill_kvcache_block_num = int(self.total_block_num * self.kv_cache_ratio)
|
||||
llm_logger.info(
|
||||
(f"Reset block num, the total_block_num:{self.total_block_num},"
|
||||
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"))
|
||||
f"Reset block num, the total_block_num:{self.total_block_num},"
|
||||
f" prefill_kvcache_block_num:{self.prefill_kvcache_block_num}"
|
||||
)
|
||||
|
||||
def print(self):
|
||||
"""
|
||||
@@ -323,8 +316,7 @@ class CacheConfig:
|
||||
llm_logger.info("Cache Configuration Information :")
|
||||
for k, v in self.__dict__.items():
|
||||
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
||||
llm_logger.info(
|
||||
"=============================================================")
|
||||
llm_logger.info("=============================================================")
|
||||
|
||||
|
||||
class SpeculativeConfig:
|
||||
@@ -340,14 +332,16 @@ class SpeculativeConfig:
|
||||
benchmark_mode (bool): Whether to use benchmark mode.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
method: Optional[str] = None,
|
||||
num_speculative_tokens: Optional[int] = 1,
|
||||
model: Optional[str] = None,
|
||||
quantization: Optional[str] = "WINT8",
|
||||
max_model_len: Optional[int] = None,
|
||||
benchmark_mode: bool = False,
|
||||
**kwargs):
|
||||
def __init__(
|
||||
self,
|
||||
method: Optional[str] = None,
|
||||
num_speculative_tokens: Optional[int] = 1,
|
||||
model: Optional[str] = None,
|
||||
quantization: Optional[str] = "WINT8",
|
||||
max_model_len: Optional[int] = None,
|
||||
benchmark_mode: bool = False,
|
||||
**kwargs,
|
||||
):
|
||||
self.model_name_or_path = model
|
||||
self.method = method
|
||||
self.num_speculative_tokens = num_speculative_tokens
|
||||
@@ -381,8 +375,7 @@ class SpeculativeConfig:
|
||||
|
||||
self.config_path = os.path.join(self.model_name_or_path, "config.json")
|
||||
if os.path.exists(self.config_path):
|
||||
self.model_config = json.load(
|
||||
open(self.config_path, 'r', encoding='utf-8'))
|
||||
self.model_config = json.load(open(self.config_path, "r", encoding="utf-8"))
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
@@ -414,10 +407,7 @@ class SpeculativeConfig:
|
||||
"""
|
||||
Convert speculative_config to json string.
|
||||
"""
|
||||
return json.dumps({
|
||||
key: value
|
||||
for key, value in self.__dict__.items() if value is not None
|
||||
})
|
||||
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
|
||||
|
||||
def print(self):
|
||||
"""
|
||||
@@ -427,8 +417,7 @@ class SpeculativeConfig:
|
||||
llm_logger.info("Speculative Decoding Configuration Information :")
|
||||
for k, v in self.__dict__.items():
|
||||
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
||||
llm_logger.info(
|
||||
"=============================================================")
|
||||
llm_logger.info("=============================================================")
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.to_json_string()
|
||||
@@ -440,7 +429,7 @@ class GraphOptimizationConfig:
|
||||
graph_opt_level: Optional[int] = 0,
|
||||
use_cudagraph: Optional[bool] = None,
|
||||
cudagraph_capture_sizes: Optional[List[int]] = None,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Graph Optimization Configuration class.
|
||||
@@ -460,10 +449,7 @@ class GraphOptimizationConfig:
|
||||
"""
|
||||
Convert speculative_config to json string.
|
||||
"""
|
||||
return json.dumps({
|
||||
key: value
|
||||
for key, value in self.__dict__.items()
|
||||
})
|
||||
return json.dumps({key: value for key, value in self.__dict__.items()})
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.to_json_string()
|
||||
@@ -473,22 +459,30 @@ class GraphOptimizationConfig:
|
||||
graph_opt_level: Optional[int] = None,
|
||||
use_cudagraph: Optional[bool] = None,
|
||||
cudagraph_capture_sizes: Optional[List[int]] = None,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
) -> None:
|
||||
""" Check the legality of parameters passed in from the command line """
|
||||
"""Check the legality of parameters passed in from the command line"""
|
||||
|
||||
if graph_opt_level is not None:
|
||||
assert graph_opt_level in [0, 1, 2], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
|
||||
assert graph_opt_level in [
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
|
||||
if use_cudagraph is not None:
|
||||
assert type(use_cudagraph) is bool, "In graph optimization config, type of use_cudagraph must is bool."
|
||||
if cudagraph_capture_sizes is not None:
|
||||
assert type(cudagraph_capture_sizes) is list, "In graph optimization config, type of cudagraph_capture_sizes must is list."
|
||||
assert len(cudagraph_capture_sizes) > 0, "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
|
||||
assert (
|
||||
type(cudagraph_capture_sizes) is list
|
||||
), "In graph optimization config, type of cudagraph_capture_sizes must is list."
|
||||
assert (
|
||||
len(cudagraph_capture_sizes) > 0
|
||||
), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
|
||||
|
||||
for key, value in kwargs.items():
|
||||
raise ValueError(f"Invalid --graph-optimization-config parameter {key}")
|
||||
|
||||
def update_use_cudagraph(self, argument:bool):
|
||||
def update_use_cudagraph(self, argument: bool):
|
||||
"""
|
||||
Unified user specifies the use_cudagraph parameter through two methods,
|
||||
'--use-cudagraph' and '--graph-optimization-config'
|
||||
@@ -499,9 +493,12 @@ class GraphOptimizationConfig:
|
||||
else:
|
||||
# User both set '--use-cudagraph' and '--graph-optimization-config'
|
||||
if self.use_cudagraph is False and argument is True:
|
||||
raise ValueError("Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously.")
|
||||
raise ValueError(
|
||||
"Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
|
||||
)
|
||||
argument = self.use_cudagraph
|
||||
|
||||
|
||||
class ParallelConfig:
|
||||
"""
|
||||
Configuration for parallelism.
|
||||
@@ -544,8 +541,7 @@ class ParallelConfig:
|
||||
llm_logger.info("Parallel Configuration Information :")
|
||||
for k, v in self.__dict__.items():
|
||||
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
||||
llm_logger.info(
|
||||
"=============================================================")
|
||||
llm_logger.info("=============================================================")
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -560,6 +556,7 @@ class CommitConfig:
|
||||
cuda_version: CUDA version string
|
||||
compiler_version: CXX compiler version string
|
||||
"""
|
||||
|
||||
fastdeploy_commit: str = ""
|
||||
paddle_version: str = ""
|
||||
paddle_commit: str = ""
|
||||
@@ -573,7 +570,7 @@ class CommitConfig:
|
||||
def _load_from_version_file(self, file_path: str = "fastdeploy/version.txt"):
|
||||
"""Internal method to load version info from file"""
|
||||
try:
|
||||
with open(file_path, 'r') as f:
|
||||
with open(file_path, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line.startswith("fastdeploy GIT COMMIT ID:"):
|
||||
@@ -589,7 +586,7 @@ class CommitConfig:
|
||||
except FileNotFoundError:
|
||||
llm_logger.info(f"Warning: Version file not found at {file_path}")
|
||||
except Exception as e:
|
||||
llm_logger.info(f"Warning: Could not read version file - {str(e)}")
|
||||
llm_logger.info(f"Warning: Could not read version file - {e!s}")
|
||||
|
||||
def print(self):
|
||||
"""
|
||||
@@ -599,8 +596,7 @@ class CommitConfig:
|
||||
llm_logger.info("Fasedeploy Commit Information :")
|
||||
for k, v in self.__dict__.items():
|
||||
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
||||
llm_logger.info(
|
||||
"=============================================================")
|
||||
llm_logger.info("=============================================================")
|
||||
|
||||
|
||||
class Config:
|
||||
@@ -701,7 +697,7 @@ class Config:
|
||||
self.max_num_batched_tokens = max_num_batched_tokens
|
||||
self.tensor_parallel_size = tensor_parallel_size
|
||||
self.dist_init_ip = dist_init_ip
|
||||
|
||||
|
||||
self.nnode = nnodes
|
||||
self.node_rank = node_rank
|
||||
if self.dist_init_ip is None:
|
||||
@@ -728,7 +724,6 @@ class Config:
|
||||
self.disable_any_whitespace = disable_any_whitespace
|
||||
self._str_to_list("innode_prefill_ports", int)
|
||||
|
||||
|
||||
assert self.splitwise_role in ["mixed", "prefill", "decode"]
|
||||
|
||||
# TODO
|
||||
@@ -739,19 +734,16 @@ class Config:
|
||||
self.max_prefill_batch = 1 # TODO:当前多模prefill阶段只支持并行度为1,待优化
|
||||
|
||||
# TODO(@wufeisheng): TP and EP need to be supported simultaneously.
|
||||
assert (self.tensor_parallel_size == 1
|
||||
and self.parallel_config.expert_parallel_size
|
||||
>= 1) or (self.tensor_parallel_size >= 1
|
||||
and self.parallel_config.expert_parallel_size
|
||||
== 1), "TP and EP cannot be enabled at the same time"
|
||||
assert (self.tensor_parallel_size == 1 and self.parallel_config.expert_parallel_size >= 1) or (
|
||||
self.tensor_parallel_size >= 1 and self.parallel_config.expert_parallel_size == 1
|
||||
), "TP and EP cannot be enabled at the same time"
|
||||
|
||||
num_ranks = self.tensor_parallel_size * self.parallel_config.expert_parallel_size
|
||||
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||
if num_ranks > self.max_chips_per_node:
|
||||
self.worker_num_per_node = self.max_chips_per_node
|
||||
nnode = ceil_div(num_ranks, self.worker_num_per_node)
|
||||
assert nnode == self.nnode, \
|
||||
f"nnode: {nnode}, but got {self.nnode}"
|
||||
assert nnode == self.nnode, f"nnode: {nnode}, but got {self.nnode}"
|
||||
else:
|
||||
self.worker_num_per_node = num_ranks
|
||||
|
||||
@@ -772,13 +764,14 @@ class Config:
|
||||
"""
|
||||
calculate some parameters
|
||||
"""
|
||||
assert self.device_ids.split(',').__len__() == self.worker_num_per_node, \
|
||||
f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}"
|
||||
assert (
|
||||
self.device_ids.split(",").__len__() == self.worker_num_per_node
|
||||
), f"invalid CUDA_VISIBLE_DEVICES, should be equal to {self.worker_num_per_node}"
|
||||
|
||||
assert self.worker_num_per_node % self.tensor_parallel_size == 0, \
|
||||
f"tensor_parallel_size: {self.tensor_parallel_size} should be divisible by worker_num_per_node: {self.worker_num_per_node}"
|
||||
self.local_device_ids = self.device_ids.split(
|
||||
',')[:self.tensor_parallel_size]
|
||||
assert (
|
||||
self.worker_num_per_node % self.tensor_parallel_size == 0
|
||||
), f"tensor_parallel_size: {self.tensor_parallel_size} should be divisible by worker_num_per_node: {self.worker_num_per_node}"
|
||||
self.local_device_ids = self.device_ids.split(",")[: self.tensor_parallel_size]
|
||||
|
||||
self.host_ip = get_host_ip()
|
||||
|
||||
@@ -788,6 +781,7 @@ class Config:
|
||||
self.is_master = False
|
||||
|
||||
import paddle
|
||||
|
||||
self.paddle_commit_id = paddle.version.commit
|
||||
|
||||
if self.max_num_batched_tokens is None:
|
||||
@@ -799,10 +793,8 @@ class Config:
|
||||
if self.long_prefill_token_threshold == 0:
|
||||
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
||||
|
||||
self.cache_config.postprocess(self.max_num_batched_tokens,
|
||||
self.max_num_seqs)
|
||||
self.cache_config.max_block_num_per_seq = int(
|
||||
self.max_model_len // self.cache_config.block_size)
|
||||
self.cache_config.postprocess(self.max_num_batched_tokens, self.max_num_seqs)
|
||||
self.cache_config.max_block_num_per_seq = int(self.max_model_len // self.cache_config.block_size)
|
||||
|
||||
if self.guided_decoding_backend == "auto":
|
||||
if self.enable_mm:
|
||||
@@ -814,30 +806,26 @@ class Config:
|
||||
"""
|
||||
check the legality of config
|
||||
"""
|
||||
assert (
|
||||
self.max_num_seqs <= 256
|
||||
), "The parameter `max_num_seqs` is not allowed to exceed 256, " "but now it's {}.".format(
|
||||
self.max_num_seqs)
|
||||
assert (
|
||||
is_port_available('0.0.0.0', self.engine_worker_queue_port)
|
||||
assert self.max_num_seqs <= 256, (
|
||||
"The parameter `max_num_seqs` is not allowed to exceed 256, " f"but now it's {self.max_num_seqs}."
|
||||
)
|
||||
assert is_port_available(
|
||||
"0.0.0.0", self.engine_worker_queue_port
|
||||
), f"The parameter `engine_worker_queue_port`:{self.engine_worker_queue_port} is already in use."
|
||||
assert (
|
||||
self.max_chips_per_node >= self.tensor_parallel_size > 0
|
||||
), f"tensor_parallel_size: {self.tensor_parallel_size} should be between 1 and {self.max_chips_per_node}"
|
||||
assert (self.nnode >= 1), f"nnode: {self.nnode} should no less than 1"
|
||||
assert (
|
||||
self.max_model_len >= 16
|
||||
), f"max_model_len: {self.max_model_len} should be larger than 16"
|
||||
assert (
|
||||
self.max_num_seqs
|
||||
>= 1), f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
|
||||
assert (
|
||||
self.max_num_batched_tokens >= self.max_num_seqs
|
||||
), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
|
||||
assert self.nnode >= 1, f"nnode: {self.nnode} should no less than 1"
|
||||
assert self.max_model_len >= 16, f"max_model_len: {self.max_model_len} should be larger than 16"
|
||||
assert self.max_num_seqs >= 1, f"max_num_seqs: {self.max_num_seqs} should be larger than 1"
|
||||
assert self.max_num_batched_tokens >= self.max_num_seqs, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_num_seqs: {self.max_num_seqs}"
|
||||
assert (self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs), \
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger" \
|
||||
f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
|
||||
)
|
||||
assert self.max_num_batched_tokens <= self.max_model_len * self.max_num_seqs, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} should be larger"
|
||||
f"than or equal to max_num_seqs: {self.max_num_seqs} * max_model_len: {self.max_model_len}"
|
||||
)
|
||||
assert (
|
||||
self.max_num_partial_prefills >= 1
|
||||
), f"max_num_partial_prefills: {self.max_num_partial_prefills} should be larger than or equal to 1"
|
||||
@@ -845,31 +833,38 @@ class Config:
|
||||
assert (
|
||||
self.max_long_partial_prefills >= 1
|
||||
), f"max_long_partial_prefills: {self.max_long_partial_prefills} should be larger than or equal to 1"
|
||||
assert (self.max_long_partial_prefills <= self.max_num_partial_prefills), \
|
||||
f"max_long_partial_prefills: {self.max_long_partial_prefills} should " \
|
||||
f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
|
||||
assert self.max_long_partial_prefills <= self.max_num_partial_prefills, (
|
||||
f"max_long_partial_prefills: {self.max_long_partial_prefills} should "
|
||||
f"be less than or equal to max_num_partial_prefills: {self.max_num_partial_prefills}"
|
||||
)
|
||||
|
||||
if not self.cache_config.enable_chunked_prefill:
|
||||
assert (
|
||||
self.max_num_batched_tokens >= self.max_model_len
|
||||
), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
|
||||
assert self.max_num_batched_tokens >= self.max_model_len, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
||||
)
|
||||
else:
|
||||
assert (
|
||||
self.max_num_batched_tokens >= self.cache_config.block_size
|
||||
), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
|
||||
assert self.max_num_batched_tokens >= self.cache_config.block_size, (
|
||||
f"max_num_batched_tokens: {self.max_num_batched_tokens} "
|
||||
f"should be larger than or equal to block_size: {self.cache_config.block_size}"
|
||||
)
|
||||
|
||||
if self.max_num_partial_prefills > 1:
|
||||
assert (self.cache_config.enable_chunked_prefill is True), \
|
||||
"Chunked prefill must be enabled to set max_num_partial_prefills > 1"
|
||||
assert (self.long_prefill_token_threshold < self.max_model_len), \
|
||||
f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"\
|
||||
f" max_model_len: {self.max_model_len}"
|
||||
assert (
|
||||
self.cache_config.enable_chunked_prefill is True
|
||||
), "Chunked prefill must be enabled to set max_num_partial_prefills > 1"
|
||||
assert self.long_prefill_token_threshold < self.max_model_len, (
|
||||
f"long_prefill_token_threshold: {self.long_prefill_token_threshold} should be less than"
|
||||
f" max_model_len: {self.max_model_len}"
|
||||
)
|
||||
|
||||
if self.guided_decoding_backend is not None:
|
||||
assert self.guided_decoding_backend in ["xgrammar", "XGrammar", "auto", "off"], \
|
||||
f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
|
||||
assert self.guided_decoding_backend in [
|
||||
"xgrammar",
|
||||
"XGrammar",
|
||||
"auto",
|
||||
"off",
|
||||
], f"Only support xgrammar、auto guided decoding backend, but got {self.guided_decoding_backend}."
|
||||
|
||||
if self.guided_decoding_backend != "off":
|
||||
# TODO: mm support guided_decoding
|
||||
@@ -878,8 +873,7 @@ class Config:
|
||||
# TODO: speculative decoding support guided_decoding
|
||||
|
||||
# TODO: xpu support guided_decoding
|
||||
assert not current_platform.is_xpu(
|
||||
), "XPU currently do not support guided_decoding"
|
||||
assert not current_platform.is_xpu(), "XPU currently do not support guided_decoding"
|
||||
|
||||
try:
|
||||
import xgrammar # noqa
|
||||
@@ -897,22 +891,22 @@ class Config:
|
||||
Args:
|
||||
file (str): the path of file to save config
|
||||
"""
|
||||
llm_logger.info(
|
||||
"=================== Configuration Information ===============")
|
||||
llm_logger.info("=================== Configuration Information ===============")
|
||||
for k, v in self.__dict__.items():
|
||||
if k == "generation_config" and v is not None:
|
||||
for gck, gcv in v.to_dict().items():
|
||||
llm_logger.info("{:<20}:{:<6}{}".format(gck, "", gcv))
|
||||
elif (k == "cache_config" or
|
||||
k == "model_config" or
|
||||
k == "scheduler_config" or
|
||||
k == "parallel_config" or
|
||||
k == "commit_config"):
|
||||
elif (
|
||||
k == "cache_config"
|
||||
or k == "model_config"
|
||||
or k == "scheduler_config"
|
||||
or k == "parallel_config"
|
||||
or k == "commit_config"
|
||||
):
|
||||
v.print()
|
||||
else:
|
||||
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
||||
llm_logger.info(
|
||||
"=============================================================")
|
||||
llm_logger.info("=============================================================")
|
||||
if file is not None:
|
||||
f = open(file, "a")
|
||||
now_time = datetime.now()
|
||||
@@ -929,15 +923,14 @@ class Config:
|
||||
if self.splitwise_role != "mixed":
|
||||
disaggregate_info["role"] = self.splitwise_role
|
||||
disaggregate_info["cache_info"] = dict()
|
||||
current_protocol = self.cache_config.cache_transfer_protocol.split(
|
||||
",")
|
||||
current_protocol = self.cache_config.cache_transfer_protocol.split(",")
|
||||
disaggregate_info["transfer_protocol"] = current_protocol
|
||||
for protocol in current_protocol:
|
||||
if protocol == "ipc":
|
||||
disaggregate_info["cache_info"][protocol] = {
|
||||
"ip": self.host_ip,
|
||||
"port": self.engine_worker_queue_port,
|
||||
"device_ids": self.local_device_ids
|
||||
"device_ids": self.local_device_ids,
|
||||
}
|
||||
elif protocol == "rdma":
|
||||
disaggregate_info["cache_info"][protocol] = {
|
||||
@@ -957,13 +950,14 @@ class Config:
|
||||
if hasattr(cls, key):
|
||||
value = getattr(cls, key)
|
||||
setattr(cls, value_name, value)
|
||||
llm_logger.info(
|
||||
f"Reset parameter {value_name} = {value} from configuration."
|
||||
)
|
||||
llm_logger.info(f"Reset parameter {value_name} = {value} from configuration.")
|
||||
|
||||
reset_value(self.cache_config, "block_size", "infer_model_block_size")
|
||||
reset_value(self.model_config, "return_full_hidden_states",
|
||||
"return_full_hidden_states")
|
||||
reset_value(
|
||||
self.model_config,
|
||||
"return_full_hidden_states",
|
||||
"return_full_hidden_states",
|
||||
)
|
||||
reset_value(self.cache_config, "cache_dtype", "infer_model_dtype")
|
||||
|
||||
def _check_master(self):
|
||||
|
Reference in New Issue
Block a user