mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
Unify server-side and model-side Config (Part2) (#3035)
* merge speculative and graph opt conifg * add attr
This commit is contained in:
@@ -16,6 +16,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Literal, Optional
|
from typing import Literal, Optional
|
||||||
@@ -24,10 +25,12 @@ from paddleformers.transformers.configuration_utils import PretrainedConfig
|
|||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase
|
from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase
|
||||||
from fastdeploy.utils import get_logger
|
from fastdeploy.utils import check_unified_ckpt, get_logger
|
||||||
|
|
||||||
logger = get_logger("config", "config.log")
|
logger = get_logger("config", "config.log")
|
||||||
|
|
||||||
|
TaskOption = Literal["generate"]
|
||||||
|
|
||||||
|
|
||||||
class MoEPhase:
|
class MoEPhase:
|
||||||
"""
|
"""
|
||||||
@@ -269,6 +272,7 @@ class SpeculativeConfig:
|
|||||||
# This ensures that the specified simulation acceptance rate is not affected.
|
# This ensures that the specified simulation acceptance rate is not affected.
|
||||||
self.benchmark_mode: bool = False
|
self.benchmark_mode: bool = False
|
||||||
|
|
||||||
|
self.num_extra_cache_layer = 0
|
||||||
# TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig.
|
# TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig.
|
||||||
# We temperately add the name map here and will delete it in future.
|
# We temperately add the name map here and will delete it in future.
|
||||||
name_map = {
|
name_map = {
|
||||||
@@ -284,6 +288,69 @@ class SpeculativeConfig:
|
|||||||
if key == "speculative_benchmark_mode":
|
if key == "speculative_benchmark_mode":
|
||||||
value = True if value.lower() == "true" else False
|
value = True if value.lower() == "true" else False
|
||||||
setattr(self, name_map[key], value)
|
setattr(self, name_map[key], value)
|
||||||
|
self.read_model_config()
|
||||||
|
self.reset()
|
||||||
|
|
||||||
|
def read_model_config(self):
|
||||||
|
"""
|
||||||
|
Read configuration from file.
|
||||||
|
"""
|
||||||
|
self.model_config = {}
|
||||||
|
if not self.enabled_speculative_decoding():
|
||||||
|
return
|
||||||
|
|
||||||
|
self.is_unified_ckpt = check_unified_ckpt(self.model_name_or_path)
|
||||||
|
if self.model_name_or_path is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.config_path = os.path.join(self.model_name_or_path, "config.json")
|
||||||
|
if os.path.exists(self.config_path):
|
||||||
|
self.model_config = json.load(open(self.config_path, "r", encoding="utf-8"))
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""
|
||||||
|
Reset configuration.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def reset_value(cls, value_name, key=None, default=None):
|
||||||
|
if key is not None and key in cls.model_config:
|
||||||
|
setattr(cls, value_name, cls.model_config[key])
|
||||||
|
elif getattr(cls, value_name, None) is None:
|
||||||
|
setattr(cls, value_name, default)
|
||||||
|
|
||||||
|
if not self.enabled_speculative_decoding():
|
||||||
|
return
|
||||||
|
|
||||||
|
# NOTE(liuzichang): We will support multi-layer in future
|
||||||
|
if self.method in ["mtp"]:
|
||||||
|
self.num_extra_cache_layer = 1
|
||||||
|
|
||||||
|
def enabled_speculative_decoding(self):
|
||||||
|
"""
|
||||||
|
Check if speculative decoding is enabled.
|
||||||
|
"""
|
||||||
|
if self.method is None:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
def to_json_string(self):
|
||||||
|
"""
|
||||||
|
Convert speculative_config to json string.
|
||||||
|
"""
|
||||||
|
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
|
||||||
|
|
||||||
|
def print(self):
|
||||||
|
"""
|
||||||
|
print all config
|
||||||
|
|
||||||
|
"""
|
||||||
|
logger.info("Speculative Decoding Configuration Information :")
|
||||||
|
for k, v in self.__dict__.items():
|
||||||
|
logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
||||||
|
logger.info("=============================================================")
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return self.to_json_string()
|
||||||
|
|
||||||
|
|
||||||
class DeviceConfig:
|
class DeviceConfig:
|
||||||
@@ -301,18 +368,21 @@ class DeviceConfig:
|
|||||||
setattr(self, key, value)
|
setattr(self, key, value)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class GraphOptimizationConfig:
|
class GraphOptimizationConfig:
|
||||||
"""
|
"""
|
||||||
Configuration for compute graph level optimization.
|
Configuration for compute graph level optimization.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
args,
|
||||||
|
):
|
||||||
"""The Top-level graph optimization contral corresponds to different backends.
|
"""The Top-level graph optimization contral corresponds to different backends.
|
||||||
- 0: dyncmic graph
|
- 0: dyncmic graph
|
||||||
- 1: static graph
|
- 1: static graph
|
||||||
- 2: static graph + cinn compilation backend
|
- 2: static graph + cinn compilation backend
|
||||||
"""
|
"""
|
||||||
graph_opt_level: int = 0
|
self.graph_opt_level: int = 0
|
||||||
|
|
||||||
# CUDA Graph Config
|
# CUDA Graph Config
|
||||||
""" Whether to use cudagraph.
|
""" Whether to use cudagraph.
|
||||||
@@ -323,20 +393,20 @@ class GraphOptimizationConfig:
|
|||||||
- With dyncmic graph backend: ...
|
- With dyncmic graph backend: ...
|
||||||
- With static grpah backend: WIP
|
- With static grpah backend: WIP
|
||||||
"""
|
"""
|
||||||
sot_warmup_sizes: Optional[list[int]] = field(default_factory=list)
|
self.sot_warmup_sizes: Optional[list[int]] = []
|
||||||
""" Number of warmup runs for SOT warmup. """
|
""" Number of warmup runs for SOT warmup. """
|
||||||
use_cudagraph: bool = False
|
self.use_cudagraph: bool = False
|
||||||
"""Sizes to capture cudagraph.
|
"""Sizes to capture cudagraph.
|
||||||
- None (default): capture sizes are inferred from llm config.
|
- None (default): capture sizes are inferred from llm config.
|
||||||
- list[int]: capture sizes are specified as given."""
|
- list[int]: capture sizes are specified as given."""
|
||||||
cudagraph_capture_sizes: Optional[list[int]] = None
|
self.cudagraph_capture_sizes: Optional[list[int]] = None
|
||||||
""" Number of warmup runs for cudagraph. """
|
""" Number of warmup runs for cudagraph. """
|
||||||
cudagraph_num_of_warmups: int = 2
|
self.cudagraph_num_of_warmups: int = 2
|
||||||
"""Whether to copy input tensors for cudagraph.
|
"""Whether to copy input tensors for cudagraph.
|
||||||
If the caller can guarantee that the same input buffers
|
If the caller can guarantee that the same input buffers
|
||||||
are always used, it can set this to False. Otherwise, it should
|
are always used, it can set this to False. Otherwise, it should
|
||||||
set this to True."""
|
set this to True."""
|
||||||
cudagraph_copy_inputs: bool = False
|
self.cudagraph_copy_inputs: bool = False
|
||||||
""" In static graph, this is an operation list that does not need to be captured by the CUDA graph.
|
""" In static graph, this is an operation list that does not need to be captured by the CUDA graph.
|
||||||
CudaGraphBackend will split these operations from the static graph.
|
CudaGraphBackend will split these operations from the static graph.
|
||||||
Example usage:
|
Example usage:
|
||||||
@@ -346,15 +416,21 @@ class GraphOptimizationConfig:
|
|||||||
can manually split the model into multiple layers and apply the @support_graph_optimization decorator
|
can manually split the model into multiple layers and apply the @support_graph_optimization decorator
|
||||||
only to the layer where CUDA graph functionality is required.
|
only to the layer where CUDA graph functionality is required.
|
||||||
"""
|
"""
|
||||||
cudagraph_splitting_ops: list[str] = field(default_factory=list)
|
self.cudagraph_splitting_ops: list[str] = []
|
||||||
""" Whether to use a full cuda graph for the entire forward pass rather than
|
""" Whether to use a full cuda graph for the entire forward pass rather than
|
||||||
splitting certain operations such as attention into subgraphs.
|
splitting certain operations such as attention into subgraphs.
|
||||||
Thus this flag cannot be used together with splitting_ops."""
|
Thus this flag cannot be used together with splitting_ops."""
|
||||||
full_cuda_graph: bool = True
|
self.full_cuda_graph: bool = True
|
||||||
|
|
||||||
max_capture_size: int = field(default=None, init=False) # type: ignore
|
self.max_capture_size: int = None
|
||||||
batch_size_to_captured_size: dict[int, int] = field(default=None, init=False) # type: ignore
|
self.batch_size_to_captured_size: dict[int, int] = None
|
||||||
# CINN Config ...
|
# CINN Config ...
|
||||||
|
if args is not None:
|
||||||
|
for key, value in args.items():
|
||||||
|
if hasattr(self, key):
|
||||||
|
setattr(self, key, value)
|
||||||
|
|
||||||
|
self.check_legality_parameters()
|
||||||
|
|
||||||
def init_with_cudagrpah_size(self, max_num_seqs: int = 0) -> None:
|
def init_with_cudagrpah_size(self, max_num_seqs: int = 0) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -401,6 +477,54 @@ class GraphOptimizationConfig:
|
|||||||
draft_capture_sizes.append(max_num_seqs)
|
draft_capture_sizes.append(max_num_seqs)
|
||||||
self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
|
self.cudagraph_capture_sizes = sorted(draft_capture_sizes)
|
||||||
|
|
||||||
|
def to_json_string(self):
|
||||||
|
"""
|
||||||
|
Convert speculative_config to json string.
|
||||||
|
"""
|
||||||
|
return json.dumps({key: value for key, value in self.__dict__.items()})
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return self.to_json_string()
|
||||||
|
|
||||||
|
def check_legality_parameters(
|
||||||
|
self,
|
||||||
|
) -> None:
|
||||||
|
"""Check the legality of parameters passed in from the command line"""
|
||||||
|
|
||||||
|
if self.graph_opt_level is not None:
|
||||||
|
assert self.graph_opt_level in [
|
||||||
|
0,
|
||||||
|
1,
|
||||||
|
2,
|
||||||
|
], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
|
||||||
|
if self.use_cudagraph is not None:
|
||||||
|
assert (
|
||||||
|
type(self.use_cudagraph) is bool
|
||||||
|
), "In graph optimization config, type of use_cudagraph must is bool."
|
||||||
|
if self.cudagraph_capture_sizes is not None:
|
||||||
|
assert (
|
||||||
|
type(self.cudagraph_capture_sizes) is list
|
||||||
|
), "In graph optimization config, type of cudagraph_capture_sizes must is list."
|
||||||
|
assert (
|
||||||
|
len(self.cudagraph_capture_sizes) > 0
|
||||||
|
), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
|
||||||
|
|
||||||
|
def update_use_cudagraph(self, argument: bool):
|
||||||
|
"""
|
||||||
|
Unified user specifies the use_cudagraph parameter through two methods,
|
||||||
|
'--use-cudagraph' and '--graph-optimization-config'
|
||||||
|
"""
|
||||||
|
if self.use_cudagraph is None:
|
||||||
|
# User only set '--use-cudagraph'
|
||||||
|
self.use_cudagraph = argument
|
||||||
|
else:
|
||||||
|
# User both set '--use-cudagraph' and '--graph-optimization-config'
|
||||||
|
if self.use_cudagraph is False and argument is True:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
|
||||||
|
)
|
||||||
|
argument = self.use_cudagraph
|
||||||
|
|
||||||
|
|
||||||
class LoadConfig:
|
class LoadConfig:
|
||||||
"""
|
"""
|
||||||
|
@@ -19,15 +19,13 @@ from dataclasses import asdict, dataclass
|
|||||||
from dataclasses import fields as dataclass_fields
|
from dataclasses import fields as dataclass_fields
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from fastdeploy.config import CacheConfig
|
from fastdeploy.config import (
|
||||||
from fastdeploy.engine.config import (
|
CacheConfig,
|
||||||
Config,
|
|
||||||
GraphOptimizationConfig,
|
GraphOptimizationConfig,
|
||||||
ModelConfig,
|
|
||||||
ParallelConfig,
|
|
||||||
SpeculativeConfig,
|
SpeculativeConfig,
|
||||||
TaskOption,
|
TaskOption,
|
||||||
)
|
)
|
||||||
|
from fastdeploy.engine.config import Config, ModelConfig, ParallelConfig
|
||||||
from fastdeploy.scheduler.config import SchedulerConfig
|
from fastdeploy.scheduler.config import SchedulerConfig
|
||||||
from fastdeploy.utils import FlexibleArgumentParser
|
from fastdeploy.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
@@ -772,10 +770,12 @@ class EngineArgs:
|
|||||||
|
|
||||||
def create_speculative_config(self) -> SpeculativeConfig:
|
def create_speculative_config(self) -> SpeculativeConfig:
|
||||||
""" """
|
""" """
|
||||||
|
speculative_args = asdict(self)
|
||||||
if self.speculative_config is not None:
|
if self.speculative_config is not None:
|
||||||
return SpeculativeConfig(**self.speculative_config)
|
for k, v in self.speculative_config.items():
|
||||||
else:
|
speculative_args[k] = v
|
||||||
return SpeculativeConfig()
|
|
||||||
|
return SpeculativeConfig(speculative_args)
|
||||||
|
|
||||||
def create_scheduler_config(self) -> SchedulerConfig:
|
def create_scheduler_config(self) -> SchedulerConfig:
|
||||||
"""
|
"""
|
||||||
@@ -816,10 +816,11 @@ class EngineArgs:
|
|||||||
"""
|
"""
|
||||||
Create and retuan a GraphOptimizationConfig object based on the current settings.
|
Create and retuan a GraphOptimizationConfig object based on the current settings.
|
||||||
"""
|
"""
|
||||||
|
graph_optimization_args = asdict(self)
|
||||||
if self.graph_optimization_config is not None:
|
if self.graph_optimization_config is not None:
|
||||||
return GraphOptimizationConfig(**self.graph_optimization_config)
|
for k, v in self.graph_optimization_config.items():
|
||||||
else:
|
graph_optimization_args[k] = v
|
||||||
return GraphOptimizationConfig()
|
return GraphOptimizationConfig(graph_optimization_args)
|
||||||
|
|
||||||
def create_engine_config(self) -> Config:
|
def create_engine_config(self) -> Config:
|
||||||
"""
|
"""
|
||||||
|
@@ -17,7 +17,7 @@ import json
|
|||||||
import os
|
import os
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Any, Dict, List, Literal, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.config import CacheConfig
|
from fastdeploy.config import CacheConfig
|
||||||
@@ -31,8 +31,6 @@ from fastdeploy.utils import (
|
|||||||
llm_logger,
|
llm_logger,
|
||||||
)
|
)
|
||||||
|
|
||||||
TaskOption = Literal["generate"]
|
|
||||||
|
|
||||||
|
|
||||||
class ModelConfig:
|
class ModelConfig:
|
||||||
"""
|
"""
|
||||||
@@ -158,188 +156,6 @@ class ModelConfig:
|
|||||||
llm_logger.info("=============================================================")
|
llm_logger.info("=============================================================")
|
||||||
|
|
||||||
|
|
||||||
class SpeculativeConfig:
|
|
||||||
"""
|
|
||||||
Speculative Decoding Configuration class.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
method (Optional[str]): Method used for speculative decoding.
|
|
||||||
num_speculative_tokens (int): Maximum draft tokens, default is 1.
|
|
||||||
model_name_or_path (Optional[str]): Path of the model.
|
|
||||||
quantization (str): Quantization method for draft model, default is WINT8.
|
|
||||||
max_model_len: Optional[int]: Maximum model length for draft model.
|
|
||||||
benchmark_mode (bool): Whether to use benchmark mode.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
method: Optional[str] = None,
|
|
||||||
num_speculative_tokens: Optional[int] = 1,
|
|
||||||
model: Optional[str] = None,
|
|
||||||
quantization: Optional[str] = "WINT8",
|
|
||||||
max_model_len: Optional[int] = None,
|
|
||||||
benchmark_mode: bool = False,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
self.model_name_or_path = model
|
|
||||||
self.method = method
|
|
||||||
self.num_speculative_tokens = num_speculative_tokens
|
|
||||||
self.quantization = quantization
|
|
||||||
self.max_model_len = max_model_len
|
|
||||||
self.benchmark_mode = benchmark_mode
|
|
||||||
# Fixed now
|
|
||||||
self.num_gpu_block_expand_ratio = 1
|
|
||||||
self.num_extra_cache_layer = 0
|
|
||||||
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
try:
|
|
||||||
setattr(self, key, value)
|
|
||||||
except Exception:
|
|
||||||
continue
|
|
||||||
|
|
||||||
self.read_model_config()
|
|
||||||
self.reset()
|
|
||||||
|
|
||||||
def read_model_config(self):
|
|
||||||
"""
|
|
||||||
Read configuration from file.
|
|
||||||
"""
|
|
||||||
self.model_config = {}
|
|
||||||
if not self.enabled_speculative_decoding():
|
|
||||||
return
|
|
||||||
|
|
||||||
self.is_unified_ckpt = check_unified_ckpt(self.model_name_or_path)
|
|
||||||
if self.model_name_or_path is None:
|
|
||||||
return
|
|
||||||
|
|
||||||
self.config_path = os.path.join(self.model_name_or_path, "config.json")
|
|
||||||
if os.path.exists(self.config_path):
|
|
||||||
self.model_config = json.load(open(self.config_path, "r", encoding="utf-8"))
|
|
||||||
|
|
||||||
def reset(self):
|
|
||||||
"""
|
|
||||||
Reset configuration.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def reset_value(cls, value_name, key=None, default=None):
|
|
||||||
if key is not None and key in cls.model_config:
|
|
||||||
setattr(cls, value_name, cls.model_config[key])
|
|
||||||
elif getattr(cls, value_name, None) is None:
|
|
||||||
setattr(cls, value_name, default)
|
|
||||||
|
|
||||||
if not self.enabled_speculative_decoding():
|
|
||||||
return
|
|
||||||
|
|
||||||
# NOTE(liuzichang): We will support multi-layer in future
|
|
||||||
if self.method in ["mtp"]:
|
|
||||||
self.num_extra_cache_layer = 1
|
|
||||||
|
|
||||||
def enabled_speculative_decoding(self):
|
|
||||||
"""
|
|
||||||
Check if speculative decoding is enabled.
|
|
||||||
"""
|
|
||||||
if self.method is None:
|
|
||||||
return False
|
|
||||||
return True
|
|
||||||
|
|
||||||
def to_json_string(self):
|
|
||||||
"""
|
|
||||||
Convert speculative_config to json string.
|
|
||||||
"""
|
|
||||||
return json.dumps({key: value for key, value in self.__dict__.items() if value is not None})
|
|
||||||
|
|
||||||
def print(self):
|
|
||||||
"""
|
|
||||||
print all config
|
|
||||||
|
|
||||||
"""
|
|
||||||
llm_logger.info("Speculative Decoding Configuration Information :")
|
|
||||||
for k, v in self.__dict__.items():
|
|
||||||
llm_logger.info("{:<20}:{:<6}{}".format(k, "", v))
|
|
||||||
llm_logger.info("=============================================================")
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
|
||||||
return self.to_json_string()
|
|
||||||
|
|
||||||
|
|
||||||
class GraphOptimizationConfig:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
graph_opt_level: Optional[int] = 0,
|
|
||||||
use_cudagraph: Optional[bool] = None,
|
|
||||||
cudagraph_capture_sizes: Optional[List[int]] = None,
|
|
||||||
sot_warmup_sizes: Optional[List[int]] = None,
|
|
||||||
**kwargs,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Graph Optimization Configuration class.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
graph_opt_level: Compute graph optimization level
|
|
||||||
use_cudagraph: Use CUDA Graph or not
|
|
||||||
cudagraph_capture_sizes: Batch size list will be captured by CUDA Graph
|
|
||||||
"""
|
|
||||||
self.check_legality_parameters(graph_opt_level, use_cudagraph, cudagraph_capture_sizes, **kwargs)
|
|
||||||
|
|
||||||
self.graph_opt_level = graph_opt_level
|
|
||||||
self.use_cudagraph = use_cudagraph
|
|
||||||
self.cudagraph_capture_sizes = cudagraph_capture_sizes
|
|
||||||
self.sot_warmup_sizes = [] if sot_warmup_sizes is None else sot_warmup_sizes
|
|
||||||
|
|
||||||
def to_json_string(self):
|
|
||||||
"""
|
|
||||||
Convert speculative_config to json string.
|
|
||||||
"""
|
|
||||||
return json.dumps({key: value for key, value in self.__dict__.items()})
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
|
||||||
return self.to_json_string()
|
|
||||||
|
|
||||||
def check_legality_parameters(
|
|
||||||
self,
|
|
||||||
graph_opt_level: Optional[int] = None,
|
|
||||||
use_cudagraph: Optional[bool] = None,
|
|
||||||
cudagraph_capture_sizes: Optional[List[int]] = None,
|
|
||||||
**kwargs,
|
|
||||||
) -> None:
|
|
||||||
"""Check the legality of parameters passed in from the command line"""
|
|
||||||
|
|
||||||
if graph_opt_level is not None:
|
|
||||||
assert graph_opt_level in [
|
|
||||||
0,
|
|
||||||
1,
|
|
||||||
2,
|
|
||||||
], "In graph optimization config, graph_opt_level can only take the values of 0, 1 and 2."
|
|
||||||
if use_cudagraph is not None:
|
|
||||||
assert type(use_cudagraph) is bool, "In graph optimization config, type of use_cudagraph must is bool."
|
|
||||||
if cudagraph_capture_sizes is not None:
|
|
||||||
assert (
|
|
||||||
type(cudagraph_capture_sizes) is list
|
|
||||||
), "In graph optimization config, type of cudagraph_capture_sizes must is list."
|
|
||||||
assert (
|
|
||||||
len(cudagraph_capture_sizes) > 0
|
|
||||||
), "In graph optimization config, When opening the CUDA graph, it is forbidden to set the capture sizes to an empty list."
|
|
||||||
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
raise ValueError(f"Invalid --graph-optimization-config parameter {key}")
|
|
||||||
|
|
||||||
def update_use_cudagraph(self, argument: bool):
|
|
||||||
"""
|
|
||||||
Unified user specifies the use_cudagraph parameter through two methods,
|
|
||||||
'--use-cudagraph' and '--graph-optimization-config'
|
|
||||||
"""
|
|
||||||
if self.use_cudagraph is None:
|
|
||||||
# User only set '--use-cudagraph'
|
|
||||||
self.use_cudagraph = argument
|
|
||||||
else:
|
|
||||||
# User both set '--use-cudagraph' and '--graph-optimization-config'
|
|
||||||
if self.use_cudagraph is False and argument is True:
|
|
||||||
raise ValueError(
|
|
||||||
"Invalid parameter: Cannot set --use-cudagraph and --graph-optimization-config '{\"use_cudagraph\":false}' simultaneously."
|
|
||||||
)
|
|
||||||
argument = self.use_cudagraph
|
|
||||||
|
|
||||||
|
|
||||||
class ParallelConfig:
|
class ParallelConfig:
|
||||||
"""
|
"""
|
||||||
Configuration for parallelism.
|
Configuration for parallelism.
|
||||||
|
@@ -19,7 +19,7 @@ from typing import Dict, Optional
|
|||||||
import paddle
|
import paddle
|
||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.engine.config import SpeculativeConfig
|
from fastdeploy.config import SpeculativeConfig
|
||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
|
|
||||||
if current_platform.is_iluvatar():
|
if current_platform.is_iluvatar():
|
||||||
|
@@ -630,14 +630,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
|||||||
|
|
||||||
load_config = LoadConfig(vars(args))
|
load_config = LoadConfig(vars(args))
|
||||||
|
|
||||||
graph_opt_config = GraphOptimizationConfig()
|
graph_opt_config = GraphOptimizationConfig(args.graph_optimization_config)
|
||||||
if args.graph_optimization_config is not None:
|
|
||||||
graph_opt_config = GraphOptimizationConfig(
|
|
||||||
use_cudagraph=args.graph_optimization_config["use_cudagraph"],
|
|
||||||
graph_opt_level=args.graph_optimization_config["graph_opt_level"],
|
|
||||||
cudagraph_capture_sizes=args.graph_optimization_config["cudagraph_capture_sizes"],
|
|
||||||
sot_warmup_sizes=args.graph_optimization_config["sot_warmup_sizes"],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note(tangbinhan): used for load_checkpoint
|
# Note(tangbinhan): used for load_checkpoint
|
||||||
model_config.pretrained_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank
|
model_config.pretrained_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank
|
||||||
|
Reference in New Issue
Block a user