mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
polish code with new pre-commit rule (#2923)
This commit is contained in:
@@ -13,15 +13,21 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import json
|
||||
from dataclasses import asdict, dataclass
|
||||
from dataclasses import fields as dataclass_fields
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from fastdeploy.engine.config import (CacheConfig, Config,
|
||||
GraphOptimizationConfig, ModelConfig,
|
||||
ParallelConfig, SpeculativeConfig,
|
||||
TaskOption)
|
||||
from fastdeploy.engine.config import (
|
||||
CacheConfig,
|
||||
Config,
|
||||
GraphOptimizationConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SpeculativeConfig,
|
||||
TaskOption,
|
||||
)
|
||||
from fastdeploy.scheduler.config import SchedulerConfig
|
||||
from fastdeploy.utils import FlexibleArgumentParser
|
||||
|
||||
@@ -323,365 +329,429 @@ class EngineArgs:
|
||||
"""
|
||||
# Model parameters group
|
||||
model_group = parser.add_argument_group("Model Configuration")
|
||||
model_group.add_argument("--model",
|
||||
type=str,
|
||||
default=EngineArgs.model,
|
||||
help="Model name or path to be used.")
|
||||
model_group.add_argument("--model-config-name",
|
||||
type=nullable_str,
|
||||
default=EngineArgs.model_config_name,
|
||||
help="The model configuration file name.")
|
||||
model_group.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default=EngineArgs.model,
|
||||
help="Model name or path to be used.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--model-config-name",
|
||||
type=nullable_str,
|
||||
default=EngineArgs.model_config_name,
|
||||
help="The model configuration file name.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--tokenizer",
|
||||
type=nullable_str,
|
||||
default=EngineArgs.tokenizer,
|
||||
help=
|
||||
"Tokenizer name or path (defaults to model path if not specified)."
|
||||
help="Tokenizer name or path (defaults to model path if not specified).",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--max-model-len",
|
||||
type=int,
|
||||
default=EngineArgs.max_model_len,
|
||||
help="Maximum context length supported by the model.")
|
||||
help="Maximum context length supported by the model.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--block-size",
|
||||
type=int,
|
||||
default=EngineArgs.block_size,
|
||||
help="Number of tokens processed in one block.")
|
||||
model_group.add_argument("--task",
|
||||
type=str,
|
||||
default=EngineArgs.task,
|
||||
help="Task to be executed by the model.")
|
||||
help="Number of tokens processed in one block.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--task",
|
||||
type=str,
|
||||
default=EngineArgs.task,
|
||||
help="Task to be executed by the model.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--use-warmup",
|
||||
type=int,
|
||||
default=EngineArgs.use_warmup,
|
||||
help="Flag to indicate whether to use warm-up before inference.")
|
||||
help="Flag to indicate whether to use warm-up before inference.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--limit-mm-per-prompt",
|
||||
default=EngineArgs.limit_mm_per_prompt,
|
||||
type=json.loads,
|
||||
help="Limitation of numbers of multi-modal data.")
|
||||
help="Limitation of numbers of multi-modal data.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--mm-processor-kwargs",
|
||||
default=EngineArgs.mm_processor_kwargs,
|
||||
type=json.loads,
|
||||
help="Additional keyword arguments for the multi-modal processor.")
|
||||
model_group.add_argument("--enable-mm",
|
||||
action='store_true',
|
||||
default=EngineArgs.enable_mm,
|
||||
help="Flag to enable multi-modal model.")
|
||||
model_group.add_argument("--reasoning-parser",
|
||||
type=str,
|
||||
default=EngineArgs.reasoning_parser,
|
||||
help="Flag specifies the reasoning parser to use for extracting "\
|
||||
"reasoning content from the model output")
|
||||
help="Additional keyword arguments for the multi-modal processor.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--enable-mm",
|
||||
action="store_true",
|
||||
default=EngineArgs.enable_mm,
|
||||
help="Flag to enable multi-modal model.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--reasoning-parser",
|
||||
type=str,
|
||||
default=EngineArgs.reasoning_parser,
|
||||
help="Flag specifies the reasoning parser to use for extracting "
|
||||
"reasoning content from the model output",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--speculative-config",
|
||||
type=json.loads,
|
||||
default=EngineArgs.speculative_config,
|
||||
help="Configuration for speculative execution.")
|
||||
help="Configuration for speculative execution.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--dynamic-load-weight",
|
||||
action='store_true',
|
||||
action="store_true",
|
||||
default=EngineArgs.dynamic_load_weight,
|
||||
help="Flag to indicate whether to load weight dynamically.")
|
||||
help="Flag to indicate whether to load weight dynamically.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--load-strategy",
|
||||
type=str,
|
||||
default=EngineArgs.load_strategy,
|
||||
help="Flag to dynamic load strategy.")
|
||||
model_group.add_argument("--engine-worker-queue-port",
|
||||
type=int,
|
||||
default=EngineArgs.engine_worker_queue_port,
|
||||
help="port for engine worker queue")
|
||||
model_group.add_argument("--quantization",
|
||||
type=str,
|
||||
default=EngineArgs.quantization,
|
||||
help="Quantization name for the model, currentlly support " \
|
||||
"'wint8', 'wint4'," \
|
||||
"default is None. The priority of this configuration "\
|
||||
"is lower than that of the config file. " \
|
||||
"More complex quantization methods need to be configured via the config file.")
|
||||
model_group.add_argument("--use-cudagraph",
|
||||
action='store_true',
|
||||
default=EngineArgs.use_cudagraph,
|
||||
help="Flags to enable cuda graph.")
|
||||
model_group.add_argument("--graph-optimization-config",
|
||||
type=json.loads,
|
||||
default=EngineArgs.graph_optimization_config,
|
||||
help="")
|
||||
model_group.add_argument("--guided-decoding-backend",
|
||||
type=str,
|
||||
default=EngineArgs.guided_decoding_backend,
|
||||
help="Guided Decoding Backend")
|
||||
help="Flag to dynamic load strategy.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--engine-worker-queue-port",
|
||||
type=int,
|
||||
default=EngineArgs.engine_worker_queue_port,
|
||||
help="port for engine worker queue",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--quantization",
|
||||
type=str,
|
||||
default=EngineArgs.quantization,
|
||||
help="Quantization name for the model, currentlly support "
|
||||
"'wint8', 'wint4',"
|
||||
"default is None. The priority of this configuration "
|
||||
"is lower than that of the config file. "
|
||||
"More complex quantization methods need to be configured via the config file.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--use-cudagraph",
|
||||
action="store_true",
|
||||
default=EngineArgs.use_cudagraph,
|
||||
help="Flags to enable cuda graph.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--graph-optimization-config",
|
||||
type=json.loads,
|
||||
default=EngineArgs.graph_optimization_config,
|
||||
help="",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--guided-decoding-backend",
|
||||
type=str,
|
||||
default=EngineArgs.guided_decoding_backend,
|
||||
help="Guided Decoding Backend",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--guided-decoding-disable-any-whitespace",
|
||||
type=str,
|
||||
default=EngineArgs.guided_decoding_disable_any_whitespace,
|
||||
help=
|
||||
"Disabled any whitespaces when using guided decoding backend XGrammar."
|
||||
help="Disabled any whitespaces when using guided decoding backend XGrammar.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--enable-logprob",
|
||||
action="store_true",
|
||||
default=EngineArgs.enable_logprob,
|
||||
help="Enable output of token-level log probabilities.",
|
||||
)
|
||||
model_group.add_argument("--enable-logprob",
|
||||
action="store_true",
|
||||
default=EngineArgs.enable_logprob,
|
||||
help="Enable output of token-level log probabilities."
|
||||
)
|
||||
|
||||
# Parallel processing parameters group
|
||||
parallel_group = parser.add_argument_group("Parallel Configuration")
|
||||
parallel_group.add_argument("--tensor-parallel-size",
|
||||
"-tp",
|
||||
type=int,
|
||||
default=EngineArgs.tensor_parallel_size,
|
||||
help="Degree of tensor parallelism.")
|
||||
parallel_group.add_argument("--enable-custom-all-reduce",
|
||||
action='store_true',
|
||||
default=EngineArgs.enable_custom_all_reduce,
|
||||
help="Flag to enable custom all-reduce.")
|
||||
parallel_group.add_argument(
|
||||
"--tensor-parallel-size",
|
||||
"-tp",
|
||||
type=int,
|
||||
default=EngineArgs.tensor_parallel_size,
|
||||
help="Degree of tensor parallelism.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--enable-custom-all-reduce",
|
||||
action="store_true",
|
||||
default=EngineArgs.enable_custom_all_reduce,
|
||||
help="Flag to enable custom all-reduce.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--max-num-seqs",
|
||||
type=int,
|
||||
default=EngineArgs.max_num_seqs,
|
||||
help="Maximum number of sequences per iteration.")
|
||||
help="Maximum number of sequences per iteration.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--num-gpu-blocks-override",
|
||||
type=int,
|
||||
default=EngineArgs.num_gpu_blocks_override,
|
||||
help="Override for the number of GPU blocks.")
|
||||
help="Override for the number of GPU blocks.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--max-num-batched-tokens",
|
||||
type=int,
|
||||
default=EngineArgs.max_num_batched_tokens,
|
||||
help="Maximum number of tokens to batch together.")
|
||||
help="Maximum number of tokens to batch together.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--gpu-memory-utilization",
|
||||
type=float,
|
||||
default=EngineArgs.gpu_memory_utilization,
|
||||
help="Fraction of GPU memory to be utilized.")
|
||||
help="Fraction of GPU memory to be utilized.",
|
||||
)
|
||||
|
||||
parallel_group.add_argument("--data-parallel-size",
|
||||
type=int,
|
||||
default=EngineArgs.data_parallel_size,
|
||||
help="Degree of data parallelism.")
|
||||
parallel_group.add_argument("--enable-expert-parallel",
|
||||
action='store_true',
|
||||
default=EngineArgs.enable_expert_parallel,
|
||||
help="Enable expert parallelism.")
|
||||
parallel_group.add_argument(
|
||||
"--data-parallel-size",
|
||||
type=int,
|
||||
default=EngineArgs.data_parallel_size,
|
||||
help="Degree of data parallelism.",
|
||||
)
|
||||
parallel_group.add_argument(
|
||||
"--enable-expert-parallel",
|
||||
action="store_true",
|
||||
default=EngineArgs.enable_expert_parallel,
|
||||
help="Enable expert parallelism.",
|
||||
)
|
||||
|
||||
# CacheConfig parameters group
|
||||
cache_group = parser.add_argument_group("Cache Configuration")
|
||||
|
||||
cache_group.add_argument("--kv-cache-ratio",
|
||||
type=float,
|
||||
default=EngineArgs.kv_cache_ratio,
|
||||
help="Ratio of tokens to process in a block.")
|
||||
cache_group.add_argument(
|
||||
"--kv-cache-ratio",
|
||||
type=float,
|
||||
default=EngineArgs.kv_cache_ratio,
|
||||
help="Ratio of tokens to process in a block.",
|
||||
)
|
||||
|
||||
cache_group.add_argument(
|
||||
"--swap-space",
|
||||
type=float,
|
||||
default=EngineArgs.swap_space,
|
||||
help="The amount of CPU memory to offload to.")
|
||||
help="The amount of CPU memory to offload to.",
|
||||
)
|
||||
|
||||
cache_group.add_argument("--cache-queue-port",
|
||||
type=int,
|
||||
default=EngineArgs.cache_queue_port,
|
||||
help="port for cache queue")
|
||||
cache_group.add_argument("--static-decode-blocks",
|
||||
type=int,
|
||||
default=EngineArgs.static_decode_blocks,
|
||||
help="Static decoding blocks num.")
|
||||
cache_group.add_argument(
|
||||
"--cache-queue-port",
|
||||
type=int,
|
||||
default=EngineArgs.cache_queue_port,
|
||||
help="port for cache queue",
|
||||
)
|
||||
cache_group.add_argument(
|
||||
"--static-decode-blocks",
|
||||
type=int,
|
||||
default=EngineArgs.static_decode_blocks,
|
||||
help="Static decoding blocks num.",
|
||||
)
|
||||
|
||||
# Cluster system parameters group
|
||||
system_group = parser.add_argument_group("System Configuration")
|
||||
system_group.add_argument(
|
||||
"--dist-init-ip",
|
||||
default=EngineArgs.dist_init_ip,
|
||||
help=
|
||||
"IP addresses of master node.")
|
||||
help="IP addresses of master node.",
|
||||
)
|
||||
|
||||
system_group.add_argument(
|
||||
"--nnodes",
|
||||
type=int,
|
||||
default=EngineArgs.nnodes,
|
||||
help=
|
||||
"The number of all nodes.")
|
||||
help="The number of all nodes.",
|
||||
)
|
||||
|
||||
system_group.add_argument(
|
||||
"--node-rank",
|
||||
type=int,
|
||||
default=EngineArgs.node_rank,
|
||||
help=
|
||||
"node rank id (range [0, nnodes)).")
|
||||
|
||||
|
||||
help="node rank id (range [0, nnodes)).",
|
||||
)
|
||||
|
||||
# Performance tuning parameters group
|
||||
perf_group = parser.add_argument_group("Performance Tuning")
|
||||
perf_group.add_argument("--enable-prefix-caching",
|
||||
action='store_true',
|
||||
default=EngineArgs.enable_prefix_caching,
|
||||
help="Flag to enable prefix caching.")
|
||||
perf_group.add_argument(
|
||||
"--enable-prefix-caching",
|
||||
action="store_true",
|
||||
default=EngineArgs.enable_prefix_caching,
|
||||
help="Flag to enable prefix caching.",
|
||||
)
|
||||
|
||||
perf_group.add_argument("--splitwise-role",
|
||||
type=str,
|
||||
default=EngineArgs.splitwise_role,
|
||||
help="Role of splitwise. Default is \
|
||||
'mixed'. (prefill, decode, mixed)")
|
||||
perf_group.add_argument(
|
||||
"--splitwise-role",
|
||||
type=str,
|
||||
default=EngineArgs.splitwise_role,
|
||||
help="Role of splitwise. Default is \
|
||||
'mixed'. (prefill, decode, mixed)",
|
||||
)
|
||||
|
||||
perf_group.add_argument("--innode-prefill-ports",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.innode_prefill_ports,
|
||||
help="port for innode prefill")
|
||||
perf_group.add_argument(
|
||||
"--innode-prefill-ports",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.innode_prefill_ports,
|
||||
help="port for innode prefill",
|
||||
)
|
||||
|
||||
perf_group.add_argument("--enable-chunked-prefill",
|
||||
action='store_true',
|
||||
default=EngineArgs.enable_chunked_prefill,
|
||||
help="Flag to enable chunked prefill.")
|
||||
perf_group.add_argument("--max-num-partial-prefills",
|
||||
type=int,
|
||||
default=EngineArgs.max_num_partial_prefills,
|
||||
help="For chunked prefill, Maximum number \
|
||||
of concurrent partial prefill requests.")
|
||||
perf_group.add_argument(
|
||||
"--enable-chunked-prefill",
|
||||
action="store_true",
|
||||
default=EngineArgs.enable_chunked_prefill,
|
||||
help="Flag to enable chunked prefill.",
|
||||
)
|
||||
perf_group.add_argument(
|
||||
"--max-num-partial-prefills",
|
||||
type=int,
|
||||
default=EngineArgs.max_num_partial_prefills,
|
||||
help="For chunked prefill, Maximum number \
|
||||
of concurrent partial prefill requests.",
|
||||
)
|
||||
perf_group.add_argument(
|
||||
"--max-long-partial-prefills",
|
||||
type=int,
|
||||
default=EngineArgs.max_long_partial_prefills,
|
||||
help=
|
||||
("For chunked prefill, the maximum number of prompts longer than long-prefill-token-threshold"
|
||||
"that will be prefilled concurrently."))
|
||||
help=(
|
||||
"For chunked prefill, the maximum number of prompts longer than long-prefill-token-threshold"
|
||||
"that will be prefilled concurrently."
|
||||
),
|
||||
)
|
||||
perf_group.add_argument(
|
||||
"--long-prefill-token-threshold",
|
||||
type=int,
|
||||
default=EngineArgs.long_prefill_token_threshold,
|
||||
help=("For chunked prefill, the threshold number of"
|
||||
" tokens for a prompt to be considered long."))
|
||||
help=("For chunked prefill, the threshold number of" " tokens for a prompt to be considered long."),
|
||||
)
|
||||
|
||||
perf_group.add_argument(
|
||||
"--cache-transfer-protocol",
|
||||
type=str,
|
||||
default=EngineArgs.cache_transfer_protocol,
|
||||
help="support protocol list, comma separated, default is ipc")
|
||||
help="support protocol list, comma separated, default is ipc",
|
||||
)
|
||||
|
||||
perf_group.add_argument("--pd-comm-port",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.pd_comm_port,
|
||||
help="port for splitwise communication.")
|
||||
perf_group.add_argument(
|
||||
"--pd-comm-port",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.pd_comm_port,
|
||||
help="port for splitwise communication.",
|
||||
)
|
||||
|
||||
perf_group.add_argument("--rdma-comm-ports",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.rdma_comm_ports,
|
||||
help="ports for rdma communication.")
|
||||
perf_group.add_argument(
|
||||
"--rdma-comm-ports",
|
||||
type=lambda s: s.split(",") if s else None,
|
||||
default=EngineArgs.rdma_comm_ports,
|
||||
help="ports for rdma communication.",
|
||||
)
|
||||
|
||||
# Scheduler parameters group
|
||||
scheduler_group = parser.add_argument_group("Scheduler")
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-name",
|
||||
default=EngineArgs.scheduler_name,
|
||||
help=
|
||||
f"Scheduler name to be used. Default is {EngineArgs.scheduler_name}. (local,global)"
|
||||
help=f"Scheduler name to be used. Default is {EngineArgs.scheduler_name}. (local,global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-max-size",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_max_size,
|
||||
help=
|
||||
f"Size of scheduler. Default is {EngineArgs.scheduler_max_size}. (Local)"
|
||||
help=f"Size of scheduler. Default is {EngineArgs.scheduler_max_size}. (Local)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-ttl",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_ttl,
|
||||
help=
|
||||
f"TTL of request. Default is {EngineArgs.scheduler_ttl} seconds. (local,global)"
|
||||
help=f"TTL of request. Default is {EngineArgs.scheduler_ttl} seconds. (local,global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-host",
|
||||
default=EngineArgs.scheduler_host,
|
||||
help=
|
||||
f"Host address of redis. Default is {EngineArgs.scheduler_host}. (global)"
|
||||
help=f"Host address of redis. Default is {EngineArgs.scheduler_host}. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-port",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_port,
|
||||
help=
|
||||
f"Port of redis. Default is {EngineArgs.scheduler_port}. (global)")
|
||||
help=f"Port of redis. Default is {EngineArgs.scheduler_port}. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-db",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_db,
|
||||
help=f"DB of redis. Default is {EngineArgs.scheduler_db}. (global)"
|
||||
help=f"DB of redis. Default is {EngineArgs.scheduler_db}. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-password",
|
||||
default=EngineArgs.scheduler_password,
|
||||
help=
|
||||
f"Password of redis. Default is {EngineArgs.scheduler_password}. (global)"
|
||||
help=f"Password of redis. Default is {EngineArgs.scheduler_password}. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-topic",
|
||||
default=EngineArgs.scheduler_topic,
|
||||
help=
|
||||
f"Topic of scheduler. Defaule is {EngineArgs.scheduler_topic}. (global)"
|
||||
help=f"Topic of scheduler. Defaule is {EngineArgs.scheduler_topic}. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-min-load-score",
|
||||
type=float,
|
||||
default=EngineArgs.scheduler_min_load_score,
|
||||
help=
|
||||
f"Minimum load score for task assignment. Default is {EngineArgs.scheduler_min_load_score} (global)"
|
||||
help=f"Minimum load score for task assignment. Default is {EngineArgs.scheduler_min_load_score} (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-load-shards-num",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_load_shards_num,
|
||||
help=("Number of shards for load balancing table. Default is "
|
||||
f"{EngineArgs.scheduler_load_shards_num} (global)"))
|
||||
help=(
|
||||
"Number of shards for load balancing table. Default is "
|
||||
f"{EngineArgs.scheduler_load_shards_num} (global)"
|
||||
),
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-sync-period",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_sync_period,
|
||||
help=f"SplitWise Use, node load sync period, "
|
||||
f"Default is {EngineArgs.scheduler_sync_period}ms. (global)")
|
||||
f"Default is {EngineArgs.scheduler_sync_period}ms. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-expire-period",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_expire_period,
|
||||
help=f"SplitWise Use, node will not be scheduled after "
|
||||
f"expire-period ms not sync load, Default is "
|
||||
f"{EngineArgs.scheduler_expire_period}ms. (global)")
|
||||
f"{EngineArgs.scheduler_expire_period}ms. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-release-load-expire-period",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_release_load_expire_period,
|
||||
help=f"SplitWise Use, scheduler will release req load after "
|
||||
f"expire period(s). Default is "
|
||||
f"{EngineArgs.scheduler_release_load_expire_period}. (global)")
|
||||
f"{EngineArgs.scheduler_release_load_expire_period}. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-reader-parallel",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_reader_parallel,
|
||||
help=f"SplitWise Use, Results Reader Sync Parallel, "
|
||||
f"Default is {EngineArgs.scheduler_reader_parallel}. (global)")
|
||||
f"Default is {EngineArgs.scheduler_reader_parallel}. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-writer-parallel",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_writer_parallel,
|
||||
help=f"SplitWise Use, Results Writer Sync Parallel, "
|
||||
f"Default is {EngineArgs.scheduler_writer_parallel}. (global)")
|
||||
f"Default is {EngineArgs.scheduler_writer_parallel}. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-reader-batch-size",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_reader_batch_size,
|
||||
help=f"SplitWise Use, Results Reader Batch Size, "
|
||||
f"Default is {EngineArgs.scheduler_reader_batch_size}. (global)")
|
||||
f"Default is {EngineArgs.scheduler_reader_batch_size}. (global)",
|
||||
)
|
||||
scheduler_group.add_argument(
|
||||
"--scheduler-writer-batch-size",
|
||||
type=int,
|
||||
default=EngineArgs.scheduler_writer_batch_size,
|
||||
help=f"SplitWise Use, Results Writer Batch Size, "
|
||||
f"Default is {EngineArgs.scheduler_writer_batch_size}. (global)")
|
||||
f"Default is {EngineArgs.scheduler_writer_batch_size}. (global)",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
@@ -690,21 +760,19 @@ class EngineArgs:
|
||||
"""
|
||||
Create an instance of EngineArgs from command line arguments.
|
||||
"""
|
||||
return cls(
|
||||
**{
|
||||
field.name: getattr(args, field.name)
|
||||
for field in dataclass_fields(cls)
|
||||
})
|
||||
return cls(**{field.name: getattr(args, field.name) for field in dataclass_fields(cls)})
|
||||
|
||||
def create_model_config(self) -> ModelConfig:
|
||||
"""
|
||||
Create and return a ModelConfig object based on the current settings.
|
||||
"""
|
||||
return ModelConfig(model_name_or_path=self.model,
|
||||
config_json_file=self.model_config_name,
|
||||
quantization=self.quantization,
|
||||
dynamic_load_weight=self.dynamic_load_weight,
|
||||
load_strategy=self.load_strategy)
|
||||
return ModelConfig(
|
||||
model_name_or_path=self.model,
|
||||
config_json_file=self.model_config_name,
|
||||
quantization=self.quantization,
|
||||
dynamic_load_weight=self.dynamic_load_weight,
|
||||
load_strategy=self.load_strategy,
|
||||
)
|
||||
|
||||
def create_cache_config(self, model_cfg) -> CacheConfig:
|
||||
"""
|
||||
@@ -728,8 +796,7 @@ class EngineArgs:
|
||||
)
|
||||
|
||||
def create_speculative_config(self) -> SpeculativeConfig:
|
||||
"""
|
||||
"""
|
||||
""" """
|
||||
if self.speculative_config is not None:
|
||||
return SpeculativeConfig(**self.speculative_config)
|
||||
else:
|
||||
@@ -742,9 +809,11 @@ class EngineArgs:
|
||||
prefix = "scheduler_"
|
||||
prefix_len = len(prefix)
|
||||
extra_params = [
|
||||
"max_model_len", "enable_chunked_prefill",
|
||||
"max_num_partial_prefills", "max_long_partial_prefills",
|
||||
"long_prefill_token_threshold"
|
||||
"max_model_len",
|
||||
"enable_chunked_prefill",
|
||||
"max_num_partial_prefills",
|
||||
"max_long_partial_prefills",
|
||||
"long_prefill_token_threshold",
|
||||
]
|
||||
|
||||
all = asdict(self)
|
||||
@@ -765,7 +834,7 @@ class EngineArgs:
|
||||
tensor_parallel_size=self.tensor_parallel_size,
|
||||
enable_expert_parallel=self.enable_expert_parallel,
|
||||
data_parallel_size=self.data_parallel_size,
|
||||
enable_custom_all_reduce=self.enable_custom_all_reduce
|
||||
enable_custom_all_reduce=self.enable_custom_all_reduce,
|
||||
)
|
||||
|
||||
def create_graph_optimization_config(self) -> GraphOptimizationConfig:
|
||||
@@ -782,8 +851,7 @@ class EngineArgs:
|
||||
Create and return a Config object based on the current settings.
|
||||
"""
|
||||
model_cfg = self.create_model_config()
|
||||
if not model_cfg.is_unified_ckpt and hasattr(model_cfg,
|
||||
'tensor_parallel_size'):
|
||||
if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
|
||||
self.tensor_parallel_size = model_cfg.tensor_parallel_size
|
||||
if self.max_num_batched_tokens is None:
|
||||
if self.enable_chunked_prefill:
|
||||
@@ -795,11 +863,11 @@ class EngineArgs:
|
||||
graph_opt_cfg = self.create_graph_optimization_config()
|
||||
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
|
||||
|
||||
assert not (self.use_cudagraph and self.enable_prefix_caching), \
|
||||
"Prefix caching cannot be used with CUDA graph"
|
||||
assert not (self.use_cudagraph and self.enable_prefix_caching), "Prefix caching cannot be used with CUDA graph"
|
||||
|
||||
assert not (self.tensor_parallel_size<=1 and self.enable_custom_all_reduce), \
|
||||
"enable_custom_all_reduce must be used with tensor_parallel_size>1"
|
||||
assert not (
|
||||
self.tensor_parallel_size <= 1 and self.enable_custom_all_reduce
|
||||
), "enable_custom_all_reduce must be used with tensor_parallel_size>1"
|
||||
|
||||
return Config(
|
||||
model_name_or_path=self.model,
|
||||
@@ -831,5 +899,5 @@ class EngineArgs:
|
||||
guided_decoding_backend=self.guided_decoding_backend,
|
||||
disable_any_whitespace=self.guided_decoding_disable_any_whitespace,
|
||||
enable_custom_all_reduce=self.enable_custom_all_reduce,
|
||||
enable_logprob = self.enable_logprob,
|
||||
enable_logprob=self.enable_logprob,
|
||||
)
|
||||
|
Reference in New Issue
Block a user