mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
polish code with new pre-commit rule (#2923)
This commit is contained in:
@@ -13,6 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import time
|
||||
@@ -21,17 +22,23 @@ from typing import List
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.distributed as dist
|
||||
import paddle.distributed.fleet as fleet
|
||||
from paddle.distributed import fleet
|
||||
|
||||
from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig,
|
||||
GraphOptimizationConfig, LoadConfig,
|
||||
ModelConfig, ParallelConfig, SpeculativeConfig,
|
||||
ErnieArchitectures)
|
||||
from fastdeploy.config import (
|
||||
DecodingConfig,
|
||||
DeviceConfig,
|
||||
ErnieArchitectures,
|
||||
FDConfig,
|
||||
GraphOptimizationConfig,
|
||||
LoadConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
SpeculativeConfig,
|
||||
)
|
||||
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
|
||||
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
|
||||
from fastdeploy.inter_communicator import IPCSignal
|
||||
from fastdeploy.model_executor.layers.quantization import \
|
||||
get_quantization_config
|
||||
from fastdeploy.model_executor.layers.quantization import get_quantization_config
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.utils import get_logger, none_or_str
|
||||
from fastdeploy.worker.worker_base import WorkerBase
|
||||
@@ -47,24 +54,28 @@ def get_worker(fd_config: FDConfig, local_rank: int, rank: int) -> WorkerBase:
|
||||
raise NotImplementedError("Only CUDA platform supports logprob.")
|
||||
if current_platform.is_dcu():
|
||||
from fastdeploy.worker.dcu_worker import DcuWorker
|
||||
|
||||
return DcuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)
|
||||
if current_platform.is_cuda():
|
||||
from fastdeploy.worker.gpu_worker import GpuWorker
|
||||
|
||||
return GpuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)
|
||||
if current_platform.is_xpu():
|
||||
from fastdeploy.worker.xpu_worker import XpuWorker
|
||||
|
||||
return XpuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)
|
||||
if current_platform.is_iluvatar():
|
||||
from fastdeploy.worker.iluvatar_worker import IluvatarWorker
|
||||
return IluvatarWorker(fd_config=fd_config,
|
||||
local_rank=local_rank,
|
||||
rank=rank)
|
||||
|
||||
return IluvatarWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)
|
||||
if current_platform.is_gcu():
|
||||
from fastdeploy.worker.gcu_worker import GcuWorker
|
||||
|
||||
return GcuWorker(fd_config=fd_config, local_rank=local_rank, rank=rank)
|
||||
|
||||
|
||||
def init_distributed_environment(seed: int = 20) -> List[int]:
|
||||
""" Initialize Paddle Fleet and get rank of worker """
|
||||
"""Initialize Paddle Fleet and get rank of worker"""
|
||||
# Global rank
|
||||
ranks = dist.get_world_size()
|
||||
dist_strategy = fleet.DistributedStrategy()
|
||||
@@ -85,6 +96,7 @@ def init_distributed_environment(seed: int = 20) -> List[int]:
|
||||
|
||||
return ranks, local_rank
|
||||
|
||||
|
||||
def update_fd_config_for_mm(fd_config: FDConfig) -> None:
|
||||
if fd_config.model_config.enable_mm:
|
||||
tokenizer = ErnieBotTokenizer.from_pretrained(
|
||||
@@ -103,13 +115,12 @@ def update_fd_config_for_mm(fd_config: FDConfig) -> None:
|
||||
vision_config.dtype = fd_config.model_config.dtype
|
||||
# vision_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size
|
||||
# vision_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank
|
||||
fd_config.model_config.im_patch_id = tokenizer.get_vocab()[
|
||||
"<|IMAGE_PLACEHOLDER|>"
|
||||
]
|
||||
fd_config.model_config.im_patch_id = tokenizer.get_vocab()["<|IMAGE_PLACEHOLDER|>"]
|
||||
fd_config.model_config.think_end_id = tokenizer.get_vocab()["</think>"]
|
||||
fd_config.model_config.sequence_parallel = fd_config.parallel_config.sequence_parallel
|
||||
|
||||
class PaddleDisWorkerProc():
|
||||
|
||||
class PaddleDisWorkerProc:
|
||||
"""
|
||||
Paddle Distrubuted wrapper for fastdeploy.worker.Worker,
|
||||
for handling single-node multi-GPU tensor parallel.
|
||||
@@ -117,12 +128,7 @@ class PaddleDisWorkerProc():
|
||||
in the task queue. Control flow is transmitted by IPC.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
fd_config: FDConfig,
|
||||
ranks: int = 1,
|
||||
local_rank: int = 0
|
||||
) -> None:
|
||||
def __init__(self, fd_config: FDConfig, ranks: int = 1, local_rank: int = 0) -> None:
|
||||
"""
|
||||
Initialize a distributed worker and task queue for single-node multi-GPU setup.
|
||||
Args:
|
||||
@@ -136,20 +142,21 @@ class PaddleDisWorkerProc():
|
||||
self.parallel_config = fd_config.parallel_config
|
||||
|
||||
# TODO(gongshaotian): Use worker factory to get worker
|
||||
self.worker = get_worker(fd_config=fd_config,
|
||||
local_rank=self.local_rank,
|
||||
rank=self.ranks)
|
||||
self.worker = get_worker(fd_config=fd_config, local_rank=self.local_rank, rank=self.ranks)
|
||||
|
||||
# Initialize task queue
|
||||
task_address = (self.parallel_config.pod_ip,
|
||||
self.parallel_config.engine_worker_queue_port)
|
||||
task_address = (
|
||||
self.parallel_config.pod_ip,
|
||||
self.parallel_config.engine_worker_queue_port,
|
||||
)
|
||||
|
||||
self.task_queue = TaskQueue(
|
||||
address=task_address,
|
||||
is_server=False,
|
||||
num_client=self.parallel_config.tensor_parallel_size,
|
||||
client_id=self.parallel_config.tensor_parallel_rank,
|
||||
local_data_parallel_id=self.parallel_config.expert_parallel_rank)
|
||||
local_data_parallel_id=self.parallel_config.expert_parallel_rank,
|
||||
)
|
||||
|
||||
def init_health_status(self) -> None:
|
||||
"""
|
||||
@@ -164,17 +171,18 @@ class PaddleDisWorkerProc():
|
||||
# init worker_ready_signal
|
||||
self.max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
|
||||
array_size = min(
|
||||
self.max_chips_per_node, self.parallel_config.tensor_parallel_size *
|
||||
self.parallel_config.expert_parallel_size)
|
||||
self.max_chips_per_node,
|
||||
self.parallel_config.tensor_parallel_size * self.parallel_config.expert_parallel_size,
|
||||
)
|
||||
workers_ready = np.zeros(shape=[array_size], dtype=np.int32)
|
||||
self.worker_ready_signal = IPCSignal(
|
||||
name="worker_ready_signal",
|
||||
array=workers_ready,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False)
|
||||
self.worker_ready_signal.value[self.local_rank %
|
||||
self.max_chips_per_node] = 1
|
||||
create=False,
|
||||
)
|
||||
self.worker_ready_signal.value[self.local_rank % self.max_chips_per_node] = 1
|
||||
|
||||
# init worker_healthy_live_signal
|
||||
workers_alive = np.zeros(shape=[array_size], dtype=np.int32)
|
||||
@@ -183,9 +191,9 @@ class PaddleDisWorkerProc():
|
||||
array=workers_alive,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False)
|
||||
self.worker_healthy_live_signal.value[self.local_rank % 8] = int(
|
||||
time.time())
|
||||
create=False,
|
||||
)
|
||||
self.worker_healthy_live_signal.value[self.local_rank % 8] = int(time.time())
|
||||
|
||||
# init model_weights_status
|
||||
workers_model_weights = np.zeros(shape=[1], dtype=np.int32)
|
||||
@@ -194,28 +202,28 @@ class PaddleDisWorkerProc():
|
||||
array=workers_model_weights,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False)
|
||||
create=False,
|
||||
)
|
||||
|
||||
# init exist_task_signal
|
||||
workers_exist_task = np.zeros(
|
||||
[self.parallel_config.expert_parallel_size], dtype=np.int32)
|
||||
workers_exist_task = np.zeros([self.parallel_config.expert_parallel_size], dtype=np.int32)
|
||||
self.exist_task_signal = IPCSignal(
|
||||
name="exist_task_signal",
|
||||
array=workers_exist_task,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False)
|
||||
create=False,
|
||||
)
|
||||
|
||||
# init exist_swapped_task_signal
|
||||
workers_swapped_task = np.zeros(
|
||||
shape=[self.parallel_config.expert_parallel_size],
|
||||
dtype=np.int32)
|
||||
workers_swapped_task = np.zeros(shape=[self.parallel_config.expert_parallel_size], dtype=np.int32)
|
||||
self.exist_swapped_task_signal = IPCSignal(
|
||||
name="exist_swapped_task_signal",
|
||||
array=workers_swapped_task,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False)
|
||||
create=False,
|
||||
)
|
||||
|
||||
# init exist_prefill_task_signal
|
||||
exist_prefill_task_signal_data = np.zeros([1], dtype=np.int32)
|
||||
@@ -224,26 +232,27 @@ class PaddleDisWorkerProc():
|
||||
array=exist_prefill_task_signal_data,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False)
|
||||
create=False,
|
||||
)
|
||||
|
||||
def event_loop_ep(self) -> None:
|
||||
"""
|
||||
Tmp loop function for ep utill DP is supported
|
||||
"""
|
||||
while True:
|
||||
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(
|
||||
time.time())
|
||||
self.worker_healthy_live_signal.value[self.local_rank % self.max_chips_per_node] = int(time.time())
|
||||
|
||||
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks(
|
||||
) > 0:
|
||||
if self.fd_config.parallel_config.tensor_parallel_rank == 0 and self.task_queue.num_tasks() > 0:
|
||||
tasks, read_finish = self.task_queue.get_tasks()
|
||||
|
||||
req_dicts = []
|
||||
for req_dict, bsz in tasks:
|
||||
num_running_requests = int(bsz)
|
||||
req_dicts.extend(req_dict)
|
||||
logger.info(f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, " \
|
||||
f"num_insert_requests: {len(req_dicts)}")
|
||||
logger.info(
|
||||
f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, "
|
||||
f"num_insert_requests: {len(req_dicts)}"
|
||||
)
|
||||
# Process prefill inputs
|
||||
self.worker.preprocess_new_task(req_dicts)
|
||||
|
||||
@@ -252,7 +261,7 @@ class PaddleDisWorkerProc():
|
||||
self.worker.execute_model()
|
||||
|
||||
def event_loop_normal(self) -> None:
|
||||
""" Main event loop for Paddle Distrubuted Workers.
|
||||
"""Main event loop for Paddle Distrubuted Workers.
|
||||
TODO(gongshaotian): support remote calling of functions that control worker.
|
||||
"""
|
||||
# Currently, only support single node
|
||||
@@ -271,18 +280,15 @@ class PaddleDisWorkerProc():
|
||||
paddle.distributed.barrier()
|
||||
|
||||
self.insert_step = False
|
||||
self.worker_healthy_live_signal.value[self.local_rank] = int(
|
||||
time.time())
|
||||
self.worker_healthy_live_signal.value[self.local_rank] = int(time.time())
|
||||
|
||||
# The first worker detects whether there are tasks in the task queue
|
||||
if self.local_rank % mp_num_per_node == 0:
|
||||
if self.local_rank % mp_num_per_node == 0:
|
||||
if self.task_queue.num_tasks() > 0:
|
||||
if self.nnode > 1:
|
||||
self.task_queue.read_finish_flag.set(1)
|
||||
else:
|
||||
self.exist_task_signal.value[
|
||||
self.fd_config.parallel_config.
|
||||
expert_parallel_rank] = 1
|
||||
self.exist_task_signal.value[self.fd_config.parallel_config.expert_parallel_rank] = 1
|
||||
|
||||
if self.parallel_config.tensor_parallel_size > 1:
|
||||
# Synchronize the signal for other workers
|
||||
@@ -291,23 +297,27 @@ class PaddleDisWorkerProc():
|
||||
|
||||
if self.fd_config.load_config.dynamic_load_weight:
|
||||
if self.exist_task_signal.value[0] == 2:
|
||||
from fastdeploy.rl.dynamic_weight_manager import \
|
||||
DynamicWeightManager
|
||||
DynamicWeightManager.check_model_weights_status(
|
||||
self.model_weights_status, self.worker.model_runner,
|
||||
self.parallel_config.engine_pid)
|
||||
from fastdeploy.rl.dynamic_weight_manager import (
|
||||
DynamicWeightManager,
|
||||
)
|
||||
|
||||
if self.exist_task_signal.value[
|
||||
self.fd_config.parallel_config.expert_parallel_rank] == 1 or \
|
||||
self.task_queue.read_finish_flag.get() == 1:
|
||||
DynamicWeightManager.check_model_weights_status(
|
||||
self.model_weights_status,
|
||||
self.worker.model_runner,
|
||||
self.parallel_config.engine_pid,
|
||||
)
|
||||
|
||||
if (
|
||||
self.exist_task_signal.value[self.fd_config.parallel_config.expert_parallel_rank] == 1
|
||||
or self.task_queue.read_finish_flag.get() == 1
|
||||
):
|
||||
logger.info(f"Rank: {self.local_rank} Detected new requests.")
|
||||
self.insert_step = True
|
||||
|
||||
tasks, read_finish = self.task_queue.get_tasks()
|
||||
if read_finish:
|
||||
# Ensure that every worker get the task
|
||||
self.exist_task_signal.value[self.fd_config.parallel_config
|
||||
.expert_parallel_rank] = 0
|
||||
self.exist_task_signal.value[self.fd_config.parallel_config.expert_parallel_rank] = 0
|
||||
self.task_queue.read_finish_flag.set(0)
|
||||
|
||||
req_dicts = []
|
||||
@@ -316,8 +326,10 @@ class PaddleDisWorkerProc():
|
||||
req_dicts.extend(req_dict)
|
||||
|
||||
req_ids = [req.request_id for req in req_dicts]
|
||||
logger.info(f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, " \
|
||||
f"num_insert_requests: {len(req_dicts)}, req_ids: {req_ids}")
|
||||
logger.info(
|
||||
f"Rank: {self.local_rank}, num_running_requests: {num_running_requests}, "
|
||||
f"num_insert_requests: {len(req_dicts)}, req_ids: {req_ids}"
|
||||
)
|
||||
|
||||
# Process prefill inputs
|
||||
self.worker.preprocess_new_task(req_dicts)
|
||||
@@ -333,9 +345,7 @@ class PaddleDisWorkerProc():
|
||||
# These generated tokens can be obtained through get_output op.
|
||||
self.worker.execute_model(req_dicts)
|
||||
|
||||
self.exist_prefill_task_signal.value[
|
||||
0] = self.worker.prefill_finished()
|
||||
|
||||
self.exist_prefill_task_signal.value[0] = self.worker.prefill_finished()
|
||||
|
||||
def determine_num_available_blocks(self) -> None:
|
||||
"""Profiles the peak memory usage of the model to determine how many
|
||||
@@ -351,64 +361,51 @@ class PaddleDisWorkerProc():
|
||||
"""
|
||||
if self.fd_config.parallel_config.do_profile:
|
||||
# 1. Get available memory(bytes)
|
||||
available_kv_cache_memory = self.worker.determine_available_memory(
|
||||
)
|
||||
logger.info(
|
||||
f"------- available_kv_cache_memory:{available_kv_cache_memory / 1024**3} GB --------"
|
||||
)
|
||||
available_kv_cache_memory = self.worker.determine_available_memory()
|
||||
logger.info(f"------- available_kv_cache_memory:{available_kv_cache_memory / 1024**3} GB --------")
|
||||
|
||||
# 2. Calculate the appropriate number of blocks
|
||||
model_block_memory_used = self.worker.cal_theortical_kvcache()
|
||||
num_blocks_local = int(available_kv_cache_memory //
|
||||
model_block_memory_used)
|
||||
num_blocks_local = int(available_kv_cache_memory // model_block_memory_used)
|
||||
# NOTE(liuzichang): Too many block will lead to illegal memory access
|
||||
# We will develop dynamic limits in future.
|
||||
if num_blocks_local > 40000:
|
||||
logger.info(
|
||||
f"------- Reset num_blocks_local {num_blocks_local} to 40000"
|
||||
)
|
||||
logger.info(f"------- Reset num_blocks_local {num_blocks_local} to 40000")
|
||||
num_blocks_local = min(40000, num_blocks_local)
|
||||
logger.info(
|
||||
f"------- model_block_memory_used:{model_block_memory_used} --------"
|
||||
)
|
||||
logger.info(
|
||||
f"------- num_blocks_local:{num_blocks_local} --------")
|
||||
logger.info(f"------- model_block_memory_used:{model_block_memory_used} --------")
|
||||
logger.info(f"------- num_blocks_local:{num_blocks_local} --------")
|
||||
|
||||
logger.info(
|
||||
f"self.fd_config.parallel_config.do_profile:{self.fd_config.parallel_config.do_profile}"
|
||||
)
|
||||
logger.info(f"self.fd_config.parallel_config.do_profile:{self.fd_config.parallel_config.do_profile}")
|
||||
|
||||
# 3. Send IPCSignal
|
||||
get_profile_block_num = np.zeros(shape=[self.ranks],
|
||||
dtype=np.int32)
|
||||
get_profile_block_num = np.zeros(shape=[self.ranks], dtype=np.int32)
|
||||
self.get_profile_block_num_signal = IPCSignal(
|
||||
name="get_profile_block_num",
|
||||
array=get_profile_block_num,
|
||||
dtype=np.int32,
|
||||
suffix=self.parallel_config.engine_pid,
|
||||
create=False)
|
||||
self.get_profile_block_num_signal.value[
|
||||
self.local_rank] = num_blocks_local
|
||||
create=False,
|
||||
)
|
||||
self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_local
|
||||
|
||||
# Wait all worker send the signal
|
||||
while np.any(self.get_profile_block_num_signal.value <= 0):
|
||||
time.sleep(0.01)
|
||||
num_blocks_global = self.get_profile_block_num_signal.value.min(
|
||||
).item()
|
||||
num_blocks_global = self.get_profile_block_num_signal.value.min().item()
|
||||
|
||||
if num_blocks_global < 0:
|
||||
logger.error(
|
||||
"The total number of blocks cannot be less than zero."
|
||||
"Please increase gpu_memory_utilization"
|
||||
"Or decrease max_num_batched_tokens(max model length) ")
|
||||
"Or decrease max_num_batched_tokens(max model length) "
|
||||
)
|
||||
raise ValueError(
|
||||
"The total number of blocks cannot be less than zero."
|
||||
"Please increase gpu_memory_utilization"
|
||||
"Or decrease max_num_batched_tokens(max model length) ")
|
||||
"Or decrease max_num_batched_tokens(max model length) "
|
||||
)
|
||||
|
||||
|
||||
self.get_profile_block_num_signal.value[
|
||||
self.local_rank] = num_blocks_global
|
||||
self.get_profile_block_num_signal.value[self.local_rank] = num_blocks_global
|
||||
else:
|
||||
num_blocks_global = self.fd_config.parallel_config.total_block_num
|
||||
# NOTE(liuzichang): Too big num_blocks_global will lead to error 700
|
||||
@@ -416,11 +413,11 @@ class PaddleDisWorkerProc():
|
||||
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)
|
||||
|
||||
def init_device(self) -> None:
|
||||
""" Initialize device and Construct model runner """
|
||||
"""Initialize device and Construct model runner"""
|
||||
self.worker.init_device()
|
||||
|
||||
def load_model(self) -> None:
|
||||
""" Load weights and create model """
|
||||
"""Load weights and create model"""
|
||||
self.worker.load_model()
|
||||
|
||||
|
||||
@@ -429,66 +426,44 @@ def parse_args():
|
||||
Parse args from command line
|
||||
"""
|
||||
parser = argparse.ArgumentParser("FastDeploy LLM Inference")
|
||||
parser.add_argument("-m",
|
||||
"--model_name_or_path",
|
||||
type=str,
|
||||
default="./output",
|
||||
help="model dir")
|
||||
parser.add_argument("-mbs",
|
||||
"--max_num_seqs",
|
||||
type=int,
|
||||
default=34,
|
||||
help="max batch size")
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--model_name_or_path",
|
||||
type=str,
|
||||
default="./output",
|
||||
help="model dir",
|
||||
)
|
||||
parser.add_argument("-mbs", "--max_num_seqs", type=int, default=34, help="max batch size")
|
||||
parser.add_argument("--total_block_num", type=int, default=2000)
|
||||
parser.add_argument("--block_size", type=int, default=64)
|
||||
parser.add_argument("--pod_ip", type=str, default="127.0.0.1")
|
||||
parser.add_argument("--engine_worker_queue_port", type=int, default=9923)
|
||||
parser.add_argument("--max_model_len",
|
||||
type=int,
|
||||
default=3072,
|
||||
help="max model len")
|
||||
parser.add_argument("--device_ids",
|
||||
type=str,
|
||||
default="0",
|
||||
help="cuda visible devices")
|
||||
parser.add_argument("--dtype",
|
||||
type=str,
|
||||
default="bfloat16",
|
||||
help="input dtype")
|
||||
parser.add_argument("--enc_dec_block_num",
|
||||
type=int,
|
||||
default=1,
|
||||
help="encoder's decoder num")
|
||||
parser.add_argument("--kv_cache_ratio",
|
||||
type=float,
|
||||
default=0.7,
|
||||
help="kv cache ratio for input")
|
||||
parser.add_argument("--first_token_id",
|
||||
type=int,
|
||||
default=1,
|
||||
help="first token id")
|
||||
parser.add_argument("--gpu_memory_utilization",
|
||||
type=float,
|
||||
default=0.9,
|
||||
help="gpu memory utilization")
|
||||
parser.add_argument("--engine_pid",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Process ID of engine")
|
||||
parser.add_argument("--do_profile",
|
||||
action='store_true',
|
||||
help="do profile or not")
|
||||
parser.add_argument("--pad_token_id",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="pad token id")
|
||||
parser.add_argument("--eos_tokens_lens",
|
||||
type=int,
|
||||
default=2,
|
||||
help="eos token lens")
|
||||
parser.add_argument("--enable_chunked_prefill",
|
||||
action='store_true',
|
||||
help="enable chunked prefill")
|
||||
parser.add_argument("--max_model_len", type=int, default=3072, help="max model len")
|
||||
parser.add_argument("--device_ids", type=str, default="0", help="cuda visible devices")
|
||||
parser.add_argument("--dtype", type=str, default="bfloat16", help="input dtype")
|
||||
parser.add_argument("--enc_dec_block_num", type=int, default=1, help="encoder's decoder num")
|
||||
parser.add_argument(
|
||||
"--kv_cache_ratio",
|
||||
type=float,
|
||||
default=0.7,
|
||||
help="kv cache ratio for input",
|
||||
)
|
||||
parser.add_argument("--first_token_id", type=int, default=1, help="first token id")
|
||||
parser.add_argument(
|
||||
"--gpu_memory_utilization",
|
||||
type=float,
|
||||
default=0.9,
|
||||
help="gpu memory utilization",
|
||||
)
|
||||
parser.add_argument("--engine_pid", type=int, default=None, help="Process ID of engine")
|
||||
parser.add_argument("--do_profile", action="store_true", help="do profile or not")
|
||||
parser.add_argument("--pad_token_id", type=int, default=-1, help="pad token id")
|
||||
parser.add_argument("--eos_tokens_lens", type=int, default=2, help="eos token lens")
|
||||
parser.add_argument(
|
||||
"--enable_chunked_prefill",
|
||||
action="store_true",
|
||||
help="enable chunked prefill",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--speculative_method",
|
||||
default=None,
|
||||
@@ -519,71 +494,90 @@ def parse_args():
|
||||
default=False,
|
||||
type=bool,
|
||||
)
|
||||
parser.add_argument("--max_num_batched_tokens",
|
||||
type=int,
|
||||
default=2048,
|
||||
help="max num batched tokens")
|
||||
parser.add_argument(
|
||||
"--max_num_batched_tokens",
|
||||
type=int,
|
||||
default=2048,
|
||||
help="max num batched tokens",
|
||||
)
|
||||
|
||||
parser.add_argument("--enable_prefix_caching",
|
||||
action='store_true',
|
||||
help="enable prefix cache")
|
||||
parser.add_argument("--enable-custom-all-reduce",
|
||||
action='store_true',
|
||||
help="enable custom all-reduce")
|
||||
parser.add_argument("--splitwise_role",
|
||||
type=str,
|
||||
default="mixed",
|
||||
help="splitwise role")
|
||||
parser.add_argument("--tensor_parallel_size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="tensor parallel size")
|
||||
parser.add_argument("--expert_parallel_size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="expert parallel size")
|
||||
parser.add_argument("--enable_expert_parallel",
|
||||
action='store_true',
|
||||
help="enable expert parallel")
|
||||
parser.add_argument(
|
||||
"--enable_prefix_caching",
|
||||
action="store_true",
|
||||
help="enable prefix cache",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable-custom-all-reduce",
|
||||
action="store_true",
|
||||
help="enable custom all-reduce",
|
||||
)
|
||||
parser.add_argument("--splitwise_role", type=str, default="mixed", help="splitwise role")
|
||||
parser.add_argument(
|
||||
"--tensor_parallel_size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="tensor parallel size",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--expert_parallel_size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="expert parallel size",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--enable_expert_parallel",
|
||||
action="store_true",
|
||||
help="enable expert parallel",
|
||||
)
|
||||
parser.add_argument("--ori_vocab_size", type=int, default=None)
|
||||
|
||||
parser.add_argument("--quantization",
|
||||
type=str,
|
||||
default="None",
|
||||
help="Quantization name for the model, currentlly support " \
|
||||
"'wint4', 'wint8'," \
|
||||
"default is None. The priority of this configuration "\
|
||||
"is lower than that of the config file. " \
|
||||
"More complex quantization methods need to be configured via the config file.")
|
||||
parser.add_argument("--graph_optimization_config",
|
||||
type=json.loads,
|
||||
default=None,
|
||||
help=" Configation of Graph optimization backend. "
|
||||
parser.add_argument(
|
||||
"--quantization",
|
||||
type=str,
|
||||
default="None",
|
||||
help="Quantization name for the model, currentlly support "
|
||||
"'wint4', 'wint8',"
|
||||
"default is None. The priority of this configuration "
|
||||
"is lower than that of the config file. "
|
||||
"More complex quantization methods need to be configured via the config file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--graph_optimization_config",
|
||||
type=json.loads,
|
||||
default=None,
|
||||
help=" Configation of Graph optimization backend. ",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--guided_decoding_backend",
|
||||
type=str,
|
||||
default="off",
|
||||
help="guided decoding backend",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--disable_any_whitespace",
|
||||
action="store_false",
|
||||
help="Disable any whitespace for guided decoding.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dynamic_load_weight",
|
||||
action="store_true",
|
||||
help="Enable dynamic weight loading strategy",
|
||||
)
|
||||
parser.add_argument("--guided_decoding_backend",
|
||||
type=str,
|
||||
default="off",
|
||||
help="guided decoding backend")
|
||||
parser.add_argument("--disable_any_whitespace",
|
||||
action='store_false',
|
||||
help="Disable any whitespace for guided decoding.")
|
||||
parser.add_argument("--dynamic_load_weight",
|
||||
action='store_true',
|
||||
help="Enable dynamic weight loading strategy")
|
||||
parser.add_argument(
|
||||
"--load_strategy",
|
||||
type=str,
|
||||
choices=['ipc', 'ipc_snapshot'],
|
||||
choices=["ipc", "ipc_snapshot"],
|
||||
default="ipc_snapshot",
|
||||
help="Weight loading method when dynamic loading is enabled: "
|
||||
"'ipc': real-time IPC streaming with automatic resharding, "
|
||||
"'ipc_snapshot': load from disk snapshot of IPC weights.")
|
||||
parser.add_argument("--enable_mm",
|
||||
action='store_true',
|
||||
help="Whether to enable vl model")
|
||||
parser.add_argument("--enable_logprob",
|
||||
action='store_true',
|
||||
help="Enable output of token-level log probabilities.")
|
||||
"'ipc_snapshot': load from disk snapshot of IPC weights.",
|
||||
)
|
||||
parser.add_argument("--enable_mm", action="store_true", help="Whether to enable vl model")
|
||||
parser.add_argument(
|
||||
"--enable_logprob",
|
||||
action="store_true",
|
||||
help="Enable output of token-level log probabilities.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
@@ -615,7 +609,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
else:
|
||||
num_experts = model_config.moe_num_experts
|
||||
|
||||
num_experts_per_rank = num_experts // args.expert_parallel_size
|
||||
num_experts_per_rank = num_experts // args.expert_parallel_size
|
||||
num_experts_start_offset = expert_parallel_rank * num_experts_per_rank
|
||||
|
||||
parallel_config.expert_parallel_rank = expert_parallel_rank
|
||||
@@ -629,7 +623,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
graph_opt_config = GraphOptimizationConfig(
|
||||
use_cudagraph=args.graph_optimization_config["use_cudagraph"],
|
||||
graph_opt_level=args.graph_optimization_config["graph_opt_level"],
|
||||
cudagraph_capture_sizes=args.graph_optimization_config["cudagraph_capture_sizes"]
|
||||
cudagraph_capture_sizes=args.graph_optimization_config["cudagraph_capture_sizes"],
|
||||
)
|
||||
|
||||
# Note(tangbinhan): used for load_checkpoint
|
||||
@@ -639,14 +633,10 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
model_config.pretrained_config.head_dim = model_config.head_dim
|
||||
|
||||
logger.info(f"parallel_config.use_ep {parallel_config.use_ep}")
|
||||
logger.info(
|
||||
f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}"
|
||||
)
|
||||
logger.info(
|
||||
f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}"
|
||||
)
|
||||
logger.info(f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}")
|
||||
logger.info(f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}")
|
||||
|
||||
if getattr(model_config, 'num_hidden_layers', None) is None:
|
||||
if getattr(model_config, "num_hidden_layers", None) is None:
|
||||
raise ValueError("num_hidden_layers is None")
|
||||
|
||||
quantization_config = model_config.quantization_config
|
||||
@@ -656,11 +646,8 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
model_config.is_quantized = True
|
||||
|
||||
quant_config_name = None
|
||||
if quantization_config is not None and quantization_config.get(
|
||||
"quantization", None) is None:
|
||||
raise ValueError(
|
||||
"quantization_config should have a key named 'quantization' for specify quant config."
|
||||
)
|
||||
if quantization_config is not None and quantization_config.get("quantization", None) is None:
|
||||
raise ValueError("quantization_config should have a key named 'quantization' for specify quant config.")
|
||||
|
||||
if quantization_config is not None:
|
||||
quant_config_name = quantization_config["quantization"]
|
||||
@@ -688,32 +675,29 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
|
||||
logger.info("===========quantization_config==============")
|
||||
if quant_config is not None:
|
||||
if model_config.is_quantized:
|
||||
logger.info(
|
||||
"Model Status: Offline Quantized (pre-quantized weights loaded)"
|
||||
)
|
||||
logger.info("Model Status: Offline Quantized (pre-quantized weights loaded)")
|
||||
else:
|
||||
logger.info(
|
||||
"Model Status: Original (will apply online quantization)")
|
||||
logger.info("Model Status: Original (will apply online quantization)")
|
||||
|
||||
logger.info(f"{quantization_config}")
|
||||
else:
|
||||
logger.info(
|
||||
"No quantization config found and use original weight and act dtype."
|
||||
)
|
||||
logger.info("No quantization config found and use original weight and act dtype.")
|
||||
|
||||
# Set VL tag
|
||||
model_config.enable_mm = args.enable_mm
|
||||
logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
|
||||
logger.info(f"- Load strategy: {load_config.load_strategy}")
|
||||
|
||||
fd_config = FDConfig(model_config=model_config,
|
||||
parallel_config=parallel_config,
|
||||
speculative_config=speculative_config,
|
||||
device_config=device_config,
|
||||
load_config=load_config,
|
||||
decoding_config=decoding_config,
|
||||
quant_config=quant_config,
|
||||
graph_opt_config=graph_opt_config)
|
||||
fd_config = FDConfig(
|
||||
model_config=model_config,
|
||||
parallel_config=parallel_config,
|
||||
speculative_config=speculative_config,
|
||||
device_config=device_config,
|
||||
load_config=load_config,
|
||||
decoding_config=decoding_config,
|
||||
quant_config=quant_config,
|
||||
graph_opt_config=graph_opt_config,
|
||||
)
|
||||
update_fd_config_for_mm(fd_config)
|
||||
|
||||
return fd_config
|
||||
|
Reference in New Issue
Block a user