diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 92fa483ba..f8451786c 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -58,6 +58,7 @@ PRETRAINED_INIT_CONFIGURATION = { "freq_allocation":20, "tie_word_embeddings":False, "rms_norm_eps":1e-5, + "moe_num_experts": None, } @@ -143,7 +144,7 @@ class ParallelConfig: self.model_name_or_path: str = "./output" self.max_num_seqs: int = 34 # Set default block num for profile run - self.max_block_num: int = 2000 + self.total_block_num: int = 2000 # block size self.block_size: int = 64 # Engine worker queue port diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index e842d9da8..a5e7f600f 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -93,7 +93,7 @@ class MTPProposer(Proposer): expected_decode_len: int): """Set dummy prefill inputs to model_inputs""" max_dec_len = expected_decode_len + 1 - self.num_gpu_blocks = self.parallel_config.max_block_num + self.num_gpu_blocks = self.parallel_config.total_block_num self.initialize_kv_cache() full_length = min(num_tokens // batch_size, self.parallel_config.max_model_len - max_dec_len) @@ -327,8 +327,8 @@ class MTPProposer(Proposer): self.free_list = list( range( - self.parallel_config.max_block_num - 1, - int(self.parallel_config.max_block_num * + self.parallel_config.total_block_num - 1, + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, -1, )) diff --git a/fastdeploy/worker/dcu_worker.py b/fastdeploy/worker/dcu_worker.py index 7997e9569..cf2c078d1 100644 --- a/fastdeploy/worker/dcu_worker.py +++ b/fastdeploy/worker/dcu_worker.py @@ -13,18 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. """ -import gc import time -from typing import List, Optional import paddle -import paddle.nn as nn from fastdeploy.config import FDConfig -from fastdeploy.engine.request import Request from fastdeploy.utils import get_logger -from fastdeploy.worker.gpu_model_runner import GPUModelRunner -from fastdeploy.worker.output import ModelRunnerOutput from fastdeploy.worker.gpu_worker import GpuWorker logger = get_logger("dcu_worker", "dcu_worker.log") @@ -97,7 +91,7 @@ class DcuWorker(GpuWorker): paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run available_kv_cache_memory = total_gpu_memory * \ self.parallel_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase - available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num + available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num end_time = time.perf_counter() logger.info( diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 29c6f189c..42cc19706 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -480,8 +480,8 @@ class GCUModelRunner(ModelRunnerBase): # Initialize free list free_list = list( range( - self.parallel_config.max_block_num - 1, - int(self.parallel_config.max_block_num * + self.parallel_config.total_block_num - 1, + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, -1)) self.free_list_len = len(free_list) self.share_inputs["free_list"] = paddle.to_tensor(free_list, @@ -1114,7 +1114,7 @@ class GCUModelRunner(ModelRunnerBase): """Execute a forward pass with dummy inputs to profile the memory usage of the model.""" # Initialize kv cache for profile run. After profile run kv cache will be reset. - self.num_gcu_blocks = self.parallel_config.max_block_num + self.num_gcu_blocks = self.parallel_config.total_block_num self.initialize_kv_cache() # 1. Profile with multimodal encoder & encoder cache diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 22336f28d..5bf4ce9c9 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -591,8 +591,8 @@ class GPUModelRunner(ModelRunnerBase): # Initialize free list free_list = list( range( - self.parallel_config.max_block_num - 1, - int(self.parallel_config.max_block_num * + self.parallel_config.total_block_num - 1, + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, -1)) self.free_list_len = len(free_list) self.share_inputs["free_list"] = paddle.to_tensor(free_list, @@ -1295,7 +1295,7 @@ class GPUModelRunner(ModelRunnerBase): # Initialize kv cache for profile run. After profile run kv cache will be reset. # TODO(gongshaotian): Optimize the management logic of kvcache - self.num_gpu_blocks = self.parallel_config.max_block_num + self.num_gpu_blocks = self.parallel_config.total_block_num self.initialize_kv_cache() # 1. Profile with multimodal encoder & encoder cache diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index b9f08c6b2..70e596359 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -61,7 +61,8 @@ class GpuWorker(WorkerBase): gc.collect() paddle.device.cuda.empty_cache() if self.parallel_config.enable_custom_all_reduce: - from fastdeploy.distributed.communication_op import use_custom_allreduce + from fastdeploy.distributed.communication_op import \ + use_custom_allreduce use_custom_allreduce() else: raise RuntimeError( @@ -137,7 +138,7 @@ class GpuWorker(WorkerBase): available_kv_cache_memory = after_run_meminfo.total * \ self.parallel_config.gpu_memory_utilization - after_run_meminfo.used - paddle_peak_increase - available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num + available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num end_time = time.perf_counter() logger.info(( diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index cd31e65ad..efd1862dc 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -468,8 +468,8 @@ class IluvatarModelRunner(ModelRunnerBase): # Initialize free list free_list = list( range( - self.parallel_config.max_block_num - 1, - int(self.parallel_config.max_block_num * + self.parallel_config.total_block_num - 1, + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, -1)) self.free_list_len = len(free_list) self.share_inputs["free_list"] = paddle.to_tensor(free_list, @@ -1069,7 +1069,7 @@ class IluvatarModelRunner(ModelRunnerBase): # Initialize kv cache for profile run. After profile run kv cache will be reset. # TODO(gongshaotian): Optimize the management logic of kvcache - self.num_gpu_blocks = self.parallel_config.max_block_num + self.num_gpu_blocks = self.parallel_config.total_block_num self.initialize_kv_cache() # 1. Profile with multimodal encoder & encoder cache diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index d180b1133..a160138f3 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -372,7 +372,7 @@ class PaddleDisWorkerProc(): self.get_profile_block_num_signal.value[ self.local_rank] = num_blocks_global else: - num_blocks_global = self.fd_config.parallel_config.max_block_num + num_blocks_global = self.fd_config.parallel_config.total_block_num # NOTE(liuzichang): Too big num_blocks_global will lead to error 700 # 4. Updata share inputs self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global) diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 12d89c4e7..968a959a7 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -479,8 +479,8 @@ class XPUModelRunner(ModelRunnerBase): # Initialize free list free_list = list( range( - self.parallel_config.max_block_num - 1, - int(self.parallel_config.max_block_num * + self.parallel_config.total_block_num - 1, + int(self.parallel_config.total_block_num * self.parallel_config.kv_cache_ratio) - 1, -1)) self.free_list_len = len(free_list) self.share_inputs["free_list"] = paddle.to_tensor(free_list, @@ -757,7 +757,7 @@ class XPUModelRunner(ModelRunnerBase): def prepare_profile(self) -> None: """Prepare the profile run by setting the block number and initializing the KV cache.""" paddle.device.xpu.empty_cache() - self.num_gpu_blocks = self.parallel_config.max_block_num + self.num_gpu_blocks = self.parallel_config.total_block_num self.initialize_kv_cache() def profile_run(self) -> None: diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py index 8b3fc5b16..bf85762c1 100644 --- a/fastdeploy/worker/xpu_worker.py +++ b/fastdeploy/worker/xpu_worker.py @@ -66,7 +66,7 @@ class XpuWorker(WorkerBase): device=self.device, rank=self.rank, local_rank=self.local_rank) - + def graph_optimize_and_warm_up_model(self) -> None: """ Optimizes the inference graph using the specified optimization options. @@ -86,9 +86,10 @@ class XpuWorker(WorkerBase): You may limit the usage of GPU memory by adjusting the `gpu_memory_utilization` parameter. """ - from fastdeploy.model_executor.ops.xpu import \ - xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory - + from fastdeploy.model_executor.ops.xpu import ( + xpu_get_free_global_memory, xpu_get_total_global_memory, + xpu_get_used_global_memory) + total_memory = xpu_get_total_global_memory(self.local_rank) used_memory = xpu_get_used_global_memory(self.local_rank) free_memory = xpu_get_free_global_memory(self.local_rank) @@ -98,12 +99,12 @@ class XpuWorker(WorkerBase): self.model_runner.prepare_profile() self.model_runner.profile_run() - + total_available_memory = int(total_memory * self.parallel_config.gpu_memory_utilization) used_memory = xpu_get_used_global_memory(self.local_rank) available_kv_cache_memory = total_available_memory - used_memory model_block_memory_used = self.cal_theortical_kvcache() - available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num + available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num self.model_runner.clear_block_table() @@ -111,7 +112,7 @@ class XpuWorker(WorkerBase): used_memory: {used_memory}, available_kv_cache_memory: {available_kv_cache_memory}") paddle.device.xpu.empty_cache() return available_kv_cache_memory # approximate value - + def cal_theortical_kvcache(self) -> int: """ """ return self.model_runner.cal_theortical_kvcache() @@ -154,10 +155,6 @@ class XpuWorker(WorkerBase): """ """ return True - def cal_theortical_kvcache(self) -> int: - """ """ - return self.model_runner.cal_theortical_kvcache() - def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None: """ """ self.model_runner.update_share_input_block_num(