mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
@@ -58,6 +58,7 @@ PRETRAINED_INIT_CONFIGURATION = {
|
||||
"freq_allocation":20,
|
||||
"tie_word_embeddings":False,
|
||||
"rms_norm_eps":1e-5,
|
||||
"moe_num_experts": None,
|
||||
}
|
||||
|
||||
|
||||
@@ -143,7 +144,7 @@ class ParallelConfig:
|
||||
self.model_name_or_path: str = "./output"
|
||||
self.max_num_seqs: int = 34
|
||||
# Set default block num for profile run
|
||||
self.max_block_num: int = 2000
|
||||
self.total_block_num: int = 2000
|
||||
# block size
|
||||
self.block_size: int = 64
|
||||
# Engine worker queue port
|
||||
|
@@ -93,7 +93,7 @@ class MTPProposer(Proposer):
|
||||
expected_decode_len: int):
|
||||
"""Set dummy prefill inputs to model_inputs"""
|
||||
max_dec_len = expected_decode_len + 1
|
||||
self.num_gpu_blocks = self.parallel_config.max_block_num
|
||||
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||
self.initialize_kv_cache()
|
||||
full_length = min(num_tokens // batch_size,
|
||||
self.parallel_config.max_model_len - max_dec_len)
|
||||
@@ -327,8 +327,8 @@ class MTPProposer(Proposer):
|
||||
|
||||
self.free_list = list(
|
||||
range(
|
||||
self.parallel_config.max_block_num - 1,
|
||||
int(self.parallel_config.max_block_num *
|
||||
self.parallel_config.total_block_num - 1,
|
||||
int(self.parallel_config.total_block_num *
|
||||
self.parallel_config.kv_cache_ratio) - 1,
|
||||
-1,
|
||||
))
|
||||
|
@@ -13,18 +13,12 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
import gc
|
||||
import time
|
||||
from typing import List, Optional
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.engine.request import Request
|
||||
from fastdeploy.utils import get_logger
|
||||
from fastdeploy.worker.gpu_model_runner import GPUModelRunner
|
||||
from fastdeploy.worker.output import ModelRunnerOutput
|
||||
from fastdeploy.worker.gpu_worker import GpuWorker
|
||||
|
||||
logger = get_logger("dcu_worker", "dcu_worker.log")
|
||||
@@ -97,7 +91,7 @@ class DcuWorker(GpuWorker):
|
||||
paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run
|
||||
available_kv_cache_memory = total_gpu_memory * \
|
||||
self.parallel_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase
|
||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
|
||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
|
||||
|
||||
end_time = time.perf_counter()
|
||||
logger.info(
|
||||
|
@@ -480,8 +480,8 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
# Initialize free list
|
||||
free_list = list(
|
||||
range(
|
||||
self.parallel_config.max_block_num - 1,
|
||||
int(self.parallel_config.max_block_num *
|
||||
self.parallel_config.total_block_num - 1,
|
||||
int(self.parallel_config.total_block_num *
|
||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||
self.free_list_len = len(free_list)
|
||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||
@@ -1114,7 +1114,7 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
|
||||
|
||||
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
||||
self.num_gcu_blocks = self.parallel_config.max_block_num
|
||||
self.num_gcu_blocks = self.parallel_config.total_block_num
|
||||
self.initialize_kv_cache()
|
||||
|
||||
# 1. Profile with multimodal encoder & encoder cache
|
||||
|
@@ -591,8 +591,8 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# Initialize free list
|
||||
free_list = list(
|
||||
range(
|
||||
self.parallel_config.max_block_num - 1,
|
||||
int(self.parallel_config.max_block_num *
|
||||
self.parallel_config.total_block_num - 1,
|
||||
int(self.parallel_config.total_block_num *
|
||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||
self.free_list_len = len(free_list)
|
||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||
@@ -1295,7 +1295,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
||||
# TODO(gongshaotian): Optimize the management logic of kvcache
|
||||
self.num_gpu_blocks = self.parallel_config.max_block_num
|
||||
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||
self.initialize_kv_cache()
|
||||
|
||||
# 1. Profile with multimodal encoder & encoder cache
|
||||
|
@@ -61,7 +61,8 @@ class GpuWorker(WorkerBase):
|
||||
gc.collect()
|
||||
paddle.device.cuda.empty_cache()
|
||||
if self.parallel_config.enable_custom_all_reduce:
|
||||
from fastdeploy.distributed.communication_op import use_custom_allreduce
|
||||
from fastdeploy.distributed.communication_op import \
|
||||
use_custom_allreduce
|
||||
use_custom_allreduce()
|
||||
else:
|
||||
raise RuntimeError(
|
||||
@@ -137,7 +138,7 @@ class GpuWorker(WorkerBase):
|
||||
|
||||
available_kv_cache_memory = after_run_meminfo.total * \
|
||||
self.parallel_config.gpu_memory_utilization - after_run_meminfo.used - paddle_peak_increase
|
||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
|
||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
|
||||
|
||||
end_time = time.perf_counter()
|
||||
logger.info((
|
||||
|
@@ -468,8 +468,8 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
# Initialize free list
|
||||
free_list = list(
|
||||
range(
|
||||
self.parallel_config.max_block_num - 1,
|
||||
int(self.parallel_config.max_block_num *
|
||||
self.parallel_config.total_block_num - 1,
|
||||
int(self.parallel_config.total_block_num *
|
||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||
self.free_list_len = len(free_list)
|
||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||
@@ -1069,7 +1069,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
||||
|
||||
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
||||
# TODO(gongshaotian): Optimize the management logic of kvcache
|
||||
self.num_gpu_blocks = self.parallel_config.max_block_num
|
||||
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||
self.initialize_kv_cache()
|
||||
|
||||
# 1. Profile with multimodal encoder & encoder cache
|
||||
|
@@ -372,7 +372,7 @@ class PaddleDisWorkerProc():
|
||||
self.get_profile_block_num_signal.value[
|
||||
self.local_rank] = num_blocks_global
|
||||
else:
|
||||
num_blocks_global = self.fd_config.parallel_config.max_block_num
|
||||
num_blocks_global = self.fd_config.parallel_config.total_block_num
|
||||
# NOTE(liuzichang): Too big num_blocks_global will lead to error 700
|
||||
# 4. Updata share inputs
|
||||
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)
|
||||
|
@@ -479,8 +479,8 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
# Initialize free list
|
||||
free_list = list(
|
||||
range(
|
||||
self.parallel_config.max_block_num - 1,
|
||||
int(self.parallel_config.max_block_num *
|
||||
self.parallel_config.total_block_num - 1,
|
||||
int(self.parallel_config.total_block_num *
|
||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||
self.free_list_len = len(free_list)
|
||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||
@@ -757,7 +757,7 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
def prepare_profile(self) -> None:
|
||||
"""Prepare the profile run by setting the block number and initializing the KV cache."""
|
||||
paddle.device.xpu.empty_cache()
|
||||
self.num_gpu_blocks = self.parallel_config.max_block_num
|
||||
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||
self.initialize_kv_cache()
|
||||
|
||||
def profile_run(self) -> None:
|
||||
|
@@ -86,8 +86,9 @@ class XpuWorker(WorkerBase):
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
"""
|
||||
from fastdeploy.model_executor.ops.xpu import \
|
||||
xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory
|
||||
from fastdeploy.model_executor.ops.xpu import (
|
||||
xpu_get_free_global_memory, xpu_get_total_global_memory,
|
||||
xpu_get_used_global_memory)
|
||||
|
||||
total_memory = xpu_get_total_global_memory(self.local_rank)
|
||||
used_memory = xpu_get_used_global_memory(self.local_rank)
|
||||
@@ -103,7 +104,7 @@ class XpuWorker(WorkerBase):
|
||||
used_memory = xpu_get_used_global_memory(self.local_rank)
|
||||
available_kv_cache_memory = total_available_memory - used_memory
|
||||
model_block_memory_used = self.cal_theortical_kvcache()
|
||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
|
||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
|
||||
|
||||
self.model_runner.clear_block_table()
|
||||
|
||||
@@ -154,10 +155,6 @@ class XpuWorker(WorkerBase):
|
||||
""" """
|
||||
return True
|
||||
|
||||
def cal_theortical_kvcache(self) -> int:
|
||||
""" """
|
||||
return self.model_runner.cal_theortical_kvcache()
|
||||
|
||||
def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
|
||||
""" """
|
||||
self.model_runner.update_share_input_block_num(
|
||||
|
Reference in New Issue
Block a user