mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
@@ -58,6 +58,7 @@ PRETRAINED_INIT_CONFIGURATION = {
|
|||||||
"freq_allocation":20,
|
"freq_allocation":20,
|
||||||
"tie_word_embeddings":False,
|
"tie_word_embeddings":False,
|
||||||
"rms_norm_eps":1e-5,
|
"rms_norm_eps":1e-5,
|
||||||
|
"moe_num_experts": None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -143,7 +144,7 @@ class ParallelConfig:
|
|||||||
self.model_name_or_path: str = "./output"
|
self.model_name_or_path: str = "./output"
|
||||||
self.max_num_seqs: int = 34
|
self.max_num_seqs: int = 34
|
||||||
# Set default block num for profile run
|
# Set default block num for profile run
|
||||||
self.max_block_num: int = 2000
|
self.total_block_num: int = 2000
|
||||||
# block size
|
# block size
|
||||||
self.block_size: int = 64
|
self.block_size: int = 64
|
||||||
# Engine worker queue port
|
# Engine worker queue port
|
||||||
|
@@ -93,7 +93,7 @@ class MTPProposer(Proposer):
|
|||||||
expected_decode_len: int):
|
expected_decode_len: int):
|
||||||
"""Set dummy prefill inputs to model_inputs"""
|
"""Set dummy prefill inputs to model_inputs"""
|
||||||
max_dec_len = expected_decode_len + 1
|
max_dec_len = expected_decode_len + 1
|
||||||
self.num_gpu_blocks = self.parallel_config.max_block_num
|
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||||
self.initialize_kv_cache()
|
self.initialize_kv_cache()
|
||||||
full_length = min(num_tokens // batch_size,
|
full_length = min(num_tokens // batch_size,
|
||||||
self.parallel_config.max_model_len - max_dec_len)
|
self.parallel_config.max_model_len - max_dec_len)
|
||||||
@@ -327,8 +327,8 @@ class MTPProposer(Proposer):
|
|||||||
|
|
||||||
self.free_list = list(
|
self.free_list = list(
|
||||||
range(
|
range(
|
||||||
self.parallel_config.max_block_num - 1,
|
self.parallel_config.total_block_num - 1,
|
||||||
int(self.parallel_config.max_block_num *
|
int(self.parallel_config.total_block_num *
|
||||||
self.parallel_config.kv_cache_ratio) - 1,
|
self.parallel_config.kv_cache_ratio) - 1,
|
||||||
-1,
|
-1,
|
||||||
))
|
))
|
||||||
|
@@ -13,18 +13,12 @@
|
|||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
import gc
|
|
||||||
import time
|
import time
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
import paddle.nn as nn
|
|
||||||
|
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.engine.request import Request
|
|
||||||
from fastdeploy.utils import get_logger
|
from fastdeploy.utils import get_logger
|
||||||
from fastdeploy.worker.gpu_model_runner import GPUModelRunner
|
|
||||||
from fastdeploy.worker.output import ModelRunnerOutput
|
|
||||||
from fastdeploy.worker.gpu_worker import GpuWorker
|
from fastdeploy.worker.gpu_worker import GpuWorker
|
||||||
|
|
||||||
logger = get_logger("dcu_worker", "dcu_worker.log")
|
logger = get_logger("dcu_worker", "dcu_worker.log")
|
||||||
@@ -97,7 +91,7 @@ class DcuWorker(GpuWorker):
|
|||||||
paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run
|
paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run
|
||||||
available_kv_cache_memory = total_gpu_memory * \
|
available_kv_cache_memory = total_gpu_memory * \
|
||||||
self.parallel_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase
|
self.parallel_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase
|
||||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
|
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
logger.info(
|
logger.info(
|
||||||
|
@@ -480,8 +480,8 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
# Initialize free list
|
# Initialize free list
|
||||||
free_list = list(
|
free_list = list(
|
||||||
range(
|
range(
|
||||||
self.parallel_config.max_block_num - 1,
|
self.parallel_config.total_block_num - 1,
|
||||||
int(self.parallel_config.max_block_num *
|
int(self.parallel_config.total_block_num *
|
||||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||||
self.free_list_len = len(free_list)
|
self.free_list_len = len(free_list)
|
||||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||||
@@ -1114,7 +1114,7 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
|
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
|
||||||
|
|
||||||
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
||||||
self.num_gcu_blocks = self.parallel_config.max_block_num
|
self.num_gcu_blocks = self.parallel_config.total_block_num
|
||||||
self.initialize_kv_cache()
|
self.initialize_kv_cache()
|
||||||
|
|
||||||
# 1. Profile with multimodal encoder & encoder cache
|
# 1. Profile with multimodal encoder & encoder cache
|
||||||
|
@@ -591,8 +591,8 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
# Initialize free list
|
# Initialize free list
|
||||||
free_list = list(
|
free_list = list(
|
||||||
range(
|
range(
|
||||||
self.parallel_config.max_block_num - 1,
|
self.parallel_config.total_block_num - 1,
|
||||||
int(self.parallel_config.max_block_num *
|
int(self.parallel_config.total_block_num *
|
||||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||||
self.free_list_len = len(free_list)
|
self.free_list_len = len(free_list)
|
||||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||||
@@ -1295,7 +1295,7 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
|
|
||||||
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
||||||
# TODO(gongshaotian): Optimize the management logic of kvcache
|
# TODO(gongshaotian): Optimize the management logic of kvcache
|
||||||
self.num_gpu_blocks = self.parallel_config.max_block_num
|
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||||
self.initialize_kv_cache()
|
self.initialize_kv_cache()
|
||||||
|
|
||||||
# 1. Profile with multimodal encoder & encoder cache
|
# 1. Profile with multimodal encoder & encoder cache
|
||||||
|
@@ -61,7 +61,8 @@ class GpuWorker(WorkerBase):
|
|||||||
gc.collect()
|
gc.collect()
|
||||||
paddle.device.cuda.empty_cache()
|
paddle.device.cuda.empty_cache()
|
||||||
if self.parallel_config.enable_custom_all_reduce:
|
if self.parallel_config.enable_custom_all_reduce:
|
||||||
from fastdeploy.distributed.communication_op import use_custom_allreduce
|
from fastdeploy.distributed.communication_op import \
|
||||||
|
use_custom_allreduce
|
||||||
use_custom_allreduce()
|
use_custom_allreduce()
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
@@ -137,7 +138,7 @@ class GpuWorker(WorkerBase):
|
|||||||
|
|
||||||
available_kv_cache_memory = after_run_meminfo.total * \
|
available_kv_cache_memory = after_run_meminfo.total * \
|
||||||
self.parallel_config.gpu_memory_utilization - after_run_meminfo.used - paddle_peak_increase
|
self.parallel_config.gpu_memory_utilization - after_run_meminfo.used - paddle_peak_increase
|
||||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
|
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
|
||||||
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
logger.info((
|
logger.info((
|
||||||
|
@@ -468,8 +468,8 @@ class IluvatarModelRunner(ModelRunnerBase):
|
|||||||
# Initialize free list
|
# Initialize free list
|
||||||
free_list = list(
|
free_list = list(
|
||||||
range(
|
range(
|
||||||
self.parallel_config.max_block_num - 1,
|
self.parallel_config.total_block_num - 1,
|
||||||
int(self.parallel_config.max_block_num *
|
int(self.parallel_config.total_block_num *
|
||||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||||
self.free_list_len = len(free_list)
|
self.free_list_len = len(free_list)
|
||||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||||
@@ -1069,7 +1069,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
|||||||
|
|
||||||
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
# Initialize kv cache for profile run. After profile run kv cache will be reset.
|
||||||
# TODO(gongshaotian): Optimize the management logic of kvcache
|
# TODO(gongshaotian): Optimize the management logic of kvcache
|
||||||
self.num_gpu_blocks = self.parallel_config.max_block_num
|
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||||
self.initialize_kv_cache()
|
self.initialize_kv_cache()
|
||||||
|
|
||||||
# 1. Profile with multimodal encoder & encoder cache
|
# 1. Profile with multimodal encoder & encoder cache
|
||||||
|
@@ -372,7 +372,7 @@ class PaddleDisWorkerProc():
|
|||||||
self.get_profile_block_num_signal.value[
|
self.get_profile_block_num_signal.value[
|
||||||
self.local_rank] = num_blocks_global
|
self.local_rank] = num_blocks_global
|
||||||
else:
|
else:
|
||||||
num_blocks_global = self.fd_config.parallel_config.max_block_num
|
num_blocks_global = self.fd_config.parallel_config.total_block_num
|
||||||
# NOTE(liuzichang): Too big num_blocks_global will lead to error 700
|
# NOTE(liuzichang): Too big num_blocks_global will lead to error 700
|
||||||
# 4. Updata share inputs
|
# 4. Updata share inputs
|
||||||
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)
|
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)
|
||||||
|
@@ -479,8 +479,8 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
# Initialize free list
|
# Initialize free list
|
||||||
free_list = list(
|
free_list = list(
|
||||||
range(
|
range(
|
||||||
self.parallel_config.max_block_num - 1,
|
self.parallel_config.total_block_num - 1,
|
||||||
int(self.parallel_config.max_block_num *
|
int(self.parallel_config.total_block_num *
|
||||||
self.parallel_config.kv_cache_ratio) - 1, -1))
|
self.parallel_config.kv_cache_ratio) - 1, -1))
|
||||||
self.free_list_len = len(free_list)
|
self.free_list_len = len(free_list)
|
||||||
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
|
||||||
@@ -757,7 +757,7 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
def prepare_profile(self) -> None:
|
def prepare_profile(self) -> None:
|
||||||
"""Prepare the profile run by setting the block number and initializing the KV cache."""
|
"""Prepare the profile run by setting the block number and initializing the KV cache."""
|
||||||
paddle.device.xpu.empty_cache()
|
paddle.device.xpu.empty_cache()
|
||||||
self.num_gpu_blocks = self.parallel_config.max_block_num
|
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||||
self.initialize_kv_cache()
|
self.initialize_kv_cache()
|
||||||
|
|
||||||
def profile_run(self) -> None:
|
def profile_run(self) -> None:
|
||||||
|
@@ -66,7 +66,7 @@ class XpuWorker(WorkerBase):
|
|||||||
device=self.device,
|
device=self.device,
|
||||||
rank=self.rank,
|
rank=self.rank,
|
||||||
local_rank=self.local_rank)
|
local_rank=self.local_rank)
|
||||||
|
|
||||||
def graph_optimize_and_warm_up_model(self) -> None:
|
def graph_optimize_and_warm_up_model(self) -> None:
|
||||||
"""
|
"""
|
||||||
Optimizes the inference graph using the specified optimization options.
|
Optimizes the inference graph using the specified optimization options.
|
||||||
@@ -86,9 +86,10 @@ class XpuWorker(WorkerBase):
|
|||||||
You may limit the usage of GPU memory
|
You may limit the usage of GPU memory
|
||||||
by adjusting the `gpu_memory_utilization` parameter.
|
by adjusting the `gpu_memory_utilization` parameter.
|
||||||
"""
|
"""
|
||||||
from fastdeploy.model_executor.ops.xpu import \
|
from fastdeploy.model_executor.ops.xpu import (
|
||||||
xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory
|
xpu_get_free_global_memory, xpu_get_total_global_memory,
|
||||||
|
xpu_get_used_global_memory)
|
||||||
|
|
||||||
total_memory = xpu_get_total_global_memory(self.local_rank)
|
total_memory = xpu_get_total_global_memory(self.local_rank)
|
||||||
used_memory = xpu_get_used_global_memory(self.local_rank)
|
used_memory = xpu_get_used_global_memory(self.local_rank)
|
||||||
free_memory = xpu_get_free_global_memory(self.local_rank)
|
free_memory = xpu_get_free_global_memory(self.local_rank)
|
||||||
@@ -98,12 +99,12 @@ class XpuWorker(WorkerBase):
|
|||||||
|
|
||||||
self.model_runner.prepare_profile()
|
self.model_runner.prepare_profile()
|
||||||
self.model_runner.profile_run()
|
self.model_runner.profile_run()
|
||||||
|
|
||||||
total_available_memory = int(total_memory * self.parallel_config.gpu_memory_utilization)
|
total_available_memory = int(total_memory * self.parallel_config.gpu_memory_utilization)
|
||||||
used_memory = xpu_get_used_global_memory(self.local_rank)
|
used_memory = xpu_get_used_global_memory(self.local_rank)
|
||||||
available_kv_cache_memory = total_available_memory - used_memory
|
available_kv_cache_memory = total_available_memory - used_memory
|
||||||
model_block_memory_used = self.cal_theortical_kvcache()
|
model_block_memory_used = self.cal_theortical_kvcache()
|
||||||
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
|
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
|
||||||
|
|
||||||
self.model_runner.clear_block_table()
|
self.model_runner.clear_block_table()
|
||||||
|
|
||||||
@@ -111,7 +112,7 @@ class XpuWorker(WorkerBase):
|
|||||||
used_memory: {used_memory}, available_kv_cache_memory: {available_kv_cache_memory}")
|
used_memory: {used_memory}, available_kv_cache_memory: {available_kv_cache_memory}")
|
||||||
paddle.device.xpu.empty_cache()
|
paddle.device.xpu.empty_cache()
|
||||||
return available_kv_cache_memory # approximate value
|
return available_kv_cache_memory # approximate value
|
||||||
|
|
||||||
def cal_theortical_kvcache(self) -> int:
|
def cal_theortical_kvcache(self) -> int:
|
||||||
""" """
|
""" """
|
||||||
return self.model_runner.cal_theortical_kvcache()
|
return self.model_runner.cal_theortical_kvcache()
|
||||||
@@ -154,10 +155,6 @@ class XpuWorker(WorkerBase):
|
|||||||
""" """
|
""" """
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def cal_theortical_kvcache(self) -> int:
|
|
||||||
""" """
|
|
||||||
return self.model_runner.cal_theortical_kvcache()
|
|
||||||
|
|
||||||
def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
|
def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
|
||||||
""" """
|
""" """
|
||||||
self.model_runner.update_share_input_block_num(
|
self.model_runner.update_share_input_block_num(
|
||||||
|
Reference in New Issue
Block a user