[BugFix] Fix Configs (#2849)

* fix config

* fix config
This commit is contained in:
YuanRisheng
2025-07-16 10:50:36 +08:00
committed by GitHub
parent 0fad10b35a
commit 101ad33332
10 changed files with 30 additions and 37 deletions

View File

@@ -58,6 +58,7 @@ PRETRAINED_INIT_CONFIGURATION = {
"freq_allocation":20, "freq_allocation":20,
"tie_word_embeddings":False, "tie_word_embeddings":False,
"rms_norm_eps":1e-5, "rms_norm_eps":1e-5,
"moe_num_experts": None,
} }
@@ -143,7 +144,7 @@ class ParallelConfig:
self.model_name_or_path: str = "./output" self.model_name_or_path: str = "./output"
self.max_num_seqs: int = 34 self.max_num_seqs: int = 34
# Set default block num for profile run # Set default block num for profile run
self.max_block_num: int = 2000 self.total_block_num: int = 2000
# block size # block size
self.block_size: int = 64 self.block_size: int = 64
# Engine worker queue port # Engine worker queue port

View File

@@ -93,7 +93,7 @@ class MTPProposer(Proposer):
expected_decode_len: int): expected_decode_len: int):
"""Set dummy prefill inputs to model_inputs""" """Set dummy prefill inputs to model_inputs"""
max_dec_len = expected_decode_len + 1 max_dec_len = expected_decode_len + 1
self.num_gpu_blocks = self.parallel_config.max_block_num self.num_gpu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache() self.initialize_kv_cache()
full_length = min(num_tokens // batch_size, full_length = min(num_tokens // batch_size,
self.parallel_config.max_model_len - max_dec_len) self.parallel_config.max_model_len - max_dec_len)
@@ -327,8 +327,8 @@ class MTPProposer(Proposer):
self.free_list = list( self.free_list = list(
range( range(
self.parallel_config.max_block_num - 1, self.parallel_config.total_block_num - 1,
int(self.parallel_config.max_block_num * int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, self.parallel_config.kv_cache_ratio) - 1,
-1, -1,
)) ))

View File

@@ -13,18 +13,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
import gc
import time import time
from typing import List, Optional
import paddle import paddle
import paddle.nn as nn
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request
from fastdeploy.utils import get_logger from fastdeploy.utils import get_logger
from fastdeploy.worker.gpu_model_runner import GPUModelRunner
from fastdeploy.worker.output import ModelRunnerOutput
from fastdeploy.worker.gpu_worker import GpuWorker from fastdeploy.worker.gpu_worker import GpuWorker
logger = get_logger("dcu_worker", "dcu_worker.log") logger = get_logger("dcu_worker", "dcu_worker.log")
@@ -97,7 +91,7 @@ class DcuWorker(GpuWorker):
paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run
available_kv_cache_memory = total_gpu_memory * \ available_kv_cache_memory = total_gpu_memory * \
self.parallel_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase self.parallel_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
end_time = time.perf_counter() end_time = time.perf_counter()
logger.info( logger.info(

View File

@@ -480,8 +480,8 @@ class GCUModelRunner(ModelRunnerBase):
# Initialize free list # Initialize free list
free_list = list( free_list = list(
range( range(
self.parallel_config.max_block_num - 1, self.parallel_config.total_block_num - 1,
int(self.parallel_config.max_block_num * int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, -1)) self.parallel_config.kv_cache_ratio) - 1, -1))
self.free_list_len = len(free_list) self.free_list_len = len(free_list)
self.share_inputs["free_list"] = paddle.to_tensor(free_list, self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -1114,7 +1114,7 @@ class GCUModelRunner(ModelRunnerBase):
"""Execute a forward pass with dummy inputs to profile the memory usage of the model.""" """Execute a forward pass with dummy inputs to profile the memory usage of the model."""
# Initialize kv cache for profile run. After profile run kv cache will be reset. # Initialize kv cache for profile run. After profile run kv cache will be reset.
self.num_gcu_blocks = self.parallel_config.max_block_num self.num_gcu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache() self.initialize_kv_cache()
# 1. Profile with multimodal encoder & encoder cache # 1. Profile with multimodal encoder & encoder cache

View File

@@ -591,8 +591,8 @@ class GPUModelRunner(ModelRunnerBase):
# Initialize free list # Initialize free list
free_list = list( free_list = list(
range( range(
self.parallel_config.max_block_num - 1, self.parallel_config.total_block_num - 1,
int(self.parallel_config.max_block_num * int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, -1)) self.parallel_config.kv_cache_ratio) - 1, -1))
self.free_list_len = len(free_list) self.free_list_len = len(free_list)
self.share_inputs["free_list"] = paddle.to_tensor(free_list, self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -1295,7 +1295,7 @@ class GPUModelRunner(ModelRunnerBase):
# Initialize kv cache for profile run. After profile run kv cache will be reset. # Initialize kv cache for profile run. After profile run kv cache will be reset.
# TODO(gongshaotian): Optimize the management logic of kvcache # TODO(gongshaotian): Optimize the management logic of kvcache
self.num_gpu_blocks = self.parallel_config.max_block_num self.num_gpu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache() self.initialize_kv_cache()
# 1. Profile with multimodal encoder & encoder cache # 1. Profile with multimodal encoder & encoder cache

View File

@@ -61,7 +61,8 @@ class GpuWorker(WorkerBase):
gc.collect() gc.collect()
paddle.device.cuda.empty_cache() paddle.device.cuda.empty_cache()
if self.parallel_config.enable_custom_all_reduce: if self.parallel_config.enable_custom_all_reduce:
from fastdeploy.distributed.communication_op import use_custom_allreduce from fastdeploy.distributed.communication_op import \
use_custom_allreduce
use_custom_allreduce() use_custom_allreduce()
else: else:
raise RuntimeError( raise RuntimeError(
@@ -137,7 +138,7 @@ class GpuWorker(WorkerBase):
available_kv_cache_memory = after_run_meminfo.total * \ available_kv_cache_memory = after_run_meminfo.total * \
self.parallel_config.gpu_memory_utilization - after_run_meminfo.used - paddle_peak_increase self.parallel_config.gpu_memory_utilization - after_run_meminfo.used - paddle_peak_increase
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
end_time = time.perf_counter() end_time = time.perf_counter()
logger.info(( logger.info((

View File

@@ -468,8 +468,8 @@ class IluvatarModelRunner(ModelRunnerBase):
# Initialize free list # Initialize free list
free_list = list( free_list = list(
range( range(
self.parallel_config.max_block_num - 1, self.parallel_config.total_block_num - 1,
int(self.parallel_config.max_block_num * int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, -1)) self.parallel_config.kv_cache_ratio) - 1, -1))
self.free_list_len = len(free_list) self.free_list_len = len(free_list)
self.share_inputs["free_list"] = paddle.to_tensor(free_list, self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -1069,7 +1069,7 @@ class IluvatarModelRunner(ModelRunnerBase):
# Initialize kv cache for profile run. After profile run kv cache will be reset. # Initialize kv cache for profile run. After profile run kv cache will be reset.
# TODO(gongshaotian): Optimize the management logic of kvcache # TODO(gongshaotian): Optimize the management logic of kvcache
self.num_gpu_blocks = self.parallel_config.max_block_num self.num_gpu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache() self.initialize_kv_cache()
# 1. Profile with multimodal encoder & encoder cache # 1. Profile with multimodal encoder & encoder cache

View File

@@ -372,7 +372,7 @@ class PaddleDisWorkerProc():
self.get_profile_block_num_signal.value[ self.get_profile_block_num_signal.value[
self.local_rank] = num_blocks_global self.local_rank] = num_blocks_global
else: else:
num_blocks_global = self.fd_config.parallel_config.max_block_num num_blocks_global = self.fd_config.parallel_config.total_block_num
# NOTE(liuzichang): Too big num_blocks_global will lead to error 700 # NOTE(liuzichang): Too big num_blocks_global will lead to error 700
# 4. Updata share inputs # 4. Updata share inputs
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global) self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)

View File

@@ -479,8 +479,8 @@ class XPUModelRunner(ModelRunnerBase):
# Initialize free list # Initialize free list
free_list = list( free_list = list(
range( range(
self.parallel_config.max_block_num - 1, self.parallel_config.total_block_num - 1,
int(self.parallel_config.max_block_num * int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, -1)) self.parallel_config.kv_cache_ratio) - 1, -1))
self.free_list_len = len(free_list) self.free_list_len = len(free_list)
self.share_inputs["free_list"] = paddle.to_tensor(free_list, self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -757,7 +757,7 @@ class XPUModelRunner(ModelRunnerBase):
def prepare_profile(self) -> None: def prepare_profile(self) -> None:
"""Prepare the profile run by setting the block number and initializing the KV cache.""" """Prepare the profile run by setting the block number and initializing the KV cache."""
paddle.device.xpu.empty_cache() paddle.device.xpu.empty_cache()
self.num_gpu_blocks = self.parallel_config.max_block_num self.num_gpu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache() self.initialize_kv_cache()
def profile_run(self) -> None: def profile_run(self) -> None:

View File

@@ -66,7 +66,7 @@ class XpuWorker(WorkerBase):
device=self.device, device=self.device,
rank=self.rank, rank=self.rank,
local_rank=self.local_rank) local_rank=self.local_rank)
def graph_optimize_and_warm_up_model(self) -> None: def graph_optimize_and_warm_up_model(self) -> None:
""" """
Optimizes the inference graph using the specified optimization options. Optimizes the inference graph using the specified optimization options.
@@ -86,9 +86,10 @@ class XpuWorker(WorkerBase):
You may limit the usage of GPU memory You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter. by adjusting the `gpu_memory_utilization` parameter.
""" """
from fastdeploy.model_executor.ops.xpu import \ from fastdeploy.model_executor.ops.xpu import (
xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory xpu_get_free_global_memory, xpu_get_total_global_memory,
xpu_get_used_global_memory)
total_memory = xpu_get_total_global_memory(self.local_rank) total_memory = xpu_get_total_global_memory(self.local_rank)
used_memory = xpu_get_used_global_memory(self.local_rank) used_memory = xpu_get_used_global_memory(self.local_rank)
free_memory = xpu_get_free_global_memory(self.local_rank) free_memory = xpu_get_free_global_memory(self.local_rank)
@@ -98,12 +99,12 @@ class XpuWorker(WorkerBase):
self.model_runner.prepare_profile() self.model_runner.prepare_profile()
self.model_runner.profile_run() self.model_runner.profile_run()
total_available_memory = int(total_memory * self.parallel_config.gpu_memory_utilization) total_available_memory = int(total_memory * self.parallel_config.gpu_memory_utilization)
used_memory = xpu_get_used_global_memory(self.local_rank) used_memory = xpu_get_used_global_memory(self.local_rank)
available_kv_cache_memory = total_available_memory - used_memory available_kv_cache_memory = total_available_memory - used_memory
model_block_memory_used = self.cal_theortical_kvcache() model_block_memory_used = self.cal_theortical_kvcache()
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
self.model_runner.clear_block_table() self.model_runner.clear_block_table()
@@ -111,7 +112,7 @@ class XpuWorker(WorkerBase):
used_memory: {used_memory}, available_kv_cache_memory: {available_kv_cache_memory}") used_memory: {used_memory}, available_kv_cache_memory: {available_kv_cache_memory}")
paddle.device.xpu.empty_cache() paddle.device.xpu.empty_cache()
return available_kv_cache_memory # approximate value return available_kv_cache_memory # approximate value
def cal_theortical_kvcache(self) -> int: def cal_theortical_kvcache(self) -> int:
""" """ """ """
return self.model_runner.cal_theortical_kvcache() return self.model_runner.cal_theortical_kvcache()
@@ -154,10 +155,6 @@ class XpuWorker(WorkerBase):
""" """ """ """
return True return True
def cal_theortical_kvcache(self) -> int:
""" """
return self.model_runner.cal_theortical_kvcache()
def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None: def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
""" """ """ """
self.model_runner.update_share_input_block_num( self.model_runner.update_share_input_block_num(