[BugFix] Fix Configs (#2849)

* fix config

* fix config
This commit is contained in:
YuanRisheng
2025-07-16 10:50:36 +08:00
committed by GitHub
parent 0fad10b35a
commit 101ad33332
10 changed files with 30 additions and 37 deletions

View File

@@ -58,6 +58,7 @@ PRETRAINED_INIT_CONFIGURATION = {
"freq_allocation":20,
"tie_word_embeddings":False,
"rms_norm_eps":1e-5,
"moe_num_experts": None,
}
@@ -143,7 +144,7 @@ class ParallelConfig:
self.model_name_or_path: str = "./output"
self.max_num_seqs: int = 34
# Set default block num for profile run
self.max_block_num: int = 2000
self.total_block_num: int = 2000
# block size
self.block_size: int = 64
# Engine worker queue port

View File

@@ -93,7 +93,7 @@ class MTPProposer(Proposer):
expected_decode_len: int):
"""Set dummy prefill inputs to model_inputs"""
max_dec_len = expected_decode_len + 1
self.num_gpu_blocks = self.parallel_config.max_block_num
self.num_gpu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache()
full_length = min(num_tokens // batch_size,
self.parallel_config.max_model_len - max_dec_len)
@@ -327,8 +327,8 @@ class MTPProposer(Proposer):
self.free_list = list(
range(
self.parallel_config.max_block_num - 1,
int(self.parallel_config.max_block_num *
self.parallel_config.total_block_num - 1,
int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1,
-1,
))

View File

@@ -13,18 +13,12 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import gc
import time
from typing import List, Optional
import paddle
import paddle.nn as nn
from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request
from fastdeploy.utils import get_logger
from fastdeploy.worker.gpu_model_runner import GPUModelRunner
from fastdeploy.worker.output import ModelRunnerOutput
from fastdeploy.worker.gpu_worker import GpuWorker
logger = get_logger("dcu_worker", "dcu_worker.log")
@@ -97,7 +91,7 @@ class DcuWorker(GpuWorker):
paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run
available_kv_cache_memory = total_gpu_memory * \
self.parallel_config.gpu_memory_utilization - after_used_gpu_memory - paddle_peak_increase
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
end_time = time.perf_counter()
logger.info(

View File

@@ -480,8 +480,8 @@ class GCUModelRunner(ModelRunnerBase):
# Initialize free list
free_list = list(
range(
self.parallel_config.max_block_num - 1,
int(self.parallel_config.max_block_num *
self.parallel_config.total_block_num - 1,
int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, -1))
self.free_list_len = len(free_list)
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -1114,7 +1114,7 @@ class GCUModelRunner(ModelRunnerBase):
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
# Initialize kv cache for profile run. After profile run kv cache will be reset.
self.num_gcu_blocks = self.parallel_config.max_block_num
self.num_gcu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache()
# 1. Profile with multimodal encoder & encoder cache

View File

@@ -591,8 +591,8 @@ class GPUModelRunner(ModelRunnerBase):
# Initialize free list
free_list = list(
range(
self.parallel_config.max_block_num - 1,
int(self.parallel_config.max_block_num *
self.parallel_config.total_block_num - 1,
int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, -1))
self.free_list_len = len(free_list)
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -1295,7 +1295,7 @@ class GPUModelRunner(ModelRunnerBase):
# Initialize kv cache for profile run. After profile run kv cache will be reset.
# TODO(gongshaotian): Optimize the management logic of kvcache
self.num_gpu_blocks = self.parallel_config.max_block_num
self.num_gpu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache()
# 1. Profile with multimodal encoder & encoder cache

View File

@@ -61,7 +61,8 @@ class GpuWorker(WorkerBase):
gc.collect()
paddle.device.cuda.empty_cache()
if self.parallel_config.enable_custom_all_reduce:
from fastdeploy.distributed.communication_op import use_custom_allreduce
from fastdeploy.distributed.communication_op import \
use_custom_allreduce
use_custom_allreduce()
else:
raise RuntimeError(
@@ -137,7 +138,7 @@ class GpuWorker(WorkerBase):
available_kv_cache_memory = after_run_meminfo.total * \
self.parallel_config.gpu_memory_utilization - after_run_meminfo.used - paddle_peak_increase
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
end_time = time.perf_counter()
logger.info((

View File

@@ -468,8 +468,8 @@ class IluvatarModelRunner(ModelRunnerBase):
# Initialize free list
free_list = list(
range(
self.parallel_config.max_block_num - 1,
int(self.parallel_config.max_block_num *
self.parallel_config.total_block_num - 1,
int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, -1))
self.free_list_len = len(free_list)
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -1069,7 +1069,7 @@ class IluvatarModelRunner(ModelRunnerBase):
# Initialize kv cache for profile run. After profile run kv cache will be reset.
# TODO(gongshaotian): Optimize the management logic of kvcache
self.num_gpu_blocks = self.parallel_config.max_block_num
self.num_gpu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache()
# 1. Profile with multimodal encoder & encoder cache

View File

@@ -372,7 +372,7 @@ class PaddleDisWorkerProc():
self.get_profile_block_num_signal.value[
self.local_rank] = num_blocks_global
else:
num_blocks_global = self.fd_config.parallel_config.max_block_num
num_blocks_global = self.fd_config.parallel_config.total_block_num
# NOTE(liuzichang): Too big num_blocks_global will lead to error 700
# 4. Updata share inputs
self.worker.reinitialize_kv_cache(num_gpu_blocks=num_blocks_global)

View File

@@ -479,8 +479,8 @@ class XPUModelRunner(ModelRunnerBase):
# Initialize free list
free_list = list(
range(
self.parallel_config.max_block_num - 1,
int(self.parallel_config.max_block_num *
self.parallel_config.total_block_num - 1,
int(self.parallel_config.total_block_num *
self.parallel_config.kv_cache_ratio) - 1, -1))
self.free_list_len = len(free_list)
self.share_inputs["free_list"] = paddle.to_tensor(free_list,
@@ -757,7 +757,7 @@ class XPUModelRunner(ModelRunnerBase):
def prepare_profile(self) -> None:
"""Prepare the profile run by setting the block number and initializing the KV cache."""
paddle.device.xpu.empty_cache()
self.num_gpu_blocks = self.parallel_config.max_block_num
self.num_gpu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache()
def profile_run(self) -> None:

View File

@@ -86,8 +86,9 @@ class XpuWorker(WorkerBase):
You may limit the usage of GPU memory
by adjusting the `gpu_memory_utilization` parameter.
"""
from fastdeploy.model_executor.ops.xpu import \
xpu_get_free_global_memory, xpu_get_total_global_memory, xpu_get_used_global_memory
from fastdeploy.model_executor.ops.xpu import (
xpu_get_free_global_memory, xpu_get_total_global_memory,
xpu_get_used_global_memory)
total_memory = xpu_get_total_global_memory(self.local_rank)
used_memory = xpu_get_used_global_memory(self.local_rank)
@@ -103,7 +104,7 @@ class XpuWorker(WorkerBase):
used_memory = xpu_get_used_global_memory(self.local_rank)
available_kv_cache_memory = total_available_memory - used_memory
model_block_memory_used = self.cal_theortical_kvcache()
available_kv_cache_memory += model_block_memory_used * self.parallel_config.max_block_num
available_kv_cache_memory += model_block_memory_used * self.parallel_config.total_block_num
self.model_runner.clear_block_table()
@@ -154,10 +155,6 @@ class XpuWorker(WorkerBase):
""" """
return True
def cal_theortical_kvcache(self) -> int:
""" """
return self.model_runner.cal_theortical_kvcache()
def reinitialize_kv_cache(self, num_gpu_blocks: int) -> None:
""" """
self.model_runner.update_share_input_block_num(