mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[LLM] fix serval bugs (#2878)
This commit is contained in:
@@ -854,6 +854,11 @@ class Config:
|
|||||||
self.max_num_batched_tokens >= self.max_model_len
|
self.max_num_batched_tokens >= self.max_model_len
|
||||||
), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
|
), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
|
||||||
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
f"should be larger than or equal to max_model_len: {self.max_model_len}"
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
self.max_num_batched_tokens >= self.cache_config.block_size
|
||||||
|
), f"max_num_batched_tokens: {self.max_num_batched_tokens} " \
|
||||||
|
f"should be larger than or equal to block_size: {self.cache_config.block_size}"
|
||||||
|
|
||||||
if self.max_num_partial_prefills > 1:
|
if self.max_num_partial_prefills > 1:
|
||||||
assert (self.cache_config.enable_chunked_prefill is True), \
|
assert (self.cache_config.enable_chunked_prefill is True), \
|
||||||
|
@@ -134,6 +134,7 @@ class LLMEngine(object):
|
|||||||
for idx in range(1, self.cfg.max_num_partial_prefills + 1):
|
for idx in range(1, self.cfg.max_num_partial_prefills + 1):
|
||||||
self.partial_chunked_tokens[idx] = (self.cfg.max_num_batched_tokens // idx) \
|
self.partial_chunked_tokens[idx] = (self.cfg.max_num_batched_tokens // idx) \
|
||||||
// self.cfg.cache_config.block_size * self.cfg.cache_config.block_size
|
// self.cfg.cache_config.block_size * self.cfg.cache_config.block_size
|
||||||
|
self.partial_chunked_tokens[idx] = max(1, self.partial_chunked_tokens[idx])
|
||||||
|
|
||||||
self._finalizer = weakref.finalize(self, self._exit_sub_services)
|
self._finalizer = weakref.finalize(self, self._exit_sub_services)
|
||||||
|
|
||||||
|
@@ -394,6 +394,18 @@ class PaddleDisWorkerProc():
|
|||||||
time.sleep(0.01)
|
time.sleep(0.01)
|
||||||
num_blocks_global = self.get_profile_block_num_signal.value.min(
|
num_blocks_global = self.get_profile_block_num_signal.value.min(
|
||||||
).item()
|
).item()
|
||||||
|
|
||||||
|
if num_blocks_global < 0:
|
||||||
|
logger.error(
|
||||||
|
f"The total number of blocks cannot be less than zero."
|
||||||
|
f"Please increase gpu_memory_utilization"
|
||||||
|
f"Or decrease max_num_batched_tokens(max model length) ")
|
||||||
|
raise ValueError(
|
||||||
|
f"The total number of blocks cannot be less than zero."
|
||||||
|
f"Please increase gpu_memory_utilization"
|
||||||
|
f"Or decrease max_num_batched_tokens(max model length) ")
|
||||||
|
|
||||||
|
|
||||||
self.get_profile_block_num_signal.value[
|
self.get_profile_block_num_signal.value[
|
||||||
self.local_rank] = num_blocks_global
|
self.local_rank] = num_blocks_global
|
||||||
else:
|
else:
|
||||||
|
Reference in New Issue
Block a user