mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-25 01:20:43 +08:00
[Feature] Set prefix caching as default (#3814)
* Set prefix caching as default * Set prefix caching as default * Set prefix caching as default * skip dynamic load scene * fix kill bug * fix kill bug * fix kill bug * fix * fix * fix ci
This commit is contained in:
@@ -29,6 +29,7 @@ FD_ENGINE_QUEUE_PORTS = [
|
||||
[9991, 9992],
|
||||
]
|
||||
FD_CACHE_QUEUE_PORT = int(os.getenv("FD_CACHE_QUEUE_PORT", 8333))
|
||||
FD_CACHE_QUEUE_PORTS = [FD_CACHE_QUEUE_PORT, FD_CACHE_QUEUE_PORT + 1, FD_CACHE_QUEUE_PORT + 2, FD_CACHE_QUEUE_PORT + 3]
|
||||
|
||||
|
||||
models = [
|
||||
@@ -54,7 +55,7 @@ def llm(request):
|
||||
max_model_len=8192,
|
||||
num_gpu_blocks_override=1024,
|
||||
engine_worker_queue_port=FD_ENGINE_QUEUE_PORTS[port_index],
|
||||
cache_queue_port=FD_CACHE_QUEUE_PORT,
|
||||
cache_queue_port=FD_CACHE_QUEUE_PORTS[port_index],
|
||||
load_choices="default",
|
||||
enable_expert_parallel=True,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user