Setup: Update Ollama service examples in compose.alldbms.yaml file

2025-09-27 05:08:13 +08:00 · 2025-09-02 11:15:57 +10:00
parent ed62352090
commit 7856e0ceb8
1 changed files with 16 additions and 15 deletions
--- a/compose.alldbms.yaml
+++ b/compose.alldbms.yaml
@@ -273,21 +273,22 @@ services:
    environment:
      ## Ollama Configuration Options:
      OLLAMA_HOST: "0.0.0.0:11434"
-      OLLAMA_MODELS: "/root/.ollama" # model storage path (see volumes section below)
-      OLLAMA_MAX_QUEUE: "100"        # maximum number of queued requests
-      OLLAMA_NUM_PARALLEL: "1"       # maximum number of parallel requests
-      OLLAMA_MAX_LOADED_MODELS: "1"  # maximum number of loaded models per GPU
-      OLLAMA_LOAD_TIMEOUT: "5m"      # maximum time for loading models (default "5m")
-      OLLAMA_KEEP_ALIVE: "10m"       # duration that models stay loaded in memory (default "5m")
-      OLLAMA_CONTEXT_LENGTH: "4096"  # maximum input context length
-      OLLAMA_MULTIUSER_CACHE: "1"    # optimize prompt caching for multi-user scenarios
-      # OLLAMA_DEBUG: "1"              # shows additional debug information
-      # OLLAMA_NOPRUNE: "1"            # disables pruning of model blobs at startup
-      # OLLAMA_NOHISTORY: "1"          # disables readline history
-      # OLLAMA_FLASH_ATTENTION: "1"    # enables the experimental flash attention feature
-      # OLLAMA_SCHED_SPREAD: "1"       # allows scheduling models across all GPUs.
-      # OLLAMA_GPU_OVERHEAD: "0"       # reserves a portion of VRAM per GPU (bytes)
-      # OLLAMA_INTEL_GPU: "1"          # enables experimental Intel GPU detection
+      OLLAMA_MODELS: "/root/.ollama"   # model storage path (see volumes section below)
+      OLLAMA_MAX_QUEUE: "100"          # maximum number of queued requests
+      OLLAMA_NUM_PARALLEL: "1"         # maximum number of parallel requests
+      OLLAMA_MAX_LOADED_MODELS: "1"    # maximum number of loaded models per GPU
+      OLLAMA_LOAD_TIMEOUT: "5m"        # maximum time for loading models (default "5m")
+      OLLAMA_KEEP_ALIVE: "5m"          # duration that models stay loaded in memory (default "5m")
+      OLLAMA_CONTEXT_LENGTH: "4096"    # maximum input context length
+      OLLAMA_MULTIUSER_CACHE: "false"  # optimize prompt caching for multi-user scenarios
+      OLLAMA_NOPRUNE: "false"          # disables pruning of model blobs at startup
+      OLLAMA_NOHISTORY: "true"         # disables readline history
+      OLLAMA_FLASH_ATTENTION: "false"  # enables the experimental flash attention feature
+      OLLAMA_KV_CACHE_TYPE: "f16"      # cache quantization (f16, q8_0, or q4_0)
+      OLLAMA_SCHED_SPREAD: "false"     # allows scheduling models across all GPUs.
+      OLLAMA_NEW_ENGINE: "true"        # enables the new Ollama engine
+      # OLLAMA_DEBUG: "true"             # shows additional debug information
+      # OLLAMA_INTEL_GPU: "true"         # enables experimental Intel GPU detection
      ## NVIDIA GPU Hardware Acceleration (optional):
      # NVIDIA_VISIBLE_DEVICES: "all"
      # NVIDIA_DRIVER_CAPABILITIES: "compute,utility"