Added rop of pending updates on bot start, reset command, AnswerChat method, GPU offload, limit to response lenght, context reduced to 2048, flash attention, 4 parallel decode queues, --keep of the original 810 tokens (which is the starting prompt)

2024-12-26 03:24:56 +01:00
parent 296d150282
commit 4167c75279
5 changed files with 49 additions and 20 deletions
--- a/compose.yaml
+++ b/compose.yaml
@@ -14,7 +14,7 @@
      - ${MODEL_PATH}:/models
    ports:
      - "80:80"
-    command: -m /models/${MODEL_NAME} --port 80 --host 0.0.0.0 -n 512
+    command: -m /models/${MODEL_NAME} --port 80 --host 0.0.0.0 -n 128 -c 2048 --no-mmap -ngl 50 -fa -np 4 --keep 810
    deploy:
      resources:
        reservations: