llm: add FastAPI shim, gateway LLM endpoints, tests, and docs

2026-04-12 09:41:21 +02:00
parent baf497b015
commit 59c9584250
15 changed files with 1779 additions and 11 deletions
--- a/.env.example
+++ b/.env.example
@@ -7,6 +7,13 @@ CLIP_URL=http://clip:8000
 BLIP_URL=http://blip:8000
 YOLO_URL=http://yolo:8000
 QDRANT_SVC_URL=http://qdrant-svc:8000
+LLM_URL=http://llm:8080
+LLM_ENABLED=false
+LLM_TIMEOUT=120
+LLM_DEFAULT_MODEL=qwen3-1.7b-instruct-q4_k_m
+LLM_MAX_TOKENS_DEFAULT=256
+LLM_MAX_TOKENS_HARD_LIMIT=1024
+LLM_MAX_REQUEST_BYTES=65536

 # HuggingFace token for private/gated models (optional). Leave empty if unused.
 # Never commit a real token to this file.
@@ -21,3 +28,10 @@ VECTOR_DIM=512
 # Gateway runtime
 VISION_TIMEOUT=300
 MAX_IMAGE_BYTES=52428800
+
+# Local llama.cpp LLM service (only needed when you run the llm profile locally)
+MODEL_PATH=/models/Qwen3-1.7B-Instruct-Q4_K_M.gguf
+LLM_CONTEXT_SIZE=4096
+LLM_THREADS=4
+LLM_GPU_LAYERS=0
+LLM_EXTRA_ARGS=