llm: add FastAPI shim, gateway LLM endpoints, tests, and docs
This commit is contained in:
14
.env.example
14
.env.example
@@ -7,6 +7,13 @@ CLIP_URL=http://clip:8000
|
||||
BLIP_URL=http://blip:8000
|
||||
YOLO_URL=http://yolo:8000
|
||||
QDRANT_SVC_URL=http://qdrant-svc:8000
|
||||
LLM_URL=http://llm:8080
|
||||
LLM_ENABLED=false
|
||||
LLM_TIMEOUT=120
|
||||
LLM_DEFAULT_MODEL=qwen3-1.7b-instruct-q4_k_m
|
||||
LLM_MAX_TOKENS_DEFAULT=256
|
||||
LLM_MAX_TOKENS_HARD_LIMIT=1024
|
||||
LLM_MAX_REQUEST_BYTES=65536
|
||||
|
||||
# HuggingFace token for private/gated models (optional). Leave empty if unused.
|
||||
# Never commit a real token to this file.
|
||||
@@ -21,3 +28,10 @@ VECTOR_DIM=512
|
||||
# Gateway runtime
|
||||
VISION_TIMEOUT=300
|
||||
MAX_IMAGE_BYTES=52428800
|
||||
|
||||
# Local llama.cpp LLM service (only needed when you run the llm profile locally)
|
||||
MODEL_PATH=/models/Qwen3-1.7B-Instruct-Q4_K_M.gguf
|
||||
LLM_CONTEXT_SIZE=4096
|
||||
LLM_THREADS=4
|
||||
LLM_GPU_LAYERS=0
|
||||
LLM_EXTRA_ARGS=
|
||||
|
||||
Reference in New Issue
Block a user