llm: add FastAPI shim, gateway LLM endpoints, tests, and docs

This commit is contained in:
2026-04-12 09:41:21 +02:00
parent baf497b015
commit 59c9584250
15 changed files with 1779 additions and 11 deletions

View File

@@ -14,6 +14,13 @@ services:
- QDRANT_SVC_URL=http://qdrant-svc:8000
- CARD_RENDERER_URL=http://card-renderer:8000
- MATURITY_URL=http://maturity:8000
- LLM_URL=${LLM_URL:-http://llm:8080}
- LLM_ENABLED=${LLM_ENABLED:-false}
- LLM_TIMEOUT=${LLM_TIMEOUT:-120}
- LLM_DEFAULT_MODEL=${LLM_DEFAULT_MODEL:-qwen3-1.7b-instruct-q4_k_m}
- LLM_MAX_TOKENS_DEFAULT=${LLM_MAX_TOKENS_DEFAULT:-256}
- LLM_MAX_TOKENS_HARD_LIMIT=${LLM_MAX_TOKENS_HARD_LIMIT:-1024}
- LLM_MAX_REQUEST_BYTES=${LLM_MAX_REQUEST_BYTES:-65536}
- MATURITY_ENABLED=true
- API_KEY=${API_KEY}
- VISION_TIMEOUT=300
@@ -151,3 +158,26 @@ services:
retries: 5
start_period: 90s
llm:
build:
context: .
dockerfile: llm/Dockerfile
environment:
- MODEL_PATH=${MODEL_PATH:-/models/Qwen3-1.7B-Instruct-Q4_K_M.gguf}
- LLM_MODEL_NAME=${LLM_DEFAULT_MODEL:-qwen3-1.7b-instruct-q4_k_m}
- LLM_CONTEXT_SIZE=${LLM_CONTEXT_SIZE:-4096}
- LLM_THREADS=${LLM_THREADS:-4}
- LLM_GPU_LAYERS=${LLM_GPU_LAYERS:-0}
- LLM_PORT=8080
- LLM_EXTRA_ARGS=${LLM_EXTRA_ARGS:-}
volumes:
- ./models/qwen3:/models:ro
healthcheck:
test: ["CMD", "curl", "-fsS", "http://127.0.0.1:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 120s
profiles:
- llm