llm: add FastAPI shim, gateway LLM endpoints, tests, and docs

2026-04-12 09:41:21 +02:00
parent baf497b015
commit 59c9584250
15 changed files with 1779 additions and 11 deletions
--- a/tests/test_llm_service.py
+++ b/tests/test_llm_service.py
@@ -0,0 +1,79 @@
+from __future__ import annotations
+
+import importlib
+import os
+import unittest
+from types import SimpleNamespace
+from unittest.mock import patch
+
+import httpx
+
+
+BASE_ENV = {
+    "MODEL_PATH": "D:/Sites/vision/models/qwen3/Qwen3-1.7B-Instruct-Q4_K_M.gguf",
+    "LLM_MODEL_NAME": "qwen3-1.7b-instruct-q4_k_m",
+    "LLM_CONTEXT_SIZE": "4096",
+    "LLM_THREADS": "4",
+    "LLM_GPU_LAYERS": "0",
+    "LLM_PORT": "8080",
+    "LLAMA_SERVER_PORT": "8081",
+}
+
+
+def load_llm_module():
+    with patch.dict(os.environ, BASE_ENV, clear=False):
+        import llm.main as llm_main
+
+        return importlib.reload(llm_main)
+
+
+class StubHTTPClient:
+    def __init__(self, response: httpx.Response):
+        self.response = response
+
+    async def get(self, *_args, **_kwargs):
+        return self.response
+
+
+class LLMServiceTests(unittest.IsolatedAsyncioTestCase):
+    async def test_health_returns_repo_owned_contract(self):
+        module = load_llm_module()
+        module._llama_process = SimpleNamespace(poll=lambda: None)
+        module._http_client = StubHTTPClient(
+            httpx.Response(200, json={"object": "list", "data": []}, request=httpx.Request("GET", "http://127.0.0.1:8081/v1/models"))
+        )
+
+        transport = httpx.ASGITransport(app=module.app)
+        async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client:
+            response = await client.get("/health")
+
+        self.assertEqual(response.status_code, 200)
+        self.assertEqual(
+            response.json(),
+            {
+                "status": "ok",
+                "model": "Qwen3-1.7B-Instruct-Q4_K_M.gguf",
+                "model_alias": "qwen3-1.7b-instruct-q4_k_m",
+                "context_size": 4096,
+                "threads": 4,
+                "gpu_layers": 0,
+            },
+        )
+
+    async def test_health_reports_unavailable_when_process_is_down(self):
+        module = load_llm_module()
+        module._llama_process = SimpleNamespace(poll=lambda: 1)
+        module._http_client = StubHTTPClient(
+            httpx.Response(200, json={"object": "list", "data": []}, request=httpx.Request("GET", "http://127.0.0.1:8081/v1/models"))
+        )
+
+        transport = httpx.ASGITransport(app=module.app)
+        async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client:
+            response = await client.get("/health")
+
+        self.assertEqual(response.status_code, 503)
+        self.assertEqual(response.json()["status"], "unavailable")
+
+
+if __name__ == "__main__":
+    unittest.main()