FROM debian:bookworm-slim AS builder ARG LLAMA_CPP_REPO=https://github.com/ggml-org/llama.cpp.git ARG LLAMA_CPP_REF= RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ cmake \ git \ && rm -rf /var/lib/apt/lists/* WORKDIR /src RUN git clone --depth 1 ${LLAMA_CPP_REPO} llama.cpp \ && if [ -n "${LLAMA_CPP_REF}" ]; then cd llama.cpp && git fetch --depth 1 origin "${LLAMA_CPP_REF}" && git checkout "${LLAMA_CPP_REF}"; fi WORKDIR /src/llama.cpp RUN cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON \ && cmake --build build --config Release --target llama-server -j"$(nproc)" FROM python:3.11-slim RUN apt-get update && apt-get install -y --no-install-recommends \ bash \ ca-certificates \ curl \ libgomp1 \ && rm -rf /var/lib/apt/lists/* WORKDIR /app COPY llm/requirements.txt /app/requirements.txt RUN pip install --no-cache-dir -r /app/requirements.txt COPY --from=builder /src/llama.cpp/build/bin/llama-server /usr/local/bin/llama-server COPY llm/main.py /app/main.py COPY llm/entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh /usr/local/bin/llama-server ENV MODEL_PATH=/models/Qwen3-1.7B-Instruct-Q4_K_M.gguf \ LLM_MODEL_NAME=qwen3-1.7b-instruct-q4_k_m \ LLM_CONTEXT_SIZE=4096 \ LLM_THREADS=4 \ LLM_GPU_LAYERS=0 \ LLM_PORT=8080 \ LLAMA_SERVER_PORT=8081 \ LLM_STARTUP_TIMEOUT=120 \ LLM_EXTRA_ARGS= EXPOSE 8080 ENTRYPOINT ["/entrypoint.sh"]