local-llm-agent/docker-compose.vllm.yml
Adrian Flores Cortes 3def230d58 Initial commit: local-llm-agent infrastructure project
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 16:42:45 -06:00

202 lines
6.1 KiB
YAML

# Local LLM Agent - Docker Compose for vLLM (Production GPU)
# ===========================================================================
#
# This compose uses vLLM for high-performance GPU inference with:
# - Continuous batching
# - Multi-LoRA support
# - Prometheus metrics
#
# Prerequisites:
# - NVIDIA GPU with CUDA support
# - nvidia-container-toolkit installed
# - Run ./scripts/setup-wsl-gpu.sh first
#
# Usage:
# docker-compose -f docker-compose.vllm.yml up -d
#
# ===========================================================================
services:
# vLLM - High Performance LLM Inference
vllm:
image: vllm/vllm-openai:latest
container_name: local-llm-vllm
ports:
- "8000:8000"
volumes:
# Model cache
- vllm-cache:/root/.cache/huggingface
# LoRA adapters
- ./lora-adapters:/lora-adapters:ro
environment:
- HF_HOME=/root/.cache/huggingface
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
command: >
--model ${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
--host 0.0.0.0
--port 8000
--gpu-memory-utilization ${VLLM_GPU_MEMORY_UTILIZATION:-0.9}
--max-model-len ${VLLM_MAX_MODEL_LEN:-8192}
--tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE:-1}
--enable-lora
--max-lora-rank ${VLLM_MAX_LORA_RANK:-64}
--max-loras ${VLLM_MAX_LORAS:-4}
--lora-extra-vocab-size ${VLLM_LORA_EXTRA_VOCAB_SIZE:-256}
--max-num-batched-tokens ${VLLM_MAX_NUM_BATCHED_TOKENS:-4096}
--max-num-seqs ${VLLM_MAX_NUM_SEQS:-8}
--enable-prefix-caching
--disable-log-requests
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 120s # vLLM needs time to load model
# Inference Engine (Python FastAPI) - vLLM Mode
inference-engine:
build:
context: ./apps/inference-engine
dockerfile: Dockerfile
container_name: local-llm-inference
ports:
- "3161:3161"
environment:
- INFERENCE_PORT=3161
- INFERENCE_HOST=0.0.0.0
- INFERENCE_BACKEND=vllm
- VLLM_HOST=http://vllm:8000
- VLLM_MODEL=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
- VLLM_ENABLE_LORA=true
- VLLM_MAX_LORAS=${VLLM_MAX_LORAS:-4}
- LOG_LEVEL=info
depends_on:
vllm:
condition: service_healthy
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3161/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
# API Gateway (NestJS)
gateway:
build:
context: ./apps/gateway
dockerfile: Dockerfile
container_name: local-llm-gateway
ports:
- "3160:3160"
environment:
- NODE_ENV=production
- GATEWAY_PORT=3160
- INFERENCE_HOST=inference-engine
- INFERENCE_PORT=3161
- MODEL_NAME=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
- TIER_SMALL_LATENCY_TARGET_MS=2000
- TIER_MAIN_LATENCY_TARGET_MS=10000
depends_on:
inference-engine:
condition: service_healthy
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
networks:
llm-network:
driver: bridge
volumes:
vllm-cache:
name: local-llm-vllm-cache
# ============================================================================
# Usage:
# ============================================================================
#
# 1. First, setup WSL GPU (if not done):
# ./scripts/setup-wsl-gpu.sh
#
# 2. Configure model (optional, default is Mistral-7B):
# export VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
#
# 3. Start the stack:
# docker-compose -f docker-compose.vllm.yml up -d
#
# 4. Monitor vLLM startup (first time downloads model ~15GB):
# docker-compose -f docker-compose.vllm.yml logs -f vllm
#
# 5. Verify health:
# curl http://localhost:8000/health # vLLM direct
# curl http://localhost:3161/health # Inference Engine
# curl http://localhost:3160/health # Gateway
#
# 6. Test inference:
# curl -X POST http://localhost:3160/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[{"role":"user","content":"Hello!"}]}'
#
# 7. Check GPU usage:
# nvidia-smi
#
# 8. Stop:
# docker-compose -f docker-compose.vllm.yml down
#
# ============================================================================
# LoRA Adapters:
# ============================================================================
#
# Place LoRA adapters in ./lora-adapters/ directory:
#
# lora-adapters/
# ├── erp-core/
# │ ├── adapter_config.json
# │ └── adapter_model.safetensors
# └── trading/
# ├── adapter_config.json
# └── adapter_model.safetensors
#
# Use adapters in requests:
# curl -X POST http://localhost:3160/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[...],"lora_adapter":"erp-core"}'
#
# ============================================================================
# Recommended Models by VRAM:
# ============================================================================
#
# 8GB VRAM:
# - mistralai/Mistral-7B-Instruct-v0.2 (quantized)
# - TheBloke/Mistral-7B-Instruct-v0.2-GPTQ
#
# 12-16GB VRAM:
# - mistralai/Mistral-7B-Instruct-v0.2
# - codellama/CodeLlama-7b-Instruct-hf
#
# 24GB+ VRAM:
# - mistralai/Mistral-7B-Instruct-v0.2 (multiple LoRA)
# - codellama/CodeLlama-13b-Instruct-hf
# - TheBloke/Llama-2-13B-chat-GPTQ
#
# ============================================================================