local-llm-agent/docker-compose.prod.yml
Adrian Flores Cortes 3def230d58 Initial commit: local-llm-agent infrastructure project
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 16:42:45 -06:00

295 lines
9.3 KiB
YAML

# Local LLM Agent - Production Docker Compose
# ===========================================================================
#
# Complete production stack with:
# - vLLM GPU inference
# - NestJS Gateway
# - Python Inference Engine
# - Prometheus + Grafana monitoring
#
# Prerequisites:
# - NVIDIA GPU with CUDA support
# - nvidia-container-toolkit installed
# - Run ./scripts/setup-wsl-gpu.sh first
#
# Usage:
# docker-compose -f docker-compose.prod.yml up -d
#
# ===========================================================================
services:
# ==========================================================================
# LLM Backend - vLLM
# ==========================================================================
vllm:
image: vllm/vllm-openai:latest
container_name: local-llm-vllm
ports:
- "8000:8000"
volumes:
- vllm-cache:/root/.cache/huggingface
- ./lora-adapters:/lora-adapters:ro
environment:
- HF_HOME=/root/.cache/huggingface
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
command: >
--model ${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
--host 0.0.0.0
--port 8000
--gpu-memory-utilization ${VLLM_GPU_MEMORY_UTILIZATION:-0.9}
--max-model-len ${VLLM_MAX_MODEL_LEN:-8192}
--tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE:-1}
--enable-lora
--max-lora-rank ${VLLM_MAX_LORA_RANK:-64}
--max-loras ${VLLM_MAX_LORAS:-4}
--lora-extra-vocab-size ${VLLM_LORA_EXTRA_VOCAB_SIZE:-256}
--max-num-batched-tokens ${VLLM_MAX_NUM_BATCHED_TOKENS:-4096}
--max-num-seqs ${VLLM_MAX_NUM_SEQS:-8}
--enable-prefix-caching
--disable-log-requests
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 180s # vLLM needs time to load model
# ==========================================================================
# Inference Engine - Python FastAPI
# ==========================================================================
inference-engine:
build:
context: ./apps/inference-engine
dockerfile: Dockerfile
container_name: local-llm-inference
ports:
- "3161:3161"
environment:
- INFERENCE_PORT=3161
- INFERENCE_HOST=0.0.0.0
- INFERENCE_BACKEND=vllm
- VLLM_HOST=http://vllm:8000
- VLLM_MODEL=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
- VLLM_ENABLE_LORA=true
- VLLM_MAX_LORAS=${VLLM_MAX_LORAS:-4}
- VLLM_MAX_NUM_BATCHED_TOKENS=${VLLM_MAX_NUM_BATCHED_TOKENS:-4096}
- VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-8}
- LOG_LEVEL=info
- SERVICE_VERSION=0.5.0
volumes:
- ./lora-adapters:/lora-adapters:ro
depends_on:
vllm:
condition: service_healthy
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3161/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
# ==========================================================================
# API Gateway - NestJS
# ==========================================================================
gateway:
build:
context: ./apps/gateway
dockerfile: Dockerfile
container_name: local-llm-gateway
ports:
- "3160:3160"
environment:
- NODE_ENV=production
- GATEWAY_PORT=3160
- INFERENCE_HOST=inference-engine
- INFERENCE_PORT=3161
- MODEL_NAME=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
- TIER_SMALL_MAX_TOKENS=512
- TIER_SMALL_MAX_CONTEXT=4096
- TIER_SMALL_LATENCY_TARGET_MS=2000
- TIER_MAIN_MAX_TOKENS=2048
- TIER_MAIN_MAX_CONTEXT=16384
- TIER_MAIN_LATENCY_TARGET_MS=10000
- TIER_LARGE_MAX_TOKENS=4096
- TIER_LARGE_MAX_CONTEXT=32768
- TIER_LARGE_LATENCY_TARGET_MS=30000
volumes:
- ./config:/app/config:ro
depends_on:
inference-engine:
condition: service_healthy
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
# ==========================================================================
# Monitoring - Prometheus
# ==========================================================================
prometheus:
image: prom/prometheus:v2.48.0
container_name: local-llm-prometheus
ports:
- "9090:9090"
volumes:
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
- '--web.enable-admin-api'
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
interval: 30s
timeout: 10s
retries: 3
# ==========================================================================
# Monitoring - Grafana
# ==========================================================================
grafana:
image: grafana/grafana:10.2.0
container_name: local-llm-grafana
ports:
- "3000:3000"
environment:
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
- GF_USERS_ALLOW_SIGN_UP=false
- GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3000}
- GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/local-llm.json
volumes:
- grafana-data:/var/lib/grafana
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
depends_on:
prometheus:
condition: service_healthy
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
interval: 30s
timeout: 10s
retries: 3
# ==========================================================================
# System Metrics - Node Exporter
# ==========================================================================
node-exporter:
image: prom/node-exporter:v1.7.0
container_name: local-llm-node-exporter
ports:
- "9100:9100"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
networks:
- llm-network
restart: unless-stopped
networks:
llm-network:
driver: bridge
volumes:
vllm-cache:
name: local-llm-vllm-cache
prometheus-data:
name: local-llm-prometheus-data
grafana-data:
name: local-llm-grafana-data
# ============================================================================
# Usage Guide
# ============================================================================
#
# 1. Prerequisites:
# ./scripts/setup-wsl-gpu.sh
#
# 2. Configure environment (.env.production):
# cp .env.example .env.production
# # Edit .env.production with your settings
#
# 3. Start production stack:
# docker-compose -f docker-compose.prod.yml --env-file .env.production up -d
#
# 4. Check status:
# docker-compose -f docker-compose.prod.yml ps
#
# 5. View logs:
# docker-compose -f docker-compose.prod.yml logs -f
#
# 6. Monitor vLLM model loading (first time):
# docker-compose -f docker-compose.prod.yml logs -f vllm
#
# 7. Access services:
# - Gateway API: http://localhost:3160
# - Inference Engine: http://localhost:3161
# - vLLM Direct: http://localhost:8000
# - Grafana: http://localhost:3000 (admin/admin)
# - Prometheus: http://localhost:9090
#
# 8. Health checks:
# curl http://localhost:3160/health
# curl http://localhost:3161/health
# curl http://localhost:8000/health
#
# 9. Test inference:
# curl -X POST http://localhost:3160/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[{"role":"user","content":"Hello!"}]}'
#
# 10. Stop stack:
# docker-compose -f docker-compose.prod.yml down
#
# ============================================================================
# Environment Variables
# ============================================================================
#
# Required:
# HUGGING_FACE_HUB_TOKEN - For gated model access
#
# Optional (with defaults):
# VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
# VLLM_GPU_MEMORY_UTILIZATION=0.9
# VLLM_MAX_MODEL_LEN=8192
# VLLM_TENSOR_PARALLEL_SIZE=1
# VLLM_MAX_LORA_RANK=64
# VLLM_MAX_LORAS=4
# VLLM_MAX_NUM_BATCHED_TOKENS=4096
# VLLM_MAX_NUM_SEQS=8
# GRAFANA_ADMIN_USER=admin
# GRAFANA_ADMIN_PASSWORD=admin
#
# ============================================================================