# Local LLM Agent - Production Docker Compose # =========================================================================== # # Complete production stack with: # - vLLM GPU inference # - NestJS Gateway # - Python Inference Engine # - Prometheus + Grafana monitoring # # Prerequisites: # - NVIDIA GPU with CUDA support # - nvidia-container-toolkit installed # - Run ./scripts/setup-wsl-gpu.sh first # # Usage: # docker-compose -f docker-compose.prod.yml up -d # # =========================================================================== services: # ========================================================================== # LLM Backend - vLLM # ========================================================================== vllm: image: vllm/vllm-openai:latest container_name: local-llm-vllm ports: - "8000:8000" volumes: - vllm-cache:/root/.cache/huggingface - ./lora-adapters:/lora-adapters:ro environment: - HF_HOME=/root/.cache/huggingface - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} command: > --model ${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2} --host 0.0.0.0 --port 8000 --gpu-memory-utilization ${VLLM_GPU_MEMORY_UTILIZATION:-0.9} --max-model-len ${VLLM_MAX_MODEL_LEN:-8192} --tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE:-1} --enable-lora --max-lora-rank ${VLLM_MAX_LORA_RANK:-64} --max-loras ${VLLM_MAX_LORAS:-4} --lora-extra-vocab-size ${VLLM_LORA_EXTRA_VOCAB_SIZE:-256} --max-num-batched-tokens ${VLLM_MAX_NUM_BATCHED_TOKENS:-4096} --max-num-seqs ${VLLM_MAX_NUM_SEQS:-8} --enable-prefix-caching --disable-log-requests deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 5 start_period: 180s # vLLM needs time to load model # ========================================================================== # Inference Engine - Python FastAPI # ========================================================================== inference-engine: build: context: ./apps/inference-engine dockerfile: Dockerfile container_name: local-llm-inference ports: - "3161:3161" environment: - INFERENCE_PORT=3161 - INFERENCE_HOST=0.0.0.0 - INFERENCE_BACKEND=vllm - VLLM_HOST=http://vllm:8000 - VLLM_MODEL=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2} - VLLM_ENABLE_LORA=true - VLLM_MAX_LORAS=${VLLM_MAX_LORAS:-4} - VLLM_MAX_NUM_BATCHED_TOKENS=${VLLM_MAX_NUM_BATCHED_TOKENS:-4096} - VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-8} - LOG_LEVEL=info - SERVICE_VERSION=0.5.0 volumes: - ./lora-adapters:/lora-adapters:ro depends_on: vllm: condition: service_healthy networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:3161/health"] interval: 30s timeout: 10s retries: 3 start_period: 15s # ========================================================================== # API Gateway - NestJS # ========================================================================== gateway: build: context: ./apps/gateway dockerfile: Dockerfile container_name: local-llm-gateway ports: - "3160:3160" environment: - NODE_ENV=production - GATEWAY_PORT=3160 - INFERENCE_HOST=inference-engine - INFERENCE_PORT=3161 - MODEL_NAME=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2} - TIER_SMALL_MAX_TOKENS=512 - TIER_SMALL_MAX_CONTEXT=4096 - TIER_SMALL_LATENCY_TARGET_MS=2000 - TIER_MAIN_MAX_TOKENS=2048 - TIER_MAIN_MAX_CONTEXT=16384 - TIER_MAIN_LATENCY_TARGET_MS=10000 - TIER_LARGE_MAX_TOKENS=4096 - TIER_LARGE_MAX_CONTEXT=32768 - TIER_LARGE_LATENCY_TARGET_MS=30000 volumes: - ./config:/app/config:ro depends_on: inference-engine: condition: service_healthy networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"] interval: 30s timeout: 10s retries: 3 start_period: 15s # ========================================================================== # Monitoring - Prometheus # ========================================================================== prometheus: image: prom/prometheus:v2.48.0 container_name: local-llm-prometheus ports: - "9090:9090" volumes: - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro - prometheus-data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--storage.tsdb.retention.time=30d' - '--web.enable-lifecycle' - '--web.enable-admin-api' networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"] interval: 30s timeout: 10s retries: 3 # ========================================================================== # Monitoring - Grafana # ========================================================================== grafana: image: grafana/grafana:10.2.0 container_name: local-llm-grafana ports: - "3000:3000" environment: - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin} - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin} - GF_USERS_ALLOW_SIGN_UP=false - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3000} - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/local-llm.json volumes: - grafana-data:/var/lib/grafana - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro depends_on: prometheus: condition: service_healthy networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"] interval: 30s timeout: 10s retries: 3 # ========================================================================== # System Metrics - Node Exporter # ========================================================================== node-exporter: image: prom/node-exporter:v1.7.0 container_name: local-llm-node-exporter ports: - "9100:9100" volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro command: - '--path.procfs=/host/proc' - '--path.sysfs=/host/sys' - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' networks: - llm-network restart: unless-stopped networks: llm-network: driver: bridge volumes: vllm-cache: name: local-llm-vllm-cache prometheus-data: name: local-llm-prometheus-data grafana-data: name: local-llm-grafana-data # ============================================================================ # Usage Guide # ============================================================================ # # 1. Prerequisites: # ./scripts/setup-wsl-gpu.sh # # 2. Configure environment (.env.production): # cp .env.example .env.production # # Edit .env.production with your settings # # 3. Start production stack: # docker-compose -f docker-compose.prod.yml --env-file .env.production up -d # # 4. Check status: # docker-compose -f docker-compose.prod.yml ps # # 5. View logs: # docker-compose -f docker-compose.prod.yml logs -f # # 6. Monitor vLLM model loading (first time): # docker-compose -f docker-compose.prod.yml logs -f vllm # # 7. Access services: # - Gateway API: http://localhost:3160 # - Inference Engine: http://localhost:3161 # - vLLM Direct: http://localhost:8000 # - Grafana: http://localhost:3000 (admin/admin) # - Prometheus: http://localhost:9090 # # 8. Health checks: # curl http://localhost:3160/health # curl http://localhost:3161/health # curl http://localhost:8000/health # # 9. Test inference: # curl -X POST http://localhost:3160/v1/chat/completions \ # -H "Content-Type: application/json" \ # -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[{"role":"user","content":"Hello!"}]}' # # 10. Stop stack: # docker-compose -f docker-compose.prod.yml down # # ============================================================================ # Environment Variables # ============================================================================ # # Required: # HUGGING_FACE_HUB_TOKEN - For gated model access # # Optional (with defaults): # VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2 # VLLM_GPU_MEMORY_UTILIZATION=0.9 # VLLM_MAX_MODEL_LEN=8192 # VLLM_TENSOR_PARALLEL_SIZE=1 # VLLM_MAX_LORA_RANK=64 # VLLM_MAX_LORAS=4 # VLLM_MAX_NUM_BATCHED_TOKENS=4096 # VLLM_MAX_NUM_SEQS=8 # GRAFANA_ADMIN_USER=admin # GRAFANA_ADMIN_PASSWORD=admin # # ============================================================================