local-llm-agent/docker-compose.prod.yml

# Local LLM Agent - Production Docker Compose
# ===========================================================================
#
# Complete production stack with:
# - vLLM GPU inference
# - NestJS Gateway
# - Python Inference Engine
# - Prometheus + Grafana monitoring
#
# Prerequisites:
# - NVIDIA GPU with CUDA support
# - nvidia-container-toolkit installed
# - Run ./scripts/setup-wsl-gpu.sh first
#
# Usage:
#   docker-compose -f docker-compose.prod.yml up -d
#
# ===========================================================================

services:
  # ==========================================================================
  # LLM Backend - vLLM
  # ==========================================================================
  vllm:
    image: vllm/vllm-openai:latest
    container_name: local-llm-vllm
    ports:
      - "8000:8000"
    volumes:
      - vllm-cache:/root/.cache/huggingface
      - ./lora-adapters:/lora-adapters:ro
    environment:
      - HF_HOME=/root/.cache/huggingface
      - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
    command: >
      --model ${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
      --host 0.0.0.0
      --port 8000
      --gpu-memory-utilization ${VLLM_GPU_MEMORY_UTILIZATION:-0.9}
      --max-model-len ${VLLM_MAX_MODEL_LEN:-8192}
      --tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE:-1}
      --enable-lora
      --max-lora-rank ${VLLM_MAX_LORA_RANK:-64}
      --max-loras ${VLLM_MAX_LORAS:-4}
      --lora-extra-vocab-size ${VLLM_LORA_EXTRA_VOCAB_SIZE:-256}
      --max-num-batched-tokens ${VLLM_MAX_NUM_BATCHED_TOKENS:-4096}
      --max-num-seqs ${VLLM_MAX_NUM_SEQS:-8}
      --enable-prefix-caching
      --disable-log-requests
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 180s  # vLLM needs time to load model

  # ==========================================================================
  # Inference Engine - Python FastAPI
  # ==========================================================================
  inference-engine:
    build:
      context: ./apps/inference-engine
      dockerfile: Dockerfile
    container_name: local-llm-inference
    ports:
      - "3161:3161"
    environment:
      - INFERENCE_PORT=3161
      - INFERENCE_HOST=0.0.0.0
      - INFERENCE_BACKEND=vllm
      - VLLM_HOST=http://vllm:8000
      - VLLM_MODEL=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
      - VLLM_ENABLE_LORA=true
      - VLLM_MAX_LORAS=${VLLM_MAX_LORAS:-4}
      - VLLM_MAX_NUM_BATCHED_TOKENS=${VLLM_MAX_NUM_BATCHED_TOKENS:-4096}
      - VLLM_MAX_NUM_SEQS=${VLLM_MAX_NUM_SEQS:-8}
      - LOG_LEVEL=info
      - SERVICE_VERSION=0.5.0
    volumes:
      - ./lora-adapters:/lora-adapters:ro
    depends_on:
      vllm:
        condition: service_healthy
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3161/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 15s

  # ==========================================================================
  # API Gateway - NestJS
  # ==========================================================================
  gateway:
    build:
      context: ./apps/gateway
      dockerfile: Dockerfile
    container_name: local-llm-gateway
    ports:
      - "3160:3160"
    environment:
      - NODE_ENV=production
      - GATEWAY_PORT=3160
      - INFERENCE_HOST=inference-engine
      - INFERENCE_PORT=3161
      - MODEL_NAME=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
      - TIER_SMALL_MAX_TOKENS=512
      - TIER_SMALL_MAX_CONTEXT=4096
      - TIER_SMALL_LATENCY_TARGET_MS=2000
      - TIER_MAIN_MAX_TOKENS=2048
      - TIER_MAIN_MAX_CONTEXT=16384
      - TIER_MAIN_LATENCY_TARGET_MS=10000
      - TIER_LARGE_MAX_TOKENS=4096
      - TIER_LARGE_MAX_CONTEXT=32768
      - TIER_LARGE_LATENCY_TARGET_MS=30000
    volumes:
      - ./config:/app/config:ro
    depends_on:
      inference-engine:
        condition: service_healthy
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 15s

  # ==========================================================================
  # Monitoring - Prometheus
  # ==========================================================================
  prometheus:
    image: prom/prometheus:v2.48.0
    container_name: local-llm-prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus-data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
      interval: 30s
      timeout: 10s
      retries: 3

  # ==========================================================================
  # Monitoring - Grafana
  # ==========================================================================
  grafana:
    image: grafana/grafana:10.2.0
    container_name: local-llm-grafana
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER:-admin}
      - GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD:-admin}
      - GF_USERS_ALLOW_SIGN_UP=false
      - GF_SERVER_ROOT_URL=${GRAFANA_ROOT_URL:-http://localhost:3000}
      - GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/local-llm.json
    volumes:
      - grafana-data:/var/lib/grafana
      - ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
      - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
    depends_on:
      prometheus:
        condition: service_healthy
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
      interval: 30s
      timeout: 10s
      retries: 3

  # ==========================================================================
  # System Metrics - Node Exporter
  # ==========================================================================
  node-exporter:
    image: prom/node-exporter:v1.7.0
    container_name: local-llm-node-exporter
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
    networks:
      - llm-network
    restart: unless-stopped

networks:
  llm-network:
    driver: bridge

volumes:
  vllm-cache:
    name: local-llm-vllm-cache
  prometheus-data:
    name: local-llm-prometheus-data
  grafana-data:
    name: local-llm-grafana-data

# ============================================================================
# Usage Guide
# ============================================================================
#
# 1. Prerequisites:
#    ./scripts/setup-wsl-gpu.sh
#
# 2. Configure environment (.env.production):
#    cp .env.example .env.production
#    # Edit .env.production with your settings
#
# 3. Start production stack:
#    docker-compose -f docker-compose.prod.yml --env-file .env.production up -d
#
# 4. Check status:
#    docker-compose -f docker-compose.prod.yml ps
#
# 5. View logs:
#    docker-compose -f docker-compose.prod.yml logs -f
#
# 6. Monitor vLLM model loading (first time):
#    docker-compose -f docker-compose.prod.yml logs -f vllm
#
# 7. Access services:
#    - Gateway API:     http://localhost:3160
#    - Inference Engine: http://localhost:3161
#    - vLLM Direct:      http://localhost:8000
#    - Grafana:          http://localhost:3000 (admin/admin)
#    - Prometheus:       http://localhost:9090
#
# 8. Health checks:
#    curl http://localhost:3160/health
#    curl http://localhost:3161/health
#    curl http://localhost:8000/health
#
# 9. Test inference:
#    curl -X POST http://localhost:3160/v1/chat/completions \
#      -H "Content-Type: application/json" \
#      -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[{"role":"user","content":"Hello!"}]}'
#
# 10. Stop stack:
#     docker-compose -f docker-compose.prod.yml down
#
# ============================================================================
# Environment Variables
# ============================================================================
#
# Required:
#   HUGGING_FACE_HUB_TOKEN - For gated model access
#
# Optional (with defaults):
#   VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
#   VLLM_GPU_MEMORY_UTILIZATION=0.9
#   VLLM_MAX_MODEL_LEN=8192
#   VLLM_TENSOR_PARALLEL_SIZE=1
#   VLLM_MAX_LORA_RANK=64
#   VLLM_MAX_LORAS=4
#   VLLM_MAX_NUM_BATCHED_TOKENS=4096
#   VLLM_MAX_NUM_SEQS=8
#   GRAFANA_ADMIN_USER=admin
#   GRAFANA_ADMIN_PASSWORD=admin
#
# ============================================================================