local-llm-agent/docker-compose.vllm.yml

# Local LLM Agent - Docker Compose for vLLM (Production GPU)
# ===========================================================================
#
# This compose uses vLLM for high-performance GPU inference with:
# - Continuous batching
# - Multi-LoRA support
# - Prometheus metrics
#
# Prerequisites:
# - NVIDIA GPU with CUDA support
# - nvidia-container-toolkit installed
# - Run ./scripts/setup-wsl-gpu.sh first
#
# Usage:
#   docker-compose -f docker-compose.vllm.yml up -d
#
# ===========================================================================

services:
  # vLLM - High Performance LLM Inference
  vllm:
    image: vllm/vllm-openai:latest
    container_name: local-llm-vllm
    ports:
      - "8000:8000"
    volumes:
      # Model cache
      - vllm-cache:/root/.cache/huggingface
      # LoRA adapters
      - ./lora-adapters:/lora-adapters:ro
    environment:
      - HF_HOME=/root/.cache/huggingface
      - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
    command: >
      --model ${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
      --host 0.0.0.0
      --port 8000
      --gpu-memory-utilization ${VLLM_GPU_MEMORY_UTILIZATION:-0.9}
      --max-model-len ${VLLM_MAX_MODEL_LEN:-8192}
      --tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE:-1}
      --enable-lora
      --max-lora-rank ${VLLM_MAX_LORA_RANK:-64}
      --max-loras ${VLLM_MAX_LORAS:-4}
      --lora-extra-vocab-size ${VLLM_LORA_EXTRA_VOCAB_SIZE:-256}
      --max-num-batched-tokens ${VLLM_MAX_NUM_BATCHED_TOKENS:-4096}
      --max-num-seqs ${VLLM_MAX_NUM_SEQS:-8}
      --enable-prefix-caching
      --disable-log-requests
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 120s  # vLLM needs time to load model

  # Inference Engine (Python FastAPI) - vLLM Mode
  inference-engine:
    build:
      context: ./apps/inference-engine
      dockerfile: Dockerfile
    container_name: local-llm-inference
    ports:
      - "3161:3161"
    environment:
      - INFERENCE_PORT=3161
      - INFERENCE_HOST=0.0.0.0
      - INFERENCE_BACKEND=vllm
      - VLLM_HOST=http://vllm:8000
      - VLLM_MODEL=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
      - VLLM_ENABLE_LORA=true
      - VLLM_MAX_LORAS=${VLLM_MAX_LORAS:-4}
      - LOG_LEVEL=info
    depends_on:
      vllm:
        condition: service_healthy
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3161/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s

  # API Gateway (NestJS)
  gateway:
    build:
      context: ./apps/gateway
      dockerfile: Dockerfile
    container_name: local-llm-gateway
    ports:
      - "3160:3160"
    environment:
      - NODE_ENV=production
      - GATEWAY_PORT=3160
      - INFERENCE_HOST=inference-engine
      - INFERENCE_PORT=3161
      - MODEL_NAME=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
      - TIER_SMALL_LATENCY_TARGET_MS=2000
      - TIER_MAIN_LATENCY_TARGET_MS=10000
    depends_on:
      inference-engine:
        condition: service_healthy
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s

networks:
  llm-network:
    driver: bridge

volumes:
  vllm-cache:
    name: local-llm-vllm-cache

# ============================================================================
# Usage:
# ============================================================================
#
# 1. First, setup WSL GPU (if not done):
#    ./scripts/setup-wsl-gpu.sh
#
# 2. Configure model (optional, default is Mistral-7B):
#    export VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
#
# 3. Start the stack:
#    docker-compose -f docker-compose.vllm.yml up -d
#
# 4. Monitor vLLM startup (first time downloads model ~15GB):
#    docker-compose -f docker-compose.vllm.yml logs -f vllm
#
# 5. Verify health:
#    curl http://localhost:8000/health          # vLLM direct
#    curl http://localhost:3161/health          # Inference Engine
#    curl http://localhost:3160/health          # Gateway
#
# 6. Test inference:
#    curl -X POST http://localhost:3160/v1/chat/completions \
#      -H "Content-Type: application/json" \
#      -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[{"role":"user","content":"Hello!"}]}'
#
# 7. Check GPU usage:
#    nvidia-smi
#
# 8. Stop:
#    docker-compose -f docker-compose.vllm.yml down
#
# ============================================================================
# LoRA Adapters:
# ============================================================================
#
# Place LoRA adapters in ./lora-adapters/ directory:
#
#   lora-adapters/
#   ├── erp-core/
#   │   ├── adapter_config.json
#   │   └── adapter_model.safetensors
#   └── trading/
#       ├── adapter_config.json
#       └── adapter_model.safetensors
#
# Use adapters in requests:
#   curl -X POST http://localhost:3160/v1/chat/completions \
#     -H "Content-Type: application/json" \
#     -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[...],"lora_adapter":"erp-core"}'
#
# ============================================================================
# Recommended Models by VRAM:
# ============================================================================
#
# 8GB VRAM:
#   - mistralai/Mistral-7B-Instruct-v0.2 (quantized)
#   - TheBloke/Mistral-7B-Instruct-v0.2-GPTQ
#
# 12-16GB VRAM:
#   - mistralai/Mistral-7B-Instruct-v0.2
#   - codellama/CodeLlama-7b-Instruct-hf
#
# 24GB+ VRAM:
#   - mistralai/Mistral-7B-Instruct-v0.2 (multiple LoRA)
#   - codellama/CodeLlama-13b-Instruct-hf
#   - TheBloke/Llama-2-13B-chat-GPTQ
#
# ============================================================================