202 lines
6.1 KiB
YAML
202 lines
6.1 KiB
YAML
# Local LLM Agent - Docker Compose for vLLM (Production GPU)
|
|
# ===========================================================================
|
|
#
|
|
# This compose uses vLLM for high-performance GPU inference with:
|
|
# - Continuous batching
|
|
# - Multi-LoRA support
|
|
# - Prometheus metrics
|
|
#
|
|
# Prerequisites:
|
|
# - NVIDIA GPU with CUDA support
|
|
# - nvidia-container-toolkit installed
|
|
# - Run ./scripts/setup-wsl-gpu.sh first
|
|
#
|
|
# Usage:
|
|
# docker-compose -f docker-compose.vllm.yml up -d
|
|
#
|
|
# ===========================================================================
|
|
|
|
services:
|
|
# vLLM - High Performance LLM Inference
|
|
vllm:
|
|
image: vllm/vllm-openai:latest
|
|
container_name: local-llm-vllm
|
|
ports:
|
|
- "8000:8000"
|
|
volumes:
|
|
# Model cache
|
|
- vllm-cache:/root/.cache/huggingface
|
|
# LoRA adapters
|
|
- ./lora-adapters:/lora-adapters:ro
|
|
environment:
|
|
- HF_HOME=/root/.cache/huggingface
|
|
- HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-}
|
|
command: >
|
|
--model ${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
|
|
--host 0.0.0.0
|
|
--port 8000
|
|
--gpu-memory-utilization ${VLLM_GPU_MEMORY_UTILIZATION:-0.9}
|
|
--max-model-len ${VLLM_MAX_MODEL_LEN:-8192}
|
|
--tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE:-1}
|
|
--enable-lora
|
|
--max-lora-rank ${VLLM_MAX_LORA_RANK:-64}
|
|
--max-loras ${VLLM_MAX_LORAS:-4}
|
|
--lora-extra-vocab-size ${VLLM_LORA_EXTRA_VOCAB_SIZE:-256}
|
|
--max-num-batched-tokens ${VLLM_MAX_NUM_BATCHED_TOKENS:-4096}
|
|
--max-num-seqs ${VLLM_MAX_NUM_SEQS:-8}
|
|
--enable-prefix-caching
|
|
--disable-log-requests
|
|
deploy:
|
|
resources:
|
|
reservations:
|
|
devices:
|
|
- driver: nvidia
|
|
count: 1
|
|
capabilities: [gpu]
|
|
networks:
|
|
- llm-network
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
start_period: 120s # vLLM needs time to load model
|
|
|
|
# Inference Engine (Python FastAPI) - vLLM Mode
|
|
inference-engine:
|
|
build:
|
|
context: ./apps/inference-engine
|
|
dockerfile: Dockerfile
|
|
container_name: local-llm-inference
|
|
ports:
|
|
- "3161:3161"
|
|
environment:
|
|
- INFERENCE_PORT=3161
|
|
- INFERENCE_HOST=0.0.0.0
|
|
- INFERENCE_BACKEND=vllm
|
|
- VLLM_HOST=http://vllm:8000
|
|
- VLLM_MODEL=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
|
|
- VLLM_ENABLE_LORA=true
|
|
- VLLM_MAX_LORAS=${VLLM_MAX_LORAS:-4}
|
|
- LOG_LEVEL=info
|
|
depends_on:
|
|
vllm:
|
|
condition: service_healthy
|
|
networks:
|
|
- llm-network
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:3161/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 10s
|
|
|
|
# API Gateway (NestJS)
|
|
gateway:
|
|
build:
|
|
context: ./apps/gateway
|
|
dockerfile: Dockerfile
|
|
container_name: local-llm-gateway
|
|
ports:
|
|
- "3160:3160"
|
|
environment:
|
|
- NODE_ENV=production
|
|
- GATEWAY_PORT=3160
|
|
- INFERENCE_HOST=inference-engine
|
|
- INFERENCE_PORT=3161
|
|
- MODEL_NAME=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2}
|
|
- TIER_SMALL_LATENCY_TARGET_MS=2000
|
|
- TIER_MAIN_LATENCY_TARGET_MS=10000
|
|
depends_on:
|
|
inference-engine:
|
|
condition: service_healthy
|
|
networks:
|
|
- llm-network
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 10s
|
|
|
|
networks:
|
|
llm-network:
|
|
driver: bridge
|
|
|
|
volumes:
|
|
vllm-cache:
|
|
name: local-llm-vllm-cache
|
|
|
|
# ============================================================================
|
|
# Usage:
|
|
# ============================================================================
|
|
#
|
|
# 1. First, setup WSL GPU (if not done):
|
|
# ./scripts/setup-wsl-gpu.sh
|
|
#
|
|
# 2. Configure model (optional, default is Mistral-7B):
|
|
# export VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
|
|
#
|
|
# 3. Start the stack:
|
|
# docker-compose -f docker-compose.vllm.yml up -d
|
|
#
|
|
# 4. Monitor vLLM startup (first time downloads model ~15GB):
|
|
# docker-compose -f docker-compose.vllm.yml logs -f vllm
|
|
#
|
|
# 5. Verify health:
|
|
# curl http://localhost:8000/health # vLLM direct
|
|
# curl http://localhost:3161/health # Inference Engine
|
|
# curl http://localhost:3160/health # Gateway
|
|
#
|
|
# 6. Test inference:
|
|
# curl -X POST http://localhost:3160/v1/chat/completions \
|
|
# -H "Content-Type: application/json" \
|
|
# -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[{"role":"user","content":"Hello!"}]}'
|
|
#
|
|
# 7. Check GPU usage:
|
|
# nvidia-smi
|
|
#
|
|
# 8. Stop:
|
|
# docker-compose -f docker-compose.vllm.yml down
|
|
#
|
|
# ============================================================================
|
|
# LoRA Adapters:
|
|
# ============================================================================
|
|
#
|
|
# Place LoRA adapters in ./lora-adapters/ directory:
|
|
#
|
|
# lora-adapters/
|
|
# ├── erp-core/
|
|
# │ ├── adapter_config.json
|
|
# │ └── adapter_model.safetensors
|
|
# └── trading/
|
|
# ├── adapter_config.json
|
|
# └── adapter_model.safetensors
|
|
#
|
|
# Use adapters in requests:
|
|
# curl -X POST http://localhost:3160/v1/chat/completions \
|
|
# -H "Content-Type: application/json" \
|
|
# -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[...],"lora_adapter":"erp-core"}'
|
|
#
|
|
# ============================================================================
|
|
# Recommended Models by VRAM:
|
|
# ============================================================================
|
|
#
|
|
# 8GB VRAM:
|
|
# - mistralai/Mistral-7B-Instruct-v0.2 (quantized)
|
|
# - TheBloke/Mistral-7B-Instruct-v0.2-GPTQ
|
|
#
|
|
# 12-16GB VRAM:
|
|
# - mistralai/Mistral-7B-Instruct-v0.2
|
|
# - codellama/CodeLlama-7b-Instruct-hf
|
|
#
|
|
# 24GB+ VRAM:
|
|
# - mistralai/Mistral-7B-Instruct-v0.2 (multiple LoRA)
|
|
# - codellama/CodeLlama-13b-Instruct-hf
|
|
# - TheBloke/Llama-2-13B-chat-GPTQ
|
|
#
|
|
# ============================================================================
|