# Local LLM Agent - Docker Compose for vLLM (Production GPU) # =========================================================================== # # This compose uses vLLM for high-performance GPU inference with: # - Continuous batching # - Multi-LoRA support # - Prometheus metrics # # Prerequisites: # - NVIDIA GPU with CUDA support # - nvidia-container-toolkit installed # - Run ./scripts/setup-wsl-gpu.sh first # # Usage: # docker-compose -f docker-compose.vllm.yml up -d # # =========================================================================== services: # vLLM - High Performance LLM Inference vllm: image: vllm/vllm-openai:latest container_name: local-llm-vllm ports: - "8000:8000" volumes: # Model cache - vllm-cache:/root/.cache/huggingface # LoRA adapters - ./lora-adapters:/lora-adapters:ro environment: - HF_HOME=/root/.cache/huggingface - HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN:-} command: > --model ${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2} --host 0.0.0.0 --port 8000 --gpu-memory-utilization ${VLLM_GPU_MEMORY_UTILIZATION:-0.9} --max-model-len ${VLLM_MAX_MODEL_LEN:-8192} --tensor-parallel-size ${VLLM_TENSOR_PARALLEL_SIZE:-1} --enable-lora --max-lora-rank ${VLLM_MAX_LORA_RANK:-64} --max-loras ${VLLM_MAX_LORAS:-4} --lora-extra-vocab-size ${VLLM_LORA_EXTRA_VOCAB_SIZE:-256} --max-num-batched-tokens ${VLLM_MAX_NUM_BATCHED_TOKENS:-4096} --max-num-seqs ${VLLM_MAX_NUM_SEQS:-8} --enable-prefix-caching --disable-log-requests deploy: resources: reservations: devices: - driver: nvidia count: 1 capabilities: [gpu] networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/health"] interval: 30s timeout: 10s retries: 5 start_period: 120s # vLLM needs time to load model # Inference Engine (Python FastAPI) - vLLM Mode inference-engine: build: context: ./apps/inference-engine dockerfile: Dockerfile container_name: local-llm-inference ports: - "3161:3161" environment: - INFERENCE_PORT=3161 - INFERENCE_HOST=0.0.0.0 - INFERENCE_BACKEND=vllm - VLLM_HOST=http://vllm:8000 - VLLM_MODEL=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2} - VLLM_ENABLE_LORA=true - VLLM_MAX_LORAS=${VLLM_MAX_LORAS:-4} - LOG_LEVEL=info depends_on: vllm: condition: service_healthy networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:3161/health"] interval: 30s timeout: 10s retries: 3 start_period: 10s # API Gateway (NestJS) gateway: build: context: ./apps/gateway dockerfile: Dockerfile container_name: local-llm-gateway ports: - "3160:3160" environment: - NODE_ENV=production - GATEWAY_PORT=3160 - INFERENCE_HOST=inference-engine - INFERENCE_PORT=3161 - MODEL_NAME=${VLLM_MODEL:-mistralai/Mistral-7B-Instruct-v0.2} - TIER_SMALL_LATENCY_TARGET_MS=2000 - TIER_MAIN_LATENCY_TARGET_MS=10000 depends_on: inference-engine: condition: service_healthy networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"] interval: 30s timeout: 10s retries: 3 start_period: 10s networks: llm-network: driver: bridge volumes: vllm-cache: name: local-llm-vllm-cache # ============================================================================ # Usage: # ============================================================================ # # 1. First, setup WSL GPU (if not done): # ./scripts/setup-wsl-gpu.sh # # 2. Configure model (optional, default is Mistral-7B): # export VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2 # # 3. Start the stack: # docker-compose -f docker-compose.vllm.yml up -d # # 4. Monitor vLLM startup (first time downloads model ~15GB): # docker-compose -f docker-compose.vllm.yml logs -f vllm # # 5. Verify health: # curl http://localhost:8000/health # vLLM direct # curl http://localhost:3161/health # Inference Engine # curl http://localhost:3160/health # Gateway # # 6. Test inference: # curl -X POST http://localhost:3160/v1/chat/completions \ # -H "Content-Type: application/json" \ # -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[{"role":"user","content":"Hello!"}]}' # # 7. Check GPU usage: # nvidia-smi # # 8. Stop: # docker-compose -f docker-compose.vllm.yml down # # ============================================================================ # LoRA Adapters: # ============================================================================ # # Place LoRA adapters in ./lora-adapters/ directory: # # lora-adapters/ # ├── erp-core/ # │ ├── adapter_config.json # │ └── adapter_model.safetensors # └── trading/ # ├── adapter_config.json # └── adapter_model.safetensors # # Use adapters in requests: # curl -X POST http://localhost:3160/v1/chat/completions \ # -H "Content-Type: application/json" \ # -d '{"model":"mistralai/Mistral-7B-Instruct-v0.2","messages":[...],"lora_adapter":"erp-core"}' # # ============================================================================ # Recommended Models by VRAM: # ============================================================================ # # 8GB VRAM: # - mistralai/Mistral-7B-Instruct-v0.2 (quantized) # - TheBloke/Mistral-7B-Instruct-v0.2-GPTQ # # 12-16GB VRAM: # - mistralai/Mistral-7B-Instruct-v0.2 # - codellama/CodeLlama-7b-Instruct-hf # # 24GB+ VRAM: # - mistralai/Mistral-7B-Instruct-v0.2 (multiple LoRA) # - codellama/CodeLlama-13b-Instruct-hf # - TheBloke/Llama-2-13B-chat-GPTQ # # ============================================================================