local-llm-agent/.env.production
Adrian Flores Cortes 3def230d58 Initial commit: local-llm-agent infrastructure project
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 16:42:45 -06:00

106 lines
3.3 KiB
Plaintext

# Local LLM Agent - Production Environment Configuration
# ============================================================================
# Copy this file to .env and configure for your environment
# ============================================================================
# ============================================================================
# vLLM Configuration
# ============================================================================
# Model to load (HuggingFace model ID)
VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
# HuggingFace token for gated models (optional)
# Get token from: https://huggingface.co/settings/tokens
HUGGING_FACE_HUB_TOKEN=
# GPU memory utilization (0.1 - 1.0)
# Lower = more room for LoRA adapters, higher = more KV cache
VLLM_GPU_MEMORY_UTILIZATION=0.9
# Maximum model context length
VLLM_MAX_MODEL_LEN=8192
# Tensor parallelism (for multi-GPU setups)
VLLM_TENSOR_PARALLEL_SIZE=1
# ============================================================================
# Multi-LoRA Configuration
# ============================================================================
# Maximum LoRA rank to support
VLLM_MAX_LORA_RANK=64
# Maximum number of LoRA adapters to load simultaneously
VLLM_MAX_LORAS=4
# Extra vocabulary size for LoRA adapters
VLLM_LORA_EXTRA_VOCAB_SIZE=256
# ============================================================================
# Continuous Batching Configuration
# ============================================================================
# Maximum tokens to process in a batch
VLLM_MAX_NUM_BATCHED_TOKENS=4096
# Maximum sequences to process simultaneously
VLLM_MAX_NUM_SEQS=8
# ============================================================================
# Gateway Configuration
# ============================================================================
# API Gateway port
GATEWAY_PORT=3160
# Inference engine connection
INFERENCE_HOST=inference-engine
INFERENCE_PORT=3161
# ============================================================================
# Monitoring Configuration
# ============================================================================
# Grafana admin credentials
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=changeme_in_production
# Grafana URL (for email notifications, etc.)
GRAFANA_ROOT_URL=http://localhost:3000
# ============================================================================
# Logging
# ============================================================================
# Log level: debug, info, warning, error
LOG_LEVEL=info
# ============================================================================
# Security
# ============================================================================
# Enable/disable rate limiting
RATE_LIMIT_ENABLED=true
# Requests per minute per client
RATE_LIMIT_RPM=60
# ============================================================================
# Notes
# ============================================================================
#
# Model VRAM Requirements (approximate):
# - TinyLlama (1.1B): ~2GB
# - Phi-2 (2.7B): ~6GB
# - Mistral-7B (7B): ~14GB
# - Llama-2-13B (13B): ~26GB
#
# With Multi-LoRA, add ~500MB per adapter
#
# Recommended GPU:
# - Development: RTX 3060 (12GB)
# - Production: RTX 3090 (24GB) or RTX 4090 (24GB)
#
# ============================================================================