local-llm-agent/.env.production

# Local LLM Agent - Production Environment Configuration
# ============================================================================
# Copy this file to .env and configure for your environment
# ============================================================================

# ============================================================================
# vLLM Configuration
# ============================================================================

# Model to load (HuggingFace model ID)
VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2

# HuggingFace token for gated models (optional)
# Get token from: https://huggingface.co/settings/tokens
HUGGING_FACE_HUB_TOKEN=

# GPU memory utilization (0.1 - 1.0)
# Lower = more room for LoRA adapters, higher = more KV cache
VLLM_GPU_MEMORY_UTILIZATION=0.9

# Maximum model context length
VLLM_MAX_MODEL_LEN=8192

# Tensor parallelism (for multi-GPU setups)
VLLM_TENSOR_PARALLEL_SIZE=1

# ============================================================================
# Multi-LoRA Configuration
# ============================================================================

# Maximum LoRA rank to support
VLLM_MAX_LORA_RANK=64

# Maximum number of LoRA adapters to load simultaneously
VLLM_MAX_LORAS=4

# Extra vocabulary size for LoRA adapters
VLLM_LORA_EXTRA_VOCAB_SIZE=256

# ============================================================================
# Continuous Batching Configuration
# ============================================================================

# Maximum tokens to process in a batch
VLLM_MAX_NUM_BATCHED_TOKENS=4096

# Maximum sequences to process simultaneously
VLLM_MAX_NUM_SEQS=8

# ============================================================================
# Gateway Configuration
# ============================================================================

# API Gateway port
GATEWAY_PORT=3160

# Inference engine connection
INFERENCE_HOST=inference-engine
INFERENCE_PORT=3161

# ============================================================================
# Monitoring Configuration
# ============================================================================

# Grafana admin credentials
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=changeme_in_production

# Grafana URL (for email notifications, etc.)
GRAFANA_ROOT_URL=http://localhost:3000

# ============================================================================
# Logging
# ============================================================================

# Log level: debug, info, warning, error
LOG_LEVEL=info

# ============================================================================
# Security
# ============================================================================

# Enable/disable rate limiting
RATE_LIMIT_ENABLED=true

# Requests per minute per client
RATE_LIMIT_RPM=60

# ============================================================================
# Notes
# ============================================================================
#
# Model VRAM Requirements (approximate):
# - TinyLlama (1.1B):      ~2GB
# - Phi-2 (2.7B):          ~6GB
# - Mistral-7B (7B):       ~14GB
# - Llama-2-13B (13B):     ~26GB
#
# With Multi-LoRA, add ~500MB per adapter
#
# Recommended GPU:
# - Development: RTX 3060 (12GB)
# - Production:  RTX 3090 (24GB) or RTX 4090 (24GB)
#
# ============================================================================