# Local LLM Agent - Production Environment Configuration # ============================================================================ # Copy this file to .env and configure for your environment # ============================================================================ # ============================================================================ # vLLM Configuration # ============================================================================ # Model to load (HuggingFace model ID) VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2 # HuggingFace token for gated models (optional) # Get token from: https://huggingface.co/settings/tokens HUGGING_FACE_HUB_TOKEN= # GPU memory utilization (0.1 - 1.0) # Lower = more room for LoRA adapters, higher = more KV cache VLLM_GPU_MEMORY_UTILIZATION=0.9 # Maximum model context length VLLM_MAX_MODEL_LEN=8192 # Tensor parallelism (for multi-GPU setups) VLLM_TENSOR_PARALLEL_SIZE=1 # ============================================================================ # Multi-LoRA Configuration # ============================================================================ # Maximum LoRA rank to support VLLM_MAX_LORA_RANK=64 # Maximum number of LoRA adapters to load simultaneously VLLM_MAX_LORAS=4 # Extra vocabulary size for LoRA adapters VLLM_LORA_EXTRA_VOCAB_SIZE=256 # ============================================================================ # Continuous Batching Configuration # ============================================================================ # Maximum tokens to process in a batch VLLM_MAX_NUM_BATCHED_TOKENS=4096 # Maximum sequences to process simultaneously VLLM_MAX_NUM_SEQS=8 # ============================================================================ # Gateway Configuration # ============================================================================ # API Gateway port GATEWAY_PORT=3160 # Inference engine connection INFERENCE_HOST=inference-engine INFERENCE_PORT=3161 # ============================================================================ # Monitoring Configuration # ============================================================================ # Grafana admin credentials GRAFANA_ADMIN_USER=admin GRAFANA_ADMIN_PASSWORD=changeme_in_production # Grafana URL (for email notifications, etc.) GRAFANA_ROOT_URL=http://localhost:3000 # ============================================================================ # Logging # ============================================================================ # Log level: debug, info, warning, error LOG_LEVEL=info # ============================================================================ # Security # ============================================================================ # Enable/disable rate limiting RATE_LIMIT_ENABLED=true # Requests per minute per client RATE_LIMIT_RPM=60 # ============================================================================ # Notes # ============================================================================ # # Model VRAM Requirements (approximate): # - TinyLlama (1.1B): ~2GB # - Phi-2 (2.7B): ~6GB # - Mistral-7B (7B): ~14GB # - Llama-2-13B (13B): ~26GB # # With Multi-LoRA, add ~500MB per adapter # # Recommended GPU: # - Development: RTX 3060 (12GB) # - Production: RTX 3090 (24GB) or RTX 4090 (24GB) # # ============================================================================