106 lines
3.3 KiB
Plaintext
106 lines
3.3 KiB
Plaintext
# Local LLM Agent - Production Environment Configuration
|
|
# ============================================================================
|
|
# Copy this file to .env and configure for your environment
|
|
# ============================================================================
|
|
|
|
# ============================================================================
|
|
# vLLM Configuration
|
|
# ============================================================================
|
|
|
|
# Model to load (HuggingFace model ID)
|
|
VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
|
|
|
|
# HuggingFace token for gated models (optional)
|
|
# Get token from: https://huggingface.co/settings/tokens
|
|
HUGGING_FACE_HUB_TOKEN=
|
|
|
|
# GPU memory utilization (0.1 - 1.0)
|
|
# Lower = more room for LoRA adapters, higher = more KV cache
|
|
VLLM_GPU_MEMORY_UTILIZATION=0.9
|
|
|
|
# Maximum model context length
|
|
VLLM_MAX_MODEL_LEN=8192
|
|
|
|
# Tensor parallelism (for multi-GPU setups)
|
|
VLLM_TENSOR_PARALLEL_SIZE=1
|
|
|
|
# ============================================================================
|
|
# Multi-LoRA Configuration
|
|
# ============================================================================
|
|
|
|
# Maximum LoRA rank to support
|
|
VLLM_MAX_LORA_RANK=64
|
|
|
|
# Maximum number of LoRA adapters to load simultaneously
|
|
VLLM_MAX_LORAS=4
|
|
|
|
# Extra vocabulary size for LoRA adapters
|
|
VLLM_LORA_EXTRA_VOCAB_SIZE=256
|
|
|
|
# ============================================================================
|
|
# Continuous Batching Configuration
|
|
# ============================================================================
|
|
|
|
# Maximum tokens to process in a batch
|
|
VLLM_MAX_NUM_BATCHED_TOKENS=4096
|
|
|
|
# Maximum sequences to process simultaneously
|
|
VLLM_MAX_NUM_SEQS=8
|
|
|
|
# ============================================================================
|
|
# Gateway Configuration
|
|
# ============================================================================
|
|
|
|
# API Gateway port
|
|
GATEWAY_PORT=3160
|
|
|
|
# Inference engine connection
|
|
INFERENCE_HOST=inference-engine
|
|
INFERENCE_PORT=3161
|
|
|
|
# ============================================================================
|
|
# Monitoring Configuration
|
|
# ============================================================================
|
|
|
|
# Grafana admin credentials
|
|
GRAFANA_ADMIN_USER=admin
|
|
GRAFANA_ADMIN_PASSWORD=changeme_in_production
|
|
|
|
# Grafana URL (for email notifications, etc.)
|
|
GRAFANA_ROOT_URL=http://localhost:3000
|
|
|
|
# ============================================================================
|
|
# Logging
|
|
# ============================================================================
|
|
|
|
# Log level: debug, info, warning, error
|
|
LOG_LEVEL=info
|
|
|
|
# ============================================================================
|
|
# Security
|
|
# ============================================================================
|
|
|
|
# Enable/disable rate limiting
|
|
RATE_LIMIT_ENABLED=true
|
|
|
|
# Requests per minute per client
|
|
RATE_LIMIT_RPM=60
|
|
|
|
# ============================================================================
|
|
# Notes
|
|
# ============================================================================
|
|
#
|
|
# Model VRAM Requirements (approximate):
|
|
# - TinyLlama (1.1B): ~2GB
|
|
# - Phi-2 (2.7B): ~6GB
|
|
# - Mistral-7B (7B): ~14GB
|
|
# - Llama-2-13B (13B): ~26GB
|
|
#
|
|
# With Multi-LoRA, add ~500MB per adapter
|
|
#
|
|
# Recommended GPU:
|
|
# - Development: RTX 3060 (12GB)
|
|
# - Production: RTX 3090 (24GB) or RTX 4090 (24GB)
|
|
#
|
|
# ============================================================================
|