# Local LLM Agent - Environment Configuration # Version: 0.5.0 (Phase 3 - Production) # Copy to .env and adjust values as needed # ============================================================================= # Gateway Configuration # ============================================================================= GATEWAY_PORT=3160 GATEWAY_HOST=0.0.0.0 NODE_ENV=development # ============================================================================= # Inference Engine Configuration # ============================================================================= INFERENCE_PORT=3161 INFERENCE_HOST=0.0.0.0 # Backend selection: ollama | vllm # - ollama: Development/CPU mode # - vllm: Production/GPU mode INFERENCE_BACKEND=ollama # ============================================================================= # Ollama Configuration (Development) # ============================================================================= OLLAMA_HOST=http://localhost:11434 OLLAMA_MODEL=tinyllama # ============================================================================= # vLLM Configuration (Production) # ============================================================================= VLLM_HOST=http://localhost:8000 VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2 # GPU memory utilization (0.1 - 1.0) VLLM_GPU_MEMORY_UTILIZATION=0.9 # Maximum model context length VLLM_MAX_MODEL_LEN=8192 # Tensor parallelism (for multi-GPU) VLLM_TENSOR_PARALLEL_SIZE=1 # ============================================================================= # Multi-LoRA Configuration # ============================================================================= # Enable LoRA adapter support VLLM_ENABLE_LORA=true # Maximum LoRA rank VLLM_MAX_LORA_RANK=64 # Maximum adapters to load simultaneously VLLM_MAX_LORAS=4 # Extra vocabulary size for LoRA VLLM_LORA_EXTRA_VOCAB_SIZE=256 # ============================================================================= # Continuous Batching Configuration # ============================================================================= # Maximum tokens per batch VLLM_MAX_NUM_BATCHED_TOKENS=4096 # Maximum sequences per batch VLLM_MAX_NUM_SEQS=8 # Enable prefix caching VLLM_ENABLE_PREFIX_CACHING=true # ============================================================================= # Model Configuration # ============================================================================= MODEL_NAME=tinyllama # Token limits MAX_TOKENS_LIMIT=4096 DEFAULT_MAX_TOKENS=512 DEFAULT_TEMPERATURE=0.7 DEFAULT_TOP_P=0.9 # ============================================================================= # Tier Configuration # ============================================================================= # Tier Small - Fast responses for simple tasks TIER_SMALL_MAX_TOKENS=512 TIER_SMALL_MAX_CONTEXT=4096 TIER_SMALL_LATENCY_TARGET_MS=500 # Tier Main - Standard capabilities TIER_MAIN_MAX_TOKENS=2048 TIER_MAIN_MAX_CONTEXT=16384 TIER_MAIN_LATENCY_TARGET_MS=2000 # Tier Large - Extended capabilities TIER_LARGE_MAX_TOKENS=4096 TIER_LARGE_MAX_CONTEXT=32768 TIER_LARGE_LATENCY_TARGET_MS=5000 # ============================================================================= # Retry Configuration # ============================================================================= RETRY_MAX_ATTEMPTS=3 RETRY_BASE_DELAY_MS=1000 # ============================================================================= # Timeout Configuration (milliseconds) # ============================================================================= REQUEST_TIMEOUT_MS=60000 CONNECT_TIMEOUT_MS=5000 # ============================================================================= # Monitoring Configuration # ============================================================================= # Grafana admin credentials GRAFANA_ADMIN_USER=admin GRAFANA_ADMIN_PASSWORD=admin # Grafana URL GRAFANA_ROOT_URL=http://localhost:3000 # ============================================================================= # Rate Limiting # ============================================================================= RATE_LIMIT_ENABLED=true RATE_LIMIT_RPM=60 # ============================================================================= # Database Configuration (Optional) # ============================================================================= DATABASE_HOST=localhost DATABASE_PORT=5432 DATABASE_NAME=local_llm_dev DATABASE_USER=isem DATABASE_PASSWORD=isem_dev # ============================================================================= # Redis Configuration (Optional - for caching) # ============================================================================= REDIS_HOST=localhost REDIS_PORT=6379 REDIS_DB=9 # ============================================================================= # Logging # ============================================================================= LOG_LEVEL=info LOG_FORMAT=json # ============================================================================= # HuggingFace (for gated models) # ============================================================================= # Get token from: https://huggingface.co/settings/tokens HUGGING_FACE_HUB_TOKEN= # ============================================================================= # Service Version # ============================================================================= SERVICE_VERSION=0.5.0