local-llm-agent/.env.example
Adrian Flores Cortes 3def230d58 Initial commit: local-llm-agent infrastructure project
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 16:42:45 -06:00

160 lines
5.2 KiB
Plaintext

# Local LLM Agent - Environment Configuration
# Version: 0.5.0 (Phase 3 - Production)
# Copy to .env and adjust values as needed
# =============================================================================
# Gateway Configuration
# =============================================================================
GATEWAY_PORT=3160
GATEWAY_HOST=0.0.0.0
NODE_ENV=development
# =============================================================================
# Inference Engine Configuration
# =============================================================================
INFERENCE_PORT=3161
INFERENCE_HOST=0.0.0.0
# Backend selection: ollama | vllm
# - ollama: Development/CPU mode
# - vllm: Production/GPU mode
INFERENCE_BACKEND=ollama
# =============================================================================
# Ollama Configuration (Development)
# =============================================================================
OLLAMA_HOST=http://localhost:11434
OLLAMA_MODEL=tinyllama
# =============================================================================
# vLLM Configuration (Production)
# =============================================================================
VLLM_HOST=http://localhost:8000
VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
# GPU memory utilization (0.1 - 1.0)
VLLM_GPU_MEMORY_UTILIZATION=0.9
# Maximum model context length
VLLM_MAX_MODEL_LEN=8192
# Tensor parallelism (for multi-GPU)
VLLM_TENSOR_PARALLEL_SIZE=1
# =============================================================================
# Multi-LoRA Configuration
# =============================================================================
# Enable LoRA adapter support
VLLM_ENABLE_LORA=true
# Maximum LoRA rank
VLLM_MAX_LORA_RANK=64
# Maximum adapters to load simultaneously
VLLM_MAX_LORAS=4
# Extra vocabulary size for LoRA
VLLM_LORA_EXTRA_VOCAB_SIZE=256
# =============================================================================
# Continuous Batching Configuration
# =============================================================================
# Maximum tokens per batch
VLLM_MAX_NUM_BATCHED_TOKENS=4096
# Maximum sequences per batch
VLLM_MAX_NUM_SEQS=8
# Enable prefix caching
VLLM_ENABLE_PREFIX_CACHING=true
# =============================================================================
# Model Configuration
# =============================================================================
MODEL_NAME=tinyllama
# Token limits
MAX_TOKENS_LIMIT=4096
DEFAULT_MAX_TOKENS=512
DEFAULT_TEMPERATURE=0.7
DEFAULT_TOP_P=0.9
# =============================================================================
# Tier Configuration
# =============================================================================
# Tier Small - Fast responses for simple tasks
TIER_SMALL_MAX_TOKENS=512
TIER_SMALL_MAX_CONTEXT=4096
TIER_SMALL_LATENCY_TARGET_MS=500
# Tier Main - Standard capabilities
TIER_MAIN_MAX_TOKENS=2048
TIER_MAIN_MAX_CONTEXT=16384
TIER_MAIN_LATENCY_TARGET_MS=2000
# Tier Large - Extended capabilities
TIER_LARGE_MAX_TOKENS=4096
TIER_LARGE_MAX_CONTEXT=32768
TIER_LARGE_LATENCY_TARGET_MS=5000
# =============================================================================
# Retry Configuration
# =============================================================================
RETRY_MAX_ATTEMPTS=3
RETRY_BASE_DELAY_MS=1000
# =============================================================================
# Timeout Configuration (milliseconds)
# =============================================================================
REQUEST_TIMEOUT_MS=60000
CONNECT_TIMEOUT_MS=5000
# =============================================================================
# Monitoring Configuration
# =============================================================================
# Grafana admin credentials
GRAFANA_ADMIN_USER=admin
GRAFANA_ADMIN_PASSWORD=admin
# Grafana URL
GRAFANA_ROOT_URL=http://localhost:3000
# =============================================================================
# Rate Limiting
# =============================================================================
RATE_LIMIT_ENABLED=true
RATE_LIMIT_RPM=60
# =============================================================================
# Database Configuration (Optional)
# =============================================================================
DATABASE_HOST=localhost
DATABASE_PORT=5432
DATABASE_NAME=local_llm_dev
DATABASE_USER=isem
DATABASE_PASSWORD=isem_dev
# =============================================================================
# Redis Configuration (Optional - for caching)
# =============================================================================
REDIS_HOST=localhost
REDIS_PORT=6379
REDIS_DB=9
# =============================================================================
# Logging
# =============================================================================
LOG_LEVEL=info
LOG_FORMAT=json
# =============================================================================
# HuggingFace (for gated models)
# =============================================================================
# Get token from: https://huggingface.co/settings/tokens
HUGGING_FACE_HUB_TOKEN=
# =============================================================================
# Service Version
# =============================================================================
SERVICE_VERSION=0.5.0