160 lines
5.2 KiB
Plaintext
160 lines
5.2 KiB
Plaintext
# Local LLM Agent - Environment Configuration
|
|
# Version: 0.5.0 (Phase 3 - Production)
|
|
# Copy to .env and adjust values as needed
|
|
|
|
# =============================================================================
|
|
# Gateway Configuration
|
|
# =============================================================================
|
|
GATEWAY_PORT=3160
|
|
GATEWAY_HOST=0.0.0.0
|
|
NODE_ENV=development
|
|
|
|
# =============================================================================
|
|
# Inference Engine Configuration
|
|
# =============================================================================
|
|
INFERENCE_PORT=3161
|
|
INFERENCE_HOST=0.0.0.0
|
|
|
|
# Backend selection: ollama | vllm
|
|
# - ollama: Development/CPU mode
|
|
# - vllm: Production/GPU mode
|
|
INFERENCE_BACKEND=ollama
|
|
|
|
# =============================================================================
|
|
# Ollama Configuration (Development)
|
|
# =============================================================================
|
|
OLLAMA_HOST=http://localhost:11434
|
|
OLLAMA_MODEL=tinyllama
|
|
|
|
# =============================================================================
|
|
# vLLM Configuration (Production)
|
|
# =============================================================================
|
|
VLLM_HOST=http://localhost:8000
|
|
VLLM_MODEL=mistralai/Mistral-7B-Instruct-v0.2
|
|
|
|
# GPU memory utilization (0.1 - 1.0)
|
|
VLLM_GPU_MEMORY_UTILIZATION=0.9
|
|
|
|
# Maximum model context length
|
|
VLLM_MAX_MODEL_LEN=8192
|
|
|
|
# Tensor parallelism (for multi-GPU)
|
|
VLLM_TENSOR_PARALLEL_SIZE=1
|
|
|
|
# =============================================================================
|
|
# Multi-LoRA Configuration
|
|
# =============================================================================
|
|
# Enable LoRA adapter support
|
|
VLLM_ENABLE_LORA=true
|
|
|
|
# Maximum LoRA rank
|
|
VLLM_MAX_LORA_RANK=64
|
|
|
|
# Maximum adapters to load simultaneously
|
|
VLLM_MAX_LORAS=4
|
|
|
|
# Extra vocabulary size for LoRA
|
|
VLLM_LORA_EXTRA_VOCAB_SIZE=256
|
|
|
|
# =============================================================================
|
|
# Continuous Batching Configuration
|
|
# =============================================================================
|
|
# Maximum tokens per batch
|
|
VLLM_MAX_NUM_BATCHED_TOKENS=4096
|
|
|
|
# Maximum sequences per batch
|
|
VLLM_MAX_NUM_SEQS=8
|
|
|
|
# Enable prefix caching
|
|
VLLM_ENABLE_PREFIX_CACHING=true
|
|
|
|
# =============================================================================
|
|
# Model Configuration
|
|
# =============================================================================
|
|
MODEL_NAME=tinyllama
|
|
|
|
# Token limits
|
|
MAX_TOKENS_LIMIT=4096
|
|
DEFAULT_MAX_TOKENS=512
|
|
DEFAULT_TEMPERATURE=0.7
|
|
DEFAULT_TOP_P=0.9
|
|
|
|
# =============================================================================
|
|
# Tier Configuration
|
|
# =============================================================================
|
|
# Tier Small - Fast responses for simple tasks
|
|
TIER_SMALL_MAX_TOKENS=512
|
|
TIER_SMALL_MAX_CONTEXT=4096
|
|
TIER_SMALL_LATENCY_TARGET_MS=500
|
|
|
|
# Tier Main - Standard capabilities
|
|
TIER_MAIN_MAX_TOKENS=2048
|
|
TIER_MAIN_MAX_CONTEXT=16384
|
|
TIER_MAIN_LATENCY_TARGET_MS=2000
|
|
|
|
# Tier Large - Extended capabilities
|
|
TIER_LARGE_MAX_TOKENS=4096
|
|
TIER_LARGE_MAX_CONTEXT=32768
|
|
TIER_LARGE_LATENCY_TARGET_MS=5000
|
|
|
|
# =============================================================================
|
|
# Retry Configuration
|
|
# =============================================================================
|
|
RETRY_MAX_ATTEMPTS=3
|
|
RETRY_BASE_DELAY_MS=1000
|
|
|
|
# =============================================================================
|
|
# Timeout Configuration (milliseconds)
|
|
# =============================================================================
|
|
REQUEST_TIMEOUT_MS=60000
|
|
CONNECT_TIMEOUT_MS=5000
|
|
|
|
# =============================================================================
|
|
# Monitoring Configuration
|
|
# =============================================================================
|
|
# Grafana admin credentials
|
|
GRAFANA_ADMIN_USER=admin
|
|
GRAFANA_ADMIN_PASSWORD=admin
|
|
|
|
# Grafana URL
|
|
GRAFANA_ROOT_URL=http://localhost:3000
|
|
|
|
# =============================================================================
|
|
# Rate Limiting
|
|
# =============================================================================
|
|
RATE_LIMIT_ENABLED=true
|
|
RATE_LIMIT_RPM=60
|
|
|
|
# =============================================================================
|
|
# Database Configuration (Optional)
|
|
# =============================================================================
|
|
DATABASE_HOST=localhost
|
|
DATABASE_PORT=5432
|
|
DATABASE_NAME=local_llm_dev
|
|
DATABASE_USER=isem
|
|
DATABASE_PASSWORD=isem_dev
|
|
|
|
# =============================================================================
|
|
# Redis Configuration (Optional - for caching)
|
|
# =============================================================================
|
|
REDIS_HOST=localhost
|
|
REDIS_PORT=6379
|
|
REDIS_DB=9
|
|
|
|
# =============================================================================
|
|
# Logging
|
|
# =============================================================================
|
|
LOG_LEVEL=info
|
|
LOG_FORMAT=json
|
|
|
|
# =============================================================================
|
|
# HuggingFace (for gated models)
|
|
# =============================================================================
|
|
# Get token from: https://huggingface.co/settings/tokens
|
|
HUGGING_FACE_HUB_TOKEN=
|
|
|
|
# =============================================================================
|
|
# Service Version
|
|
# =============================================================================
|
|
SERVICE_VERSION=0.5.0
|