local-llm-agent/apps/inference-engine/.env.example
Adrian Flores Cortes 3def230d58 Initial commit: local-llm-agent infrastructure project
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 16:42:45 -06:00

79 lines
2.5 KiB
Plaintext

# Local LLM Agent - Inference Engine Configuration
# Copy this file to .env and modify as needed
# =============================================================================
# Server Configuration
# =============================================================================
# Port to run the inference engine on (default: 3161)
INFERENCE_PORT=3161
# Host to bind to (default: 0.0.0.0 for Docker, localhost for local dev)
INFERENCE_HOST=0.0.0.0
# =============================================================================
# Backend Configuration
# =============================================================================
# Which backend to use: ollama or vllm (default: ollama)
INFERENCE_BACKEND=ollama
# Ollama server URL (default: http://localhost:11434)
OLLAMA_HOST=http://localhost:11434
# Default model to use (default: tinyllama for testing, gpt-oss-20b for prod)
OLLAMA_MODEL=tinyllama
# =============================================================================
# Model Constraints
# =============================================================================
# Maximum tokens allowed in a request (default: 4096)
MAX_TOKENS_LIMIT=4096
# Minimum tokens (default: 1)
MIN_TOKENS=1
# Default max tokens if not specified (default: 512)
DEFAULT_MAX_TOKENS=512
# Default temperature if not specified (default: 0.7)
DEFAULT_TEMPERATURE=0.7
# Default top_p if not specified (default: 0.9)
DEFAULT_TOP_P=0.9
# =============================================================================
# Timeouts (milliseconds)
# =============================================================================
# Total request timeout (default: 60000 = 60s)
REQUEST_TIMEOUT_MS=60000
# Connection timeout (default: 5000 = 5s)
CONNECT_TIMEOUT_MS=5000
# =============================================================================
# Retry Configuration
# =============================================================================
# Maximum retry attempts for health checks (default: 3)
RETRY_MAX_ATTEMPTS=3
# Base delay between retries in ms (default: 1000)
RETRY_BASE_DELAY_MS=1000
# =============================================================================
# Cache Configuration
# =============================================================================
# Time to cache model list in seconds (default: 60)
MODELS_CACHE_TTL_SECONDS=60
# =============================================================================
# Logging
# =============================================================================
# Log level: debug, info, warning, error (default: info)
LOG_LEVEL=info