local-llm-agent/apps/inference-engine/.env.example

# Local LLM Agent - Inference Engine Configuration
# Copy this file to .env and modify as needed

# =============================================================================
# Server Configuration
# =============================================================================

# Port to run the inference engine on (default: 3161)
INFERENCE_PORT=3161

# Host to bind to (default: 0.0.0.0 for Docker, localhost for local dev)
INFERENCE_HOST=0.0.0.0

# =============================================================================
# Backend Configuration
# =============================================================================

# Which backend to use: ollama or vllm (default: ollama)
INFERENCE_BACKEND=ollama

# Ollama server URL (default: http://localhost:11434)
OLLAMA_HOST=http://localhost:11434

# Default model to use (default: tinyllama for testing, gpt-oss-20b for prod)
OLLAMA_MODEL=tinyllama

# =============================================================================
# Model Constraints
# =============================================================================

# Maximum tokens allowed in a request (default: 4096)
MAX_TOKENS_LIMIT=4096

# Minimum tokens (default: 1)
MIN_TOKENS=1

# Default max tokens if not specified (default: 512)
DEFAULT_MAX_TOKENS=512

# Default temperature if not specified (default: 0.7)
DEFAULT_TEMPERATURE=0.7

# Default top_p if not specified (default: 0.9)
DEFAULT_TOP_P=0.9

# =============================================================================
# Timeouts (milliseconds)
# =============================================================================

# Total request timeout (default: 60000 = 60s)
REQUEST_TIMEOUT_MS=60000

# Connection timeout (default: 5000 = 5s)
CONNECT_TIMEOUT_MS=5000

# =============================================================================
# Retry Configuration
# =============================================================================

# Maximum retry attempts for health checks (default: 3)
RETRY_MAX_ATTEMPTS=3

# Base delay between retries in ms (default: 1000)
RETRY_BASE_DELAY_MS=1000

# =============================================================================
# Cache Configuration
# =============================================================================

# Time to cache model list in seconds (default: 60)
MODELS_CACHE_TTL_SECONDS=60

# =============================================================================
# Logging
# =============================================================================

# Log level: debug, info, warning, error (default: info)
LOG_LEVEL=info