79 lines
2.5 KiB
Plaintext
79 lines
2.5 KiB
Plaintext
# Local LLM Agent - Inference Engine Configuration
|
|
# Copy this file to .env and modify as needed
|
|
|
|
# =============================================================================
|
|
# Server Configuration
|
|
# =============================================================================
|
|
|
|
# Port to run the inference engine on (default: 3161)
|
|
INFERENCE_PORT=3161
|
|
|
|
# Host to bind to (default: 0.0.0.0 for Docker, localhost for local dev)
|
|
INFERENCE_HOST=0.0.0.0
|
|
|
|
# =============================================================================
|
|
# Backend Configuration
|
|
# =============================================================================
|
|
|
|
# Which backend to use: ollama or vllm (default: ollama)
|
|
INFERENCE_BACKEND=ollama
|
|
|
|
# Ollama server URL (default: http://localhost:11434)
|
|
OLLAMA_HOST=http://localhost:11434
|
|
|
|
# Default model to use (default: tinyllama for testing, gpt-oss-20b for prod)
|
|
OLLAMA_MODEL=tinyllama
|
|
|
|
# =============================================================================
|
|
# Model Constraints
|
|
# =============================================================================
|
|
|
|
# Maximum tokens allowed in a request (default: 4096)
|
|
MAX_TOKENS_LIMIT=4096
|
|
|
|
# Minimum tokens (default: 1)
|
|
MIN_TOKENS=1
|
|
|
|
# Default max tokens if not specified (default: 512)
|
|
DEFAULT_MAX_TOKENS=512
|
|
|
|
# Default temperature if not specified (default: 0.7)
|
|
DEFAULT_TEMPERATURE=0.7
|
|
|
|
# Default top_p if not specified (default: 0.9)
|
|
DEFAULT_TOP_P=0.9
|
|
|
|
# =============================================================================
|
|
# Timeouts (milliseconds)
|
|
# =============================================================================
|
|
|
|
# Total request timeout (default: 60000 = 60s)
|
|
REQUEST_TIMEOUT_MS=60000
|
|
|
|
# Connection timeout (default: 5000 = 5s)
|
|
CONNECT_TIMEOUT_MS=5000
|
|
|
|
# =============================================================================
|
|
# Retry Configuration
|
|
# =============================================================================
|
|
|
|
# Maximum retry attempts for health checks (default: 3)
|
|
RETRY_MAX_ATTEMPTS=3
|
|
|
|
# Base delay between retries in ms (default: 1000)
|
|
RETRY_BASE_DELAY_MS=1000
|
|
|
|
# =============================================================================
|
|
# Cache Configuration
|
|
# =============================================================================
|
|
|
|
# Time to cache model list in seconds (default: 60)
|
|
MODELS_CACHE_TTL_SECONDS=60
|
|
|
|
# =============================================================================
|
|
# Logging
|
|
# =============================================================================
|
|
|
|
# Log level: debug, info, warning, error (default: info)
|
|
LOG_LEVEL=info
|