# Local LLM Agent - Inference Engine Configuration # Copy this file to .env and modify as needed # ============================================================================= # Server Configuration # ============================================================================= # Port to run the inference engine on (default: 3161) INFERENCE_PORT=3161 # Host to bind to (default: 0.0.0.0 for Docker, localhost for local dev) INFERENCE_HOST=0.0.0.0 # ============================================================================= # Backend Configuration # ============================================================================= # Which backend to use: ollama or vllm (default: ollama) INFERENCE_BACKEND=ollama # Ollama server URL (default: http://localhost:11434) OLLAMA_HOST=http://localhost:11434 # Default model to use (default: tinyllama for testing, gpt-oss-20b for prod) OLLAMA_MODEL=tinyllama # ============================================================================= # Model Constraints # ============================================================================= # Maximum tokens allowed in a request (default: 4096) MAX_TOKENS_LIMIT=4096 # Minimum tokens (default: 1) MIN_TOKENS=1 # Default max tokens if not specified (default: 512) DEFAULT_MAX_TOKENS=512 # Default temperature if not specified (default: 0.7) DEFAULT_TEMPERATURE=0.7 # Default top_p if not specified (default: 0.9) DEFAULT_TOP_P=0.9 # ============================================================================= # Timeouts (milliseconds) # ============================================================================= # Total request timeout (default: 60000 = 60s) REQUEST_TIMEOUT_MS=60000 # Connection timeout (default: 5000 = 5s) CONNECT_TIMEOUT_MS=5000 # ============================================================================= # Retry Configuration # ============================================================================= # Maximum retry attempts for health checks (default: 3) RETRY_MAX_ATTEMPTS=3 # Base delay between retries in ms (default: 1000) RETRY_BASE_DELAY_MS=1000 # ============================================================================= # Cache Configuration # ============================================================================= # Time to cache model list in seconds (default: 60) MODELS_CACHE_TTL_SECONDS=60 # ============================================================================= # Logging # ============================================================================= # Log level: debug, info, warning, error (default: info) LOG_LEVEL=info