local-llm-agent/docker-compose.yml
Adrian Flores Cortes 3def230d58 Initial commit: local-llm-agent infrastructure project
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 16:42:45 -06:00

145 lines
3.7 KiB
YAML

# Local LLM Agent - Docker Compose for Development
# Usage: docker-compose up -d
#
# Este compose incluye:
# - Gateway (NestJS) - puerto 3160
# - Inference Engine (Python) - puerto 3161
# - Ollama (LLM Runtime) - puerto 11434
#
# Para usar GPU con Ollama, descomentar la seccion de deploy en el servicio ollama
services:
# Ollama - LLM Runtime
ollama:
image: ollama/ollama:latest
container_name: local-llm-ollama
ports:
- "11434:11434"
volumes:
- ollama-data:/root/.ollama
networks:
- llm-network
restart: unless-stopped
# Descomentar para usar GPU (requiere nvidia-container-toolkit)
# deploy:
# resources:
# reservations:
# devices:
# - driver: nvidia
# count: 1
# capabilities: [gpu]
healthcheck:
test: ["CMD", "ollama", "list"]
interval: 30s
timeout: 10s
retries: 5
start_period: 30s
# Inference Engine (Python FastAPI)
inference-engine:
build:
context: ./apps/inference-engine
dockerfile: Dockerfile
container_name: local-llm-inference
ports:
- "3161:3161"
environment:
- INFERENCE_PORT=3161
- INFERENCE_HOST=0.0.0.0
- INFERENCE_BACKEND=ollama
- OLLAMA_HOST=http://ollama:11434
- OLLAMA_MODEL=tinyllama
- MODEL_NAME=tinyllama
- LOG_LEVEL=info
depends_on:
ollama:
condition: service_healthy
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3161/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
# API Gateway (NestJS)
gateway:
build:
context: ./apps/gateway
dockerfile: Dockerfile
container_name: local-llm-gateway
ports:
- "3160:3160"
environment:
- NODE_ENV=development
- GATEWAY_PORT=3160
- INFERENCE_HOST=inference-engine
- INFERENCE_PORT=3161
- MODEL_NAME=tinyllama
- TIER_SMALL_LATENCY_TARGET_MS=5000
- TIER_MAIN_LATENCY_TARGET_MS=15000
depends_on:
inference-engine:
condition: service_healthy
networks:
- llm-network
restart: unless-stopped
healthcheck:
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
networks:
llm-network:
driver: bridge
volumes:
ollama-data:
name: local-llm-ollama-data
# ============================================================================
# Uso:
# ============================================================================
#
# 1. Levantar todo el stack:
# docker-compose up -d
#
# 2. Ver logs:
# docker-compose logs -f
#
# 3. Descargar modelo en Ollama (primera vez):
# docker exec -it local-llm-ollama ollama pull tinyllama
#
# 4. Probar endpoints:
# curl http://localhost:3160/health
# curl http://localhost:3160/v1/models
#
# 5. Chat completion:
# curl -X POST http://localhost:3160/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{"model":"tinyllama","messages":[{"role":"user","content":"Hi"}]}'
#
# 6. Detener:
# docker-compose down
#
# ============================================================================
# Modelos recomendados:
# ============================================================================
#
# Para testing (CPU, rápido):
# - tinyllama (1.1B params, ~700MB)
# - phi (2.7B params, ~1.6GB)
#
# Para producción (GPU recomendado):
# - llama2 (7B params)
# - codellama (7B params)
# - mistral (7B params)
#
# Descargar modelo:
# docker exec -it local-llm-ollama ollama pull <model-name>
#