145 lines
3.7 KiB
YAML
145 lines
3.7 KiB
YAML
# Local LLM Agent - Docker Compose for Development
|
|
# Usage: docker-compose up -d
|
|
#
|
|
# Este compose incluye:
|
|
# - Gateway (NestJS) - puerto 3160
|
|
# - Inference Engine (Python) - puerto 3161
|
|
# - Ollama (LLM Runtime) - puerto 11434
|
|
#
|
|
# Para usar GPU con Ollama, descomentar la seccion de deploy en el servicio ollama
|
|
|
|
services:
|
|
# Ollama - LLM Runtime
|
|
ollama:
|
|
image: ollama/ollama:latest
|
|
container_name: local-llm-ollama
|
|
ports:
|
|
- "11434:11434"
|
|
volumes:
|
|
- ollama-data:/root/.ollama
|
|
networks:
|
|
- llm-network
|
|
restart: unless-stopped
|
|
# Descomentar para usar GPU (requiere nvidia-container-toolkit)
|
|
# deploy:
|
|
# resources:
|
|
# reservations:
|
|
# devices:
|
|
# - driver: nvidia
|
|
# count: 1
|
|
# capabilities: [gpu]
|
|
healthcheck:
|
|
test: ["CMD", "ollama", "list"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
start_period: 30s
|
|
|
|
# Inference Engine (Python FastAPI)
|
|
inference-engine:
|
|
build:
|
|
context: ./apps/inference-engine
|
|
dockerfile: Dockerfile
|
|
container_name: local-llm-inference
|
|
ports:
|
|
- "3161:3161"
|
|
environment:
|
|
- INFERENCE_PORT=3161
|
|
- INFERENCE_HOST=0.0.0.0
|
|
- INFERENCE_BACKEND=ollama
|
|
- OLLAMA_HOST=http://ollama:11434
|
|
- OLLAMA_MODEL=tinyllama
|
|
- MODEL_NAME=tinyllama
|
|
- LOG_LEVEL=info
|
|
depends_on:
|
|
ollama:
|
|
condition: service_healthy
|
|
networks:
|
|
- llm-network
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:3161/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 10s
|
|
|
|
# API Gateway (NestJS)
|
|
gateway:
|
|
build:
|
|
context: ./apps/gateway
|
|
dockerfile: Dockerfile
|
|
container_name: local-llm-gateway
|
|
ports:
|
|
- "3160:3160"
|
|
environment:
|
|
- NODE_ENV=development
|
|
- GATEWAY_PORT=3160
|
|
- INFERENCE_HOST=inference-engine
|
|
- INFERENCE_PORT=3161
|
|
- MODEL_NAME=tinyllama
|
|
- TIER_SMALL_LATENCY_TARGET_MS=5000
|
|
- TIER_MAIN_LATENCY_TARGET_MS=15000
|
|
depends_on:
|
|
inference-engine:
|
|
condition: service_healthy
|
|
networks:
|
|
- llm-network
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 10s
|
|
|
|
networks:
|
|
llm-network:
|
|
driver: bridge
|
|
|
|
volumes:
|
|
ollama-data:
|
|
name: local-llm-ollama-data
|
|
|
|
# ============================================================================
|
|
# Uso:
|
|
# ============================================================================
|
|
#
|
|
# 1. Levantar todo el stack:
|
|
# docker-compose up -d
|
|
#
|
|
# 2. Ver logs:
|
|
# docker-compose logs -f
|
|
#
|
|
# 3. Descargar modelo en Ollama (primera vez):
|
|
# docker exec -it local-llm-ollama ollama pull tinyllama
|
|
#
|
|
# 4. Probar endpoints:
|
|
# curl http://localhost:3160/health
|
|
# curl http://localhost:3160/v1/models
|
|
#
|
|
# 5. Chat completion:
|
|
# curl -X POST http://localhost:3160/v1/chat/completions \
|
|
# -H "Content-Type: application/json" \
|
|
# -d '{"model":"tinyllama","messages":[{"role":"user","content":"Hi"}]}'
|
|
#
|
|
# 6. Detener:
|
|
# docker-compose down
|
|
#
|
|
# ============================================================================
|
|
# Modelos recomendados:
|
|
# ============================================================================
|
|
#
|
|
# Para testing (CPU, rápido):
|
|
# - tinyllama (1.1B params, ~700MB)
|
|
# - phi (2.7B params, ~1.6GB)
|
|
#
|
|
# Para producción (GPU recomendado):
|
|
# - llama2 (7B params)
|
|
# - codellama (7B params)
|
|
# - mistral (7B params)
|
|
#
|
|
# Descargar modelo:
|
|
# docker exec -it local-llm-ollama ollama pull <model-name>
|
|
#
|