local-llm-agent/docker-compose.yml

# Local LLM Agent - Docker Compose for Development
# Usage: docker-compose up -d
#
# Este compose incluye:
# - Gateway (NestJS) - puerto 3160
# - Inference Engine (Python) - puerto 3161
# - Ollama (LLM Runtime) - puerto 11434
#
# Para usar GPU con Ollama, descomentar la seccion de deploy en el servicio ollama

services:
  # Ollama - LLM Runtime
  ollama:
    image: ollama/ollama:latest
    container_name: local-llm-ollama
    ports:
      - "11434:11434"
    volumes:
      - ollama-data:/root/.ollama
    networks:
      - llm-network
    restart: unless-stopped
    # Descomentar para usar GPU (requiere nvidia-container-toolkit)
    # deploy:
    #   resources:
    #     reservations:
    #       devices:
    #         - driver: nvidia
    #           count: 1
    #           capabilities: [gpu]
    healthcheck:
      test: ["CMD", "ollama", "list"]
      interval: 30s
      timeout: 10s
      retries: 5
      start_period: 30s

  # Inference Engine (Python FastAPI)
  inference-engine:
    build:
      context: ./apps/inference-engine
      dockerfile: Dockerfile
    container_name: local-llm-inference
    ports:
      - "3161:3161"
    environment:
      - INFERENCE_PORT=3161
      - INFERENCE_HOST=0.0.0.0
      - INFERENCE_BACKEND=ollama
      - OLLAMA_HOST=http://ollama:11434
      - OLLAMA_MODEL=tinyllama
      - MODEL_NAME=tinyllama
      - LOG_LEVEL=info
    depends_on:
      ollama:
        condition: service_healthy
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:3161/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s

  # API Gateway (NestJS)
  gateway:
    build:
      context: ./apps/gateway
      dockerfile: Dockerfile
    container_name: local-llm-gateway
    ports:
      - "3160:3160"
    environment:
      - NODE_ENV=development
      - GATEWAY_PORT=3160
      - INFERENCE_HOST=inference-engine
      - INFERENCE_PORT=3161
      - MODEL_NAME=tinyllama
      - TIER_SMALL_LATENCY_TARGET_MS=5000
      - TIER_MAIN_LATENCY_TARGET_MS=15000
    depends_on:
      inference-engine:
        condition: service_healthy
    networks:
      - llm-network
    restart: unless-stopped
    healthcheck:
      test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s

networks:
  llm-network:
    driver: bridge

volumes:
  ollama-data:
    name: local-llm-ollama-data

# ============================================================================
# Uso:
# ============================================================================
#
# 1. Levantar todo el stack:
#    docker-compose up -d
#
# 2. Ver logs:
#    docker-compose logs -f
#
# 3. Descargar modelo en Ollama (primera vez):
#    docker exec -it local-llm-ollama ollama pull tinyllama
#
# 4. Probar endpoints:
#    curl http://localhost:3160/health
#    curl http://localhost:3160/v1/models
#
# 5. Chat completion:
#    curl -X POST http://localhost:3160/v1/chat/completions \
#      -H "Content-Type: application/json" \
#      -d '{"model":"tinyllama","messages":[{"role":"user","content":"Hi"}]}'
#
# 6. Detener:
#    docker-compose down
#
# ============================================================================
# Modelos recomendados:
# ============================================================================
#
# Para testing (CPU, rápido):
#   - tinyllama (1.1B params, ~700MB)
#   - phi (2.7B params, ~1.6GB)
#
# Para producción (GPU recomendado):
#   - llama2 (7B params)
#   - codellama (7B params)
#   - mistral (7B params)
#
# Descargar modelo:
#   docker exec -it local-llm-ollama ollama pull <model-name>
#