# Local LLM Agent - Docker Compose for Development # Usage: docker-compose up -d # # Este compose incluye: # - Gateway (NestJS) - puerto 3160 # - Inference Engine (Python) - puerto 3161 # - Ollama (LLM Runtime) - puerto 11434 # # Para usar GPU con Ollama, descomentar la seccion de deploy en el servicio ollama services: # Ollama - LLM Runtime ollama: image: ollama/ollama:latest container_name: local-llm-ollama ports: - "11434:11434" volumes: - ollama-data:/root/.ollama networks: - llm-network restart: unless-stopped # Descomentar para usar GPU (requiere nvidia-container-toolkit) # deploy: # resources: # reservations: # devices: # - driver: nvidia # count: 1 # capabilities: [gpu] healthcheck: test: ["CMD", "ollama", "list"] interval: 30s timeout: 10s retries: 5 start_period: 30s # Inference Engine (Python FastAPI) inference-engine: build: context: ./apps/inference-engine dockerfile: Dockerfile container_name: local-llm-inference ports: - "3161:3161" environment: - INFERENCE_PORT=3161 - INFERENCE_HOST=0.0.0.0 - INFERENCE_BACKEND=ollama - OLLAMA_HOST=http://ollama:11434 - OLLAMA_MODEL=tinyllama - MODEL_NAME=tinyllama - LOG_LEVEL=info depends_on: ollama: condition: service_healthy networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD", "curl", "-f", "http://localhost:3161/health"] interval: 30s timeout: 10s retries: 3 start_period: 10s # API Gateway (NestJS) gateway: build: context: ./apps/gateway dockerfile: Dockerfile container_name: local-llm-gateway ports: - "3160:3160" environment: - NODE_ENV=development - GATEWAY_PORT=3160 - INFERENCE_HOST=inference-engine - INFERENCE_PORT=3161 - MODEL_NAME=tinyllama - TIER_SMALL_LATENCY_TARGET_MS=5000 - TIER_MAIN_LATENCY_TARGET_MS=15000 depends_on: inference-engine: condition: service_healthy networks: - llm-network restart: unless-stopped healthcheck: test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:3160/health"] interval: 30s timeout: 10s retries: 3 start_period: 10s networks: llm-network: driver: bridge volumes: ollama-data: name: local-llm-ollama-data # ============================================================================ # Uso: # ============================================================================ # # 1. Levantar todo el stack: # docker-compose up -d # # 2. Ver logs: # docker-compose logs -f # # 3. Descargar modelo en Ollama (primera vez): # docker exec -it local-llm-ollama ollama pull tinyllama # # 4. Probar endpoints: # curl http://localhost:3160/health # curl http://localhost:3160/v1/models # # 5. Chat completion: # curl -X POST http://localhost:3160/v1/chat/completions \ # -H "Content-Type: application/json" \ # -d '{"model":"tinyllama","messages":[{"role":"user","content":"Hi"}]}' # # 6. Detener: # docker-compose down # # ============================================================================ # Modelos recomendados: # ============================================================================ # # Para testing (CPU, rápido): # - tinyllama (1.1B params, ~700MB) # - phi (2.7B params, ~1.6GB) # # Para producción (GPU recomendado): # - llama2 (7B params) # - codellama (7B params) # - mistral (7B params) # # Descargar modelo: # docker exec -it local-llm-ollama ollama pull #