136 lines
4.2 KiB
YAML
136 lines
4.2 KiB
YAML
# Local LLM Agent - Monitoring Stack (Prometheus + Grafana)
|
|
# ===========================================================================
|
|
#
|
|
# Add this to your existing compose setup for monitoring:
|
|
# docker-compose -f docker-compose.yml -f docker-compose.monitoring.yml up -d
|
|
#
|
|
# For vLLM production:
|
|
# docker-compose -f docker-compose.vllm.yml -f docker-compose.monitoring.yml up -d
|
|
#
|
|
# ===========================================================================
|
|
|
|
services:
|
|
# Prometheus - Metrics Collection
|
|
prometheus:
|
|
image: prom/prometheus:v2.48.0
|
|
container_name: local-llm-prometheus
|
|
ports:
|
|
- "9090:9090"
|
|
volumes:
|
|
- ./monitoring/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
- prometheus-data:/prometheus
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--storage.tsdb.retention.time=30d'
|
|
- '--web.enable-lifecycle'
|
|
- '--web.enable-admin-api'
|
|
networks:
|
|
- llm-network
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD", "wget", "--no-verbose", "--tries=1", "--spider", "http://localhost:9090/-/healthy"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
# Grafana - Visualization
|
|
grafana:
|
|
image: grafana/grafana:10.2.0
|
|
container_name: local-llm-grafana
|
|
ports:
|
|
- "3000:3000"
|
|
environment:
|
|
- GF_SECURITY_ADMIN_USER=admin
|
|
- GF_SECURITY_ADMIN_PASSWORD=admin
|
|
- GF_USERS_ALLOW_SIGN_UP=false
|
|
- GF_SERVER_ROOT_URL=http://localhost:3000
|
|
- GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH=/var/lib/grafana/dashboards/local-llm.json
|
|
volumes:
|
|
- grafana-data:/var/lib/grafana
|
|
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
|
|
- ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
depends_on:
|
|
prometheus:
|
|
condition: service_healthy
|
|
networks:
|
|
- llm-network
|
|
restart: unless-stopped
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
|
|
# Node Exporter - System Metrics (optional)
|
|
node-exporter:
|
|
image: prom/node-exporter:v1.7.0
|
|
container_name: local-llm-node-exporter
|
|
ports:
|
|
- "9100:9100"
|
|
volumes:
|
|
- /proc:/host/proc:ro
|
|
- /sys:/host/sys:ro
|
|
- /:/rootfs:ro
|
|
command:
|
|
- '--path.procfs=/host/proc'
|
|
- '--path.sysfs=/host/sys'
|
|
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
|
networks:
|
|
- llm-network
|
|
restart: unless-stopped
|
|
|
|
# NVIDIA GPU Exporter (for GPU metrics, requires nvidia-container-toolkit)
|
|
# Uncomment if using GPU
|
|
# nvidia-gpu-exporter:
|
|
# image: utkuozdemir/nvidia_gpu_exporter:1.2.0
|
|
# container_name: local-llm-gpu-exporter
|
|
# ports:
|
|
# - "9835:9835"
|
|
# deploy:
|
|
# resources:
|
|
# reservations:
|
|
# devices:
|
|
# - driver: nvidia
|
|
# count: all
|
|
# capabilities: [gpu]
|
|
# networks:
|
|
# - llm-network
|
|
# restart: unless-stopped
|
|
|
|
networks:
|
|
llm-network:
|
|
external: true
|
|
|
|
volumes:
|
|
prometheus-data:
|
|
name: local-llm-prometheus-data
|
|
grafana-data:
|
|
name: local-llm-grafana-data
|
|
|
|
# ============================================================================
|
|
# Usage:
|
|
# ============================================================================
|
|
#
|
|
# 1. Start monitoring with development stack:
|
|
# docker-compose -f docker-compose.yml -f docker-compose.monitoring.yml up -d
|
|
#
|
|
# 2. Start monitoring with vLLM stack:
|
|
# docker-compose -f docker-compose.vllm.yml -f docker-compose.monitoring.yml up -d
|
|
#
|
|
# 3. Access:
|
|
# - Prometheus: http://localhost:9090
|
|
# - Grafana: http://localhost:3000 (admin/admin)
|
|
# - Node Exporter: http://localhost:9100/metrics
|
|
#
|
|
# 4. Check metrics endpoint:
|
|
# curl http://localhost:3161/metrics
|
|
#
|
|
# 5. Useful Prometheus queries:
|
|
# - Request rate: rate(llm_requests_total[5m])
|
|
# - P99 latency: histogram_quantile(0.99, rate(llm_request_duration_seconds_bucket[5m]))
|
|
# - Error rate: rate(llm_errors_total[5m])
|
|
# - Tokens per second: rate(llm_completion_tokens_total[5m])
|
|
#
|
|
# ============================================================================
|