workspace-v1/orchestration/inventarios/MONITORING-CONFIG.yml

# ═══════════════════════════════════════════════════════════════════════════════
# CONFIGURACION DE MONITOREO - NEXUS WORKSPACE
# ═══════════════════════════════════════════════════════════════════════════════
#
# Version: 1.0.0
# Fecha: 2026-01-04
# Responsable: @PERFIL_MONITORING_AGENT
# Proposito: Configuracion centralizada de observabilidad
#
# ═══════════════════════════════════════════════════════════════════════════════

version: "1.0.0"
fecha_actualizacion: "2026-01-04"
responsable: "@PERFIL_MONITORING_AGENT"

# ─────────────────────────────────────────────────────────────────────────────────
# PROMETHEUS
# ─────────────────────────────────────────────────────────────────────────────────

prometheus:
  url: "http://localhost:9090"
  config_path: "/etc/prometheus/prometheus.yml"
  data_path: "/var/lib/prometheus"
  retencion: "15d"

  scrape_config:
    global:
      scrape_interval: "15s"
      evaluation_interval: "15s"

    scrape_jobs:
      # Node Exporter - Metricas del servidor
      - job_name: "node"
        static_configs:
          - targets: ["localhost:9100"]
            labels:
              instance: "vps-principal"

      # PostgreSQL Exporter
      - job_name: "postgresql"
        static_configs:
          - targets: ["localhost:9187"]
            labels:
              database: "gamilit_platform"

      # Nginx Exporter (si esta configurado)
      - job_name: "nginx"
        static_configs:
          - targets: ["localhost:9113"]

      # Aplicaciones - Gamilit
      - job_name: "gamilit-api"
        metrics_path: "/metrics"
        static_configs:
          - targets: ["localhost:3006"]
            labels:
              project: "gamilit"
              type: "api"
              environment: "production"

      # Aplicaciones - Trading Platform (cuando este en prod)
      - job_name: "trading-api"
        metrics_path: "/metrics"
        static_configs:
          - targets: ["localhost:4000"]
            labels:
              project: "trading"
              type: "api"
              environment: "staging"

      - job_name: "trading-ml"
        metrics_path: "/metrics"
        static_configs:
          - targets: ["localhost:5000"]
            labels:
              project: "trading"
              type: "ml-service"
              environment: "staging"

  alerting:
    alertmanagers:
      - static_configs:
          - targets: ["localhost:9093"]

# ─────────────────────────────────────────────────────────────────────────────────
# GRAFANA
# ─────────────────────────────────────────────────────────────────────────────────

grafana:
  url: "http://localhost:9091"
  admin_user: "${GRAFANA_ADMIN_USER}"
  config_path: "/etc/grafana/grafana.ini"

  datasources:
    - name: "Prometheus"
      type: "prometheus"
      url: "http://localhost:9090"
      access: "proxy"
      is_default: true

    - name: "PostgreSQL-Gamilit"
      type: "postgres"
      url: "localhost:5432"
      database: "gamilit_platform"
      user: "${DB_GRAFANA_USER}"

  dashboards:
    sistema:
      - uid: "node-exporter"
        nombre: "Server Overview"
        descripcion: "CPU, Memory, Disk, Network"
        tags: ["infrastructure", "node"]

      - uid: "postgresql-overview"
        nombre: "PostgreSQL Performance"
        descripcion: "Conexiones, queries, locks"
        tags: ["database", "postgresql"]

      - uid: "nginx-overview"
        nombre: "Nginx Traffic"
        descripcion: "Requests, status codes, latency"
        tags: ["infrastructure", "nginx"]

    aplicaciones:
      - uid: "gamilit-api"
        nombre: "Gamilit API Dashboard"
        descripcion: "Requests, latency, errors, business metrics"
        tags: ["gamilit", "api"]
        paneles:
          - "Request Rate"
          - "Response Time P50/P95/P99"
          - "Error Rate"
          - "Active Users"
          - "Exercises Completed"

      - uid: "gamilit-gamification"
        nombre: "Gamilit Gamification"
        descripcion: "XP, logros, economia virtual"
        tags: ["gamilit", "business"]
        paneles:
          - "XP Awarded per Hour"
          - "Achievements Unlocked"
          - "ML Coins Circulation"
          - "Level Distribution"

      - uid: "trading-platform"
        nombre: "Trading Platform Overview"
        descripcion: "Trading activity, ML predictions"
        tags: ["trading", "api"]
        paneles:
          - "Trades per Minute"
          - "Prediction Accuracy"
          - "Model Latency"
          - "Portfolio Value"

  alertas_ui:
    folder: "Alertas NEXUS"
    evaluation_interval: "1m"

# ─────────────────────────────────────────────────────────────────────────────────
# ALERTMANAGER
# ─────────────────────────────────────────────────────────────────────────────────

alertmanager:
  url: "http://localhost:9093"
  config_path: "/etc/alertmanager/alertmanager.yml"

  receivers:
    - name: "slack-critical"
      slack_configs:
        - api_url: "${SLACK_WEBHOOK_CRITICAL}"
          channel: "#alertas-criticas"
          send_resolved: true
          title: "{{ .Status | toUpper }}: {{ .CommonLabels.alertname }}"
          text: "{{ .CommonAnnotations.summary }}"

    - name: "slack-warnings"
      slack_configs:
        - api_url: "${SLACK_WEBHOOK_WARNINGS}"
          channel: "#alertas-warnings"
          send_resolved: true

    - name: "email-critical"
      email_configs:
        - to: "${ALERT_EMAIL}"
          from: "alertas@isem.dev"
          smarthost: "smtp.gmail.com:587"
          require_tls: true

    - name: "pagerduty-critical"
      pagerduty_configs:
        - service_key: "${PAGERDUTY_SERVICE_KEY}"
          severity: "critical"

  routes:
    - match:
        severity: "critical"
      receiver: "slack-critical"
      continue: true

    - match:
        severity: "critical"
      receiver: "email-critical"
      continue: true

    - match:
        severity: "warning"
      receiver: "slack-warnings"

  inhibit_rules:
    - source_match:
        severity: "critical"
      target_match:
        severity: "warning"
      equal: ["alertname", "instance"]

# ─────────────────────────────────────────────────────────────────────────────────
# REGLAS DE ALERTA
# ─────────────────────────────────────────────────────────────────────────────────

alert_rules:
  infraestructura:
    - nombre: "HighCPUUsage"
      expr: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100) > 80"
      for: "5m"
      severidad: "warning"
      resumen: "CPU usage > 80% for 5 minutes"
      accion: "Verificar procesos, considerar scaling"

    - nombre: "HighMemoryUsage"
      expr: "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85"
      for: "5m"
      severidad: "warning"
      resumen: "Memory usage > 85%"
      accion: "Verificar memory leaks, reiniciar servicios"

    - nombre: "DiskSpaceLow"
      expr: "(1 - (node_filesystem_avail_bytes{mountpoint='/'} / node_filesystem_size_bytes{mountpoint='/'})) * 100 > 80"
      for: "10m"
      severidad: "warning"
      resumen: "Disk space < 20% free"
      accion: "Limpiar logs, backups antiguos"

    - nombre: "DiskSpaceCritical"
      expr: "(1 - (node_filesystem_avail_bytes{mountpoint='/'} / node_filesystem_size_bytes{mountpoint='/'})) * 100 > 95"
      for: "5m"
      severidad: "critical"
      resumen: "Disk space < 5% free"
      accion: "URGENTE: Liberar espacio inmediatamente"

  aplicaciones:
    - nombre: "HighErrorRate"
      expr: "rate(http_requests_total{status=~'5..'}[5m]) / rate(http_requests_total[5m]) > 0.05"
      for: "5m"
      severidad: "critical"
      resumen: "Error rate > 5%"
      accion: "Revisar logs, rollback si necesario"

    - nombre: "HighLatency"
      expr: "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2"
      for: "5m"
      severidad: "warning"
      resumen: "P95 latency > 2 seconds"
      accion: "Revisar queries lentos, cache"

    - nombre: "ServiceDown"
      expr: "up{job=~'gamilit.*|trading.*'} == 0"
      for: "1m"
      severidad: "critical"
      resumen: "Service is down"
      accion: "Reiniciar servicio, verificar logs"

  base_de_datos:
    - nombre: "PostgreSQLDown"
      expr: "pg_up == 0"
      for: "1m"
      severidad: "critical"
      resumen: "PostgreSQL is down"
      accion: "Reiniciar PostgreSQL, verificar logs"

    - nombre: "TooManyConnections"
      expr: "pg_stat_activity_count > 80"
      for: "5m"
      severidad: "warning"
      resumen: "PostgreSQL connections > 80"
      accion: "Revisar connection pooling"

    - nombre: "SlowQueries"
      expr: "rate(pg_stat_statements_seconds_total[5m]) / rate(pg_stat_statements_calls_total[5m]) > 1"
      for: "10m"
      severidad: "warning"
      resumen: "Average query time > 1 second"
      accion: "Revisar queries, agregar indices"

  negocio:
    - nombre: "NoUserActivity"
      expr: "rate(gamilit_exercises_completed_total[30m]) == 0"
      for: "1h"
      severidad: "warning"
      resumen: "No exercises completed in 1 hour"
      labels:
        project: "gamilit"
      accion: "Verificar si hay problema o es hora de baja actividad"

    - nombre: "AbnormalXPRate"
      expr: "rate(gamilit_xp_awarded_total[5m]) > 10000"
      for: "5m"
      severidad: "warning"
      resumen: "Abnormal XP award rate - possible exploit"
      labels:
        project: "gamilit"
      accion: "Verificar actividad sospechosa"

# ─────────────────────────────────────────────────────────────────────────────────
# UPTIME MONITORING (EXTERNO)
# ─────────────────────────────────────────────────────────────────────────────────

uptime_monitoring:
  proveedor: "UptimeRobot"
  plan: "Free/Pro"

  monitores:
    - nombre: "Gamilit Website"
      url: "https://gamilit.com"
      tipo: "HTTP"
      intervalo: "5m"
      alertas: ["email", "slack"]

    - nombre: "Gamilit API Health"
      url: "https://api.gamilit.com/health"
      tipo: "HTTP"
      intervalo: "5m"
      keyword: "ok"
      alertas: ["email", "slack"]

    - nombre: "Trading Staging"
      url: "https://trading-staging.isem.dev"
      tipo: "HTTP"
      intervalo: "5m"
      alertas: ["email"]

# ─────────────────────────────────────────────────────────────────────────────────
# LOGGING
# ─────────────────────────────────────────────────────────────────────────────────

logging:
  aplicaciones:
    metodo: "PM2 logs + rotacion"
    ubicacion: "~/.pm2/logs/"
    rotacion:
      max_size: "10M"
      retain: 7

  sistema:
    journald: true
    ubicacion: "/var/log/"

  centralizacion:
    actual: "local"
    futuro: "considerar Loki + Grafana"

  busqueda:
    comando: "pm2 logs {app} --lines 100"
    filtrar: "pm2 logs {app} | grep ERROR"

# ─────────────────────────────────────────────────────────────────────────────────
# METRICAS DE APLICACION (INSTRUMENTACION)
# ─────────────────────────────────────────────────────────────────────────────────

metricas_aplicacion:
  gamilit:
    endpoint: "/metrics"
    puerto: 3006
    libreria: "@nestjs/terminus + prom-client"
    metricas:
      - nombre: "http_requests_total"
        tipo: "counter"
        labels: ["method", "path", "status"]

      - nombre: "http_request_duration_seconds"
        tipo: "histogram"
        labels: ["method", "path"]
        buckets: [0.1, 0.5, 1, 2, 5]

      - nombre: "gamilit_exercises_completed_total"
        tipo: "counter"
        labels: ["difficulty", "subject"]

      - nombre: "gamilit_xp_awarded_total"
        tipo: "counter"
        labels: ["source"]

      - nombre: "gamilit_active_users"
        tipo: "gauge"

  trading_platform:
    endpoint: "/metrics"
    puerto: 4000
    metricas:
      - nombre: "trades_executed_total"
        tipo: "counter"
        labels: ["symbol", "side"]

      - nombre: "prediction_accuracy"
        tipo: "gauge"
        labels: ["model"]

      - nombre: "model_inference_duration_seconds"
        tipo: "histogram"

# ─────────────────────────────────────────────────────────────────────────────────
# RUNBOOKS (PROCEDIMIENTOS DE RESPUESTA)
# ─────────────────────────────────────────────────────────────────────────────────

runbooks:
  ubicacion: "orchestration/runbooks/"

  documentos:
    - alerta: "ServiceDown"
      runbook: "RUNBOOK-SERVICE-DOWN.md"
      pasos:
        - "Verificar status: pm2 status"
        - "Revisar logs: pm2 logs {app}"
        - "Reiniciar: pm2 restart {app}"
        - "Verificar health: curl localhost:{port}/health"
        - "Si persiste: verificar recursos del sistema"

    - alerta: "HighErrorRate"
      runbook: "RUNBOOK-HIGH-ERROR-RATE.md"
      pasos:
        - "Identificar errores: grep ERROR en logs"
        - "Verificar ultimos deploys"
        - "Considerar rollback"
        - "Notificar a equipo de desarrollo"

    - alerta: "DiskSpaceCritical"
      runbook: "RUNBOOK-DISK-SPACE.md"
      pasos:
        - "Identificar uso: du -sh /*"
        - "Limpiar logs: pm2 flush"
        - "Limpiar Docker: docker system prune"
        - "Verificar backups locales"
        - "Considerar expansion de disco"

# ─────────────────────────────────────────────────────────────────────────────────
# REFERENCIAS
# ─────────────────────────────────────────────────────────────────────────────────

referencias:
  perfil_responsable: "@PERFIL_MONITORING_AGENT"
  production_inventory: "orchestration/inventarios/PRODUCTION-INVENTORY.yml"
  runbooks_folder: "orchestration/runbooks/"

# ═══════════════════════════════════════════════════════════════════════════════
# FIN DE CONFIGURACION
# ═══════════════════════════════════════════════════════════════════════════════