Sistema completo de gestión de tokens para subagentes NEXUS v4.0: Nuevas directivas SIMCO: - SIMCO-SUBAGENTE.md: Protocolo para agentes en modo subagente - SIMCO-CCA-SUBAGENTE.md: CCA ligero para subagentes (~1,500 tokens) - SIMCO-CONTROL-TOKENS.md: Gestión de límites de tokens - SIMCO-DELEGACION-PARALELA.md: Delegación paralela Perfiles compact (~250 tokens cada uno): - PERFIL-BACKEND-COMPACT.md - PERFIL-FRONTEND-COMPACT.md - PERFIL-DATABASE-COMPACT.md - PERFIL-DEVOPS-COMPACT.md - PERFIL-ML-COMPACT.md - PERFIL-GENERIC-SUBAGENT.md Templates de delegación escalonados: - TEMPLATE-DELEGACION-MINIMA.md (~250 tokens) - TEMPLATE-DELEGACION-ESTANDAR.md (~600 tokens) - TEMPLATE-DELEGACION-COMPLETA.md (~1,800 tokens) Nuevos perfiles especializados: - PERFIL-MCP-ARCHITECT.md - PERFIL-MCP-DEVELOPER.md - PERFIL-RAG-ENGINEER.md - PERFIL-CICD-SPECIALIST.md - PERFIL-PRODUCTION-MANAGER.md - PERFIL-MONITORING-AGENT.md - PERFIL-SECRETS-MANAGER.md - PERFIL-PROPAGATION-TRACKER.md Checklists y documentación: - CHECKLIST-PRE-DELEGACION.md - Análisis y planes de implementación Métricas de mejora: - ~59% reducción de tokens por delegación - Perfiles compact: 69% más ligeros - CCA subagente: 85% más ligero 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
457 lines
17 KiB
YAML
457 lines
17 KiB
YAML
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# CONFIGURACION DE MONITOREO - NEXUS WORKSPACE
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
#
|
|
# Version: 1.0.0
|
|
# Fecha: 2026-01-04
|
|
# Responsable: @PERFIL_MONITORING_AGENT
|
|
# Proposito: Configuracion centralizada de observabilidad
|
|
#
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
|
|
version: "1.0.0"
|
|
fecha_actualizacion: "2026-01-04"
|
|
responsable: "@PERFIL_MONITORING_AGENT"
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
# PROMETHEUS
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
|
|
prometheus:
|
|
url: "http://localhost:9090"
|
|
config_path: "/etc/prometheus/prometheus.yml"
|
|
data_path: "/var/lib/prometheus"
|
|
retencion: "15d"
|
|
|
|
scrape_config:
|
|
global:
|
|
scrape_interval: "15s"
|
|
evaluation_interval: "15s"
|
|
|
|
scrape_jobs:
|
|
# Node Exporter - Metricas del servidor
|
|
- job_name: "node"
|
|
static_configs:
|
|
- targets: ["localhost:9100"]
|
|
labels:
|
|
instance: "vps-principal"
|
|
|
|
# PostgreSQL Exporter
|
|
- job_name: "postgresql"
|
|
static_configs:
|
|
- targets: ["localhost:9187"]
|
|
labels:
|
|
database: "gamilit_platform"
|
|
|
|
# Nginx Exporter (si esta configurado)
|
|
- job_name: "nginx"
|
|
static_configs:
|
|
- targets: ["localhost:9113"]
|
|
|
|
# Aplicaciones - Gamilit
|
|
- job_name: "gamilit-api"
|
|
metrics_path: "/metrics"
|
|
static_configs:
|
|
- targets: ["localhost:3006"]
|
|
labels:
|
|
project: "gamilit"
|
|
type: "api"
|
|
environment: "production"
|
|
|
|
# Aplicaciones - Trading Platform (cuando este en prod)
|
|
- job_name: "trading-api"
|
|
metrics_path: "/metrics"
|
|
static_configs:
|
|
- targets: ["localhost:4000"]
|
|
labels:
|
|
project: "trading"
|
|
type: "api"
|
|
environment: "staging"
|
|
|
|
- job_name: "trading-ml"
|
|
metrics_path: "/metrics"
|
|
static_configs:
|
|
- targets: ["localhost:5000"]
|
|
labels:
|
|
project: "trading"
|
|
type: "ml-service"
|
|
environment: "staging"
|
|
|
|
alerting:
|
|
alertmanagers:
|
|
- static_configs:
|
|
- targets: ["localhost:9093"]
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
# GRAFANA
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
|
|
grafana:
|
|
url: "http://localhost:9091"
|
|
admin_user: "${GRAFANA_ADMIN_USER}"
|
|
config_path: "/etc/grafana/grafana.ini"
|
|
|
|
datasources:
|
|
- name: "Prometheus"
|
|
type: "prometheus"
|
|
url: "http://localhost:9090"
|
|
access: "proxy"
|
|
is_default: true
|
|
|
|
- name: "PostgreSQL-Gamilit"
|
|
type: "postgres"
|
|
url: "localhost:5432"
|
|
database: "gamilit_platform"
|
|
user: "${DB_GRAFANA_USER}"
|
|
|
|
dashboards:
|
|
sistema:
|
|
- uid: "node-exporter"
|
|
nombre: "Server Overview"
|
|
descripcion: "CPU, Memory, Disk, Network"
|
|
tags: ["infrastructure", "node"]
|
|
|
|
- uid: "postgresql-overview"
|
|
nombre: "PostgreSQL Performance"
|
|
descripcion: "Conexiones, queries, locks"
|
|
tags: ["database", "postgresql"]
|
|
|
|
- uid: "nginx-overview"
|
|
nombre: "Nginx Traffic"
|
|
descripcion: "Requests, status codes, latency"
|
|
tags: ["infrastructure", "nginx"]
|
|
|
|
aplicaciones:
|
|
- uid: "gamilit-api"
|
|
nombre: "Gamilit API Dashboard"
|
|
descripcion: "Requests, latency, errors, business metrics"
|
|
tags: ["gamilit", "api"]
|
|
paneles:
|
|
- "Request Rate"
|
|
- "Response Time P50/P95/P99"
|
|
- "Error Rate"
|
|
- "Active Users"
|
|
- "Exercises Completed"
|
|
|
|
- uid: "gamilit-gamification"
|
|
nombre: "Gamilit Gamification"
|
|
descripcion: "XP, logros, economia virtual"
|
|
tags: ["gamilit", "business"]
|
|
paneles:
|
|
- "XP Awarded per Hour"
|
|
- "Achievements Unlocked"
|
|
- "ML Coins Circulation"
|
|
- "Level Distribution"
|
|
|
|
- uid: "trading-platform"
|
|
nombre: "Trading Platform Overview"
|
|
descripcion: "Trading activity, ML predictions"
|
|
tags: ["trading", "api"]
|
|
paneles:
|
|
- "Trades per Minute"
|
|
- "Prediction Accuracy"
|
|
- "Model Latency"
|
|
- "Portfolio Value"
|
|
|
|
alertas_ui:
|
|
folder: "Alertas NEXUS"
|
|
evaluation_interval: "1m"
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
# ALERTMANAGER
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
|
|
alertmanager:
|
|
url: "http://localhost:9093"
|
|
config_path: "/etc/alertmanager/alertmanager.yml"
|
|
|
|
receivers:
|
|
- name: "slack-critical"
|
|
slack_configs:
|
|
- api_url: "${SLACK_WEBHOOK_CRITICAL}"
|
|
channel: "#alertas-criticas"
|
|
send_resolved: true
|
|
title: "{{ .Status | toUpper }}: {{ .CommonLabels.alertname }}"
|
|
text: "{{ .CommonAnnotations.summary }}"
|
|
|
|
- name: "slack-warnings"
|
|
slack_configs:
|
|
- api_url: "${SLACK_WEBHOOK_WARNINGS}"
|
|
channel: "#alertas-warnings"
|
|
send_resolved: true
|
|
|
|
- name: "email-critical"
|
|
email_configs:
|
|
- to: "${ALERT_EMAIL}"
|
|
from: "alertas@isem.dev"
|
|
smarthost: "smtp.gmail.com:587"
|
|
require_tls: true
|
|
|
|
- name: "pagerduty-critical"
|
|
pagerduty_configs:
|
|
- service_key: "${PAGERDUTY_SERVICE_KEY}"
|
|
severity: "critical"
|
|
|
|
routes:
|
|
- match:
|
|
severity: "critical"
|
|
receiver: "slack-critical"
|
|
continue: true
|
|
|
|
- match:
|
|
severity: "critical"
|
|
receiver: "email-critical"
|
|
continue: true
|
|
|
|
- match:
|
|
severity: "warning"
|
|
receiver: "slack-warnings"
|
|
|
|
inhibit_rules:
|
|
- source_match:
|
|
severity: "critical"
|
|
target_match:
|
|
severity: "warning"
|
|
equal: ["alertname", "instance"]
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
# REGLAS DE ALERTA
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
|
|
alert_rules:
|
|
infraestructura:
|
|
- nombre: "HighCPUUsage"
|
|
expr: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100) > 80"
|
|
for: "5m"
|
|
severidad: "warning"
|
|
resumen: "CPU usage > 80% for 5 minutes"
|
|
accion: "Verificar procesos, considerar scaling"
|
|
|
|
- nombre: "HighMemoryUsage"
|
|
expr: "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85"
|
|
for: "5m"
|
|
severidad: "warning"
|
|
resumen: "Memory usage > 85%"
|
|
accion: "Verificar memory leaks, reiniciar servicios"
|
|
|
|
- nombre: "DiskSpaceLow"
|
|
expr: "(1 - (node_filesystem_avail_bytes{mountpoint='/'} / node_filesystem_size_bytes{mountpoint='/'})) * 100 > 80"
|
|
for: "10m"
|
|
severidad: "warning"
|
|
resumen: "Disk space < 20% free"
|
|
accion: "Limpiar logs, backups antiguos"
|
|
|
|
- nombre: "DiskSpaceCritical"
|
|
expr: "(1 - (node_filesystem_avail_bytes{mountpoint='/'} / node_filesystem_size_bytes{mountpoint='/'})) * 100 > 95"
|
|
for: "5m"
|
|
severidad: "critical"
|
|
resumen: "Disk space < 5% free"
|
|
accion: "URGENTE: Liberar espacio inmediatamente"
|
|
|
|
aplicaciones:
|
|
- nombre: "HighErrorRate"
|
|
expr: "rate(http_requests_total{status=~'5..'}[5m]) / rate(http_requests_total[5m]) > 0.05"
|
|
for: "5m"
|
|
severidad: "critical"
|
|
resumen: "Error rate > 5%"
|
|
accion: "Revisar logs, rollback si necesario"
|
|
|
|
- nombre: "HighLatency"
|
|
expr: "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2"
|
|
for: "5m"
|
|
severidad: "warning"
|
|
resumen: "P95 latency > 2 seconds"
|
|
accion: "Revisar queries lentos, cache"
|
|
|
|
- nombre: "ServiceDown"
|
|
expr: "up{job=~'gamilit.*|trading.*'} == 0"
|
|
for: "1m"
|
|
severidad: "critical"
|
|
resumen: "Service is down"
|
|
accion: "Reiniciar servicio, verificar logs"
|
|
|
|
base_de_datos:
|
|
- nombre: "PostgreSQLDown"
|
|
expr: "pg_up == 0"
|
|
for: "1m"
|
|
severidad: "critical"
|
|
resumen: "PostgreSQL is down"
|
|
accion: "Reiniciar PostgreSQL, verificar logs"
|
|
|
|
- nombre: "TooManyConnections"
|
|
expr: "pg_stat_activity_count > 80"
|
|
for: "5m"
|
|
severidad: "warning"
|
|
resumen: "PostgreSQL connections > 80"
|
|
accion: "Revisar connection pooling"
|
|
|
|
- nombre: "SlowQueries"
|
|
expr: "rate(pg_stat_statements_seconds_total[5m]) / rate(pg_stat_statements_calls_total[5m]) > 1"
|
|
for: "10m"
|
|
severidad: "warning"
|
|
resumen: "Average query time > 1 second"
|
|
accion: "Revisar queries, agregar indices"
|
|
|
|
negocio:
|
|
- nombre: "NoUserActivity"
|
|
expr: "rate(gamilit_exercises_completed_total[30m]) == 0"
|
|
for: "1h"
|
|
severidad: "warning"
|
|
resumen: "No exercises completed in 1 hour"
|
|
labels:
|
|
project: "gamilit"
|
|
accion: "Verificar si hay problema o es hora de baja actividad"
|
|
|
|
- nombre: "AbnormalXPRate"
|
|
expr: "rate(gamilit_xp_awarded_total[5m]) > 10000"
|
|
for: "5m"
|
|
severidad: "warning"
|
|
resumen: "Abnormal XP award rate - possible exploit"
|
|
labels:
|
|
project: "gamilit"
|
|
accion: "Verificar actividad sospechosa"
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
# UPTIME MONITORING (EXTERNO)
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
|
|
uptime_monitoring:
|
|
proveedor: "UptimeRobot"
|
|
plan: "Free/Pro"
|
|
|
|
monitores:
|
|
- nombre: "Gamilit Website"
|
|
url: "https://gamilit.com"
|
|
tipo: "HTTP"
|
|
intervalo: "5m"
|
|
alertas: ["email", "slack"]
|
|
|
|
- nombre: "Gamilit API Health"
|
|
url: "https://api.gamilit.com/health"
|
|
tipo: "HTTP"
|
|
intervalo: "5m"
|
|
keyword: "ok"
|
|
alertas: ["email", "slack"]
|
|
|
|
- nombre: "Trading Staging"
|
|
url: "https://trading-staging.isem.dev"
|
|
tipo: "HTTP"
|
|
intervalo: "5m"
|
|
alertas: ["email"]
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
# LOGGING
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
|
|
logging:
|
|
aplicaciones:
|
|
metodo: "PM2 logs + rotacion"
|
|
ubicacion: "~/.pm2/logs/"
|
|
rotacion:
|
|
max_size: "10M"
|
|
retain: 7
|
|
|
|
sistema:
|
|
journald: true
|
|
ubicacion: "/var/log/"
|
|
|
|
centralizacion:
|
|
actual: "local"
|
|
futuro: "considerar Loki + Grafana"
|
|
|
|
busqueda:
|
|
comando: "pm2 logs {app} --lines 100"
|
|
filtrar: "pm2 logs {app} | grep ERROR"
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
# METRICAS DE APLICACION (INSTRUMENTACION)
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
|
|
metricas_aplicacion:
|
|
gamilit:
|
|
endpoint: "/metrics"
|
|
puerto: 3006
|
|
libreria: "@nestjs/terminus + prom-client"
|
|
metricas:
|
|
- nombre: "http_requests_total"
|
|
tipo: "counter"
|
|
labels: ["method", "path", "status"]
|
|
|
|
- nombre: "http_request_duration_seconds"
|
|
tipo: "histogram"
|
|
labels: ["method", "path"]
|
|
buckets: [0.1, 0.5, 1, 2, 5]
|
|
|
|
- nombre: "gamilit_exercises_completed_total"
|
|
tipo: "counter"
|
|
labels: ["difficulty", "subject"]
|
|
|
|
- nombre: "gamilit_xp_awarded_total"
|
|
tipo: "counter"
|
|
labels: ["source"]
|
|
|
|
- nombre: "gamilit_active_users"
|
|
tipo: "gauge"
|
|
|
|
trading_platform:
|
|
endpoint: "/metrics"
|
|
puerto: 4000
|
|
metricas:
|
|
- nombre: "trades_executed_total"
|
|
tipo: "counter"
|
|
labels: ["symbol", "side"]
|
|
|
|
- nombre: "prediction_accuracy"
|
|
tipo: "gauge"
|
|
labels: ["model"]
|
|
|
|
- nombre: "model_inference_duration_seconds"
|
|
tipo: "histogram"
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
# RUNBOOKS (PROCEDIMIENTOS DE RESPUESTA)
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
|
|
runbooks:
|
|
ubicacion: "orchestration/runbooks/"
|
|
|
|
documentos:
|
|
- alerta: "ServiceDown"
|
|
runbook: "RUNBOOK-SERVICE-DOWN.md"
|
|
pasos:
|
|
- "Verificar status: pm2 status"
|
|
- "Revisar logs: pm2 logs {app}"
|
|
- "Reiniciar: pm2 restart {app}"
|
|
- "Verificar health: curl localhost:{port}/health"
|
|
- "Si persiste: verificar recursos del sistema"
|
|
|
|
- alerta: "HighErrorRate"
|
|
runbook: "RUNBOOK-HIGH-ERROR-RATE.md"
|
|
pasos:
|
|
- "Identificar errores: grep ERROR en logs"
|
|
- "Verificar ultimos deploys"
|
|
- "Considerar rollback"
|
|
- "Notificar a equipo de desarrollo"
|
|
|
|
- alerta: "DiskSpaceCritical"
|
|
runbook: "RUNBOOK-DISK-SPACE.md"
|
|
pasos:
|
|
- "Identificar uso: du -sh /*"
|
|
- "Limpiar logs: pm2 flush"
|
|
- "Limpiar Docker: docker system prune"
|
|
- "Verificar backups locales"
|
|
- "Considerar expansion de disco"
|
|
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
# REFERENCIAS
|
|
# ─────────────────────────────────────────────────────────────────────────────────
|
|
|
|
referencias:
|
|
perfil_responsable: "@PERFIL_MONITORING_AGENT"
|
|
production_inventory: "orchestration/inventarios/PRODUCTION-INVENTORY.yml"
|
|
runbooks_folder: "orchestration/runbooks/"
|
|
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|
|
# FIN DE CONFIGURACION
|
|
# ═══════════════════════════════════════════════════════════════════════════════
|