workspace-v1/orchestration/inventarios/MONITORING-CONFIG.yml
rckrdmrd ff3038f183 feat(orchestration): Add subagent token management system
Sistema completo de gestión de tokens para subagentes NEXUS v4.0:

Nuevas directivas SIMCO:
- SIMCO-SUBAGENTE.md: Protocolo para agentes en modo subagente
- SIMCO-CCA-SUBAGENTE.md: CCA ligero para subagentes (~1,500 tokens)
- SIMCO-CONTROL-TOKENS.md: Gestión de límites de tokens
- SIMCO-DELEGACION-PARALELA.md: Delegación paralela

Perfiles compact (~250 tokens cada uno):
- PERFIL-BACKEND-COMPACT.md
- PERFIL-FRONTEND-COMPACT.md
- PERFIL-DATABASE-COMPACT.md
- PERFIL-DEVOPS-COMPACT.md
- PERFIL-ML-COMPACT.md
- PERFIL-GENERIC-SUBAGENT.md

Templates de delegación escalonados:
- TEMPLATE-DELEGACION-MINIMA.md (~250 tokens)
- TEMPLATE-DELEGACION-ESTANDAR.md (~600 tokens)
- TEMPLATE-DELEGACION-COMPLETA.md (~1,800 tokens)

Nuevos perfiles especializados:
- PERFIL-MCP-ARCHITECT.md
- PERFIL-MCP-DEVELOPER.md
- PERFIL-RAG-ENGINEER.md
- PERFIL-CICD-SPECIALIST.md
- PERFIL-PRODUCTION-MANAGER.md
- PERFIL-MONITORING-AGENT.md
- PERFIL-SECRETS-MANAGER.md
- PERFIL-PROPAGATION-TRACKER.md

Checklists y documentación:
- CHECKLIST-PRE-DELEGACION.md
- Análisis y planes de implementación

Métricas de mejora:
- ~59% reducción de tokens por delegación
- Perfiles compact: 69% más ligeros
- CCA subagente: 85% más ligero

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-07 04:43:01 -06:00

457 lines
17 KiB
YAML

# ═══════════════════════════════════════════════════════════════════════════════
# CONFIGURACION DE MONITOREO - NEXUS WORKSPACE
# ═══════════════════════════════════════════════════════════════════════════════
#
# Version: 1.0.0
# Fecha: 2026-01-04
# Responsable: @PERFIL_MONITORING_AGENT
# Proposito: Configuracion centralizada de observabilidad
#
# ═══════════════════════════════════════════════════════════════════════════════
version: "1.0.0"
fecha_actualizacion: "2026-01-04"
responsable: "@PERFIL_MONITORING_AGENT"
# ─────────────────────────────────────────────────────────────────────────────────
# PROMETHEUS
# ─────────────────────────────────────────────────────────────────────────────────
prometheus:
url: "http://localhost:9090"
config_path: "/etc/prometheus/prometheus.yml"
data_path: "/var/lib/prometheus"
retencion: "15d"
scrape_config:
global:
scrape_interval: "15s"
evaluation_interval: "15s"
scrape_jobs:
# Node Exporter - Metricas del servidor
- job_name: "node"
static_configs:
- targets: ["localhost:9100"]
labels:
instance: "vps-principal"
# PostgreSQL Exporter
- job_name: "postgresql"
static_configs:
- targets: ["localhost:9187"]
labels:
database: "gamilit_platform"
# Nginx Exporter (si esta configurado)
- job_name: "nginx"
static_configs:
- targets: ["localhost:9113"]
# Aplicaciones - Gamilit
- job_name: "gamilit-api"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:3006"]
labels:
project: "gamilit"
type: "api"
environment: "production"
# Aplicaciones - Trading Platform (cuando este en prod)
- job_name: "trading-api"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:4000"]
labels:
project: "trading"
type: "api"
environment: "staging"
- job_name: "trading-ml"
metrics_path: "/metrics"
static_configs:
- targets: ["localhost:5000"]
labels:
project: "trading"
type: "ml-service"
environment: "staging"
alerting:
alertmanagers:
- static_configs:
- targets: ["localhost:9093"]
# ─────────────────────────────────────────────────────────────────────────────────
# GRAFANA
# ─────────────────────────────────────────────────────────────────────────────────
grafana:
url: "http://localhost:9091"
admin_user: "${GRAFANA_ADMIN_USER}"
config_path: "/etc/grafana/grafana.ini"
datasources:
- name: "Prometheus"
type: "prometheus"
url: "http://localhost:9090"
access: "proxy"
is_default: true
- name: "PostgreSQL-Gamilit"
type: "postgres"
url: "localhost:5432"
database: "gamilit_platform"
user: "${DB_GRAFANA_USER}"
dashboards:
sistema:
- uid: "node-exporter"
nombre: "Server Overview"
descripcion: "CPU, Memory, Disk, Network"
tags: ["infrastructure", "node"]
- uid: "postgresql-overview"
nombre: "PostgreSQL Performance"
descripcion: "Conexiones, queries, locks"
tags: ["database", "postgresql"]
- uid: "nginx-overview"
nombre: "Nginx Traffic"
descripcion: "Requests, status codes, latency"
tags: ["infrastructure", "nginx"]
aplicaciones:
- uid: "gamilit-api"
nombre: "Gamilit API Dashboard"
descripcion: "Requests, latency, errors, business metrics"
tags: ["gamilit", "api"]
paneles:
- "Request Rate"
- "Response Time P50/P95/P99"
- "Error Rate"
- "Active Users"
- "Exercises Completed"
- uid: "gamilit-gamification"
nombre: "Gamilit Gamification"
descripcion: "XP, logros, economia virtual"
tags: ["gamilit", "business"]
paneles:
- "XP Awarded per Hour"
- "Achievements Unlocked"
- "ML Coins Circulation"
- "Level Distribution"
- uid: "trading-platform"
nombre: "Trading Platform Overview"
descripcion: "Trading activity, ML predictions"
tags: ["trading", "api"]
paneles:
- "Trades per Minute"
- "Prediction Accuracy"
- "Model Latency"
- "Portfolio Value"
alertas_ui:
folder: "Alertas NEXUS"
evaluation_interval: "1m"
# ─────────────────────────────────────────────────────────────────────────────────
# ALERTMANAGER
# ─────────────────────────────────────────────────────────────────────────────────
alertmanager:
url: "http://localhost:9093"
config_path: "/etc/alertmanager/alertmanager.yml"
receivers:
- name: "slack-critical"
slack_configs:
- api_url: "${SLACK_WEBHOOK_CRITICAL}"
channel: "#alertas-criticas"
send_resolved: true
title: "{{ .Status | toUpper }}: {{ .CommonLabels.alertname }}"
text: "{{ .CommonAnnotations.summary }}"
- name: "slack-warnings"
slack_configs:
- api_url: "${SLACK_WEBHOOK_WARNINGS}"
channel: "#alertas-warnings"
send_resolved: true
- name: "email-critical"
email_configs:
- to: "${ALERT_EMAIL}"
from: "alertas@isem.dev"
smarthost: "smtp.gmail.com:587"
require_tls: true
- name: "pagerduty-critical"
pagerduty_configs:
- service_key: "${PAGERDUTY_SERVICE_KEY}"
severity: "critical"
routes:
- match:
severity: "critical"
receiver: "slack-critical"
continue: true
- match:
severity: "critical"
receiver: "email-critical"
continue: true
- match:
severity: "warning"
receiver: "slack-warnings"
inhibit_rules:
- source_match:
severity: "critical"
target_match:
severity: "warning"
equal: ["alertname", "instance"]
# ─────────────────────────────────────────────────────────────────────────────────
# REGLAS DE ALERTA
# ─────────────────────────────────────────────────────────────────────────────────
alert_rules:
infraestructura:
- nombre: "HighCPUUsage"
expr: "100 - (avg by(instance) (irate(node_cpu_seconds_total{mode='idle'}[5m])) * 100) > 80"
for: "5m"
severidad: "warning"
resumen: "CPU usage > 80% for 5 minutes"
accion: "Verificar procesos, considerar scaling"
- nombre: "HighMemoryUsage"
expr: "(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85"
for: "5m"
severidad: "warning"
resumen: "Memory usage > 85%"
accion: "Verificar memory leaks, reiniciar servicios"
- nombre: "DiskSpaceLow"
expr: "(1 - (node_filesystem_avail_bytes{mountpoint='/'} / node_filesystem_size_bytes{mountpoint='/'})) * 100 > 80"
for: "10m"
severidad: "warning"
resumen: "Disk space < 20% free"
accion: "Limpiar logs, backups antiguos"
- nombre: "DiskSpaceCritical"
expr: "(1 - (node_filesystem_avail_bytes{mountpoint='/'} / node_filesystem_size_bytes{mountpoint='/'})) * 100 > 95"
for: "5m"
severidad: "critical"
resumen: "Disk space < 5% free"
accion: "URGENTE: Liberar espacio inmediatamente"
aplicaciones:
- nombre: "HighErrorRate"
expr: "rate(http_requests_total{status=~'5..'}[5m]) / rate(http_requests_total[5m]) > 0.05"
for: "5m"
severidad: "critical"
resumen: "Error rate > 5%"
accion: "Revisar logs, rollback si necesario"
- nombre: "HighLatency"
expr: "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 2"
for: "5m"
severidad: "warning"
resumen: "P95 latency > 2 seconds"
accion: "Revisar queries lentos, cache"
- nombre: "ServiceDown"
expr: "up{job=~'gamilit.*|trading.*'} == 0"
for: "1m"
severidad: "critical"
resumen: "Service is down"
accion: "Reiniciar servicio, verificar logs"
base_de_datos:
- nombre: "PostgreSQLDown"
expr: "pg_up == 0"
for: "1m"
severidad: "critical"
resumen: "PostgreSQL is down"
accion: "Reiniciar PostgreSQL, verificar logs"
- nombre: "TooManyConnections"
expr: "pg_stat_activity_count > 80"
for: "5m"
severidad: "warning"
resumen: "PostgreSQL connections > 80"
accion: "Revisar connection pooling"
- nombre: "SlowQueries"
expr: "rate(pg_stat_statements_seconds_total[5m]) / rate(pg_stat_statements_calls_total[5m]) > 1"
for: "10m"
severidad: "warning"
resumen: "Average query time > 1 second"
accion: "Revisar queries, agregar indices"
negocio:
- nombre: "NoUserActivity"
expr: "rate(gamilit_exercises_completed_total[30m]) == 0"
for: "1h"
severidad: "warning"
resumen: "No exercises completed in 1 hour"
labels:
project: "gamilit"
accion: "Verificar si hay problema o es hora de baja actividad"
- nombre: "AbnormalXPRate"
expr: "rate(gamilit_xp_awarded_total[5m]) > 10000"
for: "5m"
severidad: "warning"
resumen: "Abnormal XP award rate - possible exploit"
labels:
project: "gamilit"
accion: "Verificar actividad sospechosa"
# ─────────────────────────────────────────────────────────────────────────────────
# UPTIME MONITORING (EXTERNO)
# ─────────────────────────────────────────────────────────────────────────────────
uptime_monitoring:
proveedor: "UptimeRobot"
plan: "Free/Pro"
monitores:
- nombre: "Gamilit Website"
url: "https://gamilit.com"
tipo: "HTTP"
intervalo: "5m"
alertas: ["email", "slack"]
- nombre: "Gamilit API Health"
url: "https://api.gamilit.com/health"
tipo: "HTTP"
intervalo: "5m"
keyword: "ok"
alertas: ["email", "slack"]
- nombre: "Trading Staging"
url: "https://trading-staging.isem.dev"
tipo: "HTTP"
intervalo: "5m"
alertas: ["email"]
# ─────────────────────────────────────────────────────────────────────────────────
# LOGGING
# ─────────────────────────────────────────────────────────────────────────────────
logging:
aplicaciones:
metodo: "PM2 logs + rotacion"
ubicacion: "~/.pm2/logs/"
rotacion:
max_size: "10M"
retain: 7
sistema:
journald: true
ubicacion: "/var/log/"
centralizacion:
actual: "local"
futuro: "considerar Loki + Grafana"
busqueda:
comando: "pm2 logs {app} --lines 100"
filtrar: "pm2 logs {app} | grep ERROR"
# ─────────────────────────────────────────────────────────────────────────────────
# METRICAS DE APLICACION (INSTRUMENTACION)
# ─────────────────────────────────────────────────────────────────────────────────
metricas_aplicacion:
gamilit:
endpoint: "/metrics"
puerto: 3006
libreria: "@nestjs/terminus + prom-client"
metricas:
- nombre: "http_requests_total"
tipo: "counter"
labels: ["method", "path", "status"]
- nombre: "http_request_duration_seconds"
tipo: "histogram"
labels: ["method", "path"]
buckets: [0.1, 0.5, 1, 2, 5]
- nombre: "gamilit_exercises_completed_total"
tipo: "counter"
labels: ["difficulty", "subject"]
- nombre: "gamilit_xp_awarded_total"
tipo: "counter"
labels: ["source"]
- nombre: "gamilit_active_users"
tipo: "gauge"
trading_platform:
endpoint: "/metrics"
puerto: 4000
metricas:
- nombre: "trades_executed_total"
tipo: "counter"
labels: ["symbol", "side"]
- nombre: "prediction_accuracy"
tipo: "gauge"
labels: ["model"]
- nombre: "model_inference_duration_seconds"
tipo: "histogram"
# ─────────────────────────────────────────────────────────────────────────────────
# RUNBOOKS (PROCEDIMIENTOS DE RESPUESTA)
# ─────────────────────────────────────────────────────────────────────────────────
runbooks:
ubicacion: "orchestration/runbooks/"
documentos:
- alerta: "ServiceDown"
runbook: "RUNBOOK-SERVICE-DOWN.md"
pasos:
- "Verificar status: pm2 status"
- "Revisar logs: pm2 logs {app}"
- "Reiniciar: pm2 restart {app}"
- "Verificar health: curl localhost:{port}/health"
- "Si persiste: verificar recursos del sistema"
- alerta: "HighErrorRate"
runbook: "RUNBOOK-HIGH-ERROR-RATE.md"
pasos:
- "Identificar errores: grep ERROR en logs"
- "Verificar ultimos deploys"
- "Considerar rollback"
- "Notificar a equipo de desarrollo"
- alerta: "DiskSpaceCritical"
runbook: "RUNBOOK-DISK-SPACE.md"
pasos:
- "Identificar uso: du -sh /*"
- "Limpiar logs: pm2 flush"
- "Limpiar Docker: docker system prune"
- "Verificar backups locales"
- "Considerar expansion de disco"
# ─────────────────────────────────────────────────────────────────────────────────
# REFERENCIAS
# ─────────────────────────────────────────────────────────────────────────────────
referencias:
perfil_responsable: "@PERFIL_MONITORING_AGENT"
production_inventory: "orchestration/inventarios/PRODUCTION-INVENTORY.yml"
runbooks_folder: "orchestration/runbooks/"
# ═══════════════════════════════════════════════════════════════════════════════
# FIN DE CONFIGURACION
# ═══════════════════════════════════════════════════════════════════════════════