erp-core/docs/07-devops/scripts/health-check.sh

265 lines
8.0 KiB
Bash
Executable File

#!/bin/bash
# =====================================================
# ERP GENERIC - Health Check Script
# Comprehensive system health check
# =====================================================
set -euo pipefail
# Configuration
BACKEND_URL="${BACKEND_URL:-http://localhost:3000}"
FRONTEND_URL="${FRONTEND_URL:-http://localhost:5173}"
DB_HOST="${POSTGRES_HOST:-postgres}"
DB_PORT="${POSTGRES_PORT:-5432}"
DB_NAME="${POSTGRES_DB:-erp_generic}"
DB_USER="${POSTGRES_USER:-erp_user}"
REDIS_HOST="${REDIS_HOST:-redis}"
REDIS_PORT="${REDIS_PORT:-6379}"
REDIS_PASSWORD="${REDIS_PASSWORD:-}"
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Color
# Exit code (0 = healthy, 1 = unhealthy)
EXIT_CODE=0
# Helper functions
print_header() {
echo ""
echo "=========================================="
echo "$1"
echo "=========================================="
}
check_success() {
echo -e "${GREEN}$1${NC}"
}
check_warning() {
echo -e "${YELLOW}$1${NC}"
}
check_failure() {
echo -e "${RED}$1${NC}"
EXIT_CODE=1
}
# Main health check
echo "===== ERP Generic Health Check ====="
echo "Started at: $(date)"
echo ""
# 1. Backend API Health Check
print_header "1. Backend API"
if curl -f -s --max-time 5 "${BACKEND_URL}/health" > /dev/null 2>&1; then
check_success "Backend API is responding"
# Check detailed health
HEALTH_RESPONSE=$(curl -s --max-time 5 "${BACKEND_URL}/health")
echo "Response: $HEALTH_RESPONSE"
# Parse JSON response (requires jq)
if command -v jq &> /dev/null; then
DB_STATUS=$(echo "$HEALTH_RESPONSE" | jq -r '.checks.database.status' 2>/dev/null || echo "unknown")
REDIS_STATUS=$(echo "$HEALTH_RESPONSE" | jq -r '.checks.redis.status' 2>/dev/null || echo "unknown")
if [ "$DB_STATUS" = "up" ]; then
check_success "Database connection: OK"
else
check_failure "Database connection: FAILED"
fi
if [ "$REDIS_STATUS" = "up" ]; then
check_success "Redis connection: OK"
else
check_failure "Redis connection: FAILED"
fi
fi
else
check_failure "Backend API is not responding"
fi
# 2. Frontend Health Check
print_header "2. Frontend"
if curl -f -s --max-time 5 "${FRONTEND_URL}" > /dev/null 2>&1; then
check_success "Frontend is accessible"
else
check_failure "Frontend is not accessible"
fi
# 3. PostgreSQL Database Check
print_header "3. PostgreSQL Database"
PGPASSWORD="${POSTGRES_PASSWORD}"
export PGPASSWORD
if pg_isready -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" > /dev/null 2>&1; then
check_success "PostgreSQL is accepting connections"
# Check database exists
if psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -lqt | cut -d \| -f 1 | grep -qw "$DB_NAME"; then
check_success "Database '$DB_NAME' exists"
# Check connection count
CONNECTION_COUNT=$(psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -tAc "SELECT count(*) FROM pg_stat_activity WHERE datname='$DB_NAME';" 2>/dev/null || echo "0")
MAX_CONNECTIONS=$(psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -tAc "SHOW max_connections;" 2>/dev/null || echo "100")
echo "Active connections: $CONNECTION_COUNT / $MAX_CONNECTIONS"
if [ "$CONNECTION_COUNT" -gt $((MAX_CONNECTIONS * 80 / 100)) ]; then
check_warning "Connection pool is >80% utilized"
else
check_success "Connection pool is healthy"
fi
# Check table count
TABLE_COUNT=$(psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -tAc "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema NOT IN ('pg_catalog', 'information_schema');" 2>/dev/null || echo "0")
echo "Tables in database: $TABLE_COUNT"
if [ "$TABLE_COUNT" -gt 0 ]; then
check_success "Database has tables"
else
check_warning "Database has no tables (not initialized?)"
fi
# Check for recent errors in pg_stat_database
ERROR_COUNT=$(psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -tAc "SELECT deadlocks FROM pg_stat_database WHERE datname='$DB_NAME';" 2>/dev/null || echo "0")
if [ "$ERROR_COUNT" -gt 0 ]; then
check_warning "Database has $ERROR_COUNT deadlocks recorded"
fi
else
check_failure "Database '$DB_NAME' does not exist"
fi
else
check_failure "PostgreSQL is not accepting connections"
fi
# 4. Redis Cache Check
print_header "4. Redis Cache"
if [ -n "$REDIS_PASSWORD" ]; then
REDIS_CLI_CMD="redis-cli -h $REDIS_HOST -p $REDIS_PORT -a $REDIS_PASSWORD"
else
REDIS_CLI_CMD="redis-cli -h $REDIS_HOST -p $REDIS_PORT"
fi
if $REDIS_CLI_CMD ping > /dev/null 2>&1; then
check_success "Redis is responding"
# Check Redis info
REDIS_VERSION=$($REDIS_CLI_CMD INFO server | grep redis_version | cut -d: -f2 | tr -d '\r')
echo "Redis version: $REDIS_VERSION"
CONNECTED_CLIENTS=$($REDIS_CLI_CMD INFO clients | grep connected_clients | cut -d: -f2 | tr -d '\r')
echo "Connected clients: $CONNECTED_CLIENTS"
USED_MEMORY=$($REDIS_CLI_CMD INFO memory | grep used_memory_human | cut -d: -f2 | tr -d '\r')
echo "Used memory: $USED_MEMORY"
check_success "Redis is healthy"
else
check_failure "Redis is not responding"
fi
# 5. Disk Space Check
print_header "5. Disk Space"
DISK_USAGE=$(df -h / | awk 'NR==2 {print $5}' | sed 's/%//')
echo "Disk usage: ${DISK_USAGE}%"
if [ "$DISK_USAGE" -lt 70 ]; then
check_success "Disk space is healthy"
elif [ "$DISK_USAGE" -lt 85 ]; then
check_warning "Disk space is >70% utilized"
else
check_failure "Disk space is critically low (>85%)"
fi
# 6. CPU & Memory Check
print_header "6. System Resources"
if command -v free &> /dev/null; then
MEMORY_USAGE=$(free | awk 'NR==2 {printf "%.0f", $3*100/$2}')
echo "Memory usage: ${MEMORY_USAGE}%"
if [ "$MEMORY_USAGE" -lt 80 ]; then
check_success "Memory usage is healthy"
else
check_warning "Memory usage is high (>80%)"
fi
fi
if command -v top &> /dev/null; then
CPU_USAGE=$(top -bn1 | grep "Cpu(s)" | awk '{print $2}' | cut -d'%' -f1)
echo "CPU usage: ${CPU_USAGE}%"
# Note: This is a simple check, may need adjustment based on system
if command -v awk &> /dev/null; then
if awk "BEGIN {exit !($CPU_USAGE < 80)}"; then
check_success "CPU usage is healthy"
else
check_warning "CPU usage is high (>80%)"
fi
fi
fi
# 7. Docker Containers Check (if applicable)
print_header "7. Docker Containers"
if command -v docker &> /dev/null; then
RUNNING_CONTAINERS=$(docker ps --filter "name=erp" --format "{{.Names}}" | wc -l)
echo "Running ERP containers: $RUNNING_CONTAINERS"
if [ "$RUNNING_CONTAINERS" -gt 0 ]; then
check_success "Docker containers are running"
# List container status
docker ps --filter "name=erp" --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
else
check_warning "No ERP containers are running"
fi
else
echo "Docker is not installed or not in PATH"
fi
# 8. Log Errors Check (last 1 hour)
print_header "8. Recent Errors in Logs"
if [ -f "/var/log/erp-generic/backend.log" ]; then
ERROR_COUNT=$(grep -c "ERROR" /var/log/erp-generic/backend.log 2>/dev/null || echo "0")
echo "Errors in last log file: $ERROR_COUNT"
if [ "$ERROR_COUNT" -eq 0 ]; then
check_success "No errors in recent logs"
elif [ "$ERROR_COUNT" -lt 10 ]; then
check_warning "Found $ERROR_COUNT errors in logs"
else
check_failure "Found $ERROR_COUNT errors in logs (investigate!)"
fi
else
echo "Log file not found: /var/log/erp-generic/backend.log"
fi
# Summary
print_header "Health Check Summary"
if [ $EXIT_CODE -eq 0 ]; then
echo -e "${GREEN}✓ All systems are healthy${NC}"
else
echo -e "${RED}✗ Some systems are unhealthy (see above)${NC}"
fi
echo ""
echo "Completed at: $(date)"
echo "===== End of Health Check ====="
# Exit with appropriate code (0 = healthy, 1 = unhealthy)
exit $EXIT_CODE