#!/bin/bash
# Health check for all services — sends alert email on failure
# Runs on prod (91.98.164.18) via cron

LOG="/var/log/health-check.log"
ALERT_EMAIL="katlun@gmail.com"
FAILURES=""

log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" >> "$LOG"; }

check_http() {
    local name=$1 url=$2 expected=$3
    code=$(curl -s -o /dev/null -w "%{http_code}" --connect-timeout 10 "$url" 2>/dev/null)
    if [ "$code" != "$expected" ]; then
        FAILURES="${FAILURES}FAIL: ${name} — expected ${expected}, got ${code}\n"
        log "FAIL: ${name} (${url}) — expected ${expected}, got ${code}"
    fi
}

check_service() {
    local name=$1
    if ! systemctl is-active --quiet "$name" 2>/dev/null; then
        FAILURES="${FAILURES}FAIL: service ${name} is not running\n"
        log "FAIL: service ${name} is not running"
    fi
}

check_container() {
    local name=$1
    if ! docker inspect --format='{{.State.Running}}' "$name" 2>/dev/null | grep -q true; then
        FAILURES="${FAILURES}FAIL: container ${name} is not running\n"
        log "FAIL: container ${name} is not running"
    fi
}

check_disk() {
    local mount=$1 threshold=$2
    usage=$(df "$mount" | tail -1 | awk '{print $5}' | tr -d '%')
    if [ "$usage" -ge "$threshold" ]; then
        FAILURES="${FAILURES}WARN: disk ${mount} at ${usage}% (threshold ${threshold}%)\n"
        log "WARN: disk ${mount} at ${usage}%"
    fi
}

check_ram() {
    free_mb=$(free -m | awk '/Mem:/{print $7}')
    if [ "$free_mb" -lt 300 ]; then
        FAILURES="${FAILURES}WARN: RAM available only ${free_mb}MB\n"
        log "WARN: RAM available ${free_mb}MB"
    fi
}

# --- Services ---
check_service accounting
check_service finance
check_service nginx
check_service sqf
check_service oryon-counter

# --- Containers ---
check_container accounting-accounting-db-1
check_container mailcowdockerized-postfix-mailcow-1
check_container mailcowdockerized-dovecot-mailcow-1

# --- HTTP endpoints ---
check_http "accounting" "http://localhost:8000/" "302"
check_http "finance" "http://localhost:8001/" "200"
check_http "sqf" "http://localhost:8002/" "302"

# --- Disk ---
check_disk "/" 85
check_disk "/mnt/HC_Volume_105169608" 85

# --- RAM ---
check_ram

# --- SSL certs (warn if < 14 days) ---
for domain in accounting.windyviews.com finance.windyviews.com windyviews.com sqf.windyviews.com; do
    expiry=$(openssl s_client -connect localhost:443 -servername "$domain" </dev/null 2>/dev/null | openssl x509 -noout -enddate 2>/dev/null | cut -d= -f2)
    if [ -n "$expiry" ]; then
        expiry_epoch=$(date -d "$expiry" +%s 2>/dev/null)
        now_epoch=$(date +%s)
        days_left=$(( (expiry_epoch - now_epoch) / 86400 ))
        if [ "$days_left" -lt 14 ]; then
            FAILURES="${FAILURES}WARN: SSL cert ${domain} expires in ${days_left} days\n"
            log "WARN: SSL cert ${domain} expires in ${days_left} days"
        fi
    fi
done

# --- Send alert if any failures ---
if [ -n "$FAILURES" ]; then
    log "Sending alert email"
    docker exec mailcowdockerized-postfix-mailcow-1 sh -c "printf 'Subject: [ALERT] windyviews.com health check failed\nFrom: noreply@windyviews.com\nTo: ${ALERT_EMAIL}\nContent-Type: text/plain; charset=utf-8\n\n$(printf "$FAILURES")\n\nTimestamp: $(date)\nServer: $(hostname)\n' | /usr/sbin/sendmail -f noreply@windyviews.com ${ALERT_EMAIL}"
else
    log "OK: all checks passed"
fi
