router: add route-health.sh for gateway failover via Telegraf/InfluxDB

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

router: add route-health.sh for gateway failover via Telegraf/InfluxDB
2e9b341e · Vitaly Lipatov · 1efc636a · 2e9b341e
Commit 2e9b341e authored Feb 18, 2026 by Vitaly Lipatov
Hide whitespace changes
Inline Side-by-side

Showing with 178 additions and 0 deletions

route-health.sh router/route-health.sh +178 -0

No files found.
--- a/router/route-health.sh
+++ b/router/route-health.sh
+#!/bin/sh
+# Health-check failover for route groups via Telegraf/InfluxDB
+# Monitors gateway health and flushes dead routes when live alternatives exist
+#
+# Usage: route-health.sh [--show]
+cd "$(dirname "$(realpath "$0")")" || exit
+ROUTES_DIR=routes.d
+ROUTES6_DIR=routes6.d
+STATE_DIR=.state
+INFLUXDB_URL="http://10.20.30.130:8086/query"
+INFLUXDB_DB="gateways"
+DOWN_THRESHOLD=2
+SHOW=
+[ "$1" = "--show" ] && SHOW=1
+log() { echo "$(date '+%H:%M:%S') $*" ; }
+# --- Query InfluxDB for packet loss ---
+HEALTH_DATA=$(mktemp)
+trap 'rm -f "$HEALTH_DATA"' EXIT
+response=$(curl -sG "$INFLUXDB_URL" \
+    --data-urlencode "db=$INFLUXDB_DB" \
+    --data-urlencode "q=SELECT last(percent_packet_loss) FROM ping WHERE time > now() - 3m GROUP BY gateway" \
+    --max-time 5 2>/dev/null)
+if [ -z "$response" ] ; then
+    log "ERROR: Failed to query InfluxDB"
+    exit 1
+fi
+# Parse JSON → "gateway packet_loss" lines
+if command -v jq >/dev/null 2>&1 ; then
+    echo "$response" | jq -r '.results[0].series[]? | "\(.tags.gateway) \(.values[0][1])"' > "$HEALTH_DATA"
+else
+    echo "$response" | sed 's/},/}\n/g' | \
+        sed -n 's/.*"gateway":"\([^"]*\)".*\[\["[^"]*",\([0-9.]*\)\].*/\1 \2/p' > "$HEALTH_DATA"
+fi
+if [ ! -s "$HEALTH_DATA" ] ; then
+    log "ERROR: No health data from InfluxDB"
+    exit 1
+fi
+if [ -n "$SHOW" ] ; then
+    log "InfluxDB data (last 3m):"
+    sed 's/^/  /' "$HEALTH_DATA"
+    echo
+fi
+# --- Evaluate health for all monitored groups ---
+healthy_count=0
+dead_count=0
+dead_groups=""
+healthy_groups=""
+for routes_dir in "$ROUTES_DIR" "$ROUTES6_DIR" ; do
+    [ -d "$routes_dir" ] || continue
+    for gwdir in "$routes_dir"/*/ ; do
+        [ -d "$gwdir" ] || continue
+        name=$(basename "$gwdir")
+        [ -f "$gwdir/monitor" ] || continue
+        tag=$(grep -v '^#' "$gwdir/monitor" | grep -m1 .)
+        [ -n "$tag" ] || continue
+        loss=$(grep "^${tag} " "$HEALTH_DATA" | awk '{print $2}')
+        state_path="$routes_dir/$name"
+        if [ -z "$loss" ] ; then
+            status="dead"
+            loss_display="no_data"
+        else
+            status=$(echo "$loss" | awk '{
+                if ($1 >= 100) print "dead"
+                else if ($1 < 50) print "healthy"
+                else print "degraded"
+            }')
+            loss_display="${loss}%"
+        fi
+        [ -n "$SHOW" ] && log "[$name] $tag: loss=$loss_display -> $status"
+        case "$status" in
+            healthy)
+                healthy_count=$((healthy_count + 1))
+                healthy_groups="${healthy_groups:+$healthy_groups }$state_path"
+                ;;
+            dead)
+                dead_count=$((dead_count + 1))
+                dead_groups="${dead_groups:+$dead_groups }$state_path"
+                ;;
+        esac
+    done
+done
+[ -n "$SHOW" ] && log "Summary: healthy=$healthy_count dead=$dead_count"
+# Nothing monitored — exit
+[ "$healthy_count" -eq 0 ] && [ "$dead_count" -eq 0 ] && exit 0
+# --- Process dead groups ---
+for state_path in $dead_groups ; do
+    name=$(basename "$state_path")
+    routes_dir=$(dirname "$state_path")
+    mkdir -p "$STATE_DIR/$state_path"
+    # Read table number from group config
+    table=""
+    [ -f "$state_path/table" ] && table=$(grep -v '^#' "$state_path/table" | grep -m1 .)
+    # Flapping protection: require DOWN_THRESHOLD consecutive checks
+    down_count=0
+    [ -f "$STATE_DIR/$state_path/down_count" ] && read -r down_count < "$STATE_DIR/$state_path/down_count"
+    down_count=$((down_count + 1))
+    if [ "$down_count" -lt "$DOWN_THRESHOLD" ] ; then
+        log "[$name] Down ($down_count/$DOWN_THRESHOLD), waiting"
+        [ -z "$SHOW" ] && echo "$down_count" > "$STATE_DIR/$state_path/down_count"
+        continue
+    fi
+    # Safety: don't flush if no healthy alternatives (last resort)
+    if [ "$healthy_count" -eq 0 ] ; then
+        log "[$name] Down but no healthy alternatives, keeping routes"
+        [ -z "$SHOW" ] && echo "$down_count" > "$STATE_DIR/$state_path/down_count"
+        continue
+    fi
+    # Already flushed?
+    if [ -f "$STATE_DIR/$state_path/health" ] && [ "$(cat "$STATE_DIR/$state_path/health")" = "down" ] ; then
+        [ -n "$SHOW" ] && log "[$name] Already down"
+        continue
+    fi
+    # Flush the routing table
+    ipcmd="ip"
+    [ "$routes_dir" = "$ROUTES6_DIR" ] && ipcmd="ip -6"
+    log "[$name] Dead (${down_count}x confirmed), flushing table $table"
+    if [ -z "$SHOW" ] && [ -n "$table" ] ; then
+        $ipcmd route flush table "$table" 2>/dev/null
+        echo "down" > "$STATE_DIR/$state_path/health"
+        echo "$down_count" > "$STATE_DIR/$state_path/down_count"
+    fi
+done
+# --- Recover healthy groups that were previously down ---
+need_reload=
+for state_path in $healthy_groups ; do
+    name=$(basename "$state_path")
+    # Reset down_count for consistently healthy groups
+    [ -z "$SHOW" ] && rm -f "$STATE_DIR/$state_path/down_count"
+    # Check if was previously down
+    [ -f "$STATE_DIR/$state_path/health" ] || continue
+    [ "$(cat "$STATE_DIR/$state_path/health")" = "down" ] || continue
+    log "[$name] Recovered, scheduling route reload"
+    if [ -z "$SHOW" ] ; then
+        echo "up" > "$STATE_DIR/$state_path/health"
+        # Remove route-update state to trigger reload on next run
+        rm -f "$STATE_DIR/$state_path/hash" "$STATE_DIR/$state_path/resolved"
+        need_reload=1
+    fi
+done
+# Reload routes for recovered groups
+if [ -n "$need_reload" ] ; then
+    log "Running route-update.sh for recovered groups"
+    [ -z "$SHOW" ] && ./route-update.sh
+fi