Commit 2e9b341e authored by Vitaly Lipatov's avatar Vitaly Lipatov

router: add route-health.sh for gateway failover via Telegraf/InfluxDB

parent 1efc636a
#!/bin/sh
# Health-check failover for route groups via Telegraf/InfluxDB
# Monitors gateway health and flushes dead routes when live alternatives exist
#
# Usage: route-health.sh [--show]
cd "$(dirname "$(realpath "$0")")" || exit
ROUTES_DIR=routes.d
ROUTES6_DIR=routes6.d
STATE_DIR=.state
INFLUXDB_URL="http://10.20.30.130:8086/query"
INFLUXDB_DB="gateways"
DOWN_THRESHOLD=2
SHOW=
[ "$1" = "--show" ] && SHOW=1
log() { echo "$(date '+%H:%M:%S') $*" ; }
# --- Query InfluxDB for packet loss ---
HEALTH_DATA=$(mktemp)
trap 'rm -f "$HEALTH_DATA"' EXIT
response=$(curl -sG "$INFLUXDB_URL" \
--data-urlencode "db=$INFLUXDB_DB" \
--data-urlencode "q=SELECT last(percent_packet_loss) FROM ping WHERE time > now() - 3m GROUP BY gateway" \
--max-time 5 2>/dev/null)
if [ -z "$response" ] ; then
log "ERROR: Failed to query InfluxDB"
exit 1
fi
# Parse JSON → "gateway packet_loss" lines
if command -v jq >/dev/null 2>&1 ; then
echo "$response" | jq -r '.results[0].series[]? | "\(.tags.gateway) \(.values[0][1])"' > "$HEALTH_DATA"
else
echo "$response" | sed 's/},/}\n/g' | \
sed -n 's/.*"gateway":"\([^"]*\)".*\[\["[^"]*",\([0-9.]*\)\].*/\1 \2/p' > "$HEALTH_DATA"
fi
if [ ! -s "$HEALTH_DATA" ] ; then
log "ERROR: No health data from InfluxDB"
exit 1
fi
if [ -n "$SHOW" ] ; then
log "InfluxDB data (last 3m):"
sed 's/^/ /' "$HEALTH_DATA"
echo
fi
# --- Evaluate health for all monitored groups ---
healthy_count=0
dead_count=0
dead_groups=""
healthy_groups=""
for routes_dir in "$ROUTES_DIR" "$ROUTES6_DIR" ; do
[ -d "$routes_dir" ] || continue
for gwdir in "$routes_dir"/*/ ; do
[ -d "$gwdir" ] || continue
name=$(basename "$gwdir")
[ -f "$gwdir/monitor" ] || continue
tag=$(grep -v '^#' "$gwdir/monitor" | grep -m1 .)
[ -n "$tag" ] || continue
loss=$(grep "^${tag} " "$HEALTH_DATA" | awk '{print $2}')
state_path="$routes_dir/$name"
if [ -z "$loss" ] ; then
status="dead"
loss_display="no_data"
else
status=$(echo "$loss" | awk '{
if ($1 >= 100) print "dead"
else if ($1 < 50) print "healthy"
else print "degraded"
}')
loss_display="${loss}%"
fi
[ -n "$SHOW" ] && log "[$name] $tag: loss=$loss_display -> $status"
case "$status" in
healthy)
healthy_count=$((healthy_count + 1))
healthy_groups="${healthy_groups:+$healthy_groups }$state_path"
;;
dead)
dead_count=$((dead_count + 1))
dead_groups="${dead_groups:+$dead_groups }$state_path"
;;
esac
done
done
[ -n "$SHOW" ] && log "Summary: healthy=$healthy_count dead=$dead_count"
# Nothing monitored — exit
[ "$healthy_count" -eq 0 ] && [ "$dead_count" -eq 0 ] && exit 0
# --- Process dead groups ---
for state_path in $dead_groups ; do
name=$(basename "$state_path")
routes_dir=$(dirname "$state_path")
mkdir -p "$STATE_DIR/$state_path"
# Read table number from group config
table=""
[ -f "$state_path/table" ] && table=$(grep -v '^#' "$state_path/table" | grep -m1 .)
# Flapping protection: require DOWN_THRESHOLD consecutive checks
down_count=0
[ -f "$STATE_DIR/$state_path/down_count" ] && read -r down_count < "$STATE_DIR/$state_path/down_count"
down_count=$((down_count + 1))
if [ "$down_count" -lt "$DOWN_THRESHOLD" ] ; then
log "[$name] Down ($down_count/$DOWN_THRESHOLD), waiting"
[ -z "$SHOW" ] && echo "$down_count" > "$STATE_DIR/$state_path/down_count"
continue
fi
# Safety: don't flush if no healthy alternatives (last resort)
if [ "$healthy_count" -eq 0 ] ; then
log "[$name] Down but no healthy alternatives, keeping routes"
[ -z "$SHOW" ] && echo "$down_count" > "$STATE_DIR/$state_path/down_count"
continue
fi
# Already flushed?
if [ -f "$STATE_DIR/$state_path/health" ] && [ "$(cat "$STATE_DIR/$state_path/health")" = "down" ] ; then
[ -n "$SHOW" ] && log "[$name] Already down"
continue
fi
# Flush the routing table
ipcmd="ip"
[ "$routes_dir" = "$ROUTES6_DIR" ] && ipcmd="ip -6"
log "[$name] Dead (${down_count}x confirmed), flushing table $table"
if [ -z "$SHOW" ] && [ -n "$table" ] ; then
$ipcmd route flush table "$table" 2>/dev/null
echo "down" > "$STATE_DIR/$state_path/health"
echo "$down_count" > "$STATE_DIR/$state_path/down_count"
fi
done
# --- Recover healthy groups that were previously down ---
need_reload=
for state_path in $healthy_groups ; do
name=$(basename "$state_path")
# Reset down_count for consistently healthy groups
[ -z "$SHOW" ] && rm -f "$STATE_DIR/$state_path/down_count"
# Check if was previously down
[ -f "$STATE_DIR/$state_path/health" ] || continue
[ "$(cat "$STATE_DIR/$state_path/health")" = "down" ] || continue
log "[$name] Recovered, scheduling route reload"
if [ -z "$SHOW" ] ; then
echo "up" > "$STATE_DIR/$state_path/health"
# Remove route-update state to trigger reload on next run
rm -f "$STATE_DIR/$state_path/hash" "$STATE_DIR/$state_path/resolved"
need_reload=1
fi
done
# Reload routes for recovered groups
if [ -n "$need_reload" ] ; then
log "Running route-update.sh for recovered groups"
[ -z "$SHOW" ] && ./route-update.sh
fi
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment