Commit 7560ceb6 authored by Vitaly Lipatov's avatar Vitaly Lipatov

route-health.sh: add default route management based on gateway health

When a group has set-default in options, route-health.sh now reads gateways in priority order and picks the first healthy one as default. Also refactored health checking into get_health() helper and added rt_tables lookup fallback for table resolution. Co-Authored-By: 's avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 9db9337d
#!/bin/sh
# Health-check failover for route groups via Telegraf/InfluxDB
# Monitors gateway health and flushes dead routes when live alternatives exist
# Manages default route for groups with set-default option
#
# Usage: route-health.sh [--show]
cd "$(dirname "$(realpath "$0")")" || exit
. ./functions
ROUTES_DIR=routes.d
ROUTES6_DIR=routes6.d
STATE_DIR=.state
......@@ -52,6 +55,32 @@ if [ -n "$SHOW" ] ; then
echo
fi
# Get health status for a monitor tag
# Usage: get_health TAG
# Returns: healthy, degraded, dead
get_health()
{
local tag="$1"
local loss=$(grep "^${tag} " "$HEALTH_DATA" | awk '{print $2}')
if [ -z "$loss" ] ; then
echo "dead"
return
fi
echo "$loss" | awk '{
if ($1 >= 100) print "dead"
else if ($1 < 50) print "healthy"
else print "degraded"
}'
}
# Check if option is set in group's options file
has_option()
{
[ -f "$1/options" ] && grep -q "^$2$" "$1/options" 2>/dev/null
}
# --- Evaluate health for all monitored groups ---
healthy_count=0
dead_count=0
......@@ -68,20 +97,12 @@ for routes_dir in "$ROUTES_DIR" "$ROUTES6_DIR" ; do
tag=$(grep -v '^#' "$gwdir/monitor" | grep -m1 .)
[ -n "$tag" ] || continue
status=$(get_health "$tag")
loss=$(grep "^${tag} " "$HEALTH_DATA" | awk '{print $2}')
state_path="$routes_dir/$name"
loss_display="${loss:-no_data}%"
[ -z "$loss" ] && loss_display="no_data"
if [ -z "$loss" ] ; then
status="dead"
loss_display="no_data"
else
status=$(echo "$loss" | awk '{
if ($1 >= 100) print "dead"
else if ($1 < 50) print "healthy"
else print "degraded"
}')
loss_display="${loss}%"
fi
state_path="$routes_dir/$name"
[ -n "$SHOW" ] && log "[$name] $tag: loss=$loss_display -> $status"
......@@ -103,6 +124,74 @@ done
# Nothing monitored — exit
[ "$healthy_count" -eq 0 ] && [ "$dead_count" -eq 0 ] && exit 0
# --- Manage default route for groups with set-default ---
for routes_dir in "$ROUTES_DIR" "$ROUTES6_DIR" ; do
[ -d "$routes_dir" ] || continue
ipcmd="ip"
[ "$routes_dir" = "$ROUTES6_DIR" ] && ipcmd="ip -6"
for gwdir in "$routes_dir"/*/ ; do
[ -d "$gwdir" ] || continue
has_option "$gwdir" "set-default" || continue
name=$(basename "$gwdir")
best_gw=""
# Read gateways in priority order, pick first healthy
while IFS= read -r line ; do
[ -z "$line" ] && continue
echo "$line" | grep -q '^#' && continue
gw_ip=$(resolve_gw "$line" "$ipcmd")
[ -z "$gw_ip" ] && continue
# Find monitor tag for this gateway IP
# Look through all groups for one whose gateway matches
gw_tag=""
for check_dir in "$routes_dir"/*/ ; do
[ -d "$check_dir" ] || continue
[ -f "$check_dir/monitor" ] || continue
check_gw=$(grep -v '^#' "$check_dir/gateway" 2>/dev/null | grep -m1 .)
check_ip=$(resolve_gw "$check_gw" "$ipcmd" 2>/dev/null)
if [ "$check_ip" = "$gw_ip" ] ; then
gw_tag=$(grep -v '^#' "$check_dir/monitor" | grep -m1 .)
break
fi
done
if [ -z "$gw_tag" ] ; then
# No monitor for this gateway — assume healthy (first unmonitored wins)
[ -n "$SHOW" ] && log "[$name] $gw_ip: no monitor, assuming healthy"
best_gw="$gw_ip"
break
fi
status=$(get_health "$gw_tag")
[ -n "$SHOW" ] && log "[$name] $gw_ip ($gw_tag): $status"
if [ "$status" = "healthy" ] ; then
best_gw="$gw_ip"
break
fi
done < "$gwdir/gateway"
if [ -z "$best_gw" ] ; then
log "[$name] No healthy gateway found, keeping current default"
continue
fi
cur_default=$($ipcmd route show default | awk '/default/ {print $3; exit}')
if [ "$cur_default" != "$best_gw" ] ; then
log "[$name] Switching default: $cur_default -> $best_gw"
if [ -z "$SHOW" ] ; then
$ipcmd route replace default via "$best_gw"
fi
else
[ -n "$SHOW" ] && log "[$name] Default already via $best_gw"
fi
done
done
# --- Process dead groups ---
for state_path in $dead_groups ; do
name=$(basename "$state_path")
......@@ -110,9 +199,13 @@ for state_path in $dead_groups ; do
mkdir -p "$STATE_DIR/$state_path"
# Read table number from group config
# Read table number from group config or rt_tables
table=""
[ -f "$state_path/table" ] && table=$(grep -v '^#' "$state_path/table" | grep -m1 .)
if [ -f "$state_path/table" ] ; then
table=$(grep -v '^#' "$state_path/table" | grep -m1 .)
else
table=$(awk -v n="$name" '$2 == n { print $1; exit }' /etc/iproute2/rt_tables 2>/dev/null)
fi
# Flapping protection: require DOWN_THRESHOLD consecutive checks
down_count=0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment