-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmonitor_node.sh
99 lines (81 loc) · 3.53 KB
/
monitor_node.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#!/bin/bash
# Usage:
# e.g. bash monitor_node.sh "namada_node" "http://localhost:26657" "your-healthchecks-id" 600
# This will check whether the node at port 26657 is synced or not. It will only give error 3
# if the last block is older than 600 seconds (10 minutes). See below for more error types.
# Make sure to create a cronjob like:
# */10 * * * * /bin/bash /path/to/monitor_node.sh "namada_node" "http://localhost:26657" "your-healthchecks-id" 600
# (*/10 and 600 don't have to match)
# Input (parameters):
# $1 - Title for the node (e.g., namada_node)
# $2 - Local RPC endpoint (default: http://localhost:26657)
# $3 - Healthchecks unique ID (optional)
# $4 - Optional stale block threshold in seconds (default: 300 [5 minutes])
# CAUTION: Make sure not to set the threshold too low; your local server's time might be
# inconsistent with the node. Omitting the value (600) will default it to 300.
TITLE=${1:-"node"}
RPC_URL=${2:-"http://localhost:26657"}
HC_ID=${3:-""}
STALE_THRESHOLD=${4:-300}
# Output (error types):
# 0 = No error
# 1 = Node down
# 2 = Catching up
# 3 = Stale blocks
ERROR=0
# Directories and files
SCRIPT_DIR=$(dirname "$(readlink -f "$0")")
LOGS_DIR="${SCRIPT_DIR}/.logs"
LOG_FILE="${LOGS_DIR}/${TITLE}_health.log"
BLOCK_TRACK_FILE="${LOGS_DIR}/${TITLE}_block_height"
# Make sure the .logs folder exists and create the log file
mkdir -p "$LOGS_DIR"
if [ ! -f "$LOG_FILE" ]; then
echo "=== Monitoring ${TITLE} ===" > "$LOG_FILE"
fi
# Log purging mechanism
MAX_LOG_SIZE=1048576 # 1 MB (1024 * 1024 bytes)
LOG_SIZE=$(stat -c %s "$LOG_FILE")
if [ "$LOG_SIZE" -ge "$MAX_LOG_SIZE" ]; then
mv "$LOG_FILE" "${LOG_FILE}.old"
# In case you want to retain old logs use this instead:
# mv "$LOG_FILE" "${LOG_FILE}_$(date +%Y%m%d_%H%M%S).old"
echo "=== Monitoring ${TITLE} ===" > "$LOG_FILE"
fi
# Fetch node and server data
NODE_STATUS=$(curl -s --connect-timeout 5 "$RPC_URL/status")
SERVER_TIME_UNIX=$(date +%s)
if [ -z "$NODE_STATUS" ]; then
ERROR=1
MESSAGE="Error 1: Node is down. Unable to reach RPC."
else
BLOCK_HEIGHT=$(echo "$NODE_STATUS" | jq -r '.result.sync_info.latest_block_height')
CATCHING_UP=$(echo "$NODE_STATUS" | jq -r '.result.sync_info.catching_up')
BLOCK_TIME=$(echo "$NODE_STATUS" | jq -r '.result.sync_info.latest_block_time')
BLOCK_TIME_UNIX=$(date -d "$BLOCK_TIME" +%s 2>/dev/null || echo "0")
ELAPSED_TIME=$((SERVER_TIME_UNIX - BLOCK_TIME_UNIX))
# Read the last known block height
if [ -f "$BLOCK_TRACK_FILE" ]; then
LAST_KNOWN_BLOCK_HEIGHT=$(cat "$BLOCK_TRACK_FILE")
else
LAST_KNOWN_BLOCK_HEIGHT=0
fi
# Update the block height tracker
echo "$BLOCK_HEIGHT" > "$BLOCK_TRACK_FILE"
if [ "$CATCHING_UP" == "true" ]; then
ERROR=2
MESSAGE="Error 2: Node is catching up. Block height: $BLOCK_HEIGHT."
elif (( BLOCK_HEIGHT == LAST_KNOWN_BLOCK_HEIGHT && ELAPSED_TIME > STALE_THRESHOLD )); then
ERROR=3
MESSAGE="Error 3: No block progress in the last $ELAPSED_TIME seconds. Last block height: $BLOCK_HEIGHT at $BLOCK_TIME."
else
MESSAGE="Node is healthy. Latest block: $BLOCK_HEIGHT, Timestamp: $BLOCK_TIME."
fi
fi
# Log the message to the log file
echo "[$(date -d "@$SERVER_TIME_UNIX" "+%Y-%m-%d %H:%M:%S")] $MESSAGE" >> "$LOG_FILE"
# Ping Healthchecks if HC_ID is provided
if [ -n "$HC_ID" ]; then
tail -n 5 "$LOG_FILE" | curl -fsS -m 10 --retry 5 --data-binary @- "https://hc-ping.com/$HC_ID/$ERROR"
fi
echo "$MESSAGE"