-
Notifications
You must be signed in to change notification settings - Fork 19
/
Copy pathcheck.sh
executable file
·213 lines (193 loc) · 7.69 KB
/
check.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
#!/bin/bash
# URLs for raptoreum explorers. Main and backup one.
URL=( 'https://explorer.raptoreum.com/' 'https://raptor.mopsus.com/' )
URL_ID=0
BOOTSTRAP_TAR='https://bootstrap.raptoreum.com/bootstraps/bootstrap.tar.xz'
POSE_SCORE=0
PREV_SCORE=0
LOCAL_HEIGHT=0
# Variables provided by cron job enviroment variable.
# They should also be added into .bashrc for user use.
#RAPTOREUM_CLI -> Path to the raptoreum-cli
#CONFIG_DIR/HOME -> Path to "$HOME/.raptoreumcore/"
# Add your NODE_PROTX here if you forgot or provided wrong hash during node
# installation.
#NODE_PROTX=
# Prepare some variables that can be set if the user is runing the script
# manually but are set in cron job enviroment.
if [[ -z $RAPTOREUM_CLI ]]; then
RAPTOREUM_CLI=$(which raptoreum-cli)
fi
if [[ -z $CONFIG_DIR ]]; then
if [[ -z $HOME ]]; then
HOME="/home/$USER/"
fi
CONFIG_DIR="$HOME/.raptoreumcore/"
fi
function GetNumber () {
if [[ ${1} =~ ^[+-]?[0-9]+([.][0-9]+)?$ ]]; then
echo "${1}"
else
echo "-1"
fi
}
function ReadValue () {
GetNumber "$(cat ${1} 2>/dev/null)"
}
# Allow read anything from CLI with $@ arguments. Timeout after 300s.
function ReadCli () {
# This should just echo (return) value with standard stdout.
${RAPTOREUM_CLI} $@ &
PID=$!
for i in {0..60}; do
sleep 1
if ! ps --pid ${PID} 1>/dev/null; then
# PID ended. Just exit the function.
return
fi
done
# raptoreum-cli did not return after 300s. kill the PID and exit with -1.
kill -9 $PID
echo -1
}
function tryToKillDaemonGracefullyFirst() {
echo "$(date -u) Trying to kill daemon gracefully..."
killall raptoreumd
sleep 90s
LOCAL_HEIGHT=$(GetNumber "$(ReadCli getblockcount)")
if (( LOCAL_HEIGHT < 0 )); then
echo "$(date -u) Unable to kill daemon gracefully, force kill it..."
killall -9 raptoreumd
else
echo "$(date -u) Daemon has restarted..."
fi
}
function CheckPoSe () {
# Check if the Node PoSe score is changing.
if [[ ! -z ${NODE_PROTX} ]]; then
POSE_SCORE=$(curl -s "${URL[$URL_ID]}api/protx?command=info&protxhash=${NODE_PROTX}" | jq -r '.state.PoSePenalty')
# Check if the response returned a number or failed.
if [[ $(GetNumber $POSE_SCORE) -lt 0 && $POSE_SCORE != "null" ]]; then
URL_ID=$(( (URL_ID + 1) % 2 ))
POSE_SCORE=$(curl -s "${URL[$URL_ID]}api/protx?command=info&protxhash=${NODE_PROTX}" | jq -r '.state.PoSePenalty')
fi
if [[ $POSE_SCORE == "null" ]]; then
echo "$(date -u) Your NODE_PROTX is invalid, please insert your NODE_PROTX hash in line #18 of check.sh script."
elif (( $(GetNumber $POSE_SCORE) == -1 )); then
echo "$(date -u) Could not get PoSe score for the node. It is possible both explorers are down."
fi
POSE_SCORE=$(GetNumber $POSE_SCORE)
else
echo "$(date -u) Your NODE_PROTX is empty. Please reinitialize the node again or add it in line #18 of check.sh script."
fi
PREV_SCORE=$(ReadValue "/tmp/pose_score")
echo ${POSE_SCORE} >/tmp/pose_score
# Check if we should restart raptoreumd according to the PoSe score.
if (( POSE_SCORE > 0 )); then
if (( POSE_SCORE > PREV_SCORE )); then
echo "$(date -u) Score increased from ${PREV_SCORE} to ${POSE_SCORE}. Send kill signal..."
tryToKillDaemonGracefullyFirst
echo "1" >/tmp/was_stuck
# Do not check node height after killing raptoreumd it is sure to be stuck.
exit
elif (( POSE_SCORE < PREV_SCORE )); then
echo "$(date -u) Score decreased from ${PREV_SCORE} to ${POSE_SCORE}. Wait..."
rm /tmp/was_stuck 2>/dev/null
fi
# POSE_SCORE == PREV_SCORE is gonna force check the node block height.
fi
}
function CheckBlockHeight () {
# Check local block height.
NETWORK_HEIGHT=$(GetNumber $(curl -s "${URL[$URL_ID]}api/getblockcount"))
if (( NETWORK_HEIGHT < 0 )); then
URL_ID=$(( (URL_ID + 1) % 2 ))
NETWORK_HEIGHT=$(GetNumber $(curl -s "${URL[$URL_ID]}api/getblockcount"))
fi
PREV_HEIGHT=$(ReadValue "/tmp/height")
LOCAL_HEIGHT=$(GetNumber "$(ReadCli getblockcount)")
echo ${LOCAL_HEIGHT} >/tmp/height
if [[ $POSE_SCORE -eq $PREV_SCORE || $PREV_SCORE -eq -1 ]]; then
echo -n "$(date -u) Node height (${LOCAL_HEIGHT}/${NETWORK_HEIGHT})."
# Block height did not change. Is it stuck?. Compare with netowrk block height. Allow some slippage.
if [[ $((NETWORK_HEIGHT - LOCAL_HEIGHT)) -gt 3 || $NETWORK_HEIGHT == -1 ]]; then
if (( LOCAL_HEIGHT > PREV_HEIGHT )); then
# Node is still syncing?
rm /tmp/was_stuck 2>/dev/null
echo " Increased from ${PREV_HEIGHT} -> ${LOCAL_HEIGHT}. Wait..."
elif [[ $LOCAL_HEIGHT -gt 0 && $(ReadValue "/tmp/was_stuck") -lt 0 ]]; then
# Node is behind the network height and it is first attempt at unstucking.
# If LOCAL_HEIGHT is >0 it means that we were able to read from the cli
# but the height did not change compared to previous check.
echo "1" >/tmp/was_stuck
echo " Height difference is more than 3 blocks behind the network. Send kill signal..."
tryToKillDaemonGracefullyFirst
elif [[ $(ReadValue "/tmp/was_stuck") -lt 0 ]]; then
# Node was not able to respond. It is probably stuck but try to restart
# it once before trying to bootstrap or restore it.
echo "1" >/tmp/was_stuck
echo " Node was unresponsive for the first time. Send kill signal..."
tryToKillDaemonGracefullyFirst
else
# Node is most probably very stuck and if trying to sync wrong chain branch.
# This meand simple raptoreumd kill will not help and we need to
# force unstuck by bootstrapping / resyncing the chain again.
echo " Node seems to be hardstuck and is trying to sync forked chain. Try to force unstuck..."
return 1
fi
else
rm /tmp/was_stuck 2>/dev/null
echo " Daemon seems ok..."
fi
fi
return 0
}
function BootstrapChain () {
echo "$(date -u) Re-Bootstrap the node chain."
echo "0" >/tmp/height
echo "0" >/tmp/prev_stuck
echo "$(date -u) Download and prepare rtm-bootstrap."
rm -rf /tmp/bootstrap 2>/dev/null
mkdir -p /tmp/bootstrap 2>/dev/null
if [[ -f $HOME/bootstrap/bootstrap.tar.gz ]]; then
mv chainbackup.sh temp.sh
tar xzf $HOME/bootstrap/bootstrap.tar.gz -C /tmp/bootstrap
mv temp.sh chainbackup.sh
else
curl -L "$BOOTSTRAP_TAR" | tar xJ -C /tmp/bootstrap/
fi
# Stop serivce and kill raptoreumd.
echo "$(date -u) Kill raptoreumd."
sudo systemctl stop raptoreum
killall -9 raptoreumd 2>/dev/null
echo "$(date -u) Clean ${CONFIG_DIR}."
rm -rf ${CONFIG_DIR}/{blocks,chainstate,evodb,llmq}
echo "$(date -u) Insert Bootstrap data."
mv /tmp/bootstrap/{blocks,chainstate,evodb,llmq} ${CONFIG_DIR}/
rm -rf /tmp/bootstrap 2>/dev/null
echo "$(date -u) Bootstrap complete."
sudo systemctl start raptoreum
}
# This should force unstuck the local node.
function ReconsiderBlock () {
# If raptoreum-cli is responsive and it is stuck in the different place than before.
if [[ $LOCAL_HEIGHT -gt 0 && $LOCAL_HEIGHT -gt $(ReadValue "/tmp/prev_stuck") ]]; then
# Node is still responsive but is stuck on the wrong branch/fork.
RECONSIDER=$(( LOCAL_HEIGHT - 10 ))
HASH=$(ReadCli getblockhash ${RECONSIDER})
if [[ ${HASH} != "-1" ]]; then
echo "$(date -u) Reconsider chain from 10 blocks before current one ${RECONSIDER}."
if [[ -z $(ReadCli reconsiderblock "${HASH}") ]]; then
echo ${RECONSIDER} >/tmp/height
echo ${LOCAL_HEIGHT} >/tmp/prev_stuck
return 0
fi
fi
fi
# raptoreum-cli is/was unresponsive in at least 1 step
return 1
}
# Check pose score acording to the explorer data.
CheckPoSe
# PoSe seems fine, did not change or was not able to get the score.
CheckBlockHeight || ReconsiderBlock || BootstrapChain