diff --git a/README.md b/README.md index 6589b6c..ac1a0b9 100644 --- a/README.md +++ b/README.md @@ -60,23 +60,20 @@ in the bot's directory. - `run {start,stop,restart}`: execute the relevant action for the bot. - `run update [ref]`: restart the bot with the branch or PR - For branch: `ssh user@remote '/home/benchbot/bench-bot/run update master'` - - For PR: `ssh user@remote '/home/benchbot/bench-bot/run update pull/number/head:branch'` + - For PR: `ssh user@remote '/home/benchbot/bench-bot/run update pull/number/head:branch'` e.g. `pull/1/head:master` -### Monitoring Service commands - -- `run monitor {install,uninstall}`: install or uninstall the monitoring - service -- `run monitor {start,restart,stop,status,...}`: acts as a wrapper for - `systemctl` - ### Logs -The logs will be output to the systemd journal: - -`sudo journalctl -u benchbot-monitor.service` +- Logs from the systemd journal: + - `journalctl --follow --identifier benchbot` follows the output as if you + were running the command in the foreground + - `journalctl --pagerend --identifier benchbot` goes to the end of the output + in pager mode and therefore allows you paginate through the history + - This log is cleared between machine restarts -As well as to `./log.txt`. +- Full log history: `less +G /home/benchbot/bench-bot/log.txt` + - This log is only cleared manually # Required Github settings diff --git a/bench.js b/bench.js index 268c34a..5ea814c 100644 --- a/bench.js +++ b/bench.js @@ -40,7 +40,7 @@ function BenchContext(app, config) { stdout = result.stdout } catch (err) { error = true - app.log.fatal({ + config.logFatal({ msg: "Caught exception in command execution", error: err, }) @@ -532,7 +532,7 @@ function benchmarkRuntime(app, config) { ) if (last.error) { extraInfo = `ERROR: Unable to commit file ${outputFile}` - app.log.fatal({ + config.logFatal({ msg: extraInfo, stdout: last.stdout, stderr: last.stderr, @@ -546,7 +546,7 @@ function benchmarkRuntime(app, config) { ) if (last.error) { extraInfo = `ERROR: Unable to push ${outputFile}` - app.log.fatal({ + config.logFatal({ msg: extraInfo, stdout: last.stdout, stderr: last.stderr, @@ -556,7 +556,7 @@ function benchmarkRuntime(app, config) { } catch (error) { extraInfo = "NOTE: Caught exception while trying to push commits to the repository" - app.log.fatal({ msg: extraInfo, error }) + config.logFatal({ msg: extraInfo, error }) } } } diff --git a/index.js b/index.js index 279127e..4a05239 100644 --- a/index.js +++ b/index.js @@ -9,7 +9,7 @@ const githubCommentLimitLength = 65536 const githubCommentLimitTruncateMessage = "..." let isTerminating = false -let appFatalLogger = undefined +let logFatal = undefined for (const event of ["uncaughtException", "unhandledRejection"]) { process.on(event, function (error, origin) { @@ -19,8 +19,8 @@ for (const event of ["uncaughtException", "unhandledRejection"]) { isTerminating = true try { - if (appFatalLogger) { - appFatalLogger({ event, error, origin }) + if (logFatal) { + logFatal({ event, error, origin }) } } catch (error) { console.error({ level: "error", event, error, origin, exception }) @@ -35,7 +35,25 @@ module.exports = (app) => { app.log("Running in debug mode") } - appFatalLogger = app.log.fatal + // Crash the server on Probot failures or errors + // We retain the original error handlers on logError and logFatal so that the + // application can still report errors on the expected channels + // This is necessary to work around problems in reconnection issues from our + // event source + // (https://github.com/paritytech/bench-bot/issues/83#issuecomment-1024283664) + // FIXME: This is suboptimal and we should not have to stop the application in + // case of errors + // The server will automatically restarted on failures in ./run + const logError = app.log.error + app.log.error = function(...args) { + logError(...args) + process.exit(1) + } + logFatal = app.log.fatal + app.log.fatal = function(...args) { + logFatal(...args) + process.exit(1) + } const baseBranch = process.env.BASE_BRANCH || "master" app.log.debug(`base branch: ${baseBranch}`) @@ -140,6 +158,7 @@ module.exports = (app) => { id: action, extra, getPushDomain, + logFatal } let report @@ -156,10 +175,10 @@ module.exports = (app) => { } if (report.isError) { - app.log.error(report.message) + logError(report.message) if (report.error) { - app.log.error(report.error) + logError(report.error) } const output = `${report.message}${report.error ? `: ${report.error.toString()}` : "" @@ -221,7 +240,7 @@ ${extraInfo} body, }) } catch (error) { - app.log.fatal({ + logFatal({ error, repo, owner, diff --git a/run b/run index 640bdf9..f4360f1 100755 --- a/run +++ b/run @@ -32,19 +32,13 @@ check_executables() { done } -check_monitor_runtime_executables() { - executables=( - tail inotifywait wc cut - ) - check_executables -} - check_app_runtime_executables() { if [ -e ~/.cargo/env ]; then . ~/.cargo/env fi executables=( - rustup cargo git bash + rustup cargo git bash systemd-cat tee yarn awk tmux ionice sudo nice + wc cut ) check_executables } @@ -55,15 +49,7 @@ remote_repo="https://github.com/paritytech/$remote_repo_name" benchbot_user="benchbot" benchbot_session="/tmp/bench-bot" install_location="/home/$benchbot_user/bench-bot" - -exec_log_dir_parent="/home/$benchbot_user" -exec_log_dir="$install_location" -exec_log_file_name="log.txt" -exec_log_file="$exec_log_dir/$exec_log_file_name" - -monitor_service="benchbot-monitor" -monitor_service_dir="/usr/lib/systemd/system" -monitor_service_file="$monitor_service_dir/$monitor_service.service" +log_file="$install_location/log.txt" print_help_and_exit() { echo " @@ -76,11 +62,6 @@ Commands: start, stop, restart: Execute the relevant subcommand for the bot's process. - monitor: - Use 'monitor install' or 'monitor uninstall' for setting up the bot's - monitoring service. - Otherwise, the arguments are forwarded to systemctl. - update [ref]: Pull a ref (branch or pull request) from $remote_repo, install it and restart the bot. @@ -231,12 +212,12 @@ handle_exec() { exit_with_error "the $benchbot_user user is already running a process" fi - if [ -e "$exec_log_file" ]; then - local start_from_line="$(wc -l "$exec_log_file" | cut -d ' ' -f1)" - exit_if_error "Failed to count the lines in $exec_log_file" + if [ -e "$log_file" ]; then + local start_from_line="$(wc -l "$log_file" | cut -d ' ' -f1)" + exit_if_error "Failed to count the lines in $log_file" start_from_line=$(( start_from_line + 1 )) else - echo "" > "$exec_log_file" + echo "" > "$log_file" unset start_from_line fi @@ -248,26 +229,33 @@ handle_exec() { ;; esac + # don't want to spam errors in case something is broken; at least + # wait 1 second between attempts in the "while true" loop below sudo ionice -c 1 -n 0 sudo nice -n -19 sudo -u $benchbot_user \ tmux new-session -d bash -c " . ~/.cargo/env && cd \"$install_location\" && git config --local user.name 'Parity Bot' && git config --local user.email admin@parity.io && + git config --local advice.detachedHead false && yarn && - ${env_vars:-} yarn start 2>&1 | tee -a \"$exec_log_file\" + while true; do + ${env_vars:-} yarn start 2>&1 | \ + tee -a \"$log_file\" | \ + awk '{ print \"bb: \" \$0 }' | \ + systemd-cat -t benchbot; + sleep 1; + done " exit_if_error "Failed to create tmux session for user $benchbot_user" echo -e "\nNote: the command will still be running after quitting this terminal. Use \"run stop\" for stopping it.\n" - tail "--lines=+${start_from_line:-0}" -f "$exec_log_file" + tail "--lines=+${start_from_line:-0}" -f "$log_file" ;; stop) if pgrep -u benchbot &>/dev/null; then sudo pkill -u benchbot - else - return 0 fi ;; restart) @@ -280,156 +268,6 @@ handle_exec() { esac } -stop_follow_log_file() { - if [ ! "${follow_log_file_tail_pid:-}" ]; then - return - fi - - kill -9 "$follow_log_file_tail_pid" - exit_if_error "Failed to kill tail process $follow_log_file_tail_pid" - - unset follow_log_file_tail_pid -} - -start_follow_log_file() { - stop_follow_log_file - - local start_from_line="$(wc -l "$exec_log_file" | cut -d ' ' -f1)" - exit_if_error "Failed to count the lines in $exec_log_file" - start_from_line=$(( start_from_line + 1 )) - tail "--lines=+$start_from_line" -f "$exec_log_file" | awk '{ print "bb: " $0 }' & - follow_log_file_tail_pid=$? -} - -parse_log_file_notification_line() { - if [[ ! "$1" =~ ^([^[:space:]]+)[[:space:]]+(.*) ]]; then - exit_with_error "Notification line did not have the expected format" - fi -} - -follow_log_file() { - while true; do - # Monitor the log file while it exists - if [ -e "$exec_log_dir" ]; then - start_follow_log_file - - while IFS= read line; do - parse_log_file_notification_line "$line" - - local event="${BASH_REMATCH[1]}" - case "$event" in - DELETE_SELF) - break - ;; - esac - - local file="${BASH_REMATCH[2]}" - if [ "$file" != "$exec_log_file_name" ]; then - continue - fi - - case "$event" in - CREATE) - start_follow_log_file - ;; - DELETE) - stop_follow_log_file - ;; - *) - exit_with_error "Unhandled event $event for $exec_log_dir" - ;; - esac - done < <(inotifywait -e create,delete,delete_self --format '%e %f' --monitor --quiet "$exec_log_dir") - # If the log file does not exist, then wait for the log file's directory to - # be created - elif [ -e "$exec_log_dir_parent" ]; then - while IFS= read line; do - parse_log_file_notification_line "$line" - - local event="${BASH_REMATCH[1]}" - case "$event" in - DELETE_SELF) - break - ;; - CREATE) - if [ "$exec_log_dir_parent/$file" = "$exec_log_dir" ]; then - break - fi - ;; - *) - exit_with_error "Unhandled event $event for $exec_log_dir_parent" - ;; - esac - done < <(inotifywait -e create,delete_self --format '%e %f' --monitor --quiet "$exec_log_dir_parent") - else - exit_with_error "Unable to watch '$exec_log_dir_parent' for '$exec_log_dir'" - fi - done -} - -handle_monitor() { - local cmd="$1" - shift - - case "$cmd" in - install) - if [ "${1:-}" != "--force" ] && [ -e "$monitor_service_file" ]; then - return - fi - - &>/dev/null sudo mkdir -p "$monitor_service_dir" - - echo " - [Unit] - Description=Bench Bot Monitor - Documentation=$remote_repo - - [Service] - ExecStart=sh -c \"'$install_location/run' follow_log_file\" - Restart=always - RestartSec=30 - CapabilityBoundingSet= - LockPersonality=true - NoNewPrivileges=true - PrivateDevices=true - PrivateMounts=true - PrivateTmp=true - PrivateUsers=true - ProtectControlGroups=true - ProtectHostname=true - ProtectKernelModules=true - ProtectKernelTunables=true - ProtectSystem=strict - RemoveIPC=true - RestrictNamespaces=true - RestrictSUIDSGID=true - SystemCallArchitectures=native - - [Install] - WantedBy=default.target - " | sudo tee "$monitor_service_file" >/dev/null - - exit_if_error "Failed to create service file at $monitor_service_file" - - if [ -e "$exec_log_file" ] || [ -e "$exec_log_dir_parent" ]; then - sudo systemctl enable --now "$monitor_service" - else - log "The service \"$monitor_service\" was not activated (is the bot cloned at \"$install_location\"?). Start it later with with \"run monitor enable --now\"." - fi - ;; - uninstall) - if systemctl is-active --quiet "$monitor_service"; then - sudo systemctl disable --now "$monitor_service" - exit_if_error "Failed to disable service $monitor_service" - fi - sudo rm "$monitor_service_file" - ;; - *) - sudo systemctl "$cmd" "$monitor_service" "$@" - ;; - esac -} - main() { local cmd="$1" shift @@ -440,9 +278,6 @@ main() { $as_benchbot bash -c "'${BASH_SOURCE[0]}' check_app_runtime_executables" exit_if_error ;; - monitor) - check_monitor_runtime_executables - ;; esac case "$cmd" in @@ -468,13 +303,6 @@ main() { bash -c "'${BASH_SOURCE[0]}' start $@" exit_if_error "Failed to start" ;; - monitor) - "handle_$cmd" "$@" - local exit_code=$? - echo "Exit code: $exit_code" - exit $exit_code - ;; - follow_log_file | \ install_repo | \ install_ref | \ install_deps | \ @@ -489,8 +317,6 @@ main() { $as_benchbot bash -c "'${BASH_SOURCE[0]}' install_repo" exit_if_error "Failed to install repository" - - handle_monitor install ;; help) print_help_and_exit 0