Skip to content

Commit

Permalink
lightningd: don't abort on incorrect versions, but try to re-exec.
Browse files Browse the repository at this point in the history
You still shouldn't do this (you could get some transient failures),
but at least you have a decent chance if you reinstall over a running
daemon, instead of getting confusing internal errors if message
formats have changed.

Changelog-Added: lightningd: we now try to restart if subdaemons are upgraded underneath us.
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Fixes: ElementsProject#4346
  • Loading branch information
rustyrussell committed Apr 8, 2021
1 parent c747297 commit bd75b5c
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 4 deletions.
3 changes: 2 additions & 1 deletion contrib/pyln-testing/pyln/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1216,7 +1216,8 @@ def split_options(self, opts):
'random_hsm',
'feerates',
'wait_for_bitcoind_sync',
'allow_bad_gossip'
'allow_bad_gossip',
'start',
]
node_opts = {k: v for k, v in opts.items() if k in node_opt_keys}
cli_opts = {k: v for k, v in opts.items() if k not in node_opt_keys}
Expand Down
34 changes: 34 additions & 0 deletions lightningd/lightningd.c
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx)
ld->listen = true;
ld->autolisten = true;
ld->reconnect = true;
ld->try_reexec = false;

/*~ This is from ccan/timer: it is efficient for the case where timers
* are deleted before expiry (as is common with timeouts) using an
Expand Down Expand Up @@ -854,6 +855,8 @@ int main(int argc, char *argv[])
struct rlimit nofile = {1024, 1024};
int sigchld_rfd;
int exit_code = 0;
char **orig_argv;
bool try_reexec;

/*~ Make sure that we limit ourselves to something reasonable. Modesty
* is a virtue. */
Expand Down Expand Up @@ -888,6 +891,17 @@ int main(int argc, char *argv[])
ld = new_lightningd(NULL);
ld->state = LD_STATE_RUNNING;

/*~ We store an copy of our arguments before parsing mangles them, so
* we can re-exec if versions of subdaemons change. Note the use of
* notleak() since our leak-detector can't find orig_argv on the
* stack. */
orig_argv = notleak(tal_arr(ld, char *, argc + 1));
for (size_t i = 1; i < argc; i++)
orig_argv[i] = tal_strdup(orig_argv, argv[i]);
/*~ Turn argv[0] into an absolute path (if not already) */
orig_argv[0] = path_join(orig_argv, take(path_cwd(NULL)), argv[0]);
orig_argv[argc] = NULL;

/* Figure out where our daemons are first. */
ld->daemon_dir = find_daemon_dir(ld, argv[0]);
if (!ld->daemon_dir)
Expand Down Expand Up @@ -1144,6 +1158,11 @@ int main(int argc, char *argv[])
* ld->payments, so clean that up. */
clean_tmpctx();

/* Gather these before we free ld! */
try_reexec = ld->try_reexec;
if (try_reexec)
tal_steal(NULL, orig_argv);

/* Free this last: other things may clean up timers. */
timers = tal_steal(NULL, ld->timers);
tal_free(ld);
Expand All @@ -1161,6 +1180,21 @@ int main(int argc, char *argv[])
tal_free(stop_response);
}

/* Were we supposed to restart ourselves? */
if (try_reexec) {
long max_fd;

/* Give a reasonable chance for the install to finish. */
sleep(5);

/* Close all filedescriptors except stdin/stdout/stderr */
max_fd = sysconf(_SC_OPEN_MAX);
for (int i = STDERR_FILENO+1; i < max_fd; i++)
close(i);
execv(orig_argv[0], orig_argv);
err(1, "Failed to re-exec ourselves after version change");
}

/*~ Farewell. Next stop: hsmd/hsmd.c. */
return exit_code;
}
3 changes: 3 additions & 0 deletions lightningd/lightningd.h
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,9 @@ struct lightningd {

/* The round-robin list of channels, for use when doing MPP. */
u64 rr_counter;

/* Should we re-exec ourselves instead of just exiting? */
bool try_reexec;
};

/* Turning this on allows a tal allocation to return NULL, rather than aborting.
Expand Down
10 changes: 7 additions & 3 deletions lightningd/subd.c
Original file line number Diff line number Diff line change
Expand Up @@ -412,8 +412,12 @@ static bool handle_version(struct subd *sd, const u8 *msg)
return false;

if (!streq(ver, version())) {
fatal("subdaemon %s version '%s' not '%s'",
sd->name, ver, version());
log_broken(sd->log, "version '%s' not '%s': restarting",
ver, version());
sd->ld->try_reexec = true;
/* Return us to toplevel lightningd.c */
io_break(sd->ld);
return false;
}
return true;
}
Expand Down Expand Up @@ -473,7 +477,7 @@ static struct io_plan *sd_msg_read(struct io_conn *conn, struct subd *sd)
goto next;
case WIRE_STATUS_VERSION:
if (!handle_version(sd, sd->msg_in))
goto malformed;
goto close;
goto next;
}

Expand Down
11 changes: 11 additions & 0 deletions tests/plugins/badopeningd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#! /bin/sh

# If this file exists, we send that message back, then sleep.
if [ $# = 0 -a -f openingd-version ]; then
# lightningd expects us to write to stdin!
cat openingd-version >&0
sleep 10
exit 0
fi

exec `cat openingd-real` "$@"
31 changes: 31 additions & 0 deletions tests/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -2507,3 +2507,34 @@ def test_listforwards(node_factory, bitcoind):
# out_channel=c24
c24_forwards = l2.rpc.listforwards(out_channel=c24)['forwards']
assert len(c24_forwards) == 1


def test_version_reexec(node_factory, bitcoind):
badopeningd = os.path.join(os.path.dirname(__file__), "plugins", "badopeningd.sh")
version = subprocess.check_output(['lightningd/lightningd',
'--version']).decode('utf-8').splitlines()[0]

l1, l2 = node_factory.get_nodes(2, opts=[{'subdaemon': 'openingd:' + badopeningd,
'start': False,
'allow_broken_log': True},
{}])
# We use a file to tell our openingd wrapper where the real one is
with open(os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, "openingd-real"), 'w') as f:
f.write(os.path.abspath('lightningd/lightning_openingd'))

l1.start()
# This is a "version" message
verfile = os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, "openingd-version")
with open(verfile, 'wb') as f:
f.write(bytes.fromhex('0000000d' # len
'fff6')) # type
f.write(bytes('badversion\0', encoding='utf8'))

l1.rpc.connect(l2.info['id'], 'localhost', l2.port)

l1.daemon.wait_for_log("openingd.*version 'badversion' not '{}': restarting".format(version))

# Now "fix" it, it should restart.
os.unlink(verfile)
l1.daemon.wait_for_log("Server started with public key")

0 comments on commit bd75b5c

Please sign in to comment.