diff --git a/channeld/channeld.c b/channeld/channeld.c index 42cdfa61c310..7f33b1a8c5d9 100644 --- a/channeld/channeld.c +++ b/channeld/channeld.c @@ -3040,8 +3040,6 @@ static void init_channel(struct peer *peer) assert(!(fcntl(MASTER_FD, F_GETFL) & O_NONBLOCK)); - status_setup_sync(MASTER_FD); - msg = wire_sync_read(tmpctx, MASTER_FD); if (!fromwire_channeld_init(peer, msg, &chainparams, @@ -3225,6 +3223,8 @@ int main(int argc, char *argv[]) subdaemon_setup(argc, argv); + status_setup_sync(MASTER_FD); + peer = tal(NULL, struct peer); peer->expecting_pong = false; timers_init(&peer->timers, time_mono()); diff --git a/common/status.c b/common/status.c index c54944bb4846..42612de05e52 100644 --- a/common/status.c +++ b/common/status.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,9 @@ void status_setup_sync(int fd) assert(!status_conn); status_fd = fd; setup_logging_sighandler(); + + /* Send version now. */ + status_send(take(towire_status_version(NULL, version()))); } static void destroy_daemon_conn(struct daemon_conn *dc UNUSED) @@ -75,6 +79,9 @@ void status_setup_async(struct daemon_conn *master) tal_add_destructor(master, destroy_daemon_conn); setup_logging_sighandler(); + + /* Send version now. */ + status_send(take(towire_status_version(NULL, version()))); } void status_send(const u8 *msg TAKES) diff --git a/common/status_wire.csv b/common/status_wire.csv index d427af9d2b6f..93bf2e89ebec 100644 --- a/common/status_wire.csv +++ b/common/status_wire.csv @@ -23,4 +23,7 @@ msgtype,status_peer_connection_lost,0xFFF3 msgtype,status_peer_billboard,0xFFF5 msgdata,status_peer_billboard,perm,bool, msgdata,status_peer_billboard,happenings,wirestring, + +msgtype,status_version,0xFFF6 +msgdata,status_version,version,wirestring, # Note: 0xFFFF is reserved for MSG_PASS_FD! diff --git a/common/status_wiregen.c b/common/status_wiregen.c index 39864d0b3623..e56e59aac151 100644 --- a/common/status_wiregen.c +++ b/common/status_wiregen.c @@ -25,6 +25,7 @@ const char *status_wire_name(int e) case WIRE_STATUS_FAIL: return "WIRE_STATUS_FAIL"; case WIRE_STATUS_PEER_CONNECTION_LOST: return "WIRE_STATUS_PEER_CONNECTION_LOST"; case WIRE_STATUS_PEER_BILLBOARD: return "WIRE_STATUS_PEER_BILLBOARD"; + case WIRE_STATUS_VERSION: return "WIRE_STATUS_VERSION"; } snprintf(invalidbuf, sizeof(invalidbuf), "INVALID %i", e); @@ -39,6 +40,7 @@ bool status_wire_is_defined(u16 type) case WIRE_STATUS_FAIL:; case WIRE_STATUS_PEER_CONNECTION_LOST:; case WIRE_STATUS_PEER_BILLBOARD:; + case WIRE_STATUS_VERSION:; return true; } return false; @@ -191,4 +193,25 @@ bool fromwire_status_peer_billboard(const tal_t *ctx, const void *p, bool *perm, *happenings = fromwire_wirestring(ctx, &cursor, &plen); return cursor != NULL; } -// SHA256STAMP:67770c6a9a4205f10a455ea925afc2be4f853642b1e7ed70e3867221c64b7abc + +/* WIRE: STATUS_VERSION */ +u8 *towire_status_version(const tal_t *ctx, const wirestring *version) +{ + u8 *p = tal_arr(ctx, u8, 0); + + towire_u16(&p, WIRE_STATUS_VERSION); + towire_wirestring(&p, version); + + return memcheck(p, tal_count(p)); +} +bool fromwire_status_version(const tal_t *ctx, const void *p, wirestring **version) +{ + const u8 *cursor = p; + size_t plen = tal_count(p); + + if (fromwire_u16(&cursor, &plen) != WIRE_STATUS_VERSION) + return false; + *version = fromwire_wirestring(ctx, &cursor, &plen); + return cursor != NULL; +} +// SHA256STAMP:8e1ba9cbc812c8aad76c5049fcecefea2d706a100423c93d3c3be0afcbee851e diff --git a/common/status_wiregen.h b/common/status_wiregen.h index 854ca95fbdaa..c41cb23111ee 100644 --- a/common/status_wiregen.h +++ b/common/status_wiregen.h @@ -17,6 +17,7 @@ enum status_wire { WIRE_STATUS_FAIL = 0xFFF2, WIRE_STATUS_PEER_CONNECTION_LOST = 0xFFF3, WIRE_STATUS_PEER_BILLBOARD = 0xFFF5, + WIRE_STATUS_VERSION = 0xFFF6, }; const char *status_wire_name(int e); @@ -51,6 +52,10 @@ bool fromwire_status_peer_connection_lost(const void *p); u8 *towire_status_peer_billboard(const tal_t *ctx, bool perm, const wirestring *happenings); bool fromwire_status_peer_billboard(const tal_t *ctx, const void *p, bool *perm, wirestring **happenings); +/* WIRE: STATUS_VERSION */ +u8 *towire_status_version(const tal_t *ctx, const wirestring *version); +bool fromwire_status_version(const tal_t *ctx, const void *p, wirestring **version); + #endif /* LIGHTNING_COMMON_STATUS_WIREGEN_H */ -// SHA256STAMP:67770c6a9a4205f10a455ea925afc2be4f853642b1e7ed70e3867221c64b7abc +// SHA256STAMP:8e1ba9cbc812c8aad76c5049fcecefea2d706a100423c93d3c3be0afcbee851e diff --git a/contrib/pyln-testing/pyln/testing/utils.py b/contrib/pyln-testing/pyln/testing/utils.py index 944fd2c86ef5..911968136384 100644 --- a/contrib/pyln-testing/pyln/testing/utils.py +++ b/contrib/pyln-testing/pyln/testing/utils.py @@ -1216,7 +1216,8 @@ def split_options(self, opts): 'random_hsm', 'feerates', 'wait_for_bitcoind_sync', - 'allow_bad_gossip' + 'allow_bad_gossip', + 'start', ] node_opts = {k: v for k, v in opts.items() if k in node_opt_keys} cli_opts = {k: v for k, v in opts.items() if k not in node_opt_keys} diff --git a/gossipd/gossipd.c b/gossipd/gossipd.c index b8e6fc065bb9..1990a53e055f 100644 --- a/gossipd/gossipd.c +++ b/gossipd/gossipd.c @@ -1155,6 +1155,10 @@ static struct io_plan *gossip_init(struct io_conn *conn, /* Fire up the seeker! */ daemon->seeker = new_seeker(daemon); + /* connectd is already started, and uses this fd to ask us things. */ + daemon->connectd = daemon_conn_new(daemon, CONNECTD_FD, + connectd_req, NULL, daemon); + return daemon_conn_read_next(conn, daemon->master); } @@ -1952,10 +1956,6 @@ int main(int argc, char *argv[]) status_setup_async(daemon->master); - /* connectd is already started, and uses this fd to ask us things. */ - daemon->connectd = daemon_conn_new(daemon, CONNECTD_FD, - connectd_req, NULL, daemon); - /* This loop never exits. io_loop() only returns if a timer has * expired, or io_break() is called, or all fds are closed. We don't * use io_break and closing the lightningd fd calls master_gone() diff --git a/lightningd/lightningd.c b/lightningd/lightningd.c index 3aa98da41366..e8c6cb1b8069 100644 --- a/lightningd/lightningd.c +++ b/lightningd/lightningd.c @@ -210,6 +210,7 @@ static struct lightningd *new_lightningd(const tal_t *ctx) ld->listen = true; ld->autolisten = true; ld->reconnect = true; + ld->try_reexec = false; /*~ This is from ccan/timer: it is efficient for the case where timers * are deleted before expiry (as is common with timeouts) using an @@ -854,6 +855,8 @@ int main(int argc, char *argv[]) struct rlimit nofile = {1024, 1024}; int sigchld_rfd; int exit_code = 0; + char **orig_argv; + bool try_reexec; /*~ Make sure that we limit ourselves to something reasonable. Modesty * is a virtue. */ @@ -888,6 +891,17 @@ int main(int argc, char *argv[]) ld = new_lightningd(NULL); ld->state = LD_STATE_RUNNING; + /*~ We store an copy of our arguments before parsing mangles them, so + * we can re-exec if versions of subdaemons change. Note the use of + * notleak() since our leak-detector can't find orig_argv on the + * stack. */ + orig_argv = notleak(tal_arr(ld, char *, argc + 1)); + for (size_t i = 1; i < argc; i++) + orig_argv[i] = tal_strdup(orig_argv, argv[i]); + /*~ Turn argv[0] into an absolute path (if not already) */ + orig_argv[0] = path_join(orig_argv, take(path_cwd(NULL)), argv[0]); + orig_argv[argc] = NULL; + /* Figure out where our daemons are first. */ ld->daemon_dir = find_daemon_dir(ld, argv[0]); if (!ld->daemon_dir) @@ -1144,6 +1158,11 @@ int main(int argc, char *argv[]) * ld->payments, so clean that up. */ clean_tmpctx(); + /* Gather these before we free ld! */ + try_reexec = ld->try_reexec; + if (try_reexec) + tal_steal(NULL, orig_argv); + /* Free this last: other things may clean up timers. */ timers = tal_steal(NULL, ld->timers); tal_free(ld); @@ -1161,6 +1180,21 @@ int main(int argc, char *argv[]) tal_free(stop_response); } + /* Were we supposed to restart ourselves? */ + if (try_reexec) { + long max_fd; + + /* Give a reasonable chance for the install to finish. */ + sleep(5); + + /* Close all filedescriptors except stdin/stdout/stderr */ + max_fd = sysconf(_SC_OPEN_MAX); + for (int i = STDERR_FILENO+1; i < max_fd; i++) + close(i); + execv(orig_argv[0], orig_argv); + err(1, "Failed to re-exec ourselves after version change"); + } + /*~ Farewell. Next stop: hsmd/hsmd.c. */ return exit_code; } diff --git a/lightningd/lightningd.h b/lightningd/lightningd.h index bef1bbc8fb8f..2e14aab26c72 100644 --- a/lightningd/lightningd.h +++ b/lightningd/lightningd.h @@ -278,6 +278,9 @@ struct lightningd { /* The round-robin list of channels, for use when doing MPP. */ u64 rr_counter; + + /* Should we re-exec ourselves instead of just exiting? */ + bool try_reexec; }; /* Turning this on allows a tal allocation to return NULL, rather than aborting. diff --git a/lightningd/subd.c b/lightningd/subd.c index c53bc586e12d..0a9b0c2beb53 100644 --- a/lightningd/subd.c +++ b/lightningd/subd.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -12,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -402,6 +404,29 @@ static bool handle_set_billboard(struct subd *sd, const u8 *msg) return true; } +static bool handle_version(struct subd *sd, const u8 *msg) +{ + char *ver; + + if (!fromwire_status_version(msg, msg, &ver)) + return false; + + if (!streq(ver, version())) { + log_broken(sd->log, "version '%s' not '%s': restarting", + ver, version()); + sd->ld->try_reexec = true; + /* Return us to toplevel lightningd.c */ + io_break(sd->ld); + return false; + } + + sd->rcvd_version = true; + /* In case there are outgoing msgs, we can send now. */ + msg_wake(sd->outq); + + return true; +} + static struct io_plan *sd_msg_read(struct io_conn *conn, struct subd *sd) { int type = fromwire_peektype(sd->msg_in); @@ -455,6 +480,10 @@ static struct io_plan *sd_msg_read(struct io_conn *conn, struct subd *sd) if (!handle_set_billboard(sd, sd->msg_in)) goto malformed; goto next; + case WIRE_STATUS_VERSION: + if (!handle_version(sd, sd->msg_in)) + goto close; + goto next; } if (sd->channel) { @@ -581,10 +610,15 @@ static void destroy_subd(struct subd *sd) static struct io_plan *msg_send_next(struct io_conn *conn, struct subd *sd) { - const u8 *msg = msg_dequeue(sd->outq); + const u8 *msg; int fd; + /* Don't send if we haven't read version! */ + if (!sd->rcvd_version) + return msg_queue_wait(conn, sd->outq, msg_send_next, sd); + /* Nothing to do? Wait for msg_enqueue. */ + msg = msg_dequeue(sd->outq); if (!msg) return msg_queue_wait(conn, sd->outq, msg_send_next, sd); @@ -678,6 +712,7 @@ static struct subd *new_subd(struct lightningd *ld, tal_add_destructor(sd, destroy_subd); list_head_init(&sd->reqs); sd->channel = channel; + sd->rcvd_version = false; if (node_id) sd->node_id = tal_dup(sd, struct node_id, node_id); else diff --git a/lightningd/subd.h b/lightningd/subd.h index 7fc584cad63d..94fabf9e3a32 100644 --- a/lightningd/subd.h +++ b/lightningd/subd.h @@ -32,6 +32,9 @@ struct subd { /* If we are associated with a single channel, this points to it. */ void *channel; + /* Have we received the version msg yet? Don't send until we do. */ + bool rcvd_version; + /* For logging */ struct log *log; const struct node_id *node_id; diff --git a/lightningd/test/run-find_my_abspath.c b/lightningd/test/run-find_my_abspath.c index 245b5cb18fe9..946f38e27788 100644 --- a/lightningd/test/run-find_my_abspath.c +++ b/lightningd/test/run-find_my_abspath.c @@ -90,6 +90,9 @@ bool fromwire_status_peer_billboard(const tal_t *ctx UNNEEDED, const void *p UNN /* Generated stub for fromwire_status_peer_error */ bool fromwire_status_peer_error(const tal_t *ctx UNNEEDED, const void *p UNNEEDED, struct channel_id *channel UNNEEDED, wirestring **desc UNNEEDED, bool *warning UNNEEDED, struct per_peer_state **pps UNNEEDED, u8 **error_for_them UNNEEDED) { fprintf(stderr, "fromwire_status_peer_error called!\n"); abort(); } +/* Generated stub for fromwire_status_version */ +bool fromwire_status_version(const tal_t *ctx UNNEEDED, const void *p UNNEEDED, wirestring **version UNNEEDED) +{ fprintf(stderr, "fromwire_status_version called!\n"); abort(); } /* Generated stub for gossip_init */ void gossip_init(struct lightningd *ld UNNEEDED, int connectd_fd UNNEEDED) { fprintf(stderr, "gossip_init called!\n"); abort(); } diff --git a/tests/plugins/badopeningd.sh b/tests/plugins/badopeningd.sh new file mode 100755 index 000000000000..b674ca2a4745 --- /dev/null +++ b/tests/plugins/badopeningd.sh @@ -0,0 +1,11 @@ +#! /bin/sh + +# If this file exists, we send that message back, then sleep. +if [ $# = 0 ] && [ -f openingd-version ]; then + # lightningd expects us to write to stdin! + cat openingd-version >&0 + sleep 10 + exit 0 +fi + +exec "$(cat openingd-real)" "$@" diff --git a/tests/test_misc.py b/tests/test_misc.py index e45aff0b06d6..af07f716e9cb 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -2507,3 +2507,33 @@ def test_listforwards(node_factory, bitcoind): # out_channel=c24 c24_forwards = l2.rpc.listforwards(out_channel=c24)['forwards'] assert len(c24_forwards) == 1 + + +def test_version_reexec(node_factory, bitcoind): + badopeningd = os.path.join(os.path.dirname(__file__), "plugins", "badopeningd.sh") + version = subprocess.check_output(['lightningd/lightningd', + '--version']).decode('utf-8').splitlines()[0] + + l1, l2 = node_factory.get_nodes(2, opts=[{'subdaemon': 'openingd:' + badopeningd, + 'start': False, + 'allow_broken_log': True}, + {}]) + # We use a file to tell our openingd wrapper where the real one is + with open(os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, "openingd-real"), 'w') as f: + f.write(os.path.abspath('lightningd/lightning_openingd')) + + l1.start() + # This is a "version" message + verfile = os.path.join(l1.daemon.lightning_dir, TEST_NETWORK, "openingd-version") + with open(verfile, 'wb') as f: + f.write(bytes.fromhex('0000000d' # len + 'fff6')) # type + f.write(bytes('badversion\0', encoding='utf8')) + + l1.rpc.connect(l2.info['id'], 'localhost', l2.port) + + l1.daemon.wait_for_log("openingd.*version 'badversion' not '{}': restarting".format(version)) + + # Now "fix" it, it should restart. + os.unlink(verfile) + l1.daemon.wait_for_log("Server started with public key")