From 0d18a1e85fdb9967419dc9f7a5905452d065c84a Mon Sep 17 00:00:00 2001 From: Allen Farris Date: Thu, 28 Jan 2021 13:18:05 -0800 Subject: [PATCH] implement FAILOVER command (#8315) Implement FAILOVER command, which coordinates failover between the server and one of its replicas. --- src/networking.c | 6 +- src/replication.c | 336 ++++++++++++++++++++++++++++++++- src/server.c | 31 ++- src/server.h | 21 +++ tests/integration/failover.tcl | 290 ++++++++++++++++++++++++++++ tests/support/util.tcl | 10 +- tests/test_helper.tcl | 1 + 7 files changed, 679 insertions(+), 16 deletions(-) create mode 100644 tests/integration/failover.tcl diff --git a/src/networking.c b/src/networking.c index 893359ec53..da611675c9 100644 --- a/src/networking.c +++ b/src/networking.c @@ -2680,7 +2680,7 @@ NULL c->argc == 4)) { /* CLIENT PAUSE TIMEOUT [WRITE|ALL] */ - long long duration; + mstime_t end; int type = CLIENT_PAUSE_ALL; if (c->argc == 4) { if (!strcasecmp(c->argv[3]->ptr,"write")) { @@ -2694,9 +2694,9 @@ NULL } } - if (getTimeoutFromObjectOrReply(c,c->argv[2],&duration, + if (getTimeoutFromObjectOrReply(c,c->argv[2],&end, UNIT_MILLISECONDS) != C_OK) return; - pauseClients(duration, type); + pauseClients(end, type); addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"tracking") && c->argc >= 3) { /* CLIENT TRACKING (on|off) [REDIRECT ] [BCAST] [PREFIX first] diff --git a/src/replication.c b/src/replication.c index a9ec30e74e..f23fcb6de8 100644 --- a/src/replication.c +++ b/src/replication.c @@ -721,6 +721,36 @@ void syncCommand(client *c) { /* ignore SYNC if already slave or in monitor mode */ if (c->flags & CLIENT_SLAVE) return; + /* Check if this is a failover request to a replica with the same replid and + * become a master if so. */ + if (c->argc > 3 && !strcasecmp(c->argv[0]->ptr,"psync") && + !strcasecmp(c->argv[3]->ptr,"failover")) + { + serverLog(LL_WARNING, "Failover request received for replid %s.", + (unsigned char *)c->argv[1]->ptr); + if (!server.masterhost) { + addReplyError(c, "PSYNC FAILOVER can't be sent to a master."); + return; + } + + if (!strcasecmp(c->argv[1]->ptr,server.replid)) { + replicationUnsetMaster(); + sds client = catClientInfoString(sdsempty(),c); + serverLog(LL_NOTICE, + "MASTER MODE enabled (failover request from '%s')",client); + sdsfree(client); + } else { + addReplyError(c, "PSYNC FAILOVER replid must match my replid."); + return; + } + } + + /* Don't let replicas sync with us while we're failing over */ + if (server.failover_state != NO_FAILOVER) { + addReplyError(c,"-NOMASTERLINK Can't SYNC while failing over"); + return; + } + /* Refuse SYNC requests if we are a slave but the link with our master * is not ok... */ if (server.masterhost && server.repl_state != REPL_STATE_CONNECTED) { @@ -2031,8 +2061,15 @@ int slaveTryPartialResynchronization(connection *conn, int read_reply) { memcpy(psync_offset,"-1",3); } - /* Issue the PSYNC command */ - reply = sendCommand(conn,"PSYNC",psync_replid,psync_offset,NULL); + /* Issue the PSYNC command, if this is a master with a failover in + * progress then send the failover argument to the replica to cause it + * to become a master */ + if (server.failover_state == FAILOVER_IN_PROGRESS) { + reply = sendCommand(conn,"PSYNC",psync_replid,psync_offset,"FAILOVER",NULL); + } else { + reply = sendCommand(conn,"PSYNC",psync_replid,psync_offset,NULL); + } + if (reply != NULL) { serverLog(LL_WARNING,"Unable to send PSYNC to master: %s",reply); sdsfree(reply); @@ -2356,6 +2393,7 @@ void syncWithMaster(connection *conn) { if (server.repl_state == REPL_STATE_SEND_PSYNC) { if (slaveTryPartialResynchronization(conn,0) == PSYNC_WRITE_ERROR) { err = sdsnew("Write error sending the PSYNC command."); + abortFailover("Write error to failover target"); goto write_error; } server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY; @@ -2373,6 +2411,18 @@ void syncWithMaster(connection *conn) { psync_result = slaveTryPartialResynchronization(conn,1); if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */ + /* Check the status of the planned failover. We expect PSYNC_CONTINUE, + * but there is nothing technically wrong with a full resync which + * could happen in edge cases. */ + if (server.failover_state == FAILOVER_IN_PROGRESS) { + if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) { + clearFailoverState(); + } else { + abortFailover("Failover target rejected psync request"); + return; + } + } + /* If the master is in an transient error, we should try to PSYNC * from scratch later, so go to the error path. This happens when * the server is loading the dataset or is not connected with its @@ -2678,6 +2728,11 @@ void replicaofCommand(client *c) { return; } + if (server.failover_state != NO_FAILOVER) { + addReplyError(c,"REPLICAOF not allowed while failing over."); + return; + } + /* The special host/port combination "NO" "ONE" turns the instance * into a master. Otherwise the new master address is set. */ if (!strcasecmp(c->argv[1]->ptr,"no") && @@ -3211,6 +3266,10 @@ long long replicationGetSlaveOffset(void) { void replicationCron(void) { static long long replication_cron_loops = 0; + /* Check failover status first, to see if we need to start + * handling the failover. */ + updateFailoverStatus(); + /* Non blocking connection timeout? */ if (server.masterhost && (server.repl_state == REPL_STATE_CONNECTING || @@ -3268,8 +3327,9 @@ void replicationCron(void) { * alter the replication offsets of master and slave, and will no longer * match the one stored into 'mf_master_offset' state. */ int manual_failover_in_progress = - server.cluster_enabled && - server.cluster->mf_end && + ((server.cluster_enabled && + server.cluster->mf_end) || + server.failover_end_time) && checkClientPauseTimeoutAndReturnIfPaused(); if (!manual_failover_in_progress) { @@ -3423,3 +3483,271 @@ void replicationStartPendingFork(void) { } } } + +/* Find replica at IP:PORT from replica list */ +static client *findReplica(char *host, int port) { + listIter li; + listNode *ln; + client *replica; + + listRewind(server.slaves,&li); + while((ln = listNext(&li))) { + replica = ln->value; + char ip[NET_IP_STR_LEN], *replicaip = replica->slave_ip; + + if (replicaip[0] == '\0') { + if (connPeerToString(replica->conn, ip, sizeof(ip), NULL) == -1) + continue; + replicaip = ip; + } + + if (!strcasecmp(host, replicaip) && + (port == replica->slave_listening_port)) + return replica; + } + + return NULL; +} + +const char *getFailoverStateString() { + switch(server.failover_state) { + case NO_FAILOVER: return "no-failover"; + case FAILOVER_IN_PROGRESS: return "failover-in-progress"; + case FAILOVER_WAIT_FOR_SYNC: return "waiting-for-sync"; + default: return "unknown"; + } +} + +/* Resets the internal failover configuration, this needs + * to be called after a failover either succeeds or fails + * as it includes the client unpause. */ +void clearFailoverState() { + server.failover_end_time = 0; + server.force_failover = 0; + zfree(server.target_replica_host); + server.target_replica_host = NULL; + server.target_replica_port = 0; + server.failover_state = NO_FAILOVER; + unpauseClients(); +} + +/* Abort an ongoing failover if one is going on. */ +void abortFailover(const char *err) { + if (server.failover_state == NO_FAILOVER) return; + + if (server.target_replica_host) { + serverLog(LL_NOTICE,"FAILOVER to %s:%d aborted: %s", + server.target_replica_host,server.target_replica_port,err); + } else { + serverLog(LL_NOTICE,"FAILOVER to any replica aborted: %s",err); + } + if (server.failover_state == FAILOVER_IN_PROGRESS) { + replicationUnsetMaster(); + } + clearFailoverState(); +} + +/* + * FAILOVER [TO [FORCE]] [ABORT] [TIMEOUT ] + * + * This command will coordinate a failover between the master and one + * of its replicas. The happy path contains the following steps: + * 1) The master will initiate a client pause write, to stop replication + * traffic. + * 2) The master will periodically check if any of its replicas has + * consumed the entire replication stream through acks. + * 3) Once any replica has caught up, the master will itself become a replica. + * 4) The master will send a PSYNC FAILOVER request to the target replica, which + * if accepted will cause the replica to become the new master and start a sync. + * + * FAILOVER ABORT is the only way to abort a failover command, as replicaof + * will be disabled. This may be needed if the failover is unable to progress. + * + * The optional arguments [TO ] allows designating a specific replica + * to be failed over to. + * + * FORCE flag indicates that even if the target replica is not caught up, + * failover to it anyway. This must be specified with a timeout and a target + * HOST and IP. + * + * TIMEOUT indicates how long should the primary wait for + * a replica to sync up before aborting. If not specified, the failover + * will attempt forever and must be manually aborted. + */ +void failoverCommand(client *c) { + if (server.cluster_enabled) { + addReplyError(c,"FAILOVER not allowed in cluster mode. " + "Use CLUSTER FAILOVER command instead."); + return; + } + + /* Handle special case for abort */ + if ((c->argc == 2) && !strcasecmp(c->argv[1]->ptr,"abort")) { + if (server.failover_state == NO_FAILOVER) { + addReplyError(c, "No failover in progress."); + return; + } + + abortFailover("Failover manually aborted"); + addReply(c,shared.ok); + return; + } + + long timeout_in_ms = 0; + int force_flag = 0; + long port = 0; + char *host = NULL; + + /* Parse the command for syntax and arguments. */ + for (int j = 1; j < c->argc; j++) { + if (!strcasecmp(c->argv[j]->ptr,"timeout") && (j + 1 < c->argc) && + timeout_in_ms == 0) + { + if (getLongFromObjectOrReply(c,c->argv[j + 1], + &timeout_in_ms,NULL) != C_OK) return; + if (timeout_in_ms <= 0) { + addReplyError(c,"FAILOVER timeout must be greater than 0"); + return; + } + j++; + } else if (!strcasecmp(c->argv[j]->ptr,"to") && (j + 2 < c->argc) && + !host) + { + if (getLongFromObjectOrReply(c,c->argv[j + 2],&port,NULL) != C_OK) + return; + host = c->argv[j + 1]->ptr; + j += 2; + } else if (!strcasecmp(c->argv[j]->ptr,"force") && !force_flag) { + force_flag = 1; + } else { + addReplyErrorObject(c,shared.syntaxerr); + return; + } + } + + if (server.failover_state != NO_FAILOVER) { + addReplyError(c,"FAILOVER already in progress."); + return; + } + + if (server.masterhost) { + addReplyError(c,"FAILOVER is not valid when server is a replica."); + return; + } + + if (listLength(server.slaves) == 0) { + addReplyError(c,"FAILOVER requires connected replicas."); + return; + } + + if (force_flag && (!timeout_in_ms || !host)) { + addReplyError(c,"FAILOVER with force option requires both a timeout " + "and target HOST and IP."); + return; + } + + /* If a replica address was provided, validate that it is connected. */ + if (host) { + client *replica = findReplica(host, port); + + if (replica == NULL) { + addReplyError(c,"FAILOVER target HOST and IP is not " + "a replica."); + return; + } + + /* Check if requested replica is online */ + if (replica->replstate != SLAVE_STATE_ONLINE) { + addReplyError(c,"FAILOVER target replica is not online."); + return; + } + + server.target_replica_host = zstrdup(host); + server.target_replica_port = port; + serverLog(LL_NOTICE,"FAILOVER requested to %s:%ld.",host,port); + } else { + serverLog(LL_NOTICE,"FAILOVER requested to any replica."); + } + + mstime_t now = mstime(); + if (timeout_in_ms) { + server.failover_end_time = now + timeout_in_ms; + } + + server.force_failover = force_flag; + server.failover_state = FAILOVER_WAIT_FOR_SYNC; + /* Cluster failover will unpause eventually */ + pauseClients(LLONG_MAX,CLIENT_PAUSE_WRITE); + addReply(c,shared.ok); +} + +/* Failover cron function, checks coordinated failover state. + * + * Implementation note: The current implementation calls replicationSetMaster() + * to start the failover request, this has some unintended side effects if the + * failover doesn't work like blocked clients will be unblocked and replicas will + * be disconnected. This could be optimized further. + */ +void updateFailoverStatus(void) { + if (server.failover_state != FAILOVER_WAIT_FOR_SYNC) return; + mstime_t now = server.mstime; + + /* Check if failover operation has timed out */ + if (server.failover_end_time && server.failover_end_time <= now) { + if (server.force_failover) { + serverLog(LL_NOTICE, + "FAILOVER to %s:%d time out exceeded, failing over.", + server.target_replica_host, server.target_replica_port); + server.failover_state = FAILOVER_IN_PROGRESS; + /* If timeout has expired force a failover if requested. */ + replicationSetMaster(server.target_replica_host, + server.target_replica_port); + return; + } else { + /* Force was not requested, so timeout. */ + abortFailover("Replica never caught up before timeout"); + return; + } + } + + /* Check to see if the replica has caught up so failover can start */ + client *replica = NULL; + if (server.target_replica_host) { + replica = findReplica(server.target_replica_host, + server.target_replica_port); + } else { + listIter li; + listNode *ln; + + listRewind(server.slaves,&li); + /* Find any replica that has matched our repl_offset */ + while((ln = listNext(&li))) { + replica = ln->value; + if (replica->repl_ack_off == server.master_repl_offset) { + char ip[NET_IP_STR_LEN], *replicaip = replica->slave_ip; + + if (replicaip[0] == '\0') { + if (connPeerToString(replica->conn,ip,sizeof(ip),NULL) == -1) + continue; + replicaip = ip; + } + + /* We are now failing over to this specific node */ + server.target_replica_host = zstrdup(replicaip); + server.target_replica_port = replica->slave_listening_port; + break; + } + } + } + + /* We've found a replica that is caught up */ + if (replica && (replica->repl_ack_off == server.master_repl_offset)) { + server.failover_state = FAILOVER_IN_PROGRESS; + serverLog(LL_NOTICE, + "Failover target %s:%d is synced, failing over.", + server.target_replica_host, server.target_replica_port); + /* Designated replica is caught up, failover to it. */ + replicationSetMaster(server.target_replica_host, + server.target_replica_port); + } +} diff --git a/src/server.c b/src/server.c index 8b8da8c898..1166c4b3e7 100644 --- a/src/server.c +++ b/src/server.c @@ -752,7 +752,7 @@ struct redisCommand redisCommandTable[] = { "admin no-script", 0,NULL,0,0,0,0,0,0}, - {"psync",syncCommand,3, + {"psync",syncCommand,-3, "admin no-script", 0,NULL,0,0,0,0,0,0}, @@ -1092,6 +1092,10 @@ struct redisCommand redisCommandTable[] = { {"reset",resetCommand,1, "no-script ok-stale ok-loading fast @connection", + 0,NULL,0,0,0,0,0,0}, + + {"failover",failoverCommand,-1, + "admin no-script ok-stale", 0,NULL,0,0,0,0,0,0} }; @@ -2185,8 +2189,15 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) { checkClientPauseTimeoutAndReturnIfPaused(); /* Replication cron function -- used to reconnect to master, - * detect transfer failures, start background RDB transfers and so forth. */ - run_with_period(1000) replicationCron(); + * detect transfer failures, start background RDB transfers and so forth. + * + * If Redis is trying to failover then run the replication cron faster so + * progress on the handshake happens more quickly. */ + if (server.failover_state != NO_FAILOVER) { + run_with_period(100) replicationCron(); + } else { + run_with_period(1000) replicationCron(); + } /* Run the Redis Cluster cron. */ run_with_period(100) { @@ -2397,6 +2408,11 @@ void beforeSleep(struct aeEventLoop *eventLoop) { server.get_ack_from_slaves = 0; } + /* We may have recieved updates from clients about their current offset. NOTE: + * this can't be done where the ACK is recieved since failover will disconnect + * our clients. */ + updateFailoverStatus(); + /* Send the invalidation messages to clients participating to the * client side caching protocol in broadcasting (BCAST) mode. */ trackingBroadcastInvalidationMessages(); @@ -2651,6 +2667,13 @@ void initServerConfig(void) { server.repl_backlog_off = 0; server.repl_no_slaves_since = time(NULL); + /* Failover related */ + server.failover_end_time = 0; + server.force_failover = 0; + server.target_replica_host = NULL; + server.target_replica_port = 0; + server.failover_state = NO_FAILOVER; + /* Client output buffer limits */ for (j = 0; j < CLIENT_TYPE_OBUF_COUNT; j++) server.client_obuf_limits[j] = clientBufferLimitsDefaults[j]; @@ -4991,6 +5014,7 @@ sds genRedisInfoString(const char *section) { } } info = sdscatprintf(info, + "master_failover_state:%s\r\n" "master_replid:%s\r\n" "master_replid2:%s\r\n" "master_repl_offset:%lld\r\n" @@ -4999,6 +5023,7 @@ sds genRedisInfoString(const char *section) { "repl_backlog_size:%lld\r\n" "repl_backlog_first_byte_offset:%lld\r\n" "repl_backlog_histlen:%lld\r\n", + getFailoverStateString(), server.replid, server.replid2, server.master_repl_offset, diff --git a/src/server.h b/src/server.h index 349c887cb9..02ba8f7b9d 100644 --- a/src/server.h +++ b/src/server.h @@ -320,6 +320,14 @@ typedef enum { REPL_STATE_CONNECTED, /* Connected to master */ } repl_state; +/* The state of an in progress coordinated failover */ +typedef enum { + NO_FAILOVER = 0, /* No failover in progress */ + FAILOVER_WAIT_FOR_SYNC, /* Waiting for target replica to catch up */ + FAILOVER_IN_PROGRESS /* Waiting for target replica to accept + * PSYNC FAILOVER request. */ +} failover_state; + /* State of slaves from the POV of the master. Used in client->replstate. * In SEND_BULK and ONLINE state the slave receives new updates * in its output queue. In the WAIT_BGSAVE states instead the server is waiting @@ -1577,6 +1585,14 @@ struct redisServer { char *bgsave_cpulist; /* cpu affinity list of bgsave process. */ /* Sentinel config */ struct sentinelConfig *sentinel_config; /* sentinel config to load at startup time. */ + /* Coordinate failover info */ + mstime_t failover_end_time; /* Deadline for failover command. */ + int force_failover; /* If true then failover will be foreced at the + * deadline, otherwise failover is aborted. */ + char *target_replica_host; /* Failover target host. If null during a + * failover then any replica can be used. */ + int target_replica_port; /* Failover target port */ + int failover_state; /* Failover state */ }; typedef struct pubsubPattern { @@ -1997,6 +2013,10 @@ void feedReplicationBacklog(void *ptr, size_t len); void showLatestBacklog(void); void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask); void rdbPipeWriteHandlerConnRemoved(struct connection *conn); +void clearFailoverState(void); +void updateFailoverStatus(void); +void abortFailover(const char *err); +const char *getFailoverStateString(); /* Generic persistence functions */ void startLoadingFile(FILE* fp, char* filename, int rdbflags); @@ -2637,6 +2657,7 @@ void lolwutCommand(client *c); void aclCommand(client *c); void stralgoCommand(client *c); void resetCommand(client *c); +void failoverCommand(client *c); #if defined(__GNUC__) void *calloc(size_t count, size_t size) __attribute__ ((deprecated)); diff --git a/tests/integration/failover.tcl b/tests/integration/failover.tcl new file mode 100644 index 0000000000..c6818700d7 --- /dev/null +++ b/tests/integration/failover.tcl @@ -0,0 +1,290 @@ +start_server {tags {"failover"}} { +start_server {} { +start_server {} { + set node_0 [srv 0 client] + set node_0_host [srv 0 host] + set node_0_port [srv 0 port] + set node_0_pid [srv 0 pid] + + set node_1 [srv -1 client] + set node_1_host [srv -1 host] + set node_1_port [srv -1 port] + set node_1_pid [srv -1 pid] + + set node_2 [srv -2 client] + set node_2_host [srv -2 host] + set node_2_port [srv -2 port] + set node_2_pid [srv -2 pid] + + proc assert_digests_match {n1 n2 n3} { + assert_equal [$n1 debug digest] [$n2 debug digest] + assert_equal [$n2 debug digest] [$n3 debug digest] + } + + test {failover command fails without connected replica} { + catch { $node_0 failover to $node_1_host $node_1_port } err + if {! [string match "ERR*" $err]} { + fail "failover command succeeded when replica not connected" + } + } + + test {setup replication for following tests} { + $node_1 replicaof $node_0_host $node_0_port + $node_2 replicaof $node_0_host $node_0_port + wait_for_sync $node_1 + wait_for_sync $node_2 + } + + test {failover command fails with invalid host} { + catch { $node_0 failover to invalidhost $node_1_port } err + assert_match "ERR*" $err + } + + test {failover command fails with invalid port} { + catch { $node_0 failover to $node_1_host invalidport } err + assert_match "ERR*" $err + } + + test {failover command fails with just force and timeout} { + catch { $node_0 FAILOVER FORCE TIMEOUT 100} err + assert_match "ERR*" $err + } + + test {failover command fails when sent to a replica} { + catch { $node_1 failover to $node_1_host $node_1_port } err + assert_match "ERR*" $err + } + + test {failover command fails with force without timeout} { + catch { $node_0 failover to $node_1_host $node_1_port FORCE } err + assert_match "ERR*" $err + } + + test {failover command to specific replica works} { + set initial_psyncs [s -1 sync_partial_ok] + set initial_syncs [s -1 sync_full] + + # Generate a delta between primary and replica + set load_handler [start_write_load $node_0_host $node_0_port 5] + exec kill -SIGSTOP [srv -1 pid] + wait_for_condition 50 100 { + [s 0 total_commands_processed] > 100 + } else { + fail "Node 0 did not accept writes" + } + exec kill -SIGCONT [srv -1 pid] + + # Execute the failover + $node_0 failover to $node_1_host $node_1_port + + # Wait for failover to end + wait_for_condition 50 100 { + [s 0 master_failover_state] == "no-failover" + } else { + fail "Failover from node 0 to node 1 did not finish" + } + stop_write_load $load_handler + $node_2 replicaof $node_1_host $node_1_port + wait_for_sync $node_0 + wait_for_sync $node_2 + + assert_match *slave* [$node_0 role] + assert_match *master* [$node_1 role] + assert_match *slave* [$node_2 role] + + # We should accept psyncs from both nodes + assert_equal [expr [s -1 sync_partial_ok] - $initial_psyncs] 2 + assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0 + assert_digests_match $node_0 $node_1 $node_2 + } + + test {failover command to any replica works} { + set initial_psyncs [s -2 sync_partial_ok] + set initial_syncs [s -2 sync_full] + + wait_for_ofs_sync $node_1 $node_2 + # We stop node 0 to and make sure node 2 is selected + exec kill -SIGSTOP $node_0_pid + $node_1 set CASE 1 + $node_1 FAILOVER + + # Wait for failover to end + wait_for_condition 50 100 { + [s -1 master_failover_state] == "no-failover" + } else { + fail "Failover from node 1 to node 2 did not finish" + } + exec kill -SIGCONT $node_0_pid + $node_0 replicaof $node_2_host $node_2_port + + wait_for_sync $node_0 + wait_for_sync $node_1 + + assert_match *slave* [$node_0 role] + assert_match *slave* [$node_1 role] + assert_match *master* [$node_2 role] + + # We should accept Psyncs from both nodes + assert_equal [expr [s -2 sync_partial_ok] - $initial_psyncs] 2 + assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0 + assert_digests_match $node_0 $node_1 $node_2 + } + + test {failover to a replica with force works} { + set initial_psyncs [s 0 sync_partial_ok] + set initial_syncs [s 0 sync_full] + + exec kill -SIGSTOP $node_0_pid + # node 0 will never acknowledge this write + $node_2 set case 2 + $node_2 failover to $node_0_host $node_0_port TIMEOUT 100 FORCE + + # Wait for node 0 to give up on sync attempt and start failover + wait_for_condition 50 100 { + [s -2 master_failover_state] == "failover-in-progress" + } else { + fail "Failover from node 2 to node 0 did not timeout" + } + + # Quick check that everyone is a replica, we never want a + # state where there are two masters. + assert_match *slave* [$node_1 role] + assert_match *slave* [$node_2 role] + + exec kill -SIGCONT $node_0_pid + + # Wait for failover to end + wait_for_condition 50 100 { + [s -2 master_failover_state] == "no-failover" + } else { + fail "Failover from node 2 to node 0 did not finish" + } + $node_1 replicaof $node_0_host $node_0_port + + wait_for_sync $node_1 + wait_for_sync $node_2 + + assert_match *master* [$node_0 role] + assert_match *slave* [$node_1 role] + assert_match *slave* [$node_2 role] + + assert_equal [count_log_message -2 "time out exceeded, failing over."] 1 + + # We should accept both psyncs, although this is the condition we might not + # since we didn't catch up. + assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2 + assert_equal [expr [s 0 sync_full] - $initial_syncs] 0 + assert_digests_match $node_0 $node_1 $node_2 + } + + test {failover with timeout aborts if replica never catches up} { + set initial_psyncs [s 0 sync_partial_ok] + set initial_syncs [s 0 sync_full] + + # Stop replica so it never catches up + exec kill -SIGSTOP [srv -1 pid] + $node_0 SET CASE 1 + + $node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 500 + # Wait for failover to end + wait_for_condition 50 20 { + [s 0 master_failover_state] == "no-failover" + } else { + fail "Failover from node_0 to replica did not finish" + } + + exec kill -SIGCONT [srv -1 pid] + + # We need to make sure the nodes actually sync back up + wait_for_ofs_sync $node_0 $node_1 + wait_for_ofs_sync $node_0 $node_2 + + assert_match *master* [$node_0 role] + assert_match *slave* [$node_1 role] + assert_match *slave* [$node_2 role] + + # Since we never caught up, there should be no syncs + assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0 + assert_equal [expr [s 0 sync_full] - $initial_syncs] 0 + assert_digests_match $node_0 $node_1 $node_2 + } + + test {failovers can be aborted} { + set initial_psyncs [s 0 sync_partial_ok] + set initial_syncs [s 0 sync_full] + + # Stop replica so it never catches up + exec kill -SIGSTOP [srv -1 pid] + $node_0 SET CASE 2 + + $node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 60000 + assert_match [s 0 master_failover_state] "waiting-for-sync" + + # Sanity check that read commands are still accepted + $node_0 GET CASE + + $node_0 failover abort + assert_match [s 0 master_failover_state] "no-failover" + + exec kill -SIGCONT [srv -1 pid] + + # Just make sure everything is still synced + wait_for_ofs_sync $node_0 $node_1 + wait_for_ofs_sync $node_0 $node_2 + + assert_match *master* [$node_0 role] + assert_match *slave* [$node_1 role] + assert_match *slave* [$node_2 role] + + # Since we never caught up, there should be no syncs + assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0 + assert_equal [expr [s 0 sync_full] - $initial_syncs] 0 + assert_digests_match $node_0 $node_1 $node_2 + } + + test {failover aborts if target rejects sync request} { + set initial_psyncs [s 0 sync_partial_ok] + set initial_syncs [s 0 sync_full] + + # We block psync, so the failover will fail + $node_1 acl setuser default -psync + + # We pause the target long enough to send a write command + # during the pause. This write will not be interrupted. + exec kill -SIGSTOP [srv -1 pid] + set rd [redis_deferring_client] + $rd SET FOO BAR + $node_0 failover to $node_1_host $node_1_port + exec kill -SIGCONT [srv -1 pid] + + # Wait for failover to end + wait_for_condition 50 100 { + [s 0 master_failover_state] == "no-failover" + } else { + fail "Failover from node_0 to replica did not finish" + } + + assert_equal [$rd read] "OK" + $rd close + + # restore access to psync + $node_1 acl setuser default +psync + + # We need to make sure the nodes actually sync back up + wait_for_sync $node_1 + wait_for_sync $node_2 + + assert_match *master* [$node_0 role] + assert_match *slave* [$node_1 role] + assert_match *slave* [$node_2 role] + + # We will cycle all of our replicas here and force a psync. + assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2 + assert_equal [expr [s 0 sync_full] - $initial_syncs] 0 + + assert_equal [count_log_message 0 "Failover target rejected psync request"] 1 + assert_digests_match $node_0 $node_1 $node_2 + } +} +} +} diff --git a/tests/support/util.tcl b/tests/support/util.tcl index 86f2753c22..e693959947 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -86,12 +86,10 @@ proc waitForBgrewriteaof r { } proc wait_for_sync r { - while 1 { - if {[status $r master_link_status] eq "down"} { - after 10 - } else { - break - } + wait_for_condition 50 100 { + [status $r master_link_status] eq "up" + } else { + fail "replica didn't sync in time" } } diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index f947c024a5..2b78547808 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -52,6 +52,7 @@ set ::all_tests { integration/psync2 integration/psync2-reg integration/psync2-pingoff + integration/failover integration/redis-cli integration/redis-benchmark unit/pubsub