From 0d18a1e85fdb9967419dc9f7a5905452d065c84a Mon Sep 17 00:00:00 2001
From: Allen Farris <farrisa@amazon.com>
Date: Thu, 28 Jan 2021 13:18:05 -0800
Subject: [PATCH] implement FAILOVER command (#8315)

Implement FAILOVER command, which coordinates failover
between the server and one of its replicas.
---
 src/networking.c               |   6 +-
 src/replication.c              | 336 ++++++++++++++++++++++++++++++++-
 src/server.c                   |  31 ++-
 src/server.h                   |  21 +++
 tests/integration/failover.tcl | 290 ++++++++++++++++++++++++++++
 tests/support/util.tcl         |  10 +-
 tests/test_helper.tcl          |   1 +
 7 files changed, 679 insertions(+), 16 deletions(-)
 create mode 100644 tests/integration/failover.tcl
diff --git a/src/networking.c b/src/networking.c
index 893359ec53..da611675c9 100644
--- a/src/networking.c
+++ b/src/networking.c
@@ -2680,7 +2680,7 @@ NULL
                                                         c->argc == 4))
     {
         /* CLIENT PAUSE TIMEOUT [WRITE|ALL] */
-        long long duration;
+        mstime_t end;
         int type = CLIENT_PAUSE_ALL;
         if (c->argc == 4) {
             if (!strcasecmp(c->argv[3]->ptr,"write")) {
@@ -2694,9 +2694,9 @@ NULL
             }
         }
 
-        if (getTimeoutFromObjectOrReply(c,c->argv[2],&duration,
+        if (getTimeoutFromObjectOrReply(c,c->argv[2],&end,
             UNIT_MILLISECONDS) != C_OK) return;
-        pauseClients(duration, type);
+        pauseClients(end, type);
         addReply(c,shared.ok);
     } else if (!strcasecmp(c->argv[1]->ptr,"tracking") && c->argc >= 3) {
         /* CLIENT TRACKING (on|off) [REDIRECT <id>] [BCAST] [PREFIX first]
diff --git a/src/replication.c b/src/replication.c
index a9ec30e74e..f23fcb6de8 100644
--- a/src/replication.c
+++ b/src/replication.c
@@ -721,6 +721,36 @@ void syncCommand(client *c) {
     /* ignore SYNC if already slave or in monitor mode */
     if (c->flags & CLIENT_SLAVE) return;
 
+    /* Check if this is a failover request to a replica with the same replid and
+     * become a master if so. */
+    if (c->argc > 3 && !strcasecmp(c->argv[0]->ptr,"psync") && 
+        !strcasecmp(c->argv[3]->ptr,"failover"))
+    {
+        serverLog(LL_WARNING, "Failover request received for replid %s.",
+            (unsigned char *)c->argv[1]->ptr);
+        if (!server.masterhost) {
+            addReplyError(c, "PSYNC FAILOVER can't be sent to a master.");
+            return;
+        }
+
+        if (!strcasecmp(c->argv[1]->ptr,server.replid)) {
+            replicationUnsetMaster();
+            sds client = catClientInfoString(sdsempty(),c);
+            serverLog(LL_NOTICE,
+                "MASTER MODE enabled (failover request from '%s')",client);
+            sdsfree(client);
+        } else {
+            addReplyError(c, "PSYNC FAILOVER replid must match my replid.");
+            return;            
+        }
+    }
+
+    /* Don't let replicas sync with us while we're failing over */
+    if (server.failover_state != NO_FAILOVER) {
+        addReplyError(c,"-NOMASTERLINK Can't SYNC while failing over");
+        return;
+    }
+
     /* Refuse SYNC requests if we are a slave but the link with our master
      * is not ok... */
     if (server.masterhost && server.repl_state != REPL_STATE_CONNECTED) {
@@ -2031,8 +2061,15 @@ int slaveTryPartialResynchronization(connection *conn, int read_reply) {
             memcpy(psync_offset,"-1",3);
         }
 
-        /* Issue the PSYNC command */
-        reply = sendCommand(conn,"PSYNC",psync_replid,psync_offset,NULL);
+        /* Issue the PSYNC command, if this is a master with a failover in
+         * progress then send the failover argument to the replica to cause it
+         * to become a master */
+        if (server.failover_state == FAILOVER_IN_PROGRESS) {
+            reply = sendCommand(conn,"PSYNC",psync_replid,psync_offset,"FAILOVER",NULL);
+        } else {
+            reply = sendCommand(conn,"PSYNC",psync_replid,psync_offset,NULL);
+        }
+
         if (reply != NULL) {
             serverLog(LL_WARNING,"Unable to send PSYNC to master: %s",reply);
             sdsfree(reply);
@@ -2356,6 +2393,7 @@ void syncWithMaster(connection *conn) {
     if (server.repl_state == REPL_STATE_SEND_PSYNC) {
         if (slaveTryPartialResynchronization(conn,0) == PSYNC_WRITE_ERROR) {
             err = sdsnew("Write error sending the PSYNC command.");
+            abortFailover("Write error to failover target");
             goto write_error;
         }
         server.repl_state = REPL_STATE_RECEIVE_PSYNC_REPLY;
@@ -2373,6 +2411,18 @@ void syncWithMaster(connection *conn) {
     psync_result = slaveTryPartialResynchronization(conn,1);
     if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
 
+    /* Check the status of the planned failover. We expect PSYNC_CONTINUE,
+     * but there is nothing technically wrong with a full resync which
+     * could happen in edge cases. */
+    if (server.failover_state == FAILOVER_IN_PROGRESS) {
+        if (psync_result == PSYNC_CONTINUE || psync_result == PSYNC_FULLRESYNC) {
+            clearFailoverState();
+        } else {
+            abortFailover("Failover target rejected psync request");
+            return;
+        }
+    }
+
     /* If the master is in an transient error, we should try to PSYNC
      * from scratch later, so go to the error path. This happens when
      * the server is loading the dataset or is not connected with its
@@ -2678,6 +2728,11 @@ void replicaofCommand(client *c) {
         return;
     }
 
+    if (server.failover_state != NO_FAILOVER) {
+        addReplyError(c,"REPLICAOF not allowed while failing over.");
+        return;
+    }
+
     /* The special host/port combination "NO" "ONE" turns the instance
      * into a master. Otherwise the new master address is set. */
     if (!strcasecmp(c->argv[1]->ptr,"no") &&
@@ -3211,6 +3266,10 @@ long long replicationGetSlaveOffset(void) {
 void replicationCron(void) {
     static long long replication_cron_loops = 0;
 
+    /* Check failover status first, to see if we need to start
+     * handling the failover. */
+    updateFailoverStatus();
+
     /* Non blocking connection timeout? */
     if (server.masterhost &&
         (server.repl_state == REPL_STATE_CONNECTING ||
@@ -3268,8 +3327,9 @@ void replicationCron(void) {
          * alter the replication offsets of master and slave, and will no longer
          * match the one stored into 'mf_master_offset' state. */
         int manual_failover_in_progress =
-            server.cluster_enabled &&
-            server.cluster->mf_end &&
+            ((server.cluster_enabled &&
+              server.cluster->mf_end) ||
+            server.failover_end_time) &&
             checkClientPauseTimeoutAndReturnIfPaused();
 
         if (!manual_failover_in_progress) {
@@ -3423,3 +3483,271 @@ void replicationStartPendingFork(void) {
         }
     }
 }
+
+/* Find replica at IP:PORT from replica list */
+static client *findReplica(char *host, int port) {
+    listIter li;
+    listNode *ln;
+    client *replica;
+
+    listRewind(server.slaves,&li);
+    while((ln = listNext(&li))) {
+        replica = ln->value;
+        char ip[NET_IP_STR_LEN], *replicaip = replica->slave_ip;
+
+        if (replicaip[0] == '\0') {
+            if (connPeerToString(replica->conn, ip, sizeof(ip), NULL) == -1)
+                continue;
+            replicaip = ip;
+        }
+
+        if (!strcasecmp(host, replicaip) &&
+                (port == replica->slave_listening_port))
+            return replica;
+    }
+
+    return NULL;
+}
+
+const char *getFailoverStateString() {
+    switch(server.failover_state) {
+        case NO_FAILOVER: return "no-failover";
+        case FAILOVER_IN_PROGRESS: return "failover-in-progress";
+        case FAILOVER_WAIT_FOR_SYNC: return "waiting-for-sync";
+        default: return "unknown";
+    }
+}
+
+/* Resets the internal failover configuration, this needs
+ * to be called after a failover either succeeds or fails
+ * as it includes the client unpause. */
+void clearFailoverState() {
+    server.failover_end_time = 0;
+    server.force_failover = 0;
+    zfree(server.target_replica_host);
+    server.target_replica_host = NULL;
+    server.target_replica_port = 0;
+    server.failover_state = NO_FAILOVER;
+    unpauseClients();
+}
+
+/* Abort an ongoing failover if one is going on. */
+void abortFailover(const char *err) {
+    if (server.failover_state == NO_FAILOVER) return;
+
+    if (server.target_replica_host) {
+        serverLog(LL_NOTICE,"FAILOVER to %s:%d aborted: %s",
+            server.target_replica_host,server.target_replica_port,err);  
+    } else {
+        serverLog(LL_NOTICE,"FAILOVER to any replica aborted: %s",err);  
+    }
+    if (server.failover_state == FAILOVER_IN_PROGRESS) {
+        replicationUnsetMaster();
+    }
+    clearFailoverState();
+}
+
+/* 
+ * FAILOVER [TO <HOST> <IP> [FORCE]] [ABORT] [TIMEOUT <timeout>] 
+ * 
+ * This command will coordinate a failover between the master and one
+ * of its replicas. The happy path contains the following steps:
+ * 1) The master will initiate a client pause write, to stop replication
+ * traffic.
+ * 2) The master will periodically check if any of its replicas has
+ * consumed the entire replication stream through acks. 
+ * 3) Once any replica has caught up, the master will itself become a replica.
+ * 4) The master will send a PSYNC FAILOVER request to the target replica, which
+ * if accepted will cause the replica to become the new master and start a sync.
+ * 
+ * FAILOVER ABORT is the only way to abort a failover command, as replicaof
+ * will be disabled. This may be needed if the failover is unable to progress. 
+ * 
+ * The optional arguments [TO <HOST> <IP>] allows designating a specific replica
+ * to be failed over to.
+ * 
+ * FORCE flag indicates that even if the target replica is not caught up,
+ * failover to it anyway. This must be specified with a timeout and a target
+ * HOST and IP.
+ * 
+ * TIMEOUT <timeout> indicates how long should the primary wait for 
+ * a replica to sync up before aborting. If not specified, the failover
+ * will attempt forever and must be manually aborted.
+ */
+void failoverCommand(client *c) {
+    if (server.cluster_enabled) {
+        addReplyError(c,"FAILOVER not allowed in cluster mode. "
+                        "Use CLUSTER FAILOVER command instead.");
+        return;
+    }
+    
+    /* Handle special case for abort */
+    if ((c->argc == 2) && !strcasecmp(c->argv[1]->ptr,"abort")) {
+        if (server.failover_state == NO_FAILOVER) {
+            addReplyError(c, "No failover in progress.");
+            return;
+        }
+
+        abortFailover("Failover manually aborted");
+        addReply(c,shared.ok);
+        return;
+    }
+
+    long timeout_in_ms = 0;
+    int force_flag = 0;
+    long port = 0;
+    char *host = NULL;
+
+    /* Parse the command for syntax and arguments. */
+    for (int j = 1; j < c->argc; j++) {
+        if (!strcasecmp(c->argv[j]->ptr,"timeout") && (j + 1 < c->argc) &&
+            timeout_in_ms == 0)
+        {
+            if (getLongFromObjectOrReply(c,c->argv[j + 1],
+                        &timeout_in_ms,NULL) != C_OK) return;
+            if (timeout_in_ms <= 0) {
+                addReplyError(c,"FAILOVER timeout must be greater than 0");
+                return;
+            }
+            j++;
+        } else if (!strcasecmp(c->argv[j]->ptr,"to") && (j + 2 < c->argc) &&
+            !host) 
+        {
+            if (getLongFromObjectOrReply(c,c->argv[j + 2],&port,NULL) != C_OK)
+                return;
+            host = c->argv[j + 1]->ptr;
+            j += 2;
+        } else if (!strcasecmp(c->argv[j]->ptr,"force") && !force_flag) {
+            force_flag = 1;
+        } else {
+            addReplyErrorObject(c,shared.syntaxerr);
+            return;
+        }
+    }
+
+    if (server.failover_state != NO_FAILOVER) {
+        addReplyError(c,"FAILOVER already in progress.");
+        return;
+    }
+
+    if (server.masterhost) {
+        addReplyError(c,"FAILOVER is not valid when server is a replica.");
+        return;
+    }
+
+    if (listLength(server.slaves) == 0) {
+        addReplyError(c,"FAILOVER requires connected replicas.");
+        return; 
+    }
+
+    if (force_flag && (!timeout_in_ms || !host)) {
+        addReplyError(c,"FAILOVER with force option requires both a timeout "
+            "and target HOST and IP.");
+        return;     
+    }
+
+    /* If a replica address was provided, validate that it is connected. */
+    if (host) {
+        client *replica = findReplica(host, port);
+
+        if (replica == NULL) {
+            addReplyError(c,"FAILOVER target HOST and IP is not "
+                            "a replica.");
+            return;
+        }
+
+        /* Check if requested replica is online */
+        if (replica->replstate != SLAVE_STATE_ONLINE) {
+            addReplyError(c,"FAILOVER target replica is not online.");
+            return;
+        }
+
+        server.target_replica_host = zstrdup(host);
+        server.target_replica_port = port;
+        serverLog(LL_NOTICE,"FAILOVER requested to %s:%ld.",host,port);
+    } else {
+        serverLog(LL_NOTICE,"FAILOVER requested to any replica.");
+    }
+
+    mstime_t now = mstime();
+    if (timeout_in_ms) {
+        server.failover_end_time = now + timeout_in_ms;
+    }
+    
+    server.force_failover = force_flag;
+    server.failover_state = FAILOVER_WAIT_FOR_SYNC;
+    /* Cluster failover will unpause eventually */
+    pauseClients(LLONG_MAX,CLIENT_PAUSE_WRITE);
+    addReply(c,shared.ok);
+}
+
+/* Failover cron function, checks coordinated failover state. 
+ *
+ * Implementation note: The current implementation calls replicationSetMaster()
+ * to start the failover request, this has some unintended side effects if the
+ * failover doesn't work like blocked clients will be unblocked and replicas will
+ * be disconnected. This could be optimized further.
+ */
+void updateFailoverStatus(void) {
+    if (server.failover_state != FAILOVER_WAIT_FOR_SYNC) return;
+    mstime_t now = server.mstime;
+
+    /* Check if failover operation has timed out */
+    if (server.failover_end_time && server.failover_end_time <= now) {
+        if (server.force_failover) {
+            serverLog(LL_NOTICE,
+                "FAILOVER to %s:%d time out exceeded, failing over.",
+                server.target_replica_host, server.target_replica_port);
+            server.failover_state = FAILOVER_IN_PROGRESS;
+            /* If timeout has expired force a failover if requested. */
+            replicationSetMaster(server.target_replica_host,
+                server.target_replica_port);
+            return;
+        } else {
+            /* Force was not requested, so timeout. */
+            abortFailover("Replica never caught up before timeout");
+            return;
+        }
+    }
+
+    /* Check to see if the replica has caught up so failover can start */
+    client *replica = NULL;
+    if (server.target_replica_host) {
+        replica = findReplica(server.target_replica_host, 
+            server.target_replica_port);
+    } else {
+        listIter li;
+        listNode *ln;
+
+        listRewind(server.slaves,&li);
+        /* Find any replica that has matched our repl_offset */
+        while((ln = listNext(&li))) {
+            replica = ln->value;
+            if (replica->repl_ack_off == server.master_repl_offset) {
+                char ip[NET_IP_STR_LEN], *replicaip = replica->slave_ip;
+
+                if (replicaip[0] == '\0') {
+                    if (connPeerToString(replica->conn,ip,sizeof(ip),NULL) == -1)
+                        continue;
+                    replicaip = ip;
+                }
+
+                /* We are now failing over to this specific node */
+                server.target_replica_host = zstrdup(replicaip);
+                server.target_replica_port = replica->slave_listening_port;
+                break;
+            }
+        }
+    }
+
+    /* We've found a replica that is caught up */
+    if (replica && (replica->repl_ack_off == server.master_repl_offset)) {
+        server.failover_state = FAILOVER_IN_PROGRESS;
+        serverLog(LL_NOTICE,
+                "Failover target %s:%d is synced, failing over.",
+                server.target_replica_host, server.target_replica_port);
+        /* Designated replica is caught up, failover to it. */
+        replicationSetMaster(server.target_replica_host,
+            server.target_replica_port);
+    }
+}
diff --git a/src/server.c b/src/server.c
index 8b8da8c898..1166c4b3e7 100644
--- a/src/server.c
+++ b/src/server.c
@@ -752,7 +752,7 @@ struct redisCommand redisCommandTable[] = {
      "admin no-script",
      0,NULL,0,0,0,0,0,0},
 
-    {"psync",syncCommand,3,
+    {"psync",syncCommand,-3,
      "admin no-script",
      0,NULL,0,0,0,0,0,0},
 
@@ -1092,6 +1092,10 @@ struct redisCommand redisCommandTable[] = {
 
     {"reset",resetCommand,1,
      "no-script ok-stale ok-loading fast @connection",
+     0,NULL,0,0,0,0,0,0},
+
+    {"failover",failoverCommand,-1,
+     "admin no-script ok-stale",
      0,NULL,0,0,0,0,0,0}
 };
 
@@ -2185,8 +2189,15 @@ int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
     checkClientPauseTimeoutAndReturnIfPaused();
 
     /* Replication cron function -- used to reconnect to master,
-     * detect transfer failures, start background RDB transfers and so forth. */
-    run_with_period(1000) replicationCron();
+     * detect transfer failures, start background RDB transfers and so forth. 
+     * 
+     * If Redis is trying to failover then run the replication cron faster so
+     * progress on the handshake happens more quickly. */
+    if (server.failover_state != NO_FAILOVER) {
+        run_with_period(100) replicationCron();
+    } else {
+        run_with_period(1000) replicationCron();
+    }
 
     /* Run the Redis Cluster cron. */
     run_with_period(100) {
@@ -2397,6 +2408,11 @@ void beforeSleep(struct aeEventLoop *eventLoop) {
         server.get_ack_from_slaves = 0;
     }
 
+    /* We may have recieved updates from clients about their current offset. NOTE:
+     * this can't be done where the ACK is recieved since failover will disconnect 
+     * our clients. */
+    updateFailoverStatus();
+
     /* Send the invalidation messages to clients participating to the
      * client side caching protocol in broadcasting (BCAST) mode. */
     trackingBroadcastInvalidationMessages();
@@ -2651,6 +2667,13 @@ void initServerConfig(void) {
     server.repl_backlog_off = 0;
     server.repl_no_slaves_since = time(NULL);
 
+    /* Failover related */
+    server.failover_end_time = 0;
+    server.force_failover = 0;
+    server.target_replica_host = NULL;
+    server.target_replica_port = 0;
+    server.failover_state = NO_FAILOVER;
+
     /* Client output buffer limits */
     for (j = 0; j < CLIENT_TYPE_OBUF_COUNT; j++)
         server.client_obuf_limits[j] = clientBufferLimitsDefaults[j];
@@ -4991,6 +5014,7 @@ sds genRedisInfoString(const char *section) {
             }
         }
         info = sdscatprintf(info,
+            "master_failover_state:%s\r\n"
             "master_replid:%s\r\n"
             "master_replid2:%s\r\n"
             "master_repl_offset:%lld\r\n"
@@ -4999,6 +5023,7 @@ sds genRedisInfoString(const char *section) {
             "repl_backlog_size:%lld\r\n"
             "repl_backlog_first_byte_offset:%lld\r\n"
             "repl_backlog_histlen:%lld\r\n",
+            getFailoverStateString(),
             server.replid,
             server.replid2,
             server.master_repl_offset,
diff --git a/src/server.h b/src/server.h
index 349c887cb9..02ba8f7b9d 100644
--- a/src/server.h
+++ b/src/server.h
@@ -320,6 +320,14 @@ typedef enum {
     REPL_STATE_CONNECTED,       /* Connected to master */
 } repl_state;
 
+/* The state of an in progress coordinated failover */
+typedef enum {
+    NO_FAILOVER = 0,        /* No failover in progress */
+    FAILOVER_WAIT_FOR_SYNC, /* Waiting for target replica to catch up */
+    FAILOVER_IN_PROGRESS    /* Waiting for target replica to accept
+                             * PSYNC FAILOVER request. */
+} failover_state;
+
 /* State of slaves from the POV of the master. Used in client->replstate.
  * In SEND_BULK and ONLINE state the slave receives new updates
  * in its output queue. In the WAIT_BGSAVE states instead the server is waiting
@@ -1577,6 +1585,14 @@ struct redisServer {
     char *bgsave_cpulist; /* cpu affinity list of bgsave process. */
     /* Sentinel config */
     struct sentinelConfig *sentinel_config; /* sentinel config to load at startup time. */
+    /* Coordinate failover info */
+    mstime_t failover_end_time; /* Deadline for failover command. */
+    int force_failover; /* If true then failover will be foreced at the
+                         * deadline, otherwise failover is aborted. */
+    char *target_replica_host; /* Failover target host. If null during a
+                                * failover then any replica can be used. */
+    int target_replica_port; /* Failover target port */
+    int failover_state; /* Failover state */
 };
 
 typedef struct pubsubPattern {
@@ -1997,6 +2013,10 @@ void feedReplicationBacklog(void *ptr, size_t len);
 void showLatestBacklog(void);
 void rdbPipeReadHandler(struct aeEventLoop *eventLoop, int fd, void *clientData, int mask);
 void rdbPipeWriteHandlerConnRemoved(struct connection *conn);
+void clearFailoverState(void);
+void updateFailoverStatus(void);
+void abortFailover(const char *err);
+const char *getFailoverStateString();
 
 /* Generic persistence functions */
 void startLoadingFile(FILE* fp, char* filename, int rdbflags);
@@ -2637,6 +2657,7 @@ void lolwutCommand(client *c);
 void aclCommand(client *c);
 void stralgoCommand(client *c);
 void resetCommand(client *c);
+void failoverCommand(client *c);
 
 #if defined(__GNUC__)
 void *calloc(size_t count, size_t size) __attribute__ ((deprecated));
diff --git a/tests/integration/failover.tcl b/tests/integration/failover.tcl
new file mode 100644
index 0000000000..c6818700d7
--- /dev/null
+++ b/tests/integration/failover.tcl
@@ -0,0 +1,290 @@
+start_server {tags {"failover"}} {
+start_server {} {
+start_server {} {
+    set node_0 [srv 0 client]
+    set node_0_host [srv 0 host]
+    set node_0_port [srv 0 port]
+    set node_0_pid [srv 0 pid]
+
+    set node_1 [srv -1 client]
+    set node_1_host [srv -1 host]
+    set node_1_port [srv -1 port]
+    set node_1_pid [srv -1 pid]
+
+    set node_2 [srv -2 client]
+    set node_2_host [srv -2 host]
+    set node_2_port [srv -2 port]
+    set node_2_pid [srv -2 pid]
+
+    proc assert_digests_match {n1 n2 n3} {
+        assert_equal [$n1 debug digest] [$n2 debug digest]
+        assert_equal [$n2 debug digest] [$n3 debug digest]
+    }
+
+    test {failover command fails without connected replica} {
+        catch { $node_0 failover to $node_1_host $node_1_port } err
+        if {! [string match "ERR*" $err]} {
+            fail "failover command succeeded when replica not connected"
+        }
+    }
+
+    test {setup replication for following tests} {
+        $node_1 replicaof $node_0_host $node_0_port
+        $node_2 replicaof $node_0_host $node_0_port
+        wait_for_sync $node_1
+        wait_for_sync $node_2
+    }
+
+    test {failover command fails with invalid host} {
+        catch { $node_0 failover to invalidhost $node_1_port } err
+        assert_match "ERR*" $err
+    }
+
+    test {failover command fails with invalid port} {
+        catch { $node_0 failover to $node_1_host invalidport } err
+        assert_match "ERR*" $err
+    }
+
+    test {failover command fails with just force and timeout} {
+        catch { $node_0 FAILOVER FORCE TIMEOUT 100} err
+        assert_match "ERR*" $err
+    }
+
+    test {failover command fails when sent to a replica} {
+        catch { $node_1 failover to $node_1_host $node_1_port } err
+        assert_match "ERR*" $err
+    }
+
+    test {failover command fails with force without timeout} {
+        catch { $node_0 failover to $node_1_host $node_1_port FORCE } err
+        assert_match "ERR*" $err
+    }
+
+    test {failover command to specific replica works} {
+        set initial_psyncs [s -1 sync_partial_ok]
+        set initial_syncs [s -1 sync_full]
+
+        # Generate a delta between primary and replica
+        set load_handler [start_write_load $node_0_host $node_0_port 5]
+        exec kill -SIGSTOP [srv -1 pid]
+        wait_for_condition 50 100 {
+            [s 0 total_commands_processed] > 100
+        } else {
+            fail "Node 0 did not accept writes"
+        }
+        exec kill -SIGCONT [srv -1 pid]
+
+        # Execute the failover
+        $node_0 failover to $node_1_host $node_1_port
+
+        # Wait for failover to end
+        wait_for_condition 50 100 {
+            [s 0 master_failover_state] == "no-failover"
+        } else {
+            fail "Failover from node 0 to node 1 did not finish"
+        }
+        stop_write_load $load_handler
+        $node_2 replicaof $node_1_host $node_1_port
+        wait_for_sync $node_0
+        wait_for_sync $node_2
+
+        assert_match *slave* [$node_0 role]
+        assert_match *master* [$node_1 role]
+        assert_match *slave* [$node_2 role]
+
+        # We should accept psyncs from both nodes
+        assert_equal [expr [s -1 sync_partial_ok] - $initial_psyncs] 2
+        assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0
+        assert_digests_match $node_0 $node_1 $node_2
+    }
+
+    test {failover command to any replica works} {
+        set initial_psyncs [s -2 sync_partial_ok]
+        set initial_syncs [s -2 sync_full]
+
+        wait_for_ofs_sync $node_1 $node_2
+        # We stop node 0 to and make sure node 2 is selected
+        exec kill -SIGSTOP $node_0_pid
+        $node_1 set CASE 1
+        $node_1 FAILOVER
+
+        # Wait for failover to end
+        wait_for_condition 50 100 {
+            [s -1 master_failover_state] == "no-failover"
+        } else {
+            fail "Failover from node 1 to node 2 did not finish"
+        }
+        exec kill -SIGCONT $node_0_pid
+        $node_0 replicaof $node_2_host $node_2_port
+
+        wait_for_sync $node_0
+        wait_for_sync $node_1
+
+        assert_match *slave* [$node_0 role]
+        assert_match *slave* [$node_1 role]
+        assert_match *master* [$node_2 role]
+
+        # We should accept Psyncs from both nodes
+        assert_equal [expr [s -2 sync_partial_ok] - $initial_psyncs] 2
+        assert_equal [expr [s -1 sync_full] - $initial_psyncs] 0
+        assert_digests_match $node_0 $node_1 $node_2
+    }
+
+    test {failover to a replica with force works} {
+        set initial_psyncs [s 0 sync_partial_ok]
+        set initial_syncs [s 0 sync_full]
+
+        exec kill -SIGSTOP $node_0_pid
+        # node 0 will never acknowledge this write
+        $node_2 set case 2
+        $node_2 failover to $node_0_host $node_0_port TIMEOUT 100 FORCE
+
+        # Wait for node 0 to give up on sync attempt and start failover
+        wait_for_condition 50 100 {
+            [s -2 master_failover_state] == "failover-in-progress"
+        } else {
+            fail "Failover from node 2 to node 0 did not timeout"
+        }
+
+        # Quick check that everyone is a replica, we never want a 
+        # state where there are two masters.
+        assert_match *slave* [$node_1 role]
+        assert_match *slave* [$node_2 role]
+
+        exec kill -SIGCONT $node_0_pid
+
+        # Wait for failover to end
+        wait_for_condition 50 100 {
+            [s -2 master_failover_state] == "no-failover"
+        } else {
+            fail "Failover from node 2 to node 0 did not finish"
+        }
+        $node_1 replicaof $node_0_host $node_0_port
+
+        wait_for_sync $node_1
+        wait_for_sync $node_2
+
+        assert_match *master* [$node_0 role]
+        assert_match *slave* [$node_1 role]
+        assert_match *slave* [$node_2 role]
+
+        assert_equal [count_log_message -2 "time out exceeded, failing over."] 1
+
+        # We should accept both psyncs, although this is the condition we might not
+        # since we didn't catch up.
+        assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2
+        assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
+        assert_digests_match $node_0 $node_1 $node_2
+    }
+
+    test {failover with timeout aborts if replica never catches up} {
+        set initial_psyncs [s 0 sync_partial_ok]
+        set initial_syncs [s 0 sync_full]
+
+        # Stop replica so it never catches up
+        exec kill -SIGSTOP [srv -1 pid]
+        $node_0 SET CASE 1
+        
+        $node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 500
+        # Wait for failover to end
+        wait_for_condition 50 20 {
+            [s 0 master_failover_state] == "no-failover"
+        } else {
+            fail "Failover from node_0 to replica did not finish"
+        }
+
+        exec kill -SIGCONT [srv -1 pid]
+
+        # We need to make sure the nodes actually sync back up
+        wait_for_ofs_sync $node_0 $node_1
+        wait_for_ofs_sync $node_0 $node_2
+
+        assert_match *master* [$node_0 role]
+        assert_match *slave* [$node_1 role]
+        assert_match *slave* [$node_2 role]
+
+        # Since we never caught up, there should be no syncs
+        assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0
+        assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
+        assert_digests_match $node_0 $node_1 $node_2
+    }
+
+    test {failovers can be aborted} {
+        set initial_psyncs [s 0 sync_partial_ok]
+        set initial_syncs [s 0 sync_full]
+    
+        # Stop replica so it never catches up
+        exec kill -SIGSTOP [srv -1 pid]
+        $node_0 SET CASE 2
+        
+        $node_0 failover to [srv -1 host] [srv -1 port] TIMEOUT 60000
+        assert_match [s 0 master_failover_state] "waiting-for-sync"
+
+        # Sanity check that read commands are still accepted
+        $node_0 GET CASE
+
+        $node_0 failover abort
+        assert_match [s 0 master_failover_state] "no-failover"
+
+        exec kill -SIGCONT [srv -1 pid]
+
+        # Just make sure everything is still synced
+        wait_for_ofs_sync $node_0 $node_1
+        wait_for_ofs_sync $node_0 $node_2
+
+        assert_match *master* [$node_0 role]
+        assert_match *slave* [$node_1 role]
+        assert_match *slave* [$node_2 role]
+
+        # Since we never caught up, there should be no syncs
+        assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 0
+        assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
+        assert_digests_match $node_0 $node_1 $node_2
+    }
+
+    test {failover aborts if target rejects sync request} {
+        set initial_psyncs [s 0 sync_partial_ok]
+        set initial_syncs [s 0 sync_full]
+
+        # We block psync, so the failover will fail
+        $node_1 acl setuser default -psync
+
+        # We pause the target long enough to send a write command
+        # during the pause. This write will not be interrupted.
+        exec kill -SIGSTOP [srv -1 pid]
+        set rd [redis_deferring_client]
+        $rd SET FOO BAR
+        $node_0 failover to $node_1_host $node_1_port
+        exec kill -SIGCONT [srv -1 pid]
+
+        # Wait for failover to end
+        wait_for_condition 50 100 {
+            [s 0 master_failover_state] == "no-failover"
+        } else {
+            fail "Failover from node_0 to replica did not finish"
+        }
+
+        assert_equal [$rd read] "OK"
+        $rd close
+
+        # restore access to psync
+        $node_1 acl setuser default +psync
+
+        # We need to make sure the nodes actually sync back up
+        wait_for_sync $node_1
+        wait_for_sync $node_2
+
+        assert_match *master* [$node_0 role]
+        assert_match *slave* [$node_1 role]
+        assert_match *slave* [$node_2 role]
+
+        # We will cycle all of our replicas here and force a psync.
+        assert_equal [expr [s 0 sync_partial_ok] - $initial_psyncs] 2
+        assert_equal [expr [s 0 sync_full] - $initial_syncs] 0
+
+        assert_equal [count_log_message 0 "Failover target rejected psync request"] 1
+        assert_digests_match $node_0 $node_1 $node_2
+    }
+}
+}
+}
diff --git a/tests/support/util.tcl b/tests/support/util.tcl
index 86f2753c22..e693959947 100644
--- a/tests/support/util.tcl
+++ b/tests/support/util.tcl
@@ -86,12 +86,10 @@ proc waitForBgrewriteaof r {
 }
 
 proc wait_for_sync r {
-    while 1 {
-        if {[status $r master_link_status] eq "down"} {
-            after 10
-        } else {
-            break
-        }
+    wait_for_condition 50 100 {
+        [status $r master_link_status] eq "up"
+    } else {
+        fail "replica didn't sync in time"
     }
 }
 
diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl
index f947c024a5..2b78547808 100644
--- a/tests/test_helper.tcl
+++ b/tests/test_helper.tcl
@@ -52,6 +52,7 @@ set ::all_tests {
     integration/psync2
     integration/psync2-reg
     integration/psync2-pingoff
+    integration/failover
     integration/redis-cli
     integration/redis-benchmark
     unit/pubsub