valkey-io · enjoy-binbin · Sep 30, 2024 · Sep 30, 2024 · Oct 6, 2024 · Oct 6, 2024
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -3506,10 +3506,14 @@ int clusterProcessPacket(clusterLink *link) {
          * a non zero number of slots, and its currentEpoch is greater or
          * equal to epoch where this node started the election. */
         if (clusterNodeIsVotingPrimary(sender) && sender_claimed_current_epoch >= server.cluster->failover_auth_epoch) {
-            server.cluster->failover_auth_count++;
-            /* Maybe we reached a quorum here, set a flag to make sure
-             * we check ASAP. */
-            clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+            /* todo: see if this needed. */
+            /* My primary has already voted for me, so don't count it anymore. */
+            if (!(sender == myself->replicaof && server.cluster->mf_is_primary_failover)) {
+                server.cluster->failover_auth_count++;
+                /* Maybe we reached a quorum here, set a flag to make sure
+                 * we check ASAP. */
+                clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+            }
         }
     } else if (type == CLUSTERMSG_TYPE_MFSTART) {
         /* This message is acceptable only if I'm a primary and the sender
@@ -4592,6 +4596,11 @@ void clusterHandleReplicaFailover(void) {
         if (server.cluster->mf_end) {
             server.cluster->failover_auth_time = mstime();
             server.cluster->failover_auth_rank = 0;
+            /* todo: see if this is needed. */
+            /* This is a failover triggered by my primary, let's counts its vote. */
+            if (server.cluster->mf_is_primary_failover) {
+                server.cluster->failover_auth_count++;
+            }
             clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
         }
         serverLog(LL_NOTICE,
@@ -4816,6 +4825,7 @@ void resetManualFailover(void) {
     }
     server.cluster->mf_end = 0; /* No manual failover in progress. */
     server.cluster->mf_can_start = 0;
+    server.cluster->mf_is_primary_failover = 0;
     server.cluster->mf_replica = NULL;
     server.cluster->mf_primary_offset = -1;
 }
@@ -4844,6 +4854,7 @@ void clusterHandleManualFailover(void) {
         /* Our replication offset matches the primary replication offset
          * announced after clients were paused. We can start the failover. */
         server.cluster->mf_can_start = 1;
+        server.cluster->mf_is_primary_failover = 0;
         serverLog(LL_NOTICE, "All primary replication stream processed, "
                              "manual failover can start.");
         clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
@@ -6730,8 +6741,13 @@ int clusterCommandSpecial(client *c) {
             /* If this is a forced failover, we don't need to talk with our
              * primary to agree about the offset. We just failover taking over
              * it without coordination. */
-            serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client);
+            if (c == server.primary) {
+                serverLog(LL_NOTICE, "Forced failover primary request accepted (primary request from '%s').", client);
+            } else {
+                serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client);
+            }
             server.cluster->mf_can_start = 1;
+            server.cluster->mf_is_primary_failover = 1;
             /* We can start a manual failover as soon as possible, setting a flag
              * here so that we don't need to waiting for the cron to kick in. */
             clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER);

diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
@@ -393,6 +393,7 @@ struct clusterState {
                                    or -1 if still not received. */
     int mf_can_start;            /* If non-zero signal that the manual failover
                                     can start requesting primary vote. */
+    int mf_is_primary_failover;  /* The manual failover was triggered by my primary. */
     /* The following fields are used by primaries to take state on elections. */
     uint64_t lastVoteEpoch; /* Epoch of the last vote granted. */
     int todo_before_sleep;  /* Things to do in clusterBeforeSleep(). */

diff --git a/src/config.c b/src/config.c
@@ -3135,6 +3135,7 @@ standardConfig static_configs[] = {
     createBoolConfig("enable-debug-assert", NULL, IMMUTABLE_CONFIG | HIDDEN_CONFIG, server.enable_debug_assert, 0, NULL, NULL),
     createBoolConfig("cluster-slot-stats-enabled", NULL, MODIFIABLE_CONFIG, server.cluster_slot_stats_enabled, 0, NULL, NULL),
     createBoolConfig("hide-user-data-from-log", NULL, MODIFIABLE_CONFIG, server.hide_user_data_from_log, 1, NULL, NULL),
+    createBoolConfig("auto-failover-on-shutdown", NULL, MODIFIABLE_CONFIG, server.auto_failover_on_shutdown, 0, NULL, NULL),
 
     /* String Configs */
     createStringConfig("aclfile", NULL, IMMUTABLE_CONFIG, ALLOW_EMPTY_STRING, server.acl_filename, "", NULL, NULL),

diff --git a/src/server.c b/src/server.c
@@ -4306,6 +4306,7 @@ int finishShutdown(void) {
     int force = server.shutdown_flags & SHUTDOWN_FORCE;
 
     /* Log a warning for each replica that is lagging. */
+    client *best_replica = NULL;
     listIter replicas_iter;
     listNode *replicas_list_node;
     int num_replicas = 0, num_lagging_replicas = 0;
@@ -4320,6 +4321,14 @@ int finishShutdown(void) {
                       replicationGetReplicaName(replica), server.primary_repl_offset - replica->repl_ack_off, lag,
                       replstateToString(replica->repl_state));
         }
+        /* Find the best replica, that is, the replica with the largest offset. */
+        if (replica->repl_state == REPLICA_STATE_ONLINE) {
+            if (best_replica == NULL) {
+                best_replica = replica;
+            } else if (replica->repl_ack_off > best_replica->repl_ack_off) {
+                best_replica = replica;
+            }
+        }
     }
     if (num_replicas > 0) {
         serverLog(LL_NOTICE, "%d of %d replicas are in sync when shutting down.", num_replicas - num_lagging_replicas,
@@ -4419,6 +4428,21 @@ int finishShutdown(void) {
      * send them pending writes. */
     flushReplicasOutputBuffers();
 
+    if (server.auto_failover_on_shutdown && server.cluster_enabled && best_replica) {
+        /* Sending a CLUSTER FAILOVER FORCE to the best replica. */
+        const char *buf = "*3\r\n$7\r\nCLUSTER\r\n$8\r\nFAILOVER\r\n$5\r\nFORCE\r\n";
+        if (connWrite(best_replica->conn, buf, strlen(buf)) == (int)strlen(buf)) {
+            serverLog(LL_NOTICE, "Sending CLUSTER FAILOVER FORCE to replica %s succeeded.",
+                      replicationGetReplicaName(best_replica));
+        } else {
+            serverLog(LL_WARNING, "Failed to send CLUSTER FAILOVER FORCE to replica: %s", strerror(errno));
+        }
+    }
+
+    if (server.auto_failover_on_shutdown && server.cluster_enabled && !best_replica) {
+        serverLog(LL_WARNING, "Unable to find a replica to perform an auto failover on shutdown.");
+    }
+
     /* Close the listening sockets. Apparently this allows faster restarts. */
     closeListeningSockets(1);
 

diff --git a/src/server.h b/src/server.h
@@ -2169,6 +2169,7 @@ struct valkeyServer {
     unsigned long cluster_blacklist_ttl;                   /* Duration in seconds that a node is denied re-entry into
                                                             * the cluster after it is forgotten with CLUSTER FORGET. */
     int cluster_slot_stats_enabled;                        /* Cluster slot usage statistics tracking enabled. */
+    int auto_failover_on_shutdown;                         /* Trigger manual failover on shutdown to primary. */
     /* Debug config that goes along with cluster_drop_packet_filter. When set, the link is closed on packet drop. */
     uint32_t debug_cluster_close_link_on_packet_drop : 1;
     sds cached_cluster_slot_info[CACHE_CONN_TYPE_MAX]; /* Index in array is a bitwise or of CACHE_CONN_TYPE_* */

diff --git a/tests/support/util.tcl b/tests/support/util.tcl
@@ -126,6 +126,24 @@ proc wait_replica_online r {
     }
 }
 
+proc get_replica_acked_offset {primary replica_ip replica_port} {
+    set infostr [$primary info replication]
+    if {[regexp -lineanchor "^slave\\d:ip=$replica_ip,port=$replica_port,.*,offset=(\\d+).*\r\n" $infostr _ value]} {
+        return $value
+    }
+}
+
+proc wait_replica_acked_ofs {primary replica_ip replica_port} {
+    $primary config set repl-ping-replica-period 3600
+    wait_for_condition 50 100 {
+        [status $primary master_repl_offset] eq [get_replica_acked_offset $primary $replica_ip $replica_port]
+    } else {
+        puts "INFO REPLICATION: [$primary info replication]"
+        fail "replica acked offset didn't match in time"
+    }
+    $primary config set repl-ping-replica-period 10
+}
+
 proc wait_for_ofs_sync {r1 r2} {
     wait_for_condition 50 100 {
         [status $r1 master_repl_offset] eq [status $r2 master_repl_offset]

diff --git a/tests/unit/cluster/auto-failover-on-shutdown.tcl b/tests/unit/cluster/auto-failover-on-shutdown.tcl
@@ -0,0 +1,68 @@
+proc shutdown_on_how {srv_id how} {
+    if {$how == "shutdown"} {
+        catch {R $srv_id shutdown nosave}
+    } elseif {$how == "sigterm"} {
+        exec kill -SIGTERM [s -$srv_id process_id]
+    }
+}
+
+proc test_main {how} {
+    test "auto-failover-on-shutdown will always pick a best replica and send CLUSTER FAILOVER - $how" {
+        set primary [srv 0 client]
+        set replica1 [srv -3 client]
+        set replica1_pid [s -3 process_id]
+        set replica2 [srv -6 client]
+        set replica2_ip [srv -6 host]
+        set replica2_port [srv -6 port]
+
+        # Pause a replica so it has no chance to catch up with the offset.
+        pause_process $replica1_pid
+
+        # Primary write some data to increse the offset.
+        for {set i 0} {$i < 10} {incr i} {
+            $primary incr key_991803
+        }
+
+        # Wait the replica2 catch up with the offset
+        wait_replica_acked_ofs $primary $replica2_ip $replica2_port
+
+        # Shutdown the primary.
+        shutdown_on_how 0 $how
+
+        # Wait for the replica2 to become a primary.
+        wait_for_condition 1000 50 {
+            [s -6 role] eq {master}
+        } else {
+            puts "s -6 role: [s -6 role]"
+            fail "Failover does not happened"
+        }
+
+        # Make sure that the expected logs are printed.
+        verify_log_message 0 "*Sending CLUSTER FAILOVER FORCE to replica*" 0
+        verify_log_message -6 "*Forced failover primary request accepted*" 0
+
+        resume_process $replica1_pid
+    }
+
+    test "Unable to find a replica to perform an auto failover - $how" {
+        set primary [srv -6 client]
+        set replica1 [srv -3 client]
+        set replica1_pid [s -3 process_id]
+
+        pause_process $replica1_pid
+
+        $primary client kill type replica
+        shutdown_on_how 6 $how
+        wait_for_log_messages -6 {"*Unable to find a replica to perform an auto failover on shutdown*"} 0 1000 10
+
+        resume_process $replica1_pid
+    }
+}
+
+start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000 shutdown-timeout 0 auto-failover-on-shutdown yes}} {
+    test_main "shutdown"
+}
+
+start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000 shutdown-timeout 0 auto-failover-on-shutdown yes}} {
+    test_main "sigterm"
+}
diff --git a/valkey.conf b/valkey.conf
@@ -1594,6 +1594,10 @@ aof-timestamp-enabled no
 # shutdown-on-sigint default
 # shutdown-on-sigterm default
 
+# TODO
+#
+# auto-failover-on-shutdown no
+
 ################ NON-DETERMINISTIC LONG BLOCKING COMMANDS #####################
 
 # Maximum time in milliseconds for EVAL scripts, functions and in some cases