diff --git a/src/commands.def b/src/commands.def index ecc77126af..a6873cebce 100644 --- a/src/commands.def +++ b/src/commands.def @@ -5577,7 +5577,9 @@ struct COMMAND_ARG SENTINEL_DEBUG_Args[] = { #ifndef SKIP_CMD_HISTORY_TABLE /* SENTINEL FAILOVER history */ -#define SENTINEL_FAILOVER_History NULL +commandHistory SENTINEL_FAILOVER_History[] = { +{"8.1.0","`COORDINATED` option."}, +}; #endif #ifndef SKIP_CMD_TIPS_TABLE @@ -5593,6 +5595,7 @@ struct COMMAND_ARG SENTINEL_DEBUG_Args[] = { /* SENTINEL FAILOVER argument table */ struct COMMAND_ARG SENTINEL_FAILOVER_Args[] = { {MAKE_ARG("primary-name",ARG_TYPE_STRING,-1,NULL,NULL,NULL,CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("coordinated",ARG_TYPE_PURE_TOKEN,-1,"COORDINATED",NULL,"8.1.0",CMD_ARG_OPTIONAL,0,NULL)}, }; /********** SENTINEL FLUSHCONFIG ********************/ @@ -6055,7 +6058,7 @@ struct COMMAND_STRUCT SENTINEL_Subcommands[] = { {MAKE_CMD("ckquorum","Checks for a Sentinel quorum.",NULL,"2.8.4",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_CKQUORUM_History,0,SENTINEL_CKQUORUM_Tips,0,sentinelCommand,3,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_CKQUORUM_Keyspecs,0,NULL,1),.args=SENTINEL_CKQUORUM_Args}, {MAKE_CMD("config","Configures Sentinel.","O(N) when N is the number of configuration parameters provided","6.2.0",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_CONFIG_History,1,SENTINEL_CONFIG_Tips,0,sentinelCommand,-4,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_CONFIG_Keyspecs,0,NULL,1),.args=SENTINEL_CONFIG_Args}, {MAKE_CMD("debug","Lists or updates the current configurable parameters of Sentinel.","O(N) where N is the number of configurable parameters","7.0.0",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_DEBUG_History,0,SENTINEL_DEBUG_Tips,0,sentinelCommand,-2,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_DEBUG_Keyspecs,0,NULL,1),.args=SENTINEL_DEBUG_Args}, -{MAKE_CMD("failover","Forces a Sentinel failover.",NULL,"2.8.4",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_FAILOVER_History,0,SENTINEL_FAILOVER_Tips,0,sentinelCommand,3,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_FAILOVER_Keyspecs,0,NULL,1),.args=SENTINEL_FAILOVER_Args}, +{MAKE_CMD("failover","Forces a Sentinel failover.",NULL,"2.8.4",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_FAILOVER_History,1,SENTINEL_FAILOVER_Tips,0,sentinelCommand,-3,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_FAILOVER_Keyspecs,0,NULL,2),.args=SENTINEL_FAILOVER_Args}, {MAKE_CMD("flushconfig","Rewrites the Sentinel configuration file.","O(1)","2.8.4",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_FLUSHCONFIG_History,0,SENTINEL_FLUSHCONFIG_Tips,0,sentinelCommand,2,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_FLUSHCONFIG_Keyspecs,0,NULL,0)}, {MAKE_CMD("get-master-addr-by-name","Returns the port and address of a primary instance.","O(1)","2.8.4",CMD_DOC_DEPRECATED,"`SENTINEL GET-PRIMARY-ADDR-BY-NAME`","8.0.0","sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_GET_MASTER_ADDR_BY_NAME_History,0,SENTINEL_GET_MASTER_ADDR_BY_NAME_Tips,0,sentinelCommand,3,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_GET_MASTER_ADDR_BY_NAME_Keyspecs,0,NULL,1),.args=SENTINEL_GET_MASTER_ADDR_BY_NAME_Args}, {MAKE_CMD("get-primary-addr-by-name","Returns the port and address of a primary instance.","O(1)","8.0.0",CMD_DOC_NONE,NULL,NULL,"sentinel",COMMAND_GROUP_SENTINEL,SENTINEL_GET_PRIMARY_ADDR_BY_NAME_History,0,SENTINEL_GET_PRIMARY_ADDR_BY_NAME_Tips,0,sentinelCommand,3,CMD_ADMIN|CMD_SENTINEL|CMD_ONLY_SENTINEL,0,SENTINEL_GET_PRIMARY_ADDR_BY_NAME_Keyspecs,0,NULL,1),.args=SENTINEL_GET_PRIMARY_ADDR_BY_NAME_Args}, diff --git a/src/commands/sentinel-failover.json b/src/commands/sentinel-failover.json index 8e7c3ea3e7..a35577a0e6 100644 --- a/src/commands/sentinel-failover.json +++ b/src/commands/sentinel-failover.json @@ -3,9 +3,15 @@ "summary": "Forces a Sentinel failover.", "group": "sentinel", "since": "2.8.4", - "arity": 3, + "arity": -3, "container": "SENTINEL", "function": "sentinelCommand", + "history": [ + [ + "8.1.0", + "`COORDINATED` option." + ] + ], "command_flags": [ "ADMIN", "SENTINEL", @@ -13,12 +19,19 @@ ], "reply_schema": { "const": "OK", - "description": "Force a fail over as if the primary was not reachable, and without asking for agreement to other Sentinels." + "description": "Force a fail over of the primary. Without options, the fail over is executed immediately as if the primary was not reachable. Using `COORDINATED`, fail over seeking agreement from other Sentinels and using coordinated fail over." }, "arguments": [ { "name": "primary-name", "type": "string" + }, + { + "token": "COORDINATED", + "name": "coordinated", + "type": "pure-token", + "optional": true, + "since": "8.1.0" } ] } diff --git a/src/sentinel.c b/src/sentinel.c index ccd3ccbdca..81430a17c8 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -77,6 +77,7 @@ typedef struct sentinelAddr { #define SRI_FORCE_FAILOVER (1 << 11) /* Force failover with primary up. */ #define SRI_SCRIPT_KILL_SENT (1 << 12) /* SCRIPT KILL already sent on -BUSY */ #define SRI_PRIMARY_REBOOT (1 << 13) /* Primary was detected as rebooting */ +#define SRI_COORD_FAILOVER (1 << 14) /* Coordinated failover with primary up. */ /* Note: when adding new flags, please check the flags section in addReplySentinelValkeyInstance. */ /* Note: times are in milliseconds. */ @@ -138,6 +139,14 @@ static mstime_t sentinel_default_failover_timeout = 60 * 3 * 1000; #define SENTINEL_SIMFAILURE_CRASH_AFTER_ELECTION (1 << 0) #define SENTINEL_SIMFAILURE_CRASH_AFTER_PROMOTION (1 << 1) +/* The failover status of a monitored Valkey instance */ +#define SENTINEL_MONITORED_INSTANCE_FAILOVER_NS 0 /* Not supported*/ +#define SENTINEL_MONITORED_INSTANCE_NO_FAILOVER 1 /* Failover supported, no failover ongoing */ +#define SENTINEL_MONITORED_INSTANCE_FAILOVER 2 /* Failover supported, failover ongoing */ + +/* sentinelAskPrimaryStateToOtherSentinels flags */ +#define SENTINEL_ASK_FORCED (1 << 0) + /* The link to a sentinelValkeyInstance. When we have the same set of Sentinels * monitoring many primaries, we have different instances representing the * same Sentinels, one per primary, and we need to share the hiredis connections @@ -212,6 +221,9 @@ typedef struct sentinelValkeyInstance { mstime_t role_reported_time; mstime_t replica_conf_change_time; /* Last time replica primary addr changed. */ + int monitored_instance_failover_state; /* Whether the Valkey instance supports/reports a failover in "master_failover_state" */ + mstime_t monitored_instance_failover_change_time; /* Last time monitored_instance_failover_state changed. */ + /* Primary specific. */ dict *sentinels; /* Other sentinels monitoring the same primary. */ dict *replicas; /* Replicas for this primary instance. */ @@ -403,6 +415,7 @@ sentinelValkeyInstance *sentinelSelectReplica(sentinelValkeyInstance *primary); void sentinelScheduleScriptExecution(char *path, ...); void sentinelStartFailover(sentinelValkeyInstance *primary); void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata); +int sentinelKillClients(sentinelValkeyInstance *ri); int sentinelSendReplicaOf(sentinelValkeyInstance *ri, const sentinelAddr *addr); char *sentinelVoteLeader(sentinelValkeyInstance *primary, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch); int sentinelFlushConfig(void); @@ -411,6 +424,7 @@ int sentinelSendPing(sentinelValkeyInstance *ri); int sentinelForceHelloUpdateForPrimary(sentinelValkeyInstance *primary); sentinelValkeyInstance *getSentinelValkeyInstanceByAddrAndRunID(dict *instances, char *ip, int port, char *runid); void sentinelSimFailureCrash(void); +void sentinelAskPrimaryStateToOtherSentinels(sentinelValkeyInstance *primary, int flags); /* ========================= Dictionary types =============================== */ @@ -1346,6 +1360,9 @@ sentinelValkeyInstance *createSentinelValkeyInstance(char *name, ri->role_reported_time = mstime(); ri->replica_conf_change_time = mstime(); + ri->monitored_instance_failover_state = SENTINEL_MONITORED_INSTANCE_FAILOVER_NS; + ri->monitored_instance_failover_change_time = mstime(); + /* Add into the right table. */ dictAdd(table, ri->name, ri); return ri; @@ -2538,6 +2555,15 @@ void sentinelRefreshInstanceInfo(sentinelValkeyInstance *ri, const char *info) { /* replica_announced: */ if (sdslen(l) >= 18 && !memcmp(l, "replica_announced:", 18)) ri->replica_announced = atoi(l + 18); } + + /* master_failover_state: */ + if (sdslen(l) >= 22 && !memcmp(l, "master_failover_state:", 22)) { + int failover_state = (sdslen(l) >= 33 && !memcmp(l, "master_failover_state:no-failover", 33)) ? SENTINEL_MONITORED_INSTANCE_NO_FAILOVER : SENTINEL_MONITORED_INSTANCE_FAILOVER; + if (failover_state != ri->monitored_instance_failover_state) { + ri->monitored_instance_failover_state = failover_state; + ri->monitored_instance_failover_change_time = mstime(); + } + } } ri->info_refresh = mstime(); sdsfreesplitres(lines, numlines); @@ -2585,6 +2611,10 @@ void sentinelRefreshInstanceInfo(sentinelValkeyInstance *ri, const char *info) { sentinelFlushConfig(); sentinelEvent(LL_WARNING, "+promoted-slave", ri, "%@"); if (sentinel.simfailure_flags & SENTINEL_SIMFAILURE_CRASH_AFTER_PROMOTION) sentinelSimFailureCrash(); + if (ri->primary->flags & SRI_COORD_FAILOVER) { + sentinelKillClients(ri->primary); + sentinelKillClients(ri); + } sentinelEvent(LL_WARNING, "+failover-state-reconf-slaves", ri->primary, "%@"); sentinelCallClientReconfScript(ri->primary, SENTINEL_LEADER, "start", ri->primary->addr, ri->addr); sentinelForceHelloUpdateForPrimary(ri->primary); @@ -2602,6 +2632,18 @@ void sentinelRefreshInstanceInfo(sentinelValkeyInstance *ri, const char *info) { } } + /* Handle replicas stuck in failover for too long. */ + if ((ri->flags & SRI_REPLICA) && role == SRI_REPLICA && ri->monitored_instance_failover_state == SENTINEL_MONITORED_INSTANCE_FAILOVER) { + mstime_t wait_time = ri->primary->failover_timeout; + + /* Make sure the primary is sane before reconfiguring this instance */ + if (sentinelPrimaryLooksSane(ri->primary) && sentinelValkeyInstanceNoDownFor(ri, wait_time) && + mstime() - ri->monitored_instance_failover_change_time > wait_time) { + int retval = sentinelSendReplicaOf(ri, ri->primary->addr); + if (retval == C_OK) sentinelEvent(LL_NOTICE, "+fix-slave-config", ri, "%@"); + } + } + /* Handle replicas replicating to a different primary address. */ if ((ri->flags & SRI_REPLICA) && role == SRI_REPLICA && (ri->replica_primary_port != ri->primary->addr->port || @@ -3009,7 +3051,13 @@ void sentinelSendPeriodicCommands(sentinelValkeyInstance *ri) { /* PUBLISH hello messages to all the three kinds of instances. */ if ((now - ri->last_pub_time) > sentinel_publish_period) { - sentinelSendHello(ri); + /* Don't publish if the command would block because client writes are blocked: + - during coordinated failover + - during Valkey failover (this may be a failover that is stuck) */ + if (((ri->flags & SRI_COORD_FAILOVER) == 0 || ri->failover_state > SENTINEL_FAILOVER_STATE_WAIT_PROMOTION) && + ri->monitored_instance_failover_state != SENTINEL_MONITORED_INSTANCE_FAILOVER) { + sentinelSendHello(ri); + } } } @@ -3266,6 +3314,7 @@ void addReplySentinelValkeyInstance(client *c, sentinelValkeyInstance *ri) { if (ri->flags & SRI_RECONF_INPROG) flags = sdscat(flags, "reconf_inprog,"); if (ri->flags & SRI_RECONF_DONE) flags = sdscat(flags, "reconf_done,"); if (ri->flags & SRI_FORCE_FAILOVER) flags = sdscat(flags, "force_failover,"); + if (ri->flags & SRI_COORD_FAILOVER) flags = sdscat(flags, "coordinated_failover,"); if (ri->flags & SRI_SCRIPT_KILL_SENT) flags = sdscat(flags, "script_kill_sent,"); if (ri->flags & SRI_PRIMARY_REBOOT) flags = sdscat(flags, "master_reboot,"); @@ -3702,7 +3751,7 @@ void sentinelCommand(client *c) { " Or update current configurable parameters values (one or more).", "GET-PRIMARY-ADDR-BY-NAME ", " Return the ip and port number of the primary with that name.", - "FAILOVER ", + "FAILOVER [COORDINATED]", " Manually failover a primary node without asking for agreement from other", " Sentinels", "FLUSHCONFIG", @@ -3838,9 +3887,23 @@ void sentinelCommand(client *c) { } else if (!strcasecmp(c->argv[1]->ptr, "failover")) { /* SENTINEL FAILOVER */ sentinelValkeyInstance *ri; + int coordinated = 0; - if (c->argc != 3) goto numargserr; - if ((ri = sentinelGetPrimaryByNameOrReplyError(c, c->argv[2])) == NULL) return; + if (c->argc < 3 || c->argc > 4) goto numargserr; + if ((ri = sentinelGetPrimaryByNameOrReplyError(c, c->argv[2])) == NULL) + return; + if (c->argc == 4) { + if (!strcasecmp(c->argv[3]->ptr, "coordinated")) { + coordinated = 1; + if (ri->monitored_instance_failover_state == SENTINEL_MONITORED_INSTANCE_FAILOVER_NS) { + addReplyError(c, "-NOGOODPRIMARY Primary does not support FAILOVER command"); + return; + } + } else { + addReplyError(c, "Unknown failover option specified"); + return; + } + } if (ri->flags & SRI_FAILOVER_IN_PROGRESS) { addReplyError(c, "-INPROG Failover already in progress"); return; @@ -3850,8 +3913,19 @@ void sentinelCommand(client *c) { return; } serverLog(LL_NOTICE, "Executing user requested FAILOVER of '%s'", ri->name); + ri->s_down_since_time = mstime(); + ri->flags |= SRI_S_DOWN; sentinelStartFailover(ri); - ri->flags |= SRI_FORCE_FAILOVER; + if (coordinated) { + ri->flags |= SRI_COORD_FAILOVER; + /* Initiate a leader election, The SENTINEL_FAILOVER_STATE_WAIT_START + state will wait until we are elected. */ + sentinelAskPrimaryStateToOtherSentinels(ri, SENTINEL_ASK_FORCED); + } else { + /* SRI_FORCE_FAILOVER will cause the SENTINEL_FAILOVER_STATE_WAIT_START + state to regard us as leader (without election). */ + ri->flags |= SRI_FORCE_FAILOVER; + } addReply(c, shared.ok); } else if (!strcasecmp(c->argv[1]->ptr, "pending-scripts")) { /* SENTINEL PENDING-SCRIPTS */ @@ -4464,7 +4538,6 @@ void sentinelReceiveIsPrimaryDownReply(redisAsyncContext *c, void *reply, void * * SENTINEL IS-PRIMARY-DOWN-BY-ADDR requests to other sentinels * in order to get the replies that allow to reach the quorum * needed to mark the primary in ODOWN state and trigger a failover. */ -#define SENTINEL_ASK_FORCED (1 << 0) void sentinelAskPrimaryStateToOtherSentinels(sentinelValkeyInstance *primary, int flags) { dictIterator *di; dictEntry *de; @@ -4634,6 +4707,109 @@ char *sentinelGetLeader(sentinelValkeyInstance *primary, uint64_t epoch) { return winner; } +/* Send FAILOVER to the specified instance using the specified timeout. + * Additionally, issue a "CLIENT PAUSE WRITE" using the same timeout to + * keep clients in blocked state after the failover succeeded, since they + * don't expect to be connected to a replica suddenly (they will be + * disconnected in the next state of the failover) + * + * The command returns C_OK if the FAILOVER command was accepted for + * (later) delivery otherwise C_ERR. The command replies are just + * discarded. */ +int sentinelFailoverTo(sentinelValkeyInstance *ri, const sentinelAddr *addr, mstime_t timeout) { + char portstr[32]; + const char *host; + int retval; + + host = announceSentinelAddr(addr); + ll2string(portstr, sizeof(portstr), addr->port); + + /* Note that we don't check the replies returned by commands, since we + * will observe instead the effects in the next INFO output. */ + retval = redisAsyncCommand(ri->link->cc, + sentinelDiscardReplyCallback, ri, "%s", + sentinelInstanceMapCommand(ri, "MULTI")); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + + retval = redisAsyncCommand(ri->link->cc, + sentinelDiscardReplyCallback, ri, "%s PAUSE %d WRITE", + sentinelInstanceMapCommand(ri, "CLIENT"), + timeout); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + + retval = redisAsyncCommand(ri->link->cc, + sentinelDiscardReplyCallback, ri, "%s TO %s %s TIMEOUT %d", + sentinelInstanceMapCommand(ri, "FAILOVER"), + host, portstr, timeout); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + + retval = redisAsyncCommand(ri->link->cc, + sentinelDiscardReplyCallback, ri, "%s", + sentinelInstanceMapCommand(ri, "EXEC")); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + + return C_OK; +} + +/* Kill all existing client connections (because the role of the server switched during + * failover) and unblock new clients. Additionally, send CONFIG REWRITE command + * in order to store the new configuration on disk when possible (that is, + * if the server was started with a configuration file). + * + * The command returns C_OK if the commands were accepted for + * (later) delivery otherwise C_ERR. The command replies are just + * discarded. */ +int sentinelKillClients(sentinelValkeyInstance *ri) { + int retval; + + /* 1) Rewrite the configuration (the instance just switched roles) + * 2) Disconnect all clients (but this one sending the command) in order + * to trigger the ask-master-on-reconnection protocol for connected + * clients. + * 3) Unblock client writes (which include PUBLISH). + * + * Note that we don't check the replies returned by commands, since we + * will observe instead the effects in the next INFO output. */ + retval = redisAsyncCommand(ri->link->cc, + sentinelDiscardReplyCallback, ri, "%s", + sentinelInstanceMapCommand(ri, "MULTI")); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + + retval = redisAsyncCommand(ri->link->cc, + sentinelDiscardReplyCallback, ri, "%s REWRITE", + sentinelInstanceMapCommand(ri, "CONFIG")); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + + for (int type = 0; type < 2; type++) { + retval = redisAsyncCommand(ri->link->cc, + sentinelDiscardReplyCallback, ri, "%s KILL TYPE %s", + sentinelInstanceMapCommand(ri, "CLIENT"), + type == 0 ? "normal" : "pubsub"); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + } + + retval = redisAsyncCommand(ri->link->cc, + sentinelDiscardReplyCallback, ri, "%s UNPAUSE", + sentinelInstanceMapCommand(ri, "CLIENT")); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + + retval = redisAsyncCommand(ri->link->cc, + sentinelDiscardReplyCallback, ri, "%s", + sentinelInstanceMapCommand(ri, "EXEC")); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + + return C_OK; +} + /* Send REPLICAOF to the specified instance, always followed by a * CONFIG REWRITE command in order to store the new configuration on disk * when possible (that is, if the instance is recent enough to support @@ -4661,9 +4837,11 @@ int sentinelSendReplicaOf(sentinelValkeyInstance *ri, const sentinelAddr *addr) /* In order to send REPLICAOF in a safe way, we send a transaction performing * the following tasks: - * 1) Reconfigure the instance according to the specified host/port params. - * 2) Rewrite the configuration. - * 3) Disconnect all clients (but this one sending the command) in order + * 1) Abort a potentially ongoing Valkey FAILOVER (that may be stuck). REPLICAOF + * can't be used during a failover. + * 2) Reconfigure the instance according to the specified host/port params. + * 3) Rewrite the configuration. + * 4) Disconnect all clients (but this one sending the command) in order * to trigger the ask-primary-on-reconnection protocol for connected * clients. * @@ -4674,6 +4852,13 @@ int sentinelSendReplicaOf(sentinelValkeyInstance *ri, const sentinelAddr *addr) if (retval == C_ERR) return retval; ri->link->pending_commands++; + if (ri->monitored_instance_failover_state != SENTINEL_MONITORED_INSTANCE_FAILOVER_NS) { + retval = redisAsyncCommand(ri->link->cc, sentinelDiscardReplyCallback, ri, "%s ABORT", + sentinelInstanceMapCommand(ri, "FAILOVER")); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + } + retval = redisAsyncCommand(ri->link->cc, sentinelDiscardReplyCallback, ri, "%s %s %s", sentinelInstanceMapCommand(ri, "SLAVEOF"), host, portstr); if (retval == C_ERR) return retval; @@ -4906,6 +5091,38 @@ void sentinelFailoverSelectReplica(sentinelValkeyInstance *ri) { } } +void sentinelFailoverSendFailover(sentinelValkeyInstance *ri) { + int retval; + mstime_t time_passed = mstime() - ri->failover_state_change_time; + + /* If we don't have enough time left (1 second) for the FAILOVER command + * timeout, then abort the failover. */ + if (ri->failover_timeout - time_passed < 1000) { + sentinelEvent(LL_WARNING, "-failover-abort-master-timeout", ri, "%@"); + sentinelAbortFailover(ri); + return; + } + /* We can't send the command to the master if it is now + * disconnected. Retry again and again with this state (until the timeout + * is reached and we abort the failover.) */ + if (ri->link->disconnected) { + return; + } + + /* Send FAILOVER command to switch the role of the primary and the + * promoted replica. We actually register a generic callback for this + * command as we don't really care about the reply. We check if it worked + * indirectly observing if INFO returns a different role (master instead of + * slave). */ + retval = sentinelFailoverTo(ri, ri->promoted_replica->addr, ri->failover_timeout - time_passed); + if (retval != C_OK) return; + sentinelEvent(LL_NOTICE, "+failover-state-wait-promotion", + ri->promoted_replica, "%@"); + ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION; + ri->failover_state_change_time = mstime(); +} + + void sentinelFailoverSendReplicaOfNoOne(sentinelValkeyInstance *ri) { int retval; @@ -5077,7 +5294,12 @@ void sentinelFailoverStateMachine(sentinelValkeyInstance *ri) { switch (ri->failover_state) { case SENTINEL_FAILOVER_STATE_WAIT_START: sentinelFailoverWaitStart(ri); break; case SENTINEL_FAILOVER_STATE_SELECT_REPLICA: sentinelFailoverSelectReplica(ri); break; - case SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE: sentinelFailoverSendReplicaOfNoOne(ri); break; + case SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE: + if (!(ri->flags & SRI_COORD_FAILOVER)) + sentinelFailoverSendReplicaOfNoOne(ri); + else + sentinelFailoverSendFailover(ri); + break; case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: sentinelFailoverWaitPromotion(ri); break; case SENTINEL_FAILOVER_STATE_RECONF_REPLICAS: sentinelFailoverReconfNextReplica(ri); break; } @@ -5092,7 +5314,7 @@ void sentinelAbortFailover(sentinelValkeyInstance *ri) { serverAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS); serverAssert(ri->failover_state <= SENTINEL_FAILOVER_STATE_WAIT_PROMOTION); - ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS | SRI_FORCE_FAILOVER); + ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS | SRI_FORCE_FAILOVER | SRI_COORD_FAILOVER); ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = mstime(); if (ri->promoted_replica) { diff --git a/tests/sentinel/tests/16-config-set-config-get.tcl b/tests/sentinel/tests/16-config-set-config-get.tcl index f9831f8e86..9afe501736 100644 --- a/tests/sentinel/tests/16-config-set-config-get.tcl +++ b/tests/sentinel/tests/16-config-set-config-get.tcl @@ -56,3 +56,10 @@ test "SENTINEL CONFIG SET, wrong number of arguments" { fail "Expected to return Missing argument error" } } + +test "Undo SENTINEL CONFIG SET changes" { + foreach_sentinel_id id { + assert_equal {OK} [S $id SENTINEL CONFIG SET resolve-hostnames no announce-port 0] + } + assert_match {*no*0*} [S 1 SENTINEL CONFIG GET resolve-hostnames announce-port] +} diff --git a/tests/sentinel/tests/17-manual-coordinated.tcl b/tests/sentinel/tests/17-manual-coordinated.tcl new file mode 100644 index 0000000000..b8be58ccc5 --- /dev/null +++ b/tests/sentinel/tests/17-manual-coordinated.tcl @@ -0,0 +1,167 @@ +# Test manual coordinated failover + +source "../tests/includes/init-tests.tcl" + +foreach_sentinel_id id { + S $id sentinel debug info-period 2000 + S $id sentinel debug publish-period 1000 +} + +test "Manual coordinated failover works" { + set old_port [RPort $master_id] + set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] + assert {[lindex $addr 1] == $old_port} + + set rd [valkey_client valkey $master_id] + $rd reconnect 0 + $rd DEL FOO + + # Since we reduced the info-period (default 10000) above immediately, + # sentinel - replica may not have enough time to exchange INFO and update + # the replica's info-period, so the test may get a NOGOODSLAVE. + wait_for_condition 300 50 { + [catch {S 0 SENTINEL FAILOVER mymaster COORDINATED}] == 0 + } else { + catch {S 0 SENTINEL FAILOVER mymaster COORDINATED} reply + puts [S 0 SENTINEL REPLICAS mymaster] + fail "Sentinel manual failover did not work, got: $reply" + } + + # After sending SENTINEL FAILOVER, continue writing to the primary + # to observe the final data consistency. We must get disconnected + # when the failover finishes (and we must see no error). + for {set j 1} {$j < 1000000} {incr j} { + catch {$rd INCR FOO} reply + if {[string match "*I/O error*" $reply]} { + break + } + assert_equal $j $reply + set val $reply + } + + foreach_sentinel_id id { + wait_for_condition 1000 50 { + [lindex [S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] 1] != $old_port + } else { + fail "At least one Sentinel did not receive failover info" + } + } + set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port valkey [lindex $addr 1]] +} + +test "New primary [join $addr {:}] role matches" { + assert {[RI $master_id role] eq {master}} +} + +test "The old primary is already reconfigured as a replica" { + assert {[RI 0 master_port] == [lindex $addr 1]} +} + +test "No blocked clients (by sentinel hello PUBLISH commands) on the old primary" { + assert {[RI 0 blocked_clients] eq "0"} +} + +test "All the other replicas now point to the new primary" { + foreach_valkey_id id { + if {$id != $master_id && $id != 0} { + wait_for_condition 1000 50 { + [RI $id master_port] == [lindex $addr 1] + } else { + fail "Valkey ID $id not configured to replicate with new master" + } + } + } +} + +test "Check data consistency" { + # New primary must be synced already + assert_equal $val [R $master_id GET FOO] + # Replicas will get the value eventually + foreach_valkey_id id { + wait_for_condition 100 50 { + [R $id GET FOO] == $val + } else { + fail "Data is not consistent" + } + } +} + +test "Failover fails and times out" { + set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port valkey [lindex $addr 1]] + foreach_valkey_id id { + # Do not accept PSYNC (used during FAILOVER) on any replica + if {$id != $master_id} { + R $id ACL SETUSER default -psync + } + } + wait_for_condition 300 50 { + [catch {S 0 SENTINEL FAILOVER mymaster COORDINATED}] == 0 + } else { + catch {S 0 SENTINEL FAILOVER mymaster COORDINATED} reply + fail "Sentinel manual failover did not work, got: $reply" + } + + catch {S 0 SENTINEL FAILOVER mymaster COORDINATED} reply + assert_match {*INPROG*} $reply ;# Failover already in progress + + wait_for_condition 300 50 { + [string match "*failover_in_progress*" [dict get [S 0 SENTINEL PRIMARY mymaster] flags]] == 0 + } else { + fail "Failover did not timeout" + } + foreach_valkey_id id { + # Re-enable psync + R $id ACL SETUSER default +psync + } +} + +test "No change after failed failover: primary [join $addr {:}] role matches" { + assert {[RI $master_id role] eq {master}} +} + +test "No change after failed failover: All the other replicas still point to the primary" { + foreach_valkey_id id { + if {$id != $master_id} { + assert {[RI $id master_port] == [lindex $addr 1]} + } + } +} + +test "No change after failed failover: All sentinels agree on primary" { + foreach_sentinel_id id { + assert {[lindex [S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] 1] == [lindex $addr 1]} + } +} + +foreach flag {crash-after-election crash-after-promotion} { + # Before each SIMULATE-FAILURE test, re-source init-tests to get a clean environment + source "../tests/includes/init-tests.tcl" + + test "SENTINEL SIMULATE-FAILURE $flag works" { + assert_equal {OK} [S 0 SENTINEL SIMULATE-FAILURE $flag] + + # Trigger a failover, failover will trigger leader election, replica promotion + # Sentinel may enter failover and exit before the command, catch it and allow it + wait_for_condition 300 50 { + [catch {S 0 SENTINEL FAILOVER mymaster COORDINATED}] == 0 + || + ([catch {S 0 SENTINEL FAILOVER mymaster COORDINATED} reply] == 1 && + [string match {*couldn't open socket: connection refused*} $reply]) + } else { + catch {S 0 SENTINEL FAILOVER mymaster COORDINATED} reply + fail "Sentinel manual failover did not work, got: $reply" + } + + # Wait for sentinel to exit (due to simulate-failure flags) + wait_for_condition 1000 50 { + [catch {S 0 PING}] == 1 + } else { + fail "Sentinel set $flag but did not exit" + } + assert_error {*couldn't open socket: connection refused*} {S 0 PING} + + restart_instance sentinel 0 + } +} diff --git a/tests/sentinel/tests/18-stuck-failover.tcl b/tests/sentinel/tests/18-stuck-failover.tcl new file mode 100644 index 0000000000..cfa96137dd --- /dev/null +++ b/tests/sentinel/tests/18-stuck-failover.tcl @@ -0,0 +1,53 @@ +# Test manual coordinated failover + +source "../tests/includes/init-tests.tcl" + +foreach_sentinel_id id { + S $id sentinel debug info-period 2000 + S $id sentinel debug publish-period 1000 +} + +test "Sentinel is able to reconfigure a node that is stuck in failover state" { + # Pick a replica + foreach_valkey_id id { + if {$id != $master_id} { + set failover_to_port [RPort $id] + set failover_to_pid [get_instance_attrib valkey $id pid] + } + } + + wait_for_condition 1000 50 { + [string match "*flags master link-pending-commands*" [S 0 SENTINEL PRIMARY mymaster] ] + } else { + fail "Test" + } + + pause_process $failover_to_pid + + R $master_id failover to 127.0.0.1 $failover_to_port TIMEOUT 10 FORCE + + wait_for_condition 1000 50 { + [string match "no-failover" [RI $master_id master_failover_state]] + } else { + fail "Failover was not aborted by Sentinel" + } + resume_process $failover_to_pid +} + +test "All the replicas now point to the new primary" { + set old_master_id $master_id + set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] + set master_id [get_instance_id_by_port valkey [lindex $addr 1]] + + assert {$old_master_id ne $master_id} + + foreach_valkey_id id { + if {$id != $master_id} { + wait_for_condition 1000 50 { + [RI $id master_port] == [lindex $addr 1] + } else { + fail "Valkey ID $id not configured to replicate with new master" + } + } + } +} diff --git a/tests/sentinel/tests/includes/utils.tcl b/tests/sentinel/tests/includes/utils.tcl index 7d963de19d..caa8f249c6 100644 --- a/tests/sentinel/tests/includes/utils.tcl +++ b/tests/sentinel/tests/includes/utils.tcl @@ -10,6 +10,15 @@ proc restart_killed_instances {} { } } +proc verify_sentinel_connect_sentinels {id} { + foreach sentinel [S $id SENTINEL SENTINELS mymaster] { + if {[string match "*disconnected*" [dict get $sentinel flags]]} { + return 0 + } + } + return 1 +} + proc verify_sentinel_auto_discovery {} { set sentinels [llength $::sentinel_instances] foreach_sentinel_id id { @@ -18,5 +27,10 @@ proc verify_sentinel_auto_discovery {} { } else { fail "At least some sentinel can't detect some other sentinel" } + wait_for_condition 1000 50 { + [verify_sentinel_connect_sentinels $id] == 1 + } else { + fail "At least some sentinel can't connect to other sentinel" + } } }