From d3837f2e9c21baa34271866fd92bc946d84b1460 Mon Sep 17 00:00:00 2001 From: Sankar <1890648+srgsanky@users.noreply.github.com> Date: Sun, 16 Jun 2024 20:37:09 -0700 Subject: [PATCH] Make cluster meet reliable under link failures (#461) When there is a link failure while an ongoing MEET request is sent the sending node stops sending anymore MEET and starts sending PINGs. Since every node responds to PINGs from unknown nodes with a PONG, the receiving node never adds the sending node. But the sending node adds the receiving node when it sees a PONG. This can lead to asymmetry in cluster membership. This changes makes the sender keep sending MEET until it sees a PONG, avoiding the asymmetry. --------- Signed-off-by: Sankar <1890648+srgsanky@users.noreply.github.com> --- src/cluster.c | 31 +++++-- src/debug.c | 10 ++- src/server.h | 2 + tests/unit/cluster/cluster-multiple-meets.tcl | 83 +++++++++++++++++++ tests/unit/cluster/cluster-reliable-meet.tcl | 71 ++++++++++++++++ 5 files changed, 189 insertions(+), 8 deletions(-) create mode 100644 tests/unit/cluster/cluster-multiple-meets.tcl create mode 100644 tests/unit/cluster/cluster-reliable-meet.tcl diff --git a/src/cluster.c b/src/cluster.c index 3fa97e4831..1be4de2c91 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2747,6 +2747,11 @@ int clusterProcessPacket(clusterLink *link) { if (type == server.cluster_drop_packet_filter) { serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); + if (server.debug_cluster_close_link_on_packet_drop) { + freeClusterLink(link); + serverLog(LL_WARNING, "Closing link for matching packet type %hu", type); + return 0; + } return 1; } @@ -2921,6 +2926,14 @@ int clusterProcessPacket(clusterLink *link) { serverLog(LL_DEBUG,"%s packet received: %.40s", clusterGetMessageTypeString(type), link->node ? link->node->name : "NULL"); + + if (sender && (sender->flags & CLUSTER_NODE_MEET)) { + /* Once we get a response for MEET from the sender, we can stop sending more MEET. */ + sender->flags &= ~CLUSTER_NODE_MEET; + serverLog(LL_NOTICE, "Successfully completed handshake with %.40s (%s)", sender->name, + sender->human_nodename); + } + if (!link->inbound) { if (nodeInHandshake(link->node)) { /* If we already have this node, try to change the @@ -3339,12 +3352,18 @@ void clusterLinkConnectHandler(connection *conn) { * replaced by the clusterSendPing() call. */ node->ping_sent = old_ping_sent; } - /* We can clear the flag after the first packet is sent. - * If we'll never receive a PONG, we'll never send new packets - * to this node. Instead after the PONG is received and we - * are no longer in meet/handshake status, we want to send - * normal PING packets. */ - node->flags &= ~CLUSTER_NODE_MEET; + + /* NOTE: Assume the current node is A and is asked to MEET another node B. + * Once A sends MEET to B, it cannot clear the MEET flag for B until it + * gets a response from B. If the MEET packet is not accepted by B due to + * link failure, A must continue sending MEET. If A doesn't continue sending + * MEET, A will know about B, but B will never add A. Every node always + * responds to PINGs from unknown nodes with a PONG, so A will know about B + * and continue sending PINGs. But B won't add A until it sees a MEET (or it + * gets to know about A from a trusted third node C). In this case, clearing + * the MEET flag here leads to asymmetry in the cluster membership. So, we + * clear the MEET flag in clusterProcessPacket. + */ serverLog(LL_DEBUG,"Connecting with Node %.40s at %s:%d", node->name, node->ip, node->cport); diff --git a/src/debug.c b/src/debug.c index a595d6fac3..ecbe5656d1 100644 --- a/src/debug.c +++ b/src/debug.c @@ -431,6 +431,9 @@ void debugCommand(client *c) { " Show low level info about `key` and associated value.", "DROP-CLUSTER-PACKET-FILTER ", " Drop all packets that match the filtered type. Set to -1 allow all packets.", +"CLOSE-CLUSTER-LINK-ON-PACKET-DROP <0|1>", +" This is effective only when DROP-CLUSTER-PACKET-FILTER is set to a valid packet type.", +" When set to 1, the cluster link is closed after dropping a packet based on the filter.", "OOM", " Crash the server simulating an out-of-memory error.", "PANIC", @@ -599,8 +602,11 @@ NULL if (getLongFromObjectOrReply(c, c->argv[2], &packet_type, NULL) != C_OK) return; server.cluster_drop_packet_filter = packet_type; - addReply(c,shared.ok); - } else if (!strcasecmp(c->argv[1]->ptr,"object") && c->argc == 3) { + addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "close-cluster-link-on-packet-drop") && c->argc == 3) { + server.debug_cluster_close_link_on_packet_drop = atoi(c->argv[2]->ptr) != 0; + addReply(c, shared.ok); + } else if (!strcasecmp(c->argv[1]->ptr, "object") && c->argc == 3) { dictEntry *de; robj *val; char *strenc; diff --git a/src/server.h b/src/server.h index f4c890401e..d7430858a0 100644 --- a/src/server.h +++ b/src/server.h @@ -1997,6 +1997,8 @@ struct redisServer { unsigned long long cluster_link_msg_queue_limit_bytes; /* Memory usage limit on individual link msg queue */ int cluster_drop_packet_filter; /* Debug config that allows tactically * dropping packets of a specific type */ + /* Debug config that goes along with cluster_drop_packet_filter. When set, the link is closed on packet drop. */ + uint32_t debug_cluster_close_link_on_packet_drop : 1; /* Scripting */ mstime_t busy_reply_threshold; /* Script / module timeout in milliseconds */ int pre_command_oom_state; /* OOM before command (script?) was started */ diff --git a/tests/unit/cluster/cluster-multiple-meets.tcl b/tests/unit/cluster/cluster-multiple-meets.tcl new file mode 100644 index 0000000000..07a2582133 --- /dev/null +++ b/tests/unit/cluster/cluster-multiple-meets.tcl @@ -0,0 +1,83 @@ +# make sure the test infra won't use SELECT +set old_singledb $::singledb +set ::singledb 1 + +tags {tls:skip external:skip cluster} { + set base_conf [list cluster-enabled yes] + start_multiple_servers 2 [list overrides $base_conf] { + test "Cluster nodes are reachable" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + # Every node should be reachable. + wait_for_condition 1000 50 { + ([catch {R $id ping} ping_reply] == 0) && + ($ping_reply eq {PONG}) + } else { + catch {R $id ping} err + fail "Node #$id keeps replying '$err' to PING." + } + } + } + + test "Before slots allocation, all nodes report cluster failure" { + wait_for_cluster_state fail + } + + set CLUSTER_PACKET_TYPE_PONG 1 + set CLUSTER_PACKET_TYPE_NONE -1 + + test "Cluster nodes haven't met each other" { + assert {[llength [get_cluster_nodes 1]] == 1} + assert {[llength [get_cluster_nodes 0]] == 1} + } + + test "Allocate slots" { + cluster_allocate_slots 2 0;# primaries replicas + } + + test "Multiple MEETs from Node 1 to Node 0 should work" { + # Make 1 drop the PONG responses to MEET + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_PONG + # It is important to close the connection on drop, otherwise a subsequent MEET won't be sent + R 1 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 1 + + R 1 CLUSTER MEET 127.0.0.1 [srv 0 port] + + # Wait for at least a few MEETs to be sent so that we are sure that 1 is dropping the response to MEET. + wait_for_condition 1000 50 { + [CI 0 cluster_stats_messages_meet_received] > 1 && + [CI 1 cluster_state] eq {fail} && [CI 0 cluster_state] eq {ok} + } else { + fail "Cluster node 1 never sent multiple MEETs to 0" + } + + # 0 will be connected to 1, but 1 won't see that 0 is connected + assert {[llength [get_cluster_nodes 1 connected]] == 1} + assert {[llength [get_cluster_nodes 0 connected]] == 2} + + # Drop incoming and outgoing links from/to 1 + R 0 DEBUG CLUSTERLINK KILL ALL [R 1 CLUSTER MYID] + + # Wait for 0 to know about 1 again after 1 sends a MEET + wait_for_condition 1000 50 { + [llength [get_cluster_nodes 0 connected]] == 2 + } else { + fail "Cluster node 1 never sent multiple MEETs to 0" + } + + # Undo packet drop + R 1 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + R 1 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 0 + + # Both a and b will turn to cluster state ok + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {ok} && [CI 0 cluster_state] eq {ok} && + [CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received] + } else { + fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]" + } + } + } ;# stop servers +} ;# tags + +set ::singledb $old_singledb + diff --git a/tests/unit/cluster/cluster-reliable-meet.tcl b/tests/unit/cluster/cluster-reliable-meet.tcl new file mode 100644 index 0000000000..41da97ab9b --- /dev/null +++ b/tests/unit/cluster/cluster-reliable-meet.tcl @@ -0,0 +1,71 @@ +# make sure the test infra won't use SELECT +set old_singledb $::singledb +set ::singledb 1 + +tags {tls:skip external:skip cluster} { + set base_conf [list cluster-enabled yes] + start_multiple_servers 2 [list overrides $base_conf] { + test "Cluster nodes are reachable" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + # Every node should be reachable. + wait_for_condition 1000 50 { + ([catch {R $id ping} ping_reply] == 0) && + ($ping_reply eq {PONG}) + } else { + catch {R $id ping} err + fail "Node #$id keeps replying '$err' to PING." + } + } + } + + test "Before slots allocation, all nodes report cluster failure" { + wait_for_cluster_state fail + } + + set CLUSTER_PACKET_TYPE_MEET 2 + set CLUSTER_PACKET_TYPE_NONE -1 + + test "Cluster nodes haven't met each other" { + assert {[llength [get_cluster_nodes 1]] == 1} + assert {[llength [get_cluster_nodes 0]] == 1} + } + + test "Allocate slots" { + cluster_allocate_slots 2 0 + } + + test "MEET is reliable when target drops the initial MEETs" { + # Make 0 drop the initial MEET messages due to link failure + R 0 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_MEET + R 0 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 1 + + R 1 CLUSTER MEET 127.0.0.1 [srv 0 port] + + # Wait for at least a few MEETs to be sent so that we are sure that 0 is + # dropping them. + wait_for_condition 1000 50 { + [CI 0 cluster_stats_messages_meet_received] >= 3 + } else { + fail "Cluster node 1 never sent multiple MEETs to 0" + } + + # Make sure the nodes still don't know about each other + assert {[llength [get_cluster_nodes 1 connected]] == 1} + assert {[llength [get_cluster_nodes 0 connected]] == 1} + + R 0 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + + # If the MEET is reliable, both a and b will turn to cluster state ok + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {ok} && [CI 0 cluster_state] eq {ok} && + [CI 0 cluster_stats_messages_meet_received] >= 4 && + [CI 1 cluster_stats_messages_meet_sent] == [CI 0 cluster_stats_messages_meet_received] + } else { + fail "1 cluster_state:[CI 1 cluster_state], 0 cluster_state: [CI 0 cluster_state]" + } + } + } ;# stop servers +} ;# tags + +set ::singledb $old_singledb +