Add links to network disconnect troubleshooting (elastic#112330)

Makes the docs added in elastic#112271 more discoverable.
thecoop · Sep 9, 2024 · 1977a71 · 1977a71
1 parent 939153e
commit 1977a71
Show file tree

Hide file tree

Showing 6 changed files with 31 additions and 12 deletions.
diff --git a/docs/changelog/112330.yaml b/docs/changelog/112330.yaml
@@ -0,0 +1,5 @@
+pr: 112330
+summary: Add links to network disconnect troubleshooting
+area: Network
+type: enhancement
+issues: []
diff --git a/docs/reference/modules/transport.asciidoc b/docs/reference/modules/transport.asciidoc
@@ -185,16 +185,18 @@ configured, and defaults otherwise to `transport.tcp.reuse_address`.
 
 A transport connection between two nodes is made up of a number of long-lived
 TCP connections, some of which may be idle for an extended period of time.
-Nonetheless, Elasticsearch requires these connections to remain open, and it
-can disrupt the operation of your cluster if any inter-node connections are
-closed by an external influence such as a firewall. It is important to
-configure your network to preserve long-lived idle connections between
-Elasticsearch nodes, for instance by leaving `*.tcp.keep_alive` enabled and
-ensuring that the keepalive interval is shorter than any timeout that might
-cause idle connections to be closed, or by setting `transport.ping_schedule` if
-keepalives cannot be configured. Devices which drop connections when they reach
-a certain age are a common source of problems to Elasticsearch clusters, and
-must not be used.
+Nonetheless, {es} requires these connections to remain open, and it can disrupt
+the operation of your cluster if any inter-node connections are closed by an
+external influence such as a firewall. It is important to configure your network
+to preserve long-lived idle connections between {es} nodes, for instance by
+leaving `*.tcp.keep_alive` enabled and ensuring that the keepalive interval is
+shorter than any timeout that might cause idle connections to be closed, or by
+setting `transport.ping_schedule` if keepalives cannot be configured. Devices
+which drop connections when they reach a certain age are a common source of
+problems to {es} clusters, and must not be used.
+
+For information about troubleshooting unexpected network disconnections, see
+<<troubleshooting-unstable-cluster-network>>.
 
 [[request-compression]]
 ===== Request compression

diff --git a/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java b/server/src/main/java/org/elasticsearch/common/ReferenceDocs.java
@@ -43,6 +43,7 @@ public enum ReferenceDocs {
     UNSTABLE_CLUSTER_TROUBLESHOOTING,
     LAGGING_NODE_TROUBLESHOOTING,
     SHARD_LOCK_TROUBLESHOOTING,
+    NETWORK_DISCONNECT_TROUBLESHOOTING,
     CONCURRENT_REPOSITORY_WRITERS,
     ARCHIVE_INDICES,
     HTTP_TRACER,

diff --git a/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java b/server/src/main/java/org/elasticsearch/transport/ClusterConnectionManager.java
@@ -12,6 +12,7 @@
 import org.elasticsearch.action.ActionListener;
 import org.elasticsearch.action.support.ContextPreservingActionListener;
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.common.ReferenceDocs;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
 import org.elasticsearch.common.util.concurrent.ListenableFuture;
@@ -237,7 +238,13 @@ private void connectToNodeOrRetry(
                                     if (connectingRefCounter.hasReferences() == false) {
                                         logger.trace("connection manager shut down, closing transport connection to [{}]", node);
                                     } else if (conn.hasReferences()) {
-                                        logger.info("transport connection to [{}] closed by remote", node.descriptionWithoutAttributes());
+                                        logger.info(
+                                            """
+                                                transport connection to [{}] closed by remote; \
+                                                if unexpected, see [{}] for troubleshooting guidance""",
+                                            node.descriptionWithoutAttributes(),
+                                            ReferenceDocs.NETWORK_DISCONNECT_TROUBLESHOOTING
+                                        );
                                         // In production code we only close connections via ref-counting, so this message confirms that a
                                         // 'node-left ... reason: disconnected' event was caused by external factors. Put differently, if a
                                         // node leaves the cluster with "reason: disconnected" but without this message being logged then

diff --git a/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json b/server/src/main/resources/org/elasticsearch/common/reference-docs-links.json
@@ -4,6 +4,7 @@
   "UNSTABLE_CLUSTER_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html",
   "LAGGING_NODE_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#troubleshooting-unstable-cluster-lagging",
   "SHARD_LOCK_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#troubleshooting-unstable-cluster-shardlockobtainfailedexception",
+  "NETWORK_DISCONNECT_TROUBLESHOOTING": "troubleshooting-unstable-cluster.html#troubleshooting-unstable-cluster-network",
   "CONCURRENT_REPOSITORY_WRITERS": "diagnosing-corrupted-repositories.html",
   "ARCHIVE_INDICES": "archive-indices.html",
   "HTTP_TRACER": "modules-network.html#http-rest-request-tracer",

diff --git a/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java b/server/src/test/java/org/elasticsearch/transport/ClusterConnectionManagerTests.java
@@ -188,7 +188,10 @@ public void testDisconnectLogging() {
                     "remotely-triggered close message",
                     ClusterConnectionManager.class.getCanonicalName(),
                     Level.INFO,
-                    "transport connection to [" + remoteClose.descriptionWithoutAttributes() + "] closed by remote"
+                    "transport connection to ["
+                        + remoteClose.descriptionWithoutAttributes()
+                        + "] closed by remote; "
+                        + "if unexpected, see [https://www.elastic.co/guide/en/elasticsearch/reference/*] for troubleshooting guidance"
                 )
             );
             mockLog.addExpectation(