From c2caeb4f328c364036f27c6ecd56fce57de96650 Mon Sep 17 00:00:00 2001 From: RS146BIJAY Date: Mon, 22 Jul 2024 11:18:26 +0530 Subject: [PATCH] Caching avg total bytes and avg free bytes inside ClusterInfo Signed-off-by: RS146BIJAY --- .../org/opensearch/cluster/ClusterInfo.java | 38 +++++++++++++++++++ .../decider/DiskThresholdDecider.java | 11 +++--- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/server/src/main/java/org/opensearch/cluster/ClusterInfo.java b/server/src/main/java/org/opensearch/cluster/ClusterInfo.java index 4c38d6fd99f5d..868c92623dc6d 100644 --- a/server/src/main/java/org/opensearch/cluster/ClusterInfo.java +++ b/server/src/main/java/org/opensearch/cluster/ClusterInfo.java @@ -33,6 +33,7 @@ package org.opensearch.cluster; import org.opensearch.Version; +import org.opensearch.cluster.routing.RoutingNode; import org.opensearch.cluster.routing.ShardRouting; import org.opensearch.common.annotation.PublicApi; import org.opensearch.core.common.io.stream.StreamInput; @@ -68,9 +69,12 @@ public class ClusterInfo implements ToXContentFragment, Writeable { final Map routingToDataPath; final Map reservedSpace; final Map nodeFileCacheStats; + private long avgTotalBytes; + private long avgFreeByte; protected ClusterInfo() { this(Map.of(), Map.of(), Map.of(), Map.of(), Map.of(), Map.of()); + calculateAvgFreeAndTotalBytes(mostAvailableSpaceUsage); } /** @@ -97,6 +101,7 @@ public ClusterInfo( this.routingToDataPath = routingToDataPath; this.reservedSpace = reservedSpace; this.nodeFileCacheStats = nodeFileCacheStats; + calculateAvgFreeAndTotalBytes(mostAvailableSpaceUsage); } public ClusterInfo(StreamInput in) throws IOException { @@ -117,6 +122,39 @@ public ClusterInfo(StreamInput in) throws IOException { } else { this.nodeFileCacheStats = Map.of(); } + + calculateAvgFreeAndTotalBytes(mostAvailableSpaceUsage); + } + + /** + * Returns a {@link DiskUsage} for the {@link RoutingNode} using the + * average usage of other nodes in the disk usage map. + * @param usages Map of nodeId to DiskUsage for all known nodes + */ + private void calculateAvgFreeAndTotalBytes(final Map usages) { + if (usages == null || usages.isEmpty()) { + this.avgTotalBytes = 0; + this.avgFreeByte = 0; + return; + } + + long totalBytes = 0; + long freeBytes = 0; + for (DiskUsage du : usages.values()) { + totalBytes += du.getTotalBytes(); + freeBytes += du.getFreeBytes(); + } + + this.avgTotalBytes = totalBytes / usages.size(); + this.avgFreeByte = freeBytes / usages.size(); + } + + public long getAvgFreeByte() { + return avgFreeByte; + } + + public long getAvgTotalBytes() { + return avgTotalBytes; } @Override diff --git a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDecider.java b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDecider.java index efa5115939d3c..c6a124d808927 100644 --- a/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDecider.java +++ b/server/src/main/java/org/opensearch/cluster/routing/allocation/decider/DiskThresholdDecider.java @@ -140,9 +140,8 @@ public static long sizeOfRelocatingShards( // Where reserved space is unavailable (e.g. stats are out-of-sync) compute a conservative estimate for initialising shards final List initializingShards = node.shardsWithState(ShardRoutingState.INITIALIZING); - initializingShards.removeIf(shardRouting -> reservedSpace.containsShardId(shardRouting.shardId())); for (ShardRouting routing : initializingShards) { - if (routing.relocatingNodeId() == null) { + if (routing.relocatingNodeId() == null || reservedSpace.containsShardId(routing.shardId())) { // in practice the only initializing-but-not-relocating shards with a nonzero expected shard size will be ones created // by a resize (shrink/split/clone) operation which we expect to happen using hard links, so they shouldn't be taking // any additional space and can be ignored here @@ -230,7 +229,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing // subtractLeavingShards is passed as false here, because they still use disk space, and therefore we should be extra careful // and take the size into account - final DiskUsageWithRelocations usage = getDiskUsage(node, allocation, usages, false); + final DiskUsageWithRelocations usage = getDiskUsage(node, allocation, usages, clusterInfo.getAvgFreeByte(), clusterInfo.getAvgTotalBytes(), false); // First, check that the node currently over the low watermark double freeDiskPercentage = usage.getFreeDiskAsPercentage(); // Cache the used disk percentage for displaying disk percentages consistent with documentation @@ -492,7 +491,7 @@ public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAl // subtractLeavingShards is passed as true here, since this is only for shards remaining, we will *eventually* have enough disk // since shards are moving away. No new shards will be incoming since in canAllocate we pass false for this check. - final DiskUsageWithRelocations usage = getDiskUsage(node, allocation, usages, true); + final DiskUsageWithRelocations usage = getDiskUsage(node, allocation, usages, clusterInfo.getAvgFreeByte(), clusterInfo.getAvgTotalBytes(), false); final String dataPath = clusterInfo.getDataPath(shardRouting); // If this node is already above the high threshold, the shard cannot remain (get it off!) final double freeDiskPercentage = usage.getFreeDiskAsPercentage(); @@ -581,13 +580,15 @@ private DiskUsageWithRelocations getDiskUsage( RoutingNode node, RoutingAllocation allocation, final Map usages, + final long avgFreeBytes, + final long avgTotalBytes, boolean subtractLeavingShards ) { DiskUsage usage = usages.get(node.nodeId()); if (usage == null) { // If there is no usage, and we have other nodes in the cluster, // use the average usage for all nodes as the usage for this node - usage = averageUsage(node, usages); + usage = new DiskUsage(node.nodeId(), node.node().getName(), "_na_", avgTotalBytes, avgFreeBytes); if (logger.isDebugEnabled()) { logger.debug( "unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]",