From b2ee3c0907e2b80115694d837f4f56414f0023ba Mon Sep 17 00:00:00 2001 From: Leon Gao Date: Sun, 12 Sep 2021 23:52:38 -0700 Subject: [PATCH 1/9] Add balancer metrics --- .../hadoop/hdfs/server/balancer/Balancer.java | 21 ++++- .../hdfs/server/balancer/BalancerMetrics.java | 80 +++++++++++++++++++ .../server/balancer/TestBalancerService.java | 35 ++++++++ 3 files changed, 135 insertions(+), 1 deletion(-) create mode 100644 hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/BalancerMetrics.java diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java index 67ecd5a4d24ff..0c9750392cc2c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Balancer.java @@ -40,6 +40,8 @@ import java.util.concurrent.atomic.AtomicInteger; import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.source.JvmMetrics; import org.apache.hadoop.thirdparty.com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.hdfs.DFSUtilClient; import org.slf4j.Logger; @@ -226,6 +228,7 @@ public class Balancer { private final long maxSizeToMove; private final long defaultBlockSize; private final boolean sortTopNodes; + private final BalancerMetrics metrics; // all data node lists private final Collection overUtilized = new LinkedList(); @@ -357,6 +360,7 @@ static int getFailedTimesSinceLastSuccessfulBalance() { this.defaultBlockSize = getLongBytes(conf, DFSConfigKeys.DFS_BLOCK_SIZE_KEY, DFSConfigKeys.DFS_BLOCK_SIZE_DEFAULT); + this.metrics = BalancerMetrics.create(this); } private static long getCapacity(DatanodeStorageReport report, StorageType t) { @@ -454,6 +458,8 @@ private long init(List reports) { } logUtilizationCollections(); + metrics.setNumOfOverUtilizedNodes(overUtilized.size()); + metrics.setNumOfUnderUtilizedNodes(underUtilized.size()); Preconditions.checkState(dispatcher.getStorageGroupMap().size() == overUtilized.size() + underUtilized.size() + aboveAvgUtilized.size() @@ -636,7 +642,11 @@ void resetData(Configuration conf) { this.belowAvgUtilized.clear(); this.underUtilized.clear(); this.policy.reset(); - dispatcher.reset(conf);; + dispatcher.reset(conf); + } + + NameNodeConnector getNnc() { + return nnc; } static class Result { @@ -710,8 +720,10 @@ Result newResult(ExitStatus exitStatus) { /** Run an iteration for all datanodes. */ Result runOneIteration() { try { + metrics.setIterateRunning(true); final List reports = dispatcher.init(); final long bytesLeftToMove = init(reports); + metrics.setBytesLeftToMove(bytesLeftToMove); if (bytesLeftToMove == 0) { return newResult(ExitStatus.SUCCESS, bytesLeftToMove, 0); } else { @@ -766,6 +778,7 @@ Result runOneIteration() { System.out.println(e + ". Exiting ..."); return newResult(ExitStatus.INTERRUPTED); } finally { + metrics.setIterateRunning(false); dispatcher.shutdownNow(); } } @@ -848,6 +861,10 @@ static int run(Collection namenodes, final BalancerParameters p, static int run(Collection namenodes, Collection nsIds, final BalancerParameters p, Configuration conf) throws IOException, InterruptedException { + DefaultMetricsSystem.initialize("Balancer"); + JvmMetrics.create("Balancer", + conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY), + DefaultMetricsSystem.instance()); if (!p.getRunAsService()) { return doBalance(namenodes, nsIds, p, conf); } @@ -893,6 +910,8 @@ static int run(Collection namenodes, Collection nsIds, time2Str(scheduleInterval)); Thread.sleep(scheduleInterval); } + DefaultMetricsSystem.shutdown(); + // normal stop return 0; } diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/BalancerMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/BalancerMetrics.java new file mode 100644 index 0000000000000..fd8e1ceba2296 --- /dev/null +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/BalancerMetrics.java @@ -0,0 +1,80 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hdfs.server.balancer; + +import org.apache.hadoop.metrics2.annotation.Metric; +import org.apache.hadoop.metrics2.annotation.Metrics; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; +import org.apache.hadoop.metrics2.lib.MutableGaugeInt; +import org.apache.hadoop.metrics2.lib.MutableGaugeLong; + +/** + * Metrics for individual Balancer. + */ +@Metrics(about="Balancer metrics", context="dfs") +class BalancerMetrics { + + private final Balancer balancer; + + @Metric("If a balancer iterate is running") + private MutableGaugeInt iterateRunning; + + @Metric("Bytes left to move to make cluster balanced") + private MutableGaugeLong bytesLeftToMove; + + @Metric("Number of under utilized nodes") + private MutableGaugeInt numOfUnderUtilizedNodes; + + @Metric("Number of over utilized nodes") + private MutableGaugeInt numOfOverUtilizedNodes; + + private BalancerMetrics(Balancer b) { + this.balancer = b; + } + + public static BalancerMetrics create(Balancer b) { + BalancerMetrics m = new BalancerMetrics(b); + return DefaultMetricsSystem.instance().register( + m.getName(), null, m); + } + + String getName() { + return "Balancer-" + balancer.getNnc().getBlockpoolID(); + } + + @Metric("Bytes that already moved in current doBalance run.") + public long getBytesMovedInCurrentRun() { + return balancer.getNnc().getBytesMoved().get(); + } + + void setIterateRunning(boolean iterateRunning) { + this.iterateRunning.set(iterateRunning ? 1 : 0); + } + + void setBytesLeftToMove(long bytesLeftToMove) { + this.bytesLeftToMove.set(bytesLeftToMove); + } + + void setNumOfUnderUtilizedNodes(int numOfUnderUtilizedNodes) { + this.numOfUnderUtilizedNodes.set(numOfUnderUtilizedNodes); + } + + void setNumOfOverUtilizedNodes(int numOfOverUtilizedNodes) { + this.numOfOverUtilizedNodes.set(numOfOverUtilizedNodes); + } +} \ No newline at end of file diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java index f1fab273bdb71..bdfe36e82357b 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java @@ -28,11 +28,15 @@ import org.apache.hadoop.hdfs.client.HdfsClientConfigKeys; import org.apache.hadoop.hdfs.protocol.ClientProtocol; import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil; +import org.apache.hadoop.metrics2.MetricsRecordBuilder; +import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; import org.apache.hadoop.test.GenericTestUtils; +import org.apache.hadoop.test.MetricsAsserts; import org.apache.hadoop.util.Tool; import org.junit.Test; import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -129,8 +133,39 @@ public void testBalancerServiceBalanceTwice() throws Exception { newBalancerService(conf, new String[] {"-asService"}); balancerThread.start(); + // Check metrics + final String balancerMetricsName = "Balancer-" + + cluster.getNameNode(0).getNamesystem().getBlockPoolId(); + GenericTestUtils.waitFor(new Supplier() { + @Override + public Boolean get() { + // Validate metrics after metrics system initiated. + if (DefaultMetricsSystem.instance().getSource(balancerMetricsName) == null) { + return false; + } + MetricsRecordBuilder rb = MetricsAsserts.getMetrics(balancerMetricsName); + if (rb != null && MetricsAsserts.getLongGauge("BytesLeftToMove", rb) == 500) { + if (MetricsAsserts.getIntGauge("NumOfUnderUtilizedNodes", rb) != 1) { + return false; + } + if (MetricsAsserts.getIntGauge("NumOfOverUtilizedNodes", rb) != 0) { + return false; + } + if (MetricsAsserts.getIntGauge("IterateRunning", rb) != 1) { + return false; + } + return true; + } + return false; + } + }, 100, 2000); + TestBalancer.waitForBalancer(totalUsedSpace, totalCapacity, client, cluster, BalancerParameters.DEFAULT); + + MetricsRecordBuilder rb = MetricsAsserts.getMetrics(balancerMetricsName); + assertTrue(MetricsAsserts.getLongGauge("BytesMovedInCurrentRun", rb) >= 500); + cluster.triggerHeartbeats(); cluster.triggerBlockReports(); From d030119bf26f57b09e77b722cc4f2b377e44f60f Mon Sep 17 00:00:00 2001 From: Leon Gao Date: Mon, 13 Sep 2021 10:23:47 -0700 Subject: [PATCH 2/9] Fix checkstyle --- .../org/apache/hadoop/hdfs/server/balancer/BalancerMetrics.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/BalancerMetrics.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/BalancerMetrics.java index fd8e1ceba2296..77f3795bf6904 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/BalancerMetrics.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/BalancerMetrics.java @@ -27,7 +27,7 @@ * Metrics for individual Balancer. */ @Metrics(about="Balancer metrics", context="dfs") -class BalancerMetrics { +final class BalancerMetrics { private final Balancer balancer; From 7f2b835b869e3824f4d629649040eade999e395b Mon Sep 17 00:00:00 2001 From: Leon Gao Date: Mon, 13 Sep 2021 22:51:12 -0700 Subject: [PATCH 3/9] Use lambda --- .../server/balancer/TestBalancerService.java | 32 ++++++++----------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java index bdfe36e82357b..6a9a7d6fc602c 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java @@ -136,35 +136,29 @@ public void testBalancerServiceBalanceTwice() throws Exception { // Check metrics final String balancerMetricsName = "Balancer-" + cluster.getNameNode(0).getNamesystem().getBlockPoolId(); - GenericTestUtils.waitFor(new Supplier() { - @Override - public Boolean get() { - // Validate metrics after metrics system initiated. - if (DefaultMetricsSystem.instance().getSource(balancerMetricsName) == null) { + GenericTestUtils.waitFor( () -> { + // Validate metrics after metrics system initiated. + if (DefaultMetricsSystem.instance().getSource(balancerMetricsName) == null) { + return false; + } + MetricsRecordBuilder rb = MetricsAsserts.getMetrics(balancerMetricsName); + if (rb != null && MetricsAsserts.getLongGauge("BytesLeftToMove", rb) > 0) { + if (MetricsAsserts.getIntGauge("NumOfUnderUtilizedNodes", rb) != 1) { return false; } - MetricsRecordBuilder rb = MetricsAsserts.getMetrics(balancerMetricsName); - if (rb != null && MetricsAsserts.getLongGauge("BytesLeftToMove", rb) == 500) { - if (MetricsAsserts.getIntGauge("NumOfUnderUtilizedNodes", rb) != 1) { - return false; - } - if (MetricsAsserts.getIntGauge("NumOfOverUtilizedNodes", rb) != 0) { - return false; - } - if (MetricsAsserts.getIntGauge("IterateRunning", rb) != 1) { - return false; - } - return true; + if (MetricsAsserts.getIntGauge("NumOfOverUtilizedNodes", rb) != 0) { + return false; } - return false; + return true; } + return false; }, 100, 2000); TestBalancer.waitForBalancer(totalUsedSpace, totalCapacity, client, cluster, BalancerParameters.DEFAULT); MetricsRecordBuilder rb = MetricsAsserts.getMetrics(balancerMetricsName); - assertTrue(MetricsAsserts.getLongGauge("BytesMovedInCurrentRun", rb) >= 500); + assertTrue(MetricsAsserts.getLongGauge("BytesMovedInCurrentRun", rb) > 0); cluster.triggerHeartbeats(); cluster.triggerBlockReports(); From 0f6fd168ed026db3e845119cc334066b64f310ee Mon Sep 17 00:00:00 2001 From: Leon Gao Date: Mon, 13 Sep 2021 22:54:16 -0700 Subject: [PATCH 4/9] Use lambda --- .../hdfs/server/balancer/TestBalancerService.java | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java index 6a9a7d6fc602c..d4c4e557559d7 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java @@ -142,16 +142,7 @@ public void testBalancerServiceBalanceTwice() throws Exception { return false; } MetricsRecordBuilder rb = MetricsAsserts.getMetrics(balancerMetricsName); - if (rb != null && MetricsAsserts.getLongGauge("BytesLeftToMove", rb) > 0) { - if (MetricsAsserts.getIntGauge("NumOfUnderUtilizedNodes", rb) != 1) { - return false; - } - if (MetricsAsserts.getIntGauge("NumOfOverUtilizedNodes", rb) != 0) { - return false; - } - return true; - } - return false; + return rb != null && MetricsAsserts.getLongGauge("BytesLeftToMove", rb) > 0; }, 100, 2000); TestBalancer.waitForBalancer(totalUsedSpace, totalCapacity, client, From e8eae1d66205000ea0bb58ee42d339e5a904a912 Mon Sep 17 00:00:00 2001 From: Leon Gao Date: Tue, 14 Sep 2021 09:14:17 -0700 Subject: [PATCH 5/9] Fix checkstyle --- .../hadoop/hdfs/server/balancer/TestBalancerService.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java index d4c4e557559d7..0ec0431555d46 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/balancer/TestBalancerService.java @@ -36,7 +36,6 @@ import org.junit.Test; import java.util.concurrent.TimeUnit; -import java.util.function.Supplier; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -136,7 +135,7 @@ public void testBalancerServiceBalanceTwice() throws Exception { // Check metrics final String balancerMetricsName = "Balancer-" + cluster.getNameNode(0).getNamesystem().getBlockPoolId(); - GenericTestUtils.waitFor( () -> { + GenericTestUtils.waitFor(() -> { // Validate metrics after metrics system initiated. if (DefaultMetricsSystem.instance().getSource(balancerMetricsName) == null) { return false; From 08946a0724ee69490517310c1b3cc2aa43aa1ca7 Mon Sep 17 00:00:00 2001 From: Leon Gao Date: Tue, 14 Sep 2021 11:38:34 -0700 Subject: [PATCH 6/9] trigger build From f357415315bb230e365fc2b2e5ee4d6e9cb94115 Mon Sep 17 00:00:00 2001 From: Leon Gao Date: Tue, 14 Sep 2021 13:35:27 -0700 Subject: [PATCH 7/9] trigger build From c181de687421df7d76cc36da500751934d3e1818 Mon Sep 17 00:00:00 2001 From: Leon Gao Date: Tue, 14 Sep 2021 17:20:18 -0700 Subject: [PATCH 8/9] trigger build From ce496140c284f037fd5968d2ad04a026ffdb2d87 Mon Sep 17 00:00:00 2001 From: Leon Gao Date: Wed, 15 Sep 2021 21:33:23 -0700 Subject: [PATCH 9/9] Trigger Build