Skip to content

Commit

Permalink
[SPARK-35002][YARN][TESTS][FOLLOW-UP] Fix java.net.BindException in M…
Browse files Browse the repository at this point in the history
…iniYARNCluster

This PR fixes two tests below:

https://github.com/apache/spark/runs/2320161984

```
[info] YarnShuffleIntegrationSuite:
[info] org.apache.spark.deploy.yarn.YarnShuffleIntegrationSuite *** ABORTED *** (228 milliseconds)
[info]   org.apache.hadoop.yarn.exceptions.YarnRuntimeException: org.apache.hadoop.yarn.webapp.WebAppException: Error starting http server
[info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.startResourceManager(MiniYARNCluster.java:373)
[info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.access$300(MiniYARNCluster.java:128)
[info]   at org.apache.hadoop.yarn.server.MiniYARNCluster$ResourceManagerWrapper.serviceStart(MiniYARNCluster.java:503)
[info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
[info]   at org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:121)
[info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.serviceStart(MiniYARNCluster.java:322)
[info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
[info]   at org.apache.spark.deploy.yarn.BaseYarnClusterSuite.beforeAll(BaseYarnClusterSuite.scala:95)
...
[info]   Cause: java.net.BindException: Port in use: fv-az186-831:0
[info]   at org.apache.hadoop.http.HttpServer2.constructBindException(HttpServer2.java:1231)
[info]   at org.apache.hadoop.http.HttpServer2.bindForSinglePort(HttpServer2.java:1253)
[info]   at org.apache.hadoop.http.HttpServer2.openListeners(HttpServer2.java:1316)
[info]   at org.apache.hadoop.http.HttpServer2.start(HttpServer2.java:1167)
[info]   at org.apache.hadoop.yarn.webapp.WebApps$Builder.start(WebApps.java:449)
[info]   at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.startWepApp(ResourceManager.java:1247)
[info]   at org.apache.hadoop.yarn.server.resourcemanager.ResourceManager.serviceStart(ResourceManager.java:1356)
[info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
[info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.startResourceManager(MiniYARNCluster.java:365)
[info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.access$300(MiniYARNCluster.java:128)
[info]   at org.apache.hadoop.yarn.server.MiniYARNCluster$ResourceManagerWrapper.serviceStart(MiniYARNCluster.java:503)
[info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
[info]   at org.apache.hadoop.service.CompositeService.serviceStart(CompositeService.java:121)
[info]   at org.apache.hadoop.yarn.server.MiniYARNCluster.serviceStart(MiniYARNCluster.java:322)
[info]   at org.apache.hadoop.service.AbstractService.start(AbstractService.java:194)
[info]   at org.apache.spark.deploy.yarn.BaseYarnClusterSuite.beforeAll(BaseYarnClusterSuite.scala:95)
[info]   at org.scalatest.BeforeAndAfterAll.liftedTree1$1(BeforeAndAfterAll.scala:212)
[info]   at org.scalatest.BeforeAndAfterAll.run(BeforeAndAfterAll.scala:210)
[info]   at org.scalatest.BeforeAndAfterAll.run$(BeforeAndAfterAll.scala:208)
[info]   at org.apache.spark.SparkFunSuite.run(SparkFunSuite.scala:61)
...
```

https://github.com/apache/spark/runs/2323342094

```
[info] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadSecret started
[error] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadSecret failed: java.lang.AssertionError: Connecting to /10.1.0.161:39895 timed out (120000 ms), took 120.081 sec
[error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadSecret(ExternalShuffleSecuritySuite.java:85)
[error]     ...
[info] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadAppId started
[error] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadAppId failed: java.lang.AssertionError: Connecting to /10.1.0.198:44633 timed out (120000 ms), took 120.08 sec
[error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testBadAppId(ExternalShuffleSecuritySuite.java:76)
[error]     ...
[info] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testValid started
[error] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testValid failed: java.io.IOException: Connecting to /10.1.0.119:43575 timed out (120000 ms), took 120.089 sec
[error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:285)
[error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
[error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230)
[error]     at org.apache.spark.network.shuffle.ExternalBlockStoreClient.registerWithShuffleServer(ExternalBlockStoreClient.java:211)
[error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.validate(ExternalShuffleSecuritySuite.java:108)
[error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testValid(ExternalShuffleSecuritySuite.java:68)
[error]     ...
[info] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testEncryption started
[error] Test org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testEncryption failed: java.io.IOException: Connecting to /10.1.0.248:35271 timed out (120000 ms), took 120.014 sec
[error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:285)
[error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:218)
[error]     at org.apache.spark.network.client.TransportClientFactory.createClient(TransportClientFactory.java:230)
[error]     at org.apache.spark.network.shuffle.ExternalBlockStoreClient.registerWithShuffleServer(ExternalBlockStoreClient.java:211)
[error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.validate(ExternalShuffleSecuritySuite.java:108)
[error]     at org.apache.spark.network.shuffle.ExternalShuffleSecuritySuite.testEncryption(ExternalShu
```

For Yarn cluster suites, its difficult to fix. This PR makes it skipped if it fails to bind.
For shuffle related suites, it uses local host

To make the tests stable

No, dev-only.

Its tested in GitHub Actions: https://github.com/HyukjinKwon/spark/runs/2340210765

Closes #32126 from HyukjinKwon/SPARK-35002-followup.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: Yuming Wang <yumwang@ebay.com>
(cherry picked from commit a153efa)
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
  • Loading branch information
HyukjinKwon committed Apr 16, 2021
1 parent 0033804 commit 34d4da5
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
public class TestUtils {
public static String getLocalHost() {
try {
return InetAddress.getLocalHost().getHostAddress();
return (System.getenv().containsKey("SPARK_LOCAL_IP"))?
System.getenv("SPARK_LOCAL_IP"):
InetAddress.getLocalHost().getHostAddress();
} catch (Exception e) {
throw new RuntimeException(e);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ import scala.concurrent.duration._
import com.google.common.io.Files
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.apache.hadoop.yarn.server.MiniYARNCluster
import org.scalatest.{BeforeAndAfterAll, Matchers}
import org.scalactic.source.Position
import org.scalatest.{BeforeAndAfterAll, Matchers, Tag}
import org.scalatest.concurrent.Eventually._

import org.apache.spark._
Expand All @@ -40,6 +41,7 @@ import org.apache.spark.util.Utils

abstract class BaseYarnClusterSuite
extends SparkFunSuite with BeforeAndAfterAll with Matchers with Logging {
private var isBindSuccessful = true

// log4j configuration for the YARN containers, so that their output is collected
// by YARN instead of trying to overwrite unit-tests.log.
Expand All @@ -63,6 +65,14 @@ abstract class BaseYarnClusterSuite

def newYarnConfig(): YarnConfiguration

override protected def test(testName: String, testTags: Tag*)(testFun: => Any)
(implicit pos: Position): Unit = {
super.test(testName, testTags: _*) {
assume(isBindSuccessful, "Mini Yarn cluster should be able to bind.")
testFun
}
}

override def beforeAll(): Unit = {
super.beforeAll()

Expand All @@ -79,9 +89,16 @@ abstract class BaseYarnClusterSuite
yarnConf.set("yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage",
"100.0")

yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
yarnCluster.init(yarnConf)
yarnCluster.start()
try {
yarnCluster = new MiniYARNCluster(getClass().getName(), 1, 1, 1)
yarnCluster.init(yarnConf)
yarnCluster.start()
} catch {
case e: Throwable if org.apache.commons.lang3.exception.ExceptionUtils.indexOfThrowable(
e, classOf[java.net.BindException]) != -1 =>
isBindSuccessful = false
return
}

// There's a race in MiniYARNCluster in which start() may return before the RM has updated
// its address in the configuration. You can see this in the logs by noticing that when
Expand Down Expand Up @@ -117,7 +134,7 @@ abstract class BaseYarnClusterSuite

override def afterAll(): Unit = {
try {
yarnCluster.stop()
if (yarnCluster != null) yarnCluster.stop()
} finally {
super.afterAll()
}
Expand Down

0 comments on commit 34d4da5

Please sign in to comment.