From 97eab7c641a81e0c994f348c9bac71c5481857b0 Mon Sep 17 00:00:00 2001 From: Yu Gan Date: Mon, 14 Dec 2020 18:44:34 +0800 Subject: [PATCH] KE-19992 s3 read retry before accessing FileStatus properties (#191) Co-authored-by: Yu Gan --- .../SpecificParquetRecordReaderBase.java | 2 + .../parquet/ParquetFileFormat.scala | 2 + .../apache/spark/sql/util/S3FileUtils.scala | 39 +++++++++++++++++++ .../spark/sql/hive/HiveStrategies.scala | 3 +- 4 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 sql/core/src/main/scala/org/apache/spark/sql/util/S3FileUtils.scala diff --git a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java index 6264d6341c65a..0b3d755833b15 100644 --- a/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java +++ b/sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/SpecificParquetRecordReaderBase.java @@ -53,6 +53,7 @@ import org.apache.spark.sql.internal.SQLConf; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.types.StructType$; +import org.apache.spark.sql.util.S3FileUtils; import org.apache.spark.util.AccumulatorV2; /** @@ -149,6 +150,7 @@ protected void initialize(String path, List columns) throws IOException config.setBoolean(SQLConf.PARQUET_INT96_AS_TIMESTAMP().key(), false); this.file = new Path(path); + S3FileUtils.tryOpenClose(config, this.file); long length = this.file.getFileSystem(config).getFileStatus(this.file).getLen(); ParquetReadOptions options = HadoopReadOptions diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index d3ac077ccf4a7..9eb448d8ac687 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -50,6 +50,7 @@ import org.apache.spark.sql.execution.vectorized.{OffHeapColumnVector, OnHeapCol import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.S3FileUtils import org.apache.spark.util.{SerializableConfiguration, ThreadUtils} class ParquetFileFormat @@ -264,6 +265,7 @@ class ParquetFileFormat val sharedConf = broadcastedHadoopConf.value.value + S3FileUtils.tryOpenClose(sharedConf, filePath) lazy val footerFileMetaData = ParquetFooterReader.readFooter(sharedConf, filePath, SKIP_ROW_GROUPS).getFileMetaData val datetimeRebaseMode = DataSourceUtils.datetimeRebaseMode( diff --git a/sql/core/src/main/scala/org/apache/spark/sql/util/S3FileUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/util/S3FileUtils.scala new file mode 100644 index 0000000000000..792bd660ba758 --- /dev/null +++ b/sql/core/src/main/scala/org/apache/spark/sql/util/S3FileUtils.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.util + +import java.io.IOException + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.Path + +import org.apache.spark.internal.Logging + +object S3FileUtils extends Logging { + + @throws(classOf[IOException]) + def tryOpenClose(conf: Configuration, fp: Path): Unit = { + val fs = fp.getFileSystem(conf) + if (fs.getScheme.startsWith("s3")) { + // Read retry before accessing FileStatus properties. + logInfo(s"Try open-close $fp") + fs.open(fp).close() + } + } + +} diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala index c8a5c03bdd2d6..19b5c0e54d884 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveStrategies.scala @@ -21,7 +21,6 @@ import java.io.IOException import java.util.Locale import org.apache.hadoop.fs.{FileSystem, Path} - import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.expressions._ @@ -34,6 +33,7 @@ import org.apache.spark.sql.execution.datasources.{CreateTable, DataSourceStrate import org.apache.spark.sql.hive.execution._ import org.apache.spark.sql.hive.execution.HiveScriptTransformationExec import org.apache.spark.sql.internal.HiveSerDe +import org.apache.spark.sql.util.S3FileUtils /** @@ -123,6 +123,7 @@ class DetermineTableStats(session: SparkSession) extends Rule[LogicalPlan] { val hadoopConf = session.sessionState.newHadoopConf() val tablePath = new Path(table.location) val fs: FileSystem = tablePath.getFileSystem(hadoopConf) + S3FileUtils.tryOpenClose(hadoopConf, tablePath) fs.getContentSummary(tablePath).getLength } catch { case e: IOException =>