Skip to content

Commit

Permalink
[SPARK-32381][CORE][FOLLOWUP][TEST-HADOOP2.7] Don't remove Serializab…
Browse files Browse the repository at this point in the history
…leFileStatus and SerializableBlockLocation for Hadoop 2.7

### What changes were proposed in this pull request?

Revert the change in #29959 and don't remove `SerializableFileStatus` and `SerializableBlockLocation`.

### Why are the changes needed?

In Hadoop 2.7 `FileStatus` and `BlockLocation` are not serializable, so we still need the two wrapper classes.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

N/A

Closes #30447 from sunchao/SPARK-32381-followup.

Authored-by: Chao Sun <sunchao@apple.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
sunchao authored and dongjoon-hyun committed Nov 21, 2020
1 parent 530c0a8 commit b623c03
Showing 1 changed file with 60 additions and 1 deletion.
61 changes: 60 additions & 1 deletion core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -136,12 +136,53 @@ private[spark] object HadoopFSUtils extends Logging {
parallelismMax = 0)
(path, leafFiles)
}.iterator
}.map { case (path, statuses) =>
val serializableStatuses = statuses.map { status =>
// Turn FileStatus into SerializableFileStatus so we can send it back to the driver
val blockLocations = status match {
case f: LocatedFileStatus =>
f.getBlockLocations.map { loc =>
SerializableBlockLocation(
loc.getNames,
loc.getHosts,
loc.getOffset,
loc.getLength)
}

case _ =>
Array.empty[SerializableBlockLocation]
}

SerializableFileStatus(
status.getPath.toString,
status.getLen,
status.isDirectory,
status.getReplication,
status.getBlockSize,
status.getModificationTime,
status.getAccessTime,
blockLocations)
}
(path.toString, serializableStatuses)
}.collect()
} finally {
sc.setJobDescription(previousJobDescription)
}

statusMap.toSeq
// turn SerializableFileStatus back to Status
statusMap.map { case (path, serializableStatuses) =>
val statuses = serializableStatuses.map { f =>
val blockLocations = f.blockLocations.map { loc =>
new BlockLocation(loc.names, loc.hosts, loc.offset, loc.length)
}
new LocatedFileStatus(
new FileStatus(
f.length, f.isDir, f.blockReplication, f.blockSize, f.modificationTime,
new Path(f.path)),
blockLocations)
}
(new Path(path), statuses)
}
}

// scalastyle:off argcount
Expand Down Expand Up @@ -291,4 +332,22 @@ private[spark] object HadoopFSUtils extends Logging {
resolvedLeafStatuses
}
// scalastyle:on argcount

/** A serializable variant of HDFS's BlockLocation. This is required by Hadoop 2.7. */
private case class SerializableBlockLocation(
names: Array[String],
hosts: Array[String],
offset: Long,
length: Long)

/** A serializable variant of HDFS's FileStatus. This is required by Hadoop 2.7. */
private case class SerializableFileStatus(
path: String,
length: Long,
isDir: Boolean,
blockReplication: Short,
blockSize: Long,
modificationTime: Long,
accessTime: Long,
blockLocations: Array[SerializableBlockLocation])
}

0 comments on commit b623c03

Please sign in to comment.