From 8abc83f41b750978c8746e47553648b518af1e22 Mon Sep 17 00:00:00 2001 From: Dan King Date: Thu, 30 Nov 2023 13:20:02 -0500 Subject: [PATCH] [query] fix #13998 CHANGELOG: Fix #13998 which appeared in 0.2.58 and prevented reading from a networked filesystem mounted within the filesystem of the worker node for certain pipelines (those that did not trigger "lowering"). We use the IndexReader in `PartitionNativeIntervalReader`, `PartitionNativeReaderIndexed`, and `PartitionZippedIndexedNativeReader`. 1. `PartitionNativeIntervalReader` is only used by `query_table`. 2. `PartitionNativeReaderIndexed` is only used by `IndexedRVDSpec2.readTableStage` which is used by `TableNativeReader` when there is a new partitioner. 3. `PartitionZippedIndexedNativeReader` is only sued by `AbstractRVDSpec.readZippedLowered` when there is a new partitioner. Two is for tables, three is for matrix tables. In `readZippedLowered` we explicitly [drop the file protocol](https://github.com/hail-is/hail/blob/1dedf3c63f9aabf1b6ce538165360056f82f76e4/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala#L154-L155): ``` val absPathLeft = removeFileProtocol(pathLeft) val absPathRight = removeFileProtocol(pathRight) ``` We have done this, by various names, since this lowered code path was added. I added `removeFileProtocol` because stripping the protocol in Query-on-Batch prevented the reading and writing of gs:// URIs, the only URIs I could read in QoB. `uriPath` (the function whose use I replaced with `removeFileProtocol`) was added by Cotton [a very long time ago](https://github.com/hail-is/hail/commit/92a9936e11d2f56b88390bee5dc4de489e188f02). It seems he added it so that he could use HDFS to generate a temporary file path on the local filesystem but pass the file path to binary tools that know nothing of HDFS and file:// URIs. [`readIndexedPartitions`](https://github.com/hail-is/hail/blob/2b0aded9206849252b453dd80710cea8d2156793/hail/src/main/scala/is/hail/HailContext.scala#L421-L440) which *does not* strip any protocols from the path. This has gone undetected because we never try to read data through the OS's filesystem. We always use gs://, Azure, or s3:// because we do not test in environments that have a networked file system mounted in the OS's filesystem. To replicate this bug (and add a test for it), we would need a cluster with a lustre file system (or another networked filesystem). This would be a fairly large lift. The fix is trivial: just never intentionally strip the protocol! --- hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala | 6 +++--- hail/src/main/scala/is/hail/utils/package.scala | 9 --------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala b/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala index b95670552c3..384784a8785 100644 --- a/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala +++ b/hail/src/main/scala/is/hail/rvd/AbstractRVDSpec.scala @@ -151,8 +151,8 @@ object AbstractRVDSpec { indexSpecLeft, indexSpecRight, specLeft.key, uidFieldName) - val absPathLeft = removeFileProtocol(pathLeft) - val absPathRight = removeFileProtocol(pathRight) + val absPathLeft = pathLeft + val absPathRight = pathRight val partsAndIntervals: IndexedSeq[(String, Interval)] = if (specLeft.key.isEmpty) { specLeft.partFiles.map { p => (p, null) } } else { @@ -438,7 +438,7 @@ case class IndexedRVDSpec2( val rSpec = typedCodecSpec val reader = ir.PartitionNativeReaderIndexed(rSpec, indexSpec, part.kType.fieldNames, uidFieldName) - val absPath = removeFileProtocol(path) + val absPath = path val partPaths = tmpPartitioner.rangeBounds.map { b => partFiles(part.lowerBoundInterval(b)) } diff --git a/hail/src/main/scala/is/hail/utils/package.scala b/hail/src/main/scala/is/hail/utils/package.scala index b1d4c61518a..ade36e71226 100644 --- a/hail/src/main/scala/is/hail/utils/package.scala +++ b/hail/src/main/scala/is/hail/utils/package.scala @@ -351,15 +351,6 @@ package object utils extends Logging def uriPath(uri: String): String = new URI(uri).getPath - def removeFileProtocol(uriString: String): String = { - val uri = new URI(uriString) - if (uri.getScheme == "file") { - uri.getPath - } else { - uri.toString - } - } - // NB: can't use Nothing here because it is not a super type of Null private object flattenOrNullInstance extends FlattenOrNull[Array]