From a8a26010f372fdefa9209362478b1267f973e522 Mon Sep 17 00:00:00 2001 From: Oliver <20188437+olivergrabinski@users.noreply.github.com> Date: Wed, 22 May 2024 10:23:38 +0200 Subject: [PATCH] Support for s3 multipart checksums (#4991) --- .../resources/contexts/indexing-metadata.json | 1 + .../resources/defaults/default-mapping.json | 3 ++ .../plugins/storage/files/model/Digest.scala | 20 +++++++-- .../storages/operations/s3/HeadObject.scala | 21 ++++++--- .../operations/s3/HeadObjectSuite.scala | 44 +++++++++++++++++++ 5 files changed, 80 insertions(+), 9 deletions(-) create mode 100644 delta/plugins/storage/src/test/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/storages/operations/s3/HeadObjectSuite.scala diff --git a/delta/plugins/elasticsearch/src/main/resources/contexts/indexing-metadata.json b/delta/plugins/elasticsearch/src/main/resources/contexts/indexing-metadata.json index 32a8255fc7..60fcaa72d0 100644 --- a/delta/plugins/elasticsearch/src/main/resources/contexts/indexing-metadata.json +++ b/delta/plugins/elasticsearch/src/main/resources/contexts/indexing-metadata.json @@ -7,6 +7,7 @@ "@type": "@id" }, "_digest": "https://bluebrain.github.io/nexus/vocabulary/digest", + "_numberOfParts": "https://bluebrain.github.io/nexus/vocabulary/numberOfParts", "_bytes": "https://bluebrain.github.io/nexus/vocabulary/bytes", "_value": "https://bluebrain.github.io/nexus/vocabulary/value", "_rev": "https://bluebrain.github.io/nexus/vocabulary/rev", diff --git a/delta/plugins/elasticsearch/src/main/resources/defaults/default-mapping.json b/delta/plugins/elasticsearch/src/main/resources/defaults/default-mapping.json index add1b374df..197cf61db7 100644 --- a/delta/plugins/elasticsearch/src/main/resources/defaults/default-mapping.json +++ b/delta/plugins/elasticsearch/src/main/resources/defaults/default-mapping.json @@ -179,6 +179,9 @@ }, "_value": { "type": "keyword" + }, + "_numberOfParts": { + "type": "long" } } }, diff --git a/delta/plugins/storage/src/main/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/files/model/Digest.scala b/delta/plugins/storage/src/main/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/files/model/Digest.scala index f92f410245..cd10fe0dfc 100644 --- a/delta/plugins/storage/src/main/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/files/model/Digest.scala +++ b/delta/plugins/storage/src/main/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/files/model/Digest.scala @@ -23,6 +23,18 @@ object Digest { */ final case class ComputedDigest(algorithm: DigestAlgorithm, value: String) extends Digest + /** + * A digest as provided by S3 when the file is uploaded in parts. + * + * @param algorithm + * the algorithm used in order to compute the digest + * @param value + * the actual value of the digest of the file + * @param numberOfParts + * the number of parts the digest was computed from + */ + final case class MultiPartDigest(algorithm: DigestAlgorithm, value: String, numberOfParts: Int) extends Digest + /** * A digest that does not yield a value because it is still being computed */ @@ -36,8 +48,10 @@ object Digest { final case object NotComputedDigest extends Digest implicit val digestEncoder: Encoder.AsObject[Digest] = Encoder.encodeJsonObject.contramapObject { - case ComputedDigest(algorithm, value) => JsonObject("_algorithm" -> algorithm.asJson, "_value" -> value.asJson) - case NotComputedDigest => JsonObject("_value" -> "".asJson) - case NoDigest => JsonObject("_value" -> "".asJson) + case ComputedDigest(algorithm, value) => JsonObject("_algorithm" -> algorithm.asJson, "_value" -> value.asJson) + case MultiPartDigest(algorithm, value, numberOfParts) => + JsonObject("_algorithm" -> algorithm.asJson, "_value" -> value.asJson, "_numberOfParts" -> numberOfParts.asJson) + case NotComputedDigest => JsonObject("_value" -> "".asJson) + case NoDigest => JsonObject("_value" -> "".asJson) } } diff --git a/delta/plugins/storage/src/main/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/storages/operations/s3/HeadObject.scala b/delta/plugins/storage/src/main/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/storages/operations/s3/HeadObject.scala index 10d0671ac7..c8e613e3d4 100644 --- a/delta/plugins/storage/src/main/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/storages/operations/s3/HeadObject.scala +++ b/delta/plugins/storage/src/main/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/storages/operations/s3/HeadObject.scala @@ -17,12 +17,21 @@ object HeadObject { // It is highly likely for S3 to return an erroneous value here ContentType.parse(value).toOption } - val digestValue = Option(response.checksumSHA256).map { encodedChecksum => - Hex.valueOf(Base64.getDecoder.decode(encodedChecksum)) - } - val digest = digestValue.fold(Digest.none) { value => - ComputedDigest(DigestAlgorithm.SHA256, value) - } + + val digest = Option(response.checksumSHA256()) + .map { encodedChecksum => + val multiPartDigest = """^(.*)-(\d+)$""".r + encodedChecksum match { + case multiPartDigest(value, parts) => + val digestValue = Hex.valueOf(Base64.getDecoder.decode(value)) + Digest.MultiPartDigest(DigestAlgorithm.SHA256, digestValue, parts.toInt) + case _ => + val digestValue = Hex.valueOf(Base64.getDecoder.decode(encodedChecksum)) + ComputedDigest(DigestAlgorithm.SHA256, digestValue) + } + } + .getOrElse(Digest.none) + HeadObject( response.contentLength(), contentType, diff --git a/delta/plugins/storage/src/test/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/storages/operations/s3/HeadObjectSuite.scala b/delta/plugins/storage/src/test/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/storages/operations/s3/HeadObjectSuite.scala new file mode 100644 index 0000000000..177b09ca09 --- /dev/null +++ b/delta/plugins/storage/src/test/scala/ch/epfl/bluebrain/nexus/delta/plugins/storage/storages/operations/s3/HeadObjectSuite.scala @@ -0,0 +1,44 @@ +package ch.epfl.bluebrain.nexus.delta.plugins.storage.storages.operations.s3 + +import ch.epfl.bluebrain.nexus.delta.plugins.storage.files.model.Digest +import ch.epfl.bluebrain.nexus.delta.plugins.storage.storages.model.DigestAlgorithm +import ch.epfl.bluebrain.nexus.testkit.mu.NexusSuite +import software.amazon.awssdk.services.s3.model.HeadObjectResponse + +class HeadObjectSuite extends NexusSuite { + + test("HeadObject should correctly parse a standard S3 SHA256 digest") { + + val multiPartDigest = "44ImQwqlEWtD75zMbO3GeJCOj4oO2lMb+VW6l6zJ3sc=" + val response = + HeadObjectResponse.builder().checksumSHA256(multiPartDigest).build() + val digest = HeadObject(response).digest + + assertEquals( + digest, + Digest.ComputedDigest( + DigestAlgorithm.SHA256, + "e38226430aa5116b43ef9ccc6cedc678908e8f8a0eda531bf955ba97acc9dec7" + ) + ) + + } + + test("HeadObject should correctly parse a multipart S3 SHA256 digest") { + val multiPartDigest = "kFsM2p15+Jbp2K0FIF0y1zIWlEJOt5052qlU8IRQPtM=-13" + val response = + HeadObjectResponse.builder().checksumSHA256(multiPartDigest).build() + val digest = HeadObject(response).digest + + assertEquals( + digest, + Digest.MultiPartDigest( + DigestAlgorithm.SHA256, + "905b0cda9d79f896e9d8ad05205d32d7321694424eb79d39daa954f084503ed3", + 13 + ) + ) + + } + +}