diff --git a/polaris-core/src/main/java/org/apache/polaris/core/entity/AsyncTaskType.java b/polaris-core/src/main/java/org/apache/polaris/core/entity/AsyncTaskType.java index 32c478e86..6e3ce9ae5 100644 --- a/polaris-core/src/main/java/org/apache/polaris/core/entity/AsyncTaskType.java +++ b/polaris-core/src/main/java/org/apache/polaris/core/entity/AsyncTaskType.java @@ -23,7 +23,8 @@ public enum AsyncTaskType { ENTITY_CLEANUP_SCHEDULER(1), - FILE_CLEANUP(2); + MANIFEST_FILE_CLEANUP(2), + METADATA_FILE_BATCH_CLEANUP(3); private final int typeCode; diff --git a/polaris-core/src/testFixtures/java/org/apache/polaris/core/persistence/BasePolarisMetaStoreManagerTest.java b/polaris-core/src/testFixtures/java/org/apache/polaris/core/persistence/BasePolarisMetaStoreManagerTest.java index bf97e4128..1b5483afe 100644 --- a/polaris-core/src/testFixtures/java/org/apache/polaris/core/persistence/BasePolarisMetaStoreManagerTest.java +++ b/polaris-core/src/testFixtures/java/org/apache/polaris/core/persistence/BasePolarisMetaStoreManagerTest.java @@ -240,7 +240,7 @@ private static TaskEntity createTask(String taskName, long id) { .setName(taskName) .withData("data") .setId(id) - .withTaskType(AsyncTaskType.FILE_CLEANUP) + .withTaskType(AsyncTaskType.MANIFEST_FILE_CLEANUP) .setCreateTimestamp(Instant.now().toEpochMilli()) .build(); } diff --git a/polaris-service/src/main/java/org/apache/polaris/service/task/ManifestFileCleanupTaskHandler.java b/polaris-service/src/main/java/org/apache/polaris/service/task/ManifestFileCleanupTaskHandler.java index e17e24fd7..441a56b27 100644 --- a/polaris-service/src/main/java/org/apache/polaris/service/task/ManifestFileCleanupTaskHandler.java +++ b/polaris-service/src/main/java/org/apache/polaris/service/task/ManifestFileCleanupTaskHandler.java @@ -42,10 +42,13 @@ import org.slf4j.LoggerFactory; /** - * {@link TaskHandler} responsible for deleting all of the files in a manifest and the manifest - * itself. Since data files may be present in multiple manifests across different snapshots, we - * assume a data file that doesn't exist is missing because it was already deleted by another task. + * {@link TaskHandler} responsible for deleting table files: 1. Manifest files: It contains all the + * files in a manifest and the manifest itself. Since data files may be present in multiple + * manifests across different snapshots, we assume a data file that doesn't exist is missing because + * it was already deleted by another task. 2. Table metadata files: It contains previous metadata + * and statistics files, which are grouped and deleted in batch */ +// TODO: Rename this class since we introducing metadata cleanup here public class ManifestFileCleanupTaskHandler implements TaskHandler { public static final int MAX_ATTEMPTS = 3; public static final int FILE_DELETION_RETRY_MILLIS = 100; @@ -62,66 +65,119 @@ public ManifestFileCleanupTaskHandler( @Override public boolean canHandleTask(TaskEntity task) { - return task.getTaskType() == AsyncTaskType.FILE_CLEANUP; + return task.getTaskType() == AsyncTaskType.MANIFEST_FILE_CLEANUP + || task.getTaskType() == AsyncTaskType.METADATA_FILE_BATCH_CLEANUP; } @Override public boolean handleTask(TaskEntity task) { ManifestCleanupTask cleanupTask = task.readData(ManifestCleanupTask.class); - ManifestFile manifestFile = decodeManifestData(cleanupTask.getManifestFileData()); TableIdentifier tableId = cleanupTask.getTableId(); try (FileIO authorizedFileIO = fileIOSupplier.apply(task)) { - - // if the file doesn't exist, we assume that another task execution was successful, but failed - // to drop the task entity. Log a warning and return success - if (!TaskUtils.exists(manifestFile.path(), authorizedFileIO)) { + if (task.getTaskType() == AsyncTaskType.MANIFEST_FILE_CLEANUP) { + ManifestFile manifestFile = decodeManifestData(cleanupTask.getManifestFileData()); + return cleanUpManifestFile(manifestFile, authorizedFileIO, tableId); + } else if (task.getTaskType() == AsyncTaskType.METADATA_FILE_BATCH_CLEANUP) { + return cleanUpMetadataFiles(cleanupTask.getMetadataFiles(), authorizedFileIO, tableId); + } else { LOGGER .atWarn() - .addKeyValue("manifestFile", manifestFile.path()) .addKeyValue("tableId", tableId) - .log("Manifest cleanup task scheduled, but manifest file doesn't exist"); - return true; - } - - ManifestReader dataFiles = ManifestFiles.read(manifestFile, authorizedFileIO); - List> dataFileDeletes = - StreamSupport.stream( - Spliterators.spliteratorUnknownSize(dataFiles.iterator(), Spliterator.IMMUTABLE), - false) - .map( - file -> - tryDelete( - tableId, authorizedFileIO, manifestFile, file.path().toString(), null, 1)) - .toList(); - LOGGER.debug( - "Scheduled {} data files to be deleted from manifest {}", - dataFileDeletes.size(), - manifestFile.path()); - try { - // wait for all data files to be deleted, then wait for the manifest itself to be deleted - CompletableFuture.allOf(dataFileDeletes.toArray(CompletableFuture[]::new)) - .thenCompose( - (v) -> { - LOGGER - .atInfo() - .addKeyValue("manifestFile", manifestFile.path()) - .log("All data files in manifest deleted - deleting manifest"); - return tryDelete( - tableId, authorizedFileIO, manifestFile, manifestFile.path(), null, 1); - }) - .get(); - return true; - } catch (InterruptedException e) { - LOGGER.error( - "Interrupted exception deleting data files from manifest {}", manifestFile.path(), e); - throw new RuntimeException(e); - } catch (ExecutionException e) { - LOGGER.error("Unable to delete data files from manifest {}", manifestFile.path(), e); + .log("Unknown task type {}", task.getTaskType()); return false; } } } + private boolean cleanUpManifestFile( + ManifestFile manifestFile, FileIO fileIO, TableIdentifier tableId) { + // if the file doesn't exist, we assume that another task execution was successful, but + // failed to drop the task entity. Log a warning and return success + if (!TaskUtils.exists(manifestFile.path(), fileIO)) { + LOGGER + .atWarn() + .addKeyValue("manifestFile", manifestFile.path()) + .addKeyValue("tableId", tableId) + .log("Manifest cleanup task scheduled, but manifest file doesn't exist"); + return true; + } + + ManifestReader dataFiles = ManifestFiles.read(manifestFile, fileIO); + List> dataFileDeletes = + StreamSupport.stream( + Spliterators.spliteratorUnknownSize(dataFiles.iterator(), Spliterator.IMMUTABLE), + false) + .map(file -> tryDelete(tableId, fileIO, manifestFile, file.path().toString(), null, 1)) + .toList(); + LOGGER.debug( + "Scheduled {} data files to be deleted from manifest {}", + dataFileDeletes.size(), + manifestFile.path()); + try { + // wait for all data files to be deleted, then wait for the manifest itself to be deleted + CompletableFuture.allOf(dataFileDeletes.toArray(CompletableFuture[]::new)) + .thenCompose( + (v) -> { + LOGGER + .atInfo() + .addKeyValue("manifestFile", manifestFile.path()) + .log("All data files in manifest deleted - deleting manifest"); + return tryDelete(tableId, fileIO, manifestFile, manifestFile.path(), null, 1); + }) + .get(); + return true; + } catch (InterruptedException e) { + LOGGER.error( + "Interrupted exception deleting data files from manifest {}", manifestFile.path(), e); + throw new RuntimeException(e); + } catch (ExecutionException e) { + LOGGER.error("Unable to delete data files from manifest {}", manifestFile.path(), e); + return false; + } + } + + private boolean cleanUpMetadataFiles( + List metadataFiles, FileIO fileIO, TableIdentifier tableId) { + List validFiles = + metadataFiles.stream().filter(file -> TaskUtils.exists(file, fileIO)).toList(); + if (validFiles.isEmpty()) { + LOGGER + .atWarn() + .addKeyValue("metadataFiles", metadataFiles.toString()) + .addKeyValue("tableId", tableId) + .log("Table metadata cleanup task scheduled, but the none of the file in batch exists"); + return true; + } + if (validFiles.size() < metadataFiles.size()) { + List missingFiles = + metadataFiles.stream().filter(file -> !TaskUtils.exists(file, fileIO)).toList(); + LOGGER + .atWarn() + .addKeyValue("metadataFiles", metadataFiles.toString()) + .addKeyValue("missingFiles", missingFiles) + .addKeyValue("tableId", tableId) + .log( + "Table metadata cleanup task scheduled, but {} files in the batch are missing", + missingFiles.size()); + } + + // Schedule the deletion for each file asynchronously + List> deleteFutures = + validFiles.stream().map(file -> tryDelete(tableId, fileIO, null, file, null, 1)).toList(); + + try { + // Wait for all delete operations to finish + CompletableFuture allDeletes = + CompletableFuture.allOf(deleteFutures.toArray(new CompletableFuture[0])); + allDeletes.join(); + } catch (Exception e) { + LOGGER.error("Exception detected during metadata file deletion", e); + return false; + } + + return true; + } + private static ManifestFile decodeManifestData(String manifestFileData) { try { return ManifestFiles.decode(Base64.decodeBase64(manifestFileData)); @@ -134,16 +190,16 @@ private CompletableFuture tryDelete( TableIdentifier tableId, FileIO fileIO, ManifestFile manifestFile, - String dataFile, + String file, Throwable e, int attempt) { if (e != null && attempt <= MAX_ATTEMPTS) { LOGGER .atWarn() - .addKeyValue("dataFile", dataFile) + .addKeyValue("file", file) .addKeyValue("attempt", attempt) .addKeyValue("error", e.getMessage()) - .log("Error encountered attempting to delete data file"); + .log("Error encountered attempting to delete file"); } if (attempt > MAX_ATTEMPTS && e != null) { return CompletableFuture.failedFuture(e); @@ -155,15 +211,15 @@ private CompletableFuture tryDelete( // file's existence, but then it is deleted before we have a chance to // send the delete request. In such a case, we should retry // and find - if (TaskUtils.exists(dataFile, fileIO)) { - fileIO.deleteFile(dataFile); + if (TaskUtils.exists(file, fileIO)) { + fileIO.deleteFile(file); } else { LOGGER .atInfo() - .addKeyValue("dataFile", dataFile) - .addKeyValue("manifestFile", manifestFile.path()) + .addKeyValue("file", file) + .addKeyValue("manifestFile", manifestFile != null ? manifestFile.path() : "") .addKeyValue("tableId", tableId) - .log("Manifest cleanup task scheduled, but data file doesn't exist"); + .log("table file cleanup task scheduled, but data file doesn't exist"); } }, executorService) @@ -171,11 +227,11 @@ private CompletableFuture tryDelete( newEx -> { LOGGER .atWarn() - .addKeyValue("dataFile", dataFile) - .addKeyValue("tableIdentifer", tableId) - .addKeyValue("manifestFile", manifestFile.path()) + .addKeyValue("dataFile", file) + .addKeyValue("tableIdentifier", tableId) + .addKeyValue("manifestFile", manifestFile != null ? manifestFile.path() : "") .log("Exception caught deleting data file from manifest", newEx); - return tryDelete(tableId, fileIO, manifestFile, dataFile, newEx, attempt + 1); + return tryDelete(tableId, fileIO, manifestFile, file, newEx, attempt + 1); }, CompletableFuture.delayedExecutor( FILE_DELETION_RETRY_MILLIS, TimeUnit.MILLISECONDS, executorService)); @@ -185,12 +241,18 @@ private CompletableFuture tryDelete( public static final class ManifestCleanupTask { private TableIdentifier tableId; private String manifestFileData; + private List metadataFiles; public ManifestCleanupTask(TableIdentifier tableId, String manifestFileData) { this.tableId = tableId; this.manifestFileData = manifestFileData; } + public ManifestCleanupTask(TableIdentifier tableId, List metadataFiles) { + this.tableId = tableId; + this.metadataFiles = metadataFiles; + } + public ManifestCleanupTask() {} public TableIdentifier getTableId() { @@ -209,17 +271,26 @@ public void setManifestFileData(String manifestFileData) { this.manifestFileData = manifestFileData; } + public List getMetadataFiles() { + return metadataFiles; + } + + public void setMetadataFiles(List metadataFiles) { + this.metadataFiles = metadataFiles; + } + @Override public boolean equals(Object object) { if (this == object) return true; if (!(object instanceof ManifestCleanupTask that)) return false; return Objects.equals(tableId, that.tableId) - && Objects.equals(manifestFileData, that.manifestFileData); + && Objects.equals(manifestFileData, that.manifestFileData) + && Objects.equals(metadataFiles, that.metadataFiles); } @Override public int hashCode() { - return Objects.hash(tableId, manifestFileData); + return Objects.hash(tableId, manifestFileData, metadataFiles); } } } diff --git a/polaris-service/src/main/java/org/apache/polaris/service/task/TableCleanupTaskHandler.java b/polaris-service/src/main/java/org/apache/polaris/service/task/TableCleanupTaskHandler.java index 7f323174b..cc722bbb5 100644 --- a/polaris-service/src/main/java/org/apache/polaris/service/task/TableCleanupTaskHandler.java +++ b/polaris-service/src/main/java/org/apache/polaris/service/task/TableCleanupTaskHandler.java @@ -18,11 +18,14 @@ */ package org.apache.polaris.service.task; +import java.util.ArrayList; import java.util.List; import java.util.UUID; import java.util.function.Function; import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.iceberg.ManifestFile; +import org.apache.iceberg.StatisticsFile; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.io.FileIO; @@ -49,6 +52,7 @@ public class TableCleanupTaskHandler implements TaskHandler { private final TaskExecutor taskExecutor; private final MetaStoreManagerFactory metaStoreManagerFactory; private final Function fileIOSupplier; + private static final String BATCH_SIZE_CONFIG_KEY = "TABLE_METADATA_CLEANUP_BATCH_SIZE"; public TableCleanupTaskHandler( TaskExecutor taskExecutor, @@ -99,51 +103,28 @@ public boolean handleTask(TaskEntity cleanupTask) { TableMetadata tableMetadata = TableMetadataParser.read(fileIO, tableEntity.getMetadataLocation()); - // read the manifest list for each snapshot. dedupe the manifest files and schedule a - // cleanupTask - // for each manifest file and its data files to be deleted + Stream manifestCleanupTasks = + getManifestTaskStream( + cleanupTask, + tableMetadata, + fileIO, + tableEntity, + metaStoreManager, + polarisCallContext); + + // TODO: handle partition statistics files + Stream metadataFileCleanupTasks = + getMetadataTaskStream( + cleanupTask, + tableMetadata, + fileIO, + tableEntity, + metaStoreManager, + polarisCallContext); + List taskEntities = - tableMetadata.snapshots().stream() - .flatMap(sn -> sn.allManifests(fileIO).stream()) - // distinct by manifest path, since multiple snapshots will contain the same - // manifest - .collect(Collectors.toMap(ManifestFile::path, Function.identity(), (mf1, mf2) -> mf1)) - .values() - .stream() - .filter(mf -> TaskUtils.exists(mf.path(), fileIO)) - .map( - mf -> { - // append a random uuid to the task name to avoid any potential conflict - // when - // storing the task entity. It's better to have duplicate tasks than to risk - // not storing the rest of the task entities. If a duplicate deletion task - // is - // queued, it will check for the manifest file's existence and simply exit - // if - // the task has already been handled. - String taskName = - cleanupTask.getName() + "_" + mf.path() + "_" + UUID.randomUUID(); - LOGGER - .atDebug() - .addKeyValue("taskName", taskName) - .addKeyValue("tableIdentifier", tableEntity.getTableIdentifier()) - .addKeyValue("metadataLocation", tableEntity.getMetadataLocation()) - .addKeyValue("manifestFile", mf.path()) - .log("Queueing task to delete manifest file"); - return new TaskEntity.Builder() - .setName(taskName) - .setId(metaStoreManager.generateNewEntityId(polarisCallContext).getId()) - .setCreateTimestamp(polarisCallContext.getClock().millis()) - .withTaskType(AsyncTaskType.FILE_CLEANUP) - .withData( - new ManifestFileCleanupTaskHandler.ManifestCleanupTask( - tableEntity.getTableIdentifier(), TaskUtils.encodeManifestFile(mf))) - .setId(metaStoreManager.generateNewEntityId(polarisCallContext).getId()) - // copy the internal properties, which will have storage info - .setInternalProperties(cleanupTask.getInternalPropertiesAsMap()) - .build(); - }) - .toList(); + Stream.concat(manifestCleanupTasks, metadataFileCleanupTasks).toList(); + List createdTasks = metaStoreManager .createEntitiesIfNotExist(polarisCallContext, null, taskEntities) @@ -154,10 +135,12 @@ public boolean handleTask(TaskEntity cleanupTask) { .addKeyValue("tableIdentifier", tableEntity.getTableIdentifier()) .addKeyValue("metadataLocation", tableEntity.getMetadataLocation()) .addKeyValue("taskCount", taskEntities.size()) - .log("Successfully queued tasks to delete manifests - deleting table metadata file"); + .log( + "Successfully queued tasks to delete manifests, previous metadata, and statistics files - deleting table metadata file"); for (PolarisBaseEntity createdTask : createdTasks) { taskExecutor.addTaskHandlerContext(createdTask.getId(), CallContext.getCurrentContext()); } + fileIO.deleteFile(tableEntity.getMetadataLocation()); return true; @@ -165,4 +148,109 @@ public boolean handleTask(TaskEntity cleanupTask) { } return false; } + + private Stream getManifestTaskStream( + TaskEntity cleanupTask, + TableMetadata tableMetadata, + FileIO fileIO, + TableLikeEntity tableEntity, + PolarisMetaStoreManager metaStoreManager, + PolarisCallContext polarisCallContext) { + // read the manifest list for each snapshot. dedupe the manifest files and schedule a + // cleanupTask + // for each manifest file and its data files to be deleted + return tableMetadata.snapshots().stream() + .flatMap(sn -> sn.allManifests(fileIO).stream()) + // distinct by manifest path, since multiple snapshots will contain the same + // manifest + .collect(Collectors.toMap(ManifestFile::path, Function.identity(), (mf1, mf2) -> mf1)) + .values() + .stream() + .filter(mf -> TaskUtils.exists(mf.path(), fileIO)) + .map( + mf -> { + // append a random uuid to the task name to avoid any potential conflict + // when + // storing the task entity. It's better to have duplicate tasks than to risk + // not storing the rest of the task entities. If a duplicate deletion task + // is + // queued, it will check for the manifest file's existence and simply exit + // if + // the task has already been handled. + String taskName = cleanupTask.getName() + "_" + mf.path() + "_" + UUID.randomUUID(); + LOGGER + .atDebug() + .addKeyValue("taskName", taskName) + .addKeyValue("tableIdentifier", tableEntity.getTableIdentifier()) + .addKeyValue("metadataLocation", tableEntity.getMetadataLocation()) + .addKeyValue("manifestFile", mf.path()) + .log("Queueing task to delete manifest file"); + return new TaskEntity.Builder() + .setName(taskName) + .setId(metaStoreManager.generateNewEntityId(polarisCallContext).getId()) + .setCreateTimestamp(polarisCallContext.getClock().millis()) + .withTaskType(AsyncTaskType.MANIFEST_FILE_CLEANUP) + .withData( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableEntity.getTableIdentifier(), TaskUtils.encodeManifestFile(mf))) + .setId(metaStoreManager.generateNewEntityId(polarisCallContext).getId()) + // copy the internal properties, which will have storage info + .setInternalProperties(cleanupTask.getInternalPropertiesAsMap()) + .build(); + }); + } + + private Stream getMetadataTaskStream( + TaskEntity cleanupTask, + TableMetadata tableMetadata, + FileIO fileIO, + TableLikeEntity tableEntity, + PolarisMetaStoreManager metaStoreManager, + PolarisCallContext polarisCallContext) { + int batchSize = + polarisCallContext + .getConfigurationStore() + .getConfiguration(polarisCallContext, BATCH_SIZE_CONFIG_KEY, 10); + return getMetadataFileBatches(tableMetadata, batchSize).stream() + .map( + metadataBatch -> { + String taskName = + String.join( + "_", + cleanupTask.getName(), + metadataBatch.toString(), + UUID.randomUUID().toString()); + LOGGER + .atDebug() + .addKeyValue("taskName", taskName) + .addKeyValue("tableIdentifier", tableEntity.getTableIdentifier()) + .addKeyValue("metadataFiles", metadataBatch.toString()) + .log( + "Queueing task to delete metadata files (prev metadata and statistics files)"); + return new TaskEntity.Builder() + .setName(taskName) + .setId(metaStoreManager.generateNewEntityId(polarisCallContext).getId()) + .setCreateTimestamp(polarisCallContext.getClock().millis()) + .withTaskType(AsyncTaskType.METADATA_FILE_BATCH_CLEANUP) + .withData( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableEntity.getTableIdentifier(), metadataBatch)) + .setInternalProperties(cleanupTask.getInternalPropertiesAsMap()) + .build(); + }); + } + + private List> getMetadataFileBatches(TableMetadata tableMetadata, int batchSize) { + List> result = new ArrayList<>(); + List metadataFiles = + Stream.concat( + tableMetadata.previousFiles().stream().map(TableMetadata.MetadataLogEntry::file), + tableMetadata.statisticsFiles().stream().map(StatisticsFile::path)) + .toList(); + + for (int i = 0; i < metadataFiles.size(); i += batchSize) { + result.add(metadataFiles.subList(i, Math.min(i + batchSize, metadataFiles.size()))); + } + return result; + } } diff --git a/polaris-service/src/test/java/org/apache/polaris/service/task/ManifestFileCleanupTaskHandlerTest.java b/polaris-service/src/test/java/org/apache/polaris/service/task/ManifestFileCleanupTaskHandlerTest.java index 711e661f2..709c0ad87 100644 --- a/polaris-service/src/test/java/org/apache/polaris/service/task/ManifestFileCleanupTaskHandlerTest.java +++ b/polaris-service/src/test/java/org/apache/polaris/service/task/ManifestFileCleanupTaskHandlerTest.java @@ -19,17 +19,24 @@ package org.apache.polaris.service.task; import static java.nio.charset.StandardCharsets.UTF_8; +import static org.assertj.core.api.Assertions.assertThat; import static org.assertj.core.api.Assertions.assertThatPredicate; import java.io.IOException; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.UUID; +import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executors; import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Stream; import org.apache.commons.codec.binary.Base64; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.ManifestFiles; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.StatisticsFile; +import org.apache.iceberg.TableMetadata; import org.apache.iceberg.catalog.Namespace; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.inmemory.InMemoryFileIO; @@ -75,7 +82,7 @@ public void testCleanupFileNotExists() throws IOException { fileIO.deleteFile(manifestFile.path()); TaskEntity task = new TaskEntity.Builder() - .withTaskType(AsyncTaskType.FILE_CLEANUP) + .withTaskType(AsyncTaskType.MANIFEST_FILE_CLEANUP) .withData( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, @@ -105,7 +112,7 @@ public void testCleanupFileManifestExistsDataFilesDontExist() throws IOException fileIO, "manifest1.avro", 100L, "dataFile1.parquet", "dataFile2.parquet"); TaskEntity task = new TaskEntity.Builder() - .withTaskType(AsyncTaskType.FILE_CLEANUP) + .withTaskType(AsyncTaskType.MANIFEST_FILE_CLEANUP) .withData( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, @@ -150,7 +157,7 @@ public void close() { TaskTestUtils.manifestFile(fileIO, "manifest1.avro", 100L, dataFile1Path, dataFile2Path); TaskEntity task = new TaskEntity.Builder() - .withTaskType(AsyncTaskType.FILE_CLEANUP) + .withTaskType(AsyncTaskType.MANIFEST_FILE_CLEANUP) .withData( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, @@ -213,7 +220,7 @@ public void deleteFile(String location) { TaskTestUtils.manifestFile(fileIO, "manifest1.avro", 100L, dataFile1Path, dataFile2Path); TaskEntity task = new TaskEntity.Builder() - .withTaskType(AsyncTaskType.FILE_CLEANUP) + .withTaskType(AsyncTaskType.MANIFEST_FILE_CLEANUP) .withData( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, @@ -226,4 +233,228 @@ public void deleteFile(String location) { assertThatPredicate((String f) -> TaskUtils.exists(f, fileIO)).rejects(dataFile2Path); } } + + @Test + public void testMetadataFileCleanup() throws IOException { + PolarisCallContext polarisCallContext = + new PolarisCallContext( + metaStoreManagerFactory.getOrCreateSessionSupplier(realmContext).get(), + new PolarisDefaultDiagServiceImpl()); + try (CallContext callCtx = CallContext.of(realmContext, polarisCallContext)) { + CallContext.setCurrentContext(callCtx); + FileIO fileIO = + new InMemoryFileIO() { + @Override + public void close() { + // no-op + } + }; + TableIdentifier tableIdentifier = + TableIdentifier.of(Namespace.of("db1", "schema1"), "table1"); + ManifestFileCleanupTaskHandler handler = + new ManifestFileCleanupTaskHandler((task) -> fileIO, Executors.newSingleThreadExecutor()); + + long snapshotId1 = 100L; + ManifestFile manifestFile1 = + TaskTestUtils.manifestFile( + fileIO, "manifest1.avro", snapshotId1, "dataFile1.parquet", "dataFile2.parquet"); + ManifestFile manifestFile2 = + TaskTestUtils.manifestFile( + fileIO, "manifest2.avro", snapshotId1, "dataFile3.parquet", "dataFile4.parquet"); + Snapshot snapshot = + TaskTestUtils.newSnapshot( + fileIO, "manifestList.avro", 1, snapshotId1, 99L, manifestFile1, manifestFile2); + StatisticsFile statisticsFile1 = + TaskTestUtils.writeStatsFile( + snapshot.snapshotId(), + snapshot.sequenceNumber(), + "/metadata/" + UUID.randomUUID() + ".stats", + fileIO); + String firstMetadataFile = "v1-295495059.metadata.json"; + TableMetadata firstMetadata = + TaskTestUtils.writeTableMetadata( + fileIO, firstMetadataFile, List.of(statisticsFile1), snapshot); + assertThat(TaskUtils.exists(firstMetadataFile, fileIO)).isTrue(); + + ManifestFile manifestFile3 = + TaskTestUtils.manifestFile( + fileIO, "manifest3.avro", snapshot.snapshotId() + 1, "dataFile5.parquet"); + Snapshot snapshot2 = + TaskTestUtils.newSnapshot( + fileIO, + "manifestList2.avro", + snapshot.sequenceNumber() + 1, + snapshot.snapshotId() + 1, + snapshot.snapshotId(), + manifestFile1, + manifestFile3); // exclude manifest2 from the new snapshot + StatisticsFile statisticsFile2 = + TaskTestUtils.writeStatsFile( + snapshot2.snapshotId(), + snapshot2.sequenceNumber(), + "/metadata/" + UUID.randomUUID() + ".stats", + fileIO); + String secondMetadataFile = "v1-295495060.metadata.json"; + TableMetadata secondMetadata = + TaskTestUtils.writeTableMetadata( + fileIO, + secondMetadataFile, + firstMetadata, + firstMetadataFile, + List.of(statisticsFile2), + snapshot2); + assertThat(TaskUtils.exists(firstMetadataFile, fileIO)).isTrue(); + assertThat(TaskUtils.exists(secondMetadataFile, fileIO)).isTrue(); + + List cleanupFiles = + Stream.concat( + secondMetadata.previousFiles().stream() + .map(TableMetadata.MetadataLogEntry::file) + .filter(file -> TaskUtils.exists(file, fileIO)), + secondMetadata.statisticsFiles().stream() + .map(StatisticsFile::path) + .filter(file -> TaskUtils.exists(file, fileIO))) + .toList(); + + TaskEntity task = + new TaskEntity.Builder() + .withTaskType(AsyncTaskType.METADATA_FILE_BATCH_CLEANUP) + .withData( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableIdentifier, cleanupFiles)) + .setName(UUID.randomUUID().toString()) + .build(); + + assertThatPredicate(handler::canHandleTask).accepts(task); + assertThatPredicate(handler::handleTask).accepts(task); + + assertThatPredicate((String file) -> TaskUtils.exists(file, fileIO)) + .rejects(firstMetadataFile); + assertThatPredicate((String file) -> TaskUtils.exists(file, fileIO)) + .rejects(statisticsFile1.path()); + assertThatPredicate((String file) -> TaskUtils.exists(file, fileIO)) + .rejects(statisticsFile2.path()); + } + } + + @Test + public void testMetadataFileCleanupIfFileNotExist() throws IOException { + PolarisCallContext polarisCallContext = + new PolarisCallContext( + metaStoreManagerFactory.getOrCreateSessionSupplier(realmContext).get(), + new PolarisDefaultDiagServiceImpl()); + try (CallContext callCtx = CallContext.of(realmContext, polarisCallContext)) { + CallContext.setCurrentContext(callCtx); + FileIO fileIO = new InMemoryFileIO(); + TableIdentifier tableIdentifier = + TableIdentifier.of(Namespace.of("db1", "schema1"), "table1"); + ManifestFileCleanupTaskHandler handler = + new ManifestFileCleanupTaskHandler((task) -> fileIO, Executors.newSingleThreadExecutor()); + long snapshotId = 100L; + ManifestFile manifestFile = + TaskTestUtils.manifestFile( + fileIO, "manifest1.avro", snapshotId, "dataFile1.parquet", "dataFile2.parquet"); + TestSnapshot snapshot = + TaskTestUtils.newSnapshot(fileIO, "manifestList.avro", 1, snapshotId, 99L, manifestFile); + String metadataFile = "v1-49494949.metadata.json"; + StatisticsFile statisticsFile = + TaskTestUtils.writeStatsFile( + snapshot.snapshotId(), + snapshot.sequenceNumber(), + "/metadata/" + UUID.randomUUID() + ".stats", + fileIO); + TaskTestUtils.writeTableMetadata(fileIO, metadataFile, List.of(statisticsFile), snapshot); + + fileIO.deleteFile(statisticsFile.path()); + assertThat(TaskUtils.exists(statisticsFile.path(), fileIO)).isFalse(); + + TaskEntity task = + new TaskEntity.Builder() + .withTaskType(AsyncTaskType.METADATA_FILE_BATCH_CLEANUP) + .withData( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableIdentifier, List.of(statisticsFile.path()))) + .setName(UUID.randomUUID().toString()) + .build(); + assertThatPredicate(handler::canHandleTask).accepts(task); + assertThatPredicate(handler::handleTask).accepts(task); + } + } + + @Test + public void testCleanupWithRetries() throws IOException { + PolarisCallContext polarisCallContext = + new PolarisCallContext( + metaStoreManagerFactory.getOrCreateSessionSupplier(realmContext).get(), + new PolarisDefaultDiagServiceImpl()); + try (CallContext callCtx = CallContext.of(realmContext, polarisCallContext)) { + CallContext.setCurrentContext(callCtx); + Map retryCounter = new HashMap<>(); + FileIO fileIO = + new InMemoryFileIO() { + @Override + public void close() { + // no-op + } + + @Override + public void deleteFile(String location) { + int attempts = + retryCounter + .computeIfAbsent(location, k -> new AtomicInteger(0)) + .incrementAndGet(); + if (attempts < 3) { + throw new RuntimeException("Simulating failure to test retries"); + } else { + super.deleteFile(location); + } + } + }; + TableIdentifier tableIdentifier = + TableIdentifier.of(Namespace.of("db1", "schema1"), "table1"); + ManifestFileCleanupTaskHandler handler = + new ManifestFileCleanupTaskHandler((task) -> fileIO, Executors.newSingleThreadExecutor()); + long snapshotId = 100L; + ManifestFile manifestFile = + TaskTestUtils.manifestFile( + fileIO, "manifest1.avro", snapshotId, "dataFile1.parquet", "dataFile2.parquet"); + TestSnapshot snapshot = + TaskTestUtils.newSnapshot(fileIO, "manifestList.avro", 1, snapshotId, 99L, manifestFile); + String metadataFile = "v1-49494949.metadata.json"; + StatisticsFile statisticsFile = + TaskTestUtils.writeStatsFile( + snapshot.snapshotId(), + snapshot.sequenceNumber(), + "/metadata/" + UUID.randomUUID() + ".stats", + fileIO); + TaskTestUtils.writeTableMetadata(fileIO, metadataFile, List.of(statisticsFile), snapshot); + assertThat(TaskUtils.exists(statisticsFile.path(), fileIO)).isTrue(); + + TaskEntity task = + new TaskEntity.Builder() + .withTaskType(AsyncTaskType.METADATA_FILE_BATCH_CLEANUP) + .withData( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableIdentifier, List.of(statisticsFile.path()))) + .setName(UUID.randomUUID().toString()) + .build(); + + CompletableFuture future = + CompletableFuture.runAsync( + () -> { + assertThatPredicate(handler::canHandleTask).accepts(task); + handler.handleTask(task); // this will schedule the batch deletion + }); + + // Wait for all async tasks to finish + future.join(); + + // Check if the file was successfully deleted after retries + assertThat(TaskUtils.exists(statisticsFile.path(), fileIO)).isFalse(); + + // Ensure that retries happened as expected + assertThat(retryCounter.containsKey(statisticsFile.path())).isTrue(); + assertThat(retryCounter.get(statisticsFile.path()).get()).isEqualTo(3); + } + } } diff --git a/polaris-service/src/test/java/org/apache/polaris/service/task/TableCleanupTaskHandlerTest.java b/polaris-service/src/test/java/org/apache/polaris/service/task/TableCleanupTaskHandlerTest.java index ab9f9324c..383ac029d 100644 --- a/polaris-service/src/test/java/org/apache/polaris/service/task/TableCleanupTaskHandlerTest.java +++ b/polaris-service/src/test/java/org/apache/polaris/service/task/TableCleanupTaskHandlerTest.java @@ -22,10 +22,13 @@ import java.io.IOException; import java.util.List; +import java.util.UUID; import org.apache.commons.codec.binary.Base64; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.Snapshot; +import org.apache.iceberg.StatisticsFile; +import org.apache.iceberg.TableMetadata; import org.apache.iceberg.catalog.Namespace; import org.apache.iceberg.catalog.TableIdentifier; import org.apache.iceberg.inmemory.InMemoryFileIO; @@ -76,7 +79,13 @@ public void testTableCleanup() throws IOException { TestSnapshot snapshot = TaskTestUtils.newSnapshot(fileIO, "manifestList.avro", 1, snapshotId, 99L, manifestFile); String metadataFile = "v1-49494949.metadata.json"; - TaskTestUtils.writeTableMetadata(fileIO, metadataFile, snapshot); + StatisticsFile statisticsFile = + TaskTestUtils.writeStatsFile( + snapshot.snapshotId(), + snapshot.sequenceNumber(), + "/metadata/" + UUID.randomUUID() + ".stats", + fileIO); + TaskTestUtils.writeTableMetadata(fileIO, metadataFile, List.of(statisticsFile), snapshot); TaskEntity task = new TaskEntity.Builder() @@ -97,19 +106,30 @@ public void testTableCleanup() throws IOException { assertThat( metaStoreManagerFactory .getOrCreateMetaStoreManager(realmContext) - .loadTasks(polarisCallContext, "test", 1) + .loadTasks(polarisCallContext, "test", 2) .getEntities()) - .hasSize(1) - .satisfiesExactly( + .hasSize(2) + .satisfiesExactlyInAnyOrder( taskEntity -> assertThat(taskEntity) .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) - .extracting(entity -> TaskEntity.of(entity)) - .returns(AsyncTaskType.FILE_CLEANUP, TaskEntity::getTaskType) + .extracting(TaskEntity::of) + .returns(AsyncTaskType.MANIFEST_FILE_CLEANUP, TaskEntity::getTaskType) .returns( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, Base64.encodeBase64String(ManifestFiles.encode(manifestFile))), + entity -> + entity.readData( + ManifestFileCleanupTaskHandler.ManifestCleanupTask.class)), + taskEntity -> + assertThat(taskEntity) + .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) + .extracting(TaskEntity::of) + .returns(AsyncTaskType.METADATA_FILE_BATCH_CLEANUP, TaskEntity::getTaskType) + .returns( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableIdentifier, List.of(statisticsFile.path())), entity -> entity.readData( ManifestFileCleanupTaskHandler.ManifestCleanupTask.class))); @@ -243,8 +263,8 @@ public void close() { taskEntity -> assertThat(taskEntity) .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) - .extracting(entity -> TaskEntity.of(entity)) - .returns(AsyncTaskType.FILE_CLEANUP, TaskEntity::getTaskType) + .extracting(TaskEntity::of) + .returns(AsyncTaskType.MANIFEST_FILE_CLEANUP, TaskEntity::getTaskType) .returns( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, @@ -255,8 +275,8 @@ public void close() { taskEntity -> assertThat(taskEntity) .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) - .extracting(entity -> TaskEntity.of(entity)) - .returns(AsyncTaskType.FILE_CLEANUP, TaskEntity::getTaskType) + .extracting(TaskEntity::of) + .returns(AsyncTaskType.MANIFEST_FILE_CLEANUP, TaskEntity::getTaskType) .returns( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, @@ -303,7 +323,20 @@ public void testTableCleanupMultipleSnapshots() throws IOException { manifestFile1, manifestFile3); // exclude manifest2 from the new snapshot String metadataFile = "v1-295495059.metadata.json"; - TaskTestUtils.writeTableMetadata(fileIO, metadataFile, snapshot, snapshot2); + StatisticsFile statisticsFile1 = + TaskTestUtils.writeStatsFile( + snapshot.snapshotId(), + snapshot.sequenceNumber(), + "/metadata/" + UUID.randomUUID() + ".stats", + fileIO); + StatisticsFile statisticsFile2 = + TaskTestUtils.writeStatsFile( + snapshot2.snapshotId(), + snapshot2.sequenceNumber(), + "/metadata/" + UUID.randomUUID() + ".stats", + fileIO); + TaskTestUtils.writeTableMetadata( + fileIO, metadataFile, List.of(statisticsFile1, statisticsFile2), snapshot, snapshot2); TaskEntity task = new TaskEntity.Builder() @@ -318,13 +351,48 @@ public void testTableCleanupMultipleSnapshots() throws IOException { Assertions.assertThatPredicate(handler::canHandleTask).accepts(task); CallContext.setCurrentContext(CallContext.of(realmContext, polarisCallContext)); + handler.handleTask(task); - assertThat( - metaStoreManagerFactory - .getOrCreateMetaStoreManager(realmContext) - .loadTasks(polarisCallContext, "test", 5) - .getEntities()) + List entities = + metaStoreManagerFactory + .getOrCreateMetaStoreManager(realmContext) + .loadTasks(polarisCallContext, "test", 5) + .getEntities(); + + List manifestCleanupTasks = + entities.stream() + .filter( + entity -> { + AsyncTaskType taskType = TaskEntity.of(entity).getTaskType(); + return taskType == AsyncTaskType.MANIFEST_FILE_CLEANUP; + }) + .toList(); + List metadataCleanupTasks = + entities.stream() + .filter( + entity -> { + AsyncTaskType taskType = TaskEntity.of(entity).getTaskType(); + return taskType == AsyncTaskType.METADATA_FILE_BATCH_CLEANUP; + }) + .toList(); + + assertThat(metadataCleanupTasks) + .hasSize(1) + .satisfiesExactlyInAnyOrder( + taskEntity -> + assertThat(taskEntity) + .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) + .extracting(TaskEntity::of) + .returns( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableIdentifier, + List.of(statisticsFile1.path(), statisticsFile2.path())), + entity -> + entity.readData( + ManifestFileCleanupTaskHandler.ManifestCleanupTask.class))); + + assertThat(manifestCleanupTasks) // all three manifests should be present, even though one is excluded from the latest // snapshot .hasSize(3) @@ -332,7 +400,168 @@ public void testTableCleanupMultipleSnapshots() throws IOException { taskEntity -> assertThat(taskEntity) .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) - .extracting(entity -> TaskEntity.of(entity)) + .extracting(TaskEntity::of) + .returns( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableIdentifier, + Base64.encodeBase64String(ManifestFiles.encode(manifestFile1))), + entity -> + entity.readData( + ManifestFileCleanupTaskHandler.ManifestCleanupTask.class)), + taskEntity -> + assertThat(taskEntity) + .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) + .extracting(TaskEntity::of) + .returns( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableIdentifier, + Base64.encodeBase64String(ManifestFiles.encode(manifestFile2))), + entity -> + entity.readData( + ManifestFileCleanupTaskHandler.ManifestCleanupTask.class)), + taskEntity -> + assertThat(taskEntity) + .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) + .extracting(TaskEntity::of) + .returns( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableIdentifier, + Base64.encodeBase64String(ManifestFiles.encode(manifestFile3))), + entity -> + entity.readData( + ManifestFileCleanupTaskHandler.ManifestCleanupTask.class))); + } + } + + @Test + public void testTableCleanupMultipleMetadata() throws IOException { + PolarisCallContext polarisCallContext = + new PolarisCallContext( + metaStoreManagerFactory.getOrCreateSessionSupplier(realmContext).get(), + new PolarisDefaultDiagServiceImpl()); + try (CallContext callCtx = CallContext.of(realmContext, polarisCallContext)) { + CallContext.setCurrentContext(callCtx); + FileIO fileIO = new InMemoryFileIO(); + TableIdentifier tableIdentifier = + TableIdentifier.of(Namespace.of("db1", "schema1"), "table1"); + TableCleanupTaskHandler handler = + new TableCleanupTaskHandler(Mockito.mock(), metaStoreManagerFactory, (task) -> fileIO); + long snapshotId1 = 100L; + ManifestFile manifestFile1 = + TaskTestUtils.manifestFile( + fileIO, "manifest1.avro", snapshotId1, "dataFile1.parquet", "dataFile2.parquet"); + ManifestFile manifestFile2 = + TaskTestUtils.manifestFile( + fileIO, "manifest2.avro", snapshotId1, "dataFile3.parquet", "dataFile4.parquet"); + Snapshot snapshot = + TaskTestUtils.newSnapshot( + fileIO, "manifestList.avro", 1, snapshotId1, 99L, manifestFile1, manifestFile2); + StatisticsFile statisticsFile1 = + TaskTestUtils.writeStatsFile( + snapshot.snapshotId(), + snapshot.sequenceNumber(), + "/metadata/" + UUID.randomUUID() + ".stats", + fileIO); + String firstMetadataFile = "v1-295495059.metadata.json"; + TableMetadata firstMetadata = + TaskTestUtils.writeTableMetadata( + fileIO, firstMetadataFile, List.of(statisticsFile1), snapshot); + assertThat(TaskUtils.exists(firstMetadataFile, fileIO)).isTrue(); + + ManifestFile manifestFile3 = + TaskTestUtils.manifestFile( + fileIO, "manifest3.avro", snapshot.snapshotId() + 1, "dataFile5.parquet"); + Snapshot snapshot2 = + TaskTestUtils.newSnapshot( + fileIO, + "manifestList2.avro", + snapshot.sequenceNumber() + 1, + snapshot.snapshotId() + 1, + snapshot.snapshotId(), + manifestFile1, + manifestFile3); // exclude manifest2 from the new snapshot + StatisticsFile statisticsFile2 = + TaskTestUtils.writeStatsFile( + snapshot2.snapshotId(), + snapshot2.sequenceNumber(), + "/metadata/" + UUID.randomUUID() + ".stats", + fileIO); + String secondMetadataFile = "v1-295495060.metadata.json"; + TaskTestUtils.writeTableMetadata( + fileIO, + secondMetadataFile, + firstMetadata, + firstMetadataFile, + List.of(statisticsFile2), + snapshot2); + assertThat(TaskUtils.exists(firstMetadataFile, fileIO)).isTrue(); + assertThat(TaskUtils.exists(secondMetadataFile, fileIO)).isTrue(); + + TaskEntity task = + new TaskEntity.Builder() + .withTaskType(AsyncTaskType.ENTITY_CLEANUP_SCHEDULER) + .withData( + new TableLikeEntity.Builder(tableIdentifier, secondMetadataFile) + .setName("table1") + .setCatalogId(1) + .setCreateTimestamp(100) + .build()) + .build(); + + Assertions.assertThatPredicate(handler::canHandleTask).accepts(task); + + CallContext.setCurrentContext(CallContext.of(realmContext, polarisCallContext)); + + handler.handleTask(task); + + List entities = + metaStoreManagerFactory + .getOrCreateMetaStoreManager(realmContext) + .loadTasks(polarisCallContext, "test", 6) + .getEntities(); + + List manifestCleanupTasks = + entities.stream() + .filter( + entity -> { + AsyncTaskType taskType = TaskEntity.of(entity).getTaskType(); + return taskType == AsyncTaskType.MANIFEST_FILE_CLEANUP; + }) + .toList(); + List metadataCleanupTasks = + entities.stream() + .filter( + entity -> { + AsyncTaskType taskType = TaskEntity.of(entity).getTaskType(); + return taskType == AsyncTaskType.METADATA_FILE_BATCH_CLEANUP; + }) + .toList(); + + assertThat(metadataCleanupTasks) + .hasSize(1) + .satisfiesExactlyInAnyOrder( + taskEntity -> + assertThat(taskEntity) + .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) + .extracting(TaskEntity::of) + .returns( + new ManifestFileCleanupTaskHandler.ManifestCleanupTask( + tableIdentifier, + List.of( + firstMetadataFile, + statisticsFile1.path(), + statisticsFile2.path())), + entity -> + entity.readData( + ManifestFileCleanupTaskHandler.ManifestCleanupTask.class))); + + assertThat(manifestCleanupTasks) + .hasSize(3) + .satisfiesExactlyInAnyOrder( + taskEntity -> + assertThat(taskEntity) + .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) + .extracting(TaskEntity::of) .returns( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, @@ -343,7 +572,7 @@ public void testTableCleanupMultipleSnapshots() throws IOException { taskEntity -> assertThat(taskEntity) .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) - .extracting(entity -> TaskEntity.of(entity)) + .extracting(TaskEntity::of) .returns( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, @@ -354,7 +583,7 @@ public void testTableCleanupMultipleSnapshots() throws IOException { taskEntity -> assertThat(taskEntity) .returns(PolarisEntityType.TASK.getCode(), PolarisBaseEntity::getTypeCode) - .extracting(entity -> TaskEntity.of(entity)) + .extracting(TaskEntity::of) .returns( new ManifestFileCleanupTaskHandler.ManifestCleanupTask( tableIdentifier, diff --git a/polaris-service/src/test/java/org/apache/polaris/service/task/TaskTestUtils.java b/polaris-service/src/test/java/org/apache/polaris/service/task/TaskTestUtils.java index 818f87654..376ca1afa 100644 --- a/polaris-service/src/test/java/org/apache/polaris/service/task/TaskTestUtils.java +++ b/polaris-service/src/test/java/org/apache/polaris/service/task/TaskTestUtils.java @@ -20,6 +20,7 @@ import jakarta.annotation.Nonnull; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.util.Arrays; import java.util.List; @@ -27,6 +28,8 @@ import org.apache.iceberg.DataFile; import org.apache.iceberg.DataFiles; import org.apache.iceberg.FileFormat; +import org.apache.iceberg.GenericBlobMetadata; +import org.apache.iceberg.GenericStatisticsFile; import org.apache.iceberg.ManifestFile; import org.apache.iceberg.ManifestFiles; import org.apache.iceberg.ManifestWriter; @@ -34,12 +37,16 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; import org.apache.iceberg.SortOrder; +import org.apache.iceberg.StatisticsFile; import org.apache.iceberg.TableMetadata; import org.apache.iceberg.TableMetadataParser; import org.apache.iceberg.avro.Avro; import org.apache.iceberg.io.FileAppender; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.io.PositionOutputStream; +import org.apache.iceberg.puffin.Blob; +import org.apache.iceberg.puffin.Puffin; +import org.apache.iceberg.puffin.PuffinWriter; import org.apache.iceberg.types.Types; public class TaskTestUtils { @@ -62,25 +69,55 @@ static ManifestFile manifestFile( return writer.toManifestFile(); } - static void writeTableMetadata(FileIO fileIO, String metadataFile, Snapshot... snapshots) + static TableMetadata writeTableMetadata(FileIO fileIO, String metadataFile, Snapshot... snapshots) throws IOException { - TableMetadata.Builder tmBuidler = - TableMetadata.buildFromEmpty() - .setLocation("path/to/table") - .addSchema( - new Schema( - List.of(Types.NestedField.of(1, false, "field1", Types.StringType.get()))), - 1) - .addSortOrder(SortOrder.unsorted()) - .assignUUID(UUID.randomUUID().toString()) - .addPartitionSpec(PartitionSpec.unpartitioned()); + return writeTableMetadata(fileIO, metadataFile, null, null, null, snapshots); + } + + static TableMetadata writeTableMetadata( + FileIO fileIO, + String metadataFile, + List statisticsFiles, + Snapshot... snapshots) + throws IOException { + return writeTableMetadata(fileIO, metadataFile, null, null, statisticsFiles, snapshots); + } + + static TableMetadata writeTableMetadata( + FileIO fileIO, + String metadataFile, + TableMetadata prevMetadata, + String prevMetadataFile, + List statisticsFiles, + Snapshot... snapshots) + throws IOException { + TableMetadata.Builder tmBuilder; + if (prevMetadata == null) { + tmBuilder = TableMetadata.buildFromEmpty(); + } else { + tmBuilder = TableMetadata.buildFrom(prevMetadata).setPreviousFileLocation(prevMetadataFile); + } + tmBuilder + .setLocation("path/to/table") + .addSchema( + new Schema(List.of(Types.NestedField.of(1, false, "field1", Types.StringType.get()))), + 1) + .addSortOrder(SortOrder.unsorted()) + .assignUUID(UUID.randomUUID().toString()) + .addPartitionSpec(PartitionSpec.unpartitioned()); + + int statisticsFileIndex = 0; for (Snapshot snapshot : snapshots) { - tmBuidler.addSnapshot(snapshot); + tmBuilder.addSnapshot(snapshot); + if (statisticsFiles != null) { + tmBuilder.setStatistics(snapshot.snapshotId(), statisticsFiles.get(statisticsFileIndex++)); + } } - TableMetadata tableMetadata = tmBuidler.build(); + TableMetadata tableMetadata = tmBuilder.build(); PositionOutputStream out = fileIO.newOutputFile(metadataFile).createOrOverwrite(); out.write(TableMetadataParser.toJson(tableMetadata).getBytes(StandardCharsets.UTF_8)); out.close(); + return tableMetadata; } static @Nonnull TestSnapshot newSnapshot( @@ -103,4 +140,26 @@ static void writeTableMetadata(FileIO fileIO, String metadataFile, Snapshot... s new TestSnapshot(sequenceNumber, snapshotId, parentSnapshot, 1L, manifestListLocation); return snapshot; } + + public static StatisticsFile writeStatsFile( + long snapshotId, long snapshotSequenceNumber, String statsLocation, FileIO fileIO) + throws IOException { + try (PuffinWriter puffinWriter = Puffin.write(fileIO.newOutputFile(statsLocation)).build()) { + puffinWriter.add( + new Blob( + "some-blob-type", + List.of(1), + snapshotId, + snapshotSequenceNumber, + ByteBuffer.wrap("blob content".getBytes(StandardCharsets.UTF_8)))); + puffinWriter.finish(); + + return new GenericStatisticsFile( + snapshotId, + statsLocation, + puffinWriter.fileSize(), + puffinWriter.footerSize(), + puffinWriter.writtenBlobsMetadata().stream().map(GenericBlobMetadata::from).toList()); + } + } }