From 981de86defbd1be41ae47e033c91abc774fc56f2 Mon Sep 17 00:00:00 2001 From: Nikhil Collooru Date: Wed, 10 Jul 2024 15:23:55 -0700 Subject: [PATCH] Use data size for directory listing cache --- presto-hdfs-core/pom.xml | 6 +++ .../facebook/presto/hive/BlockLocation.java | 8 ++++ .../facebook/presto/hive/HiveFileInfo.java | 11 +++++ .../presto/hive/CachingDirectoryLister.java | 46 +++++++++++++++---- .../presto/hive/HiveClientConfig.java | 13 +++--- .../hive/TestBackgroundHiveSplitLoader.java | 9 ++-- .../presto/hive/TestHiveClientConfig.java | 7 +-- 7 files changed, 77 insertions(+), 23 deletions(-) diff --git a/presto-hdfs-core/pom.xml b/presto-hdfs-core/pom.xml index 4af29d95bb4c6..213e14c8ac570 100644 --- a/presto-hdfs-core/pom.xml +++ b/presto-hdfs-core/pom.xml @@ -65,6 +65,12 @@ drift-api + + org.openjdk.jol + jol-core + provided + + org.testng diff --git a/presto-hdfs-core/src/main/java/com/facebook/presto/hive/BlockLocation.java b/presto-hdfs-core/src/main/java/com/facebook/presto/hive/BlockLocation.java index 0743a9ddcdf4a..c3089d8b1fae9 100644 --- a/presto-hdfs-core/src/main/java/com/facebook/presto/hive/BlockLocation.java +++ b/presto-hdfs-core/src/main/java/com/facebook/presto/hive/BlockLocation.java @@ -17,6 +17,7 @@ import com.facebook.drift.annotations.ThriftField; import com.facebook.drift.annotations.ThriftStruct; import com.google.common.collect.ImmutableList; +import org.openjdk.jol.info.ClassLayout; import javax.annotation.Nullable; @@ -30,6 +31,8 @@ @ThriftStruct public class BlockLocation { + private static final long INSTANCE_SIZE = ClassLayout.parseClass(BlockLocation.class).instanceSize(); + private final List hosts; private final long offset; private final long length; @@ -83,6 +86,11 @@ public long getLength() return length; } + public long getRetainedSizeInBytes() + { + return INSTANCE_SIZE + hosts.stream().mapToLong(String::length).reduce(0, Long::sum); + } + @Override public boolean equals(Object o) { diff --git a/presto-hdfs-core/src/main/java/com/facebook/presto/hive/HiveFileInfo.java b/presto-hdfs-core/src/main/java/com/facebook/presto/hive/HiveFileInfo.java index 2c781096467b3..1ec09d5996edb 100644 --- a/presto-hdfs-core/src/main/java/com/facebook/presto/hive/HiveFileInfo.java +++ b/presto-hdfs-core/src/main/java/com/facebook/presto/hive/HiveFileInfo.java @@ -19,6 +19,7 @@ import com.google.common.collect.ImmutableMap; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; +import org.openjdk.jol.info.ClassLayout; import java.io.IOException; import java.util.List; @@ -34,6 +35,8 @@ public class HiveFileInfo implements Comparable { + private static final long INSTANCE_SIZE = ClassLayout.parseClass(HiveFileInfo.class).instanceSize(); + private final String path; private final boolean isDirectory; private final List blockLocations; @@ -130,6 +133,14 @@ public Path getPath() return new Path(path); } + public long getRetainedSizeInBytes() + { + long blockLocationsSizeInBytes = blockLocations.stream().map(BlockLocation::getRetainedSizeInBytes).reduce(0L, Long::sum); + long extraFileInfoSizeInBytes = extraFileInfo.map(bytes -> bytes.length).orElse(0); + long customSplitInfoSizeInBytes = customSplitInfo.entrySet().stream().mapToLong(e -> e.getKey().length() + e.getValue().length()).reduce(0, Long::sum); + return INSTANCE_SIZE + path.length() + blockLocationsSizeInBytes + extraFileInfoSizeInBytes + customSplitInfoSizeInBytes; + } + @Override public boolean equals(Object o) { diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/CachingDirectoryLister.java b/presto-hive/src/main/java/com/facebook/presto/hive/CachingDirectoryLister.java index f9e398280dc67..7090493afe580 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/CachingDirectoryLister.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/CachingDirectoryLister.java @@ -24,8 +24,10 @@ import com.google.common.cache.Weigher; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; +import io.airlift.units.DataSize; import io.airlift.units.Duration; import org.apache.hadoop.fs.Path; +import org.openjdk.jol.info.ClassLayout; import org.weakref.jmx.Managed; import javax.inject.Inject; @@ -46,12 +48,13 @@ import static com.facebook.presto.spi.StandardErrorCode.INVALID_PROCEDURE_ARGUMENT; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static java.lang.Math.toIntExact; import static java.util.Objects.requireNonNull; public class CachingDirectoryLister implements DirectoryLister { - private final Cache> cache; + private final Cache cache; private final CachedTableChecker cachedTableChecker; private final DirectoryLister delegate; @@ -61,16 +64,16 @@ public CachingDirectoryLister(@ForCachingDirectoryLister DirectoryLister delegat this( delegate, hiveClientConfig.getFileStatusCacheExpireAfterWrite(), - hiveClientConfig.getFileStatusCacheMaxSize(), + hiveClientConfig.getFileStatusCacheMaxRetainedSize(), hiveClientConfig.getFileStatusCacheTables()); } - public CachingDirectoryLister(DirectoryLister delegate, Duration expireAfterWrite, long maxSize, List tables) + public CachingDirectoryLister(DirectoryLister delegate, Duration expireAfterWrite, DataSize maxSize, List tables) { this.delegate = requireNonNull(delegate, "delegate is null"); cache = CacheBuilder.newBuilder() - .maximumWeight(maxSize) - .weigher((Weigher>) (key, value) -> value.size()) + .maximumWeight(maxSize.toBytes()) + .weigher((Weigher) (key, value) -> toIntExact(key.length() + value.getRetainedSizeInBytes())) .expireAfterWrite(expireAfterWrite.toMillis(), TimeUnit.MILLISECONDS) .recordStats() .build(); @@ -91,8 +94,9 @@ public Iterator list( if (hiveDirectoryContext.isCacheable()) { // DO NOT USE Caching, when cache is disabled. // This is useful for debugging issues, when cache is explicitly disabled via session property. - List files = cache.getIfPresent(path.toString()); - if (files != null) { + ValueHolder value = Optional.ofNullable(cache.getIfPresent(path.toString())).orElse(null); + if (value != null) { + List files = value.getFiles(); runtimeStats.addMetricValue(DIRECTORY_LISTING_CACHE_HIT, NONE, 1); runtimeStats.addMetricValue(DIRECTORY_LISTING_TIME_NANOS, NANO, System.nanoTime() - startTime); runtimeStats.addMetricValue(FILES_READ_COUNT, NONE, files.size()); @@ -122,7 +126,7 @@ public boolean hasNext() if (!hasNext) { runtimeStats.addMetricValue(FILES_READ_COUNT, NONE, files.size()); if (enableCaching) { - cache.put(path.toString(), ImmutableList.copyOf(files)); + cache.put(path.toString(), new ValueHolder(files)); } } return hasNext; @@ -145,8 +149,8 @@ public void invalidateDirectoryListCache(Optional directoryPath) throw new PrestoException(INVALID_PROCEDURE_ARGUMENT, "Directory path can not be a empty string"); } - List files = cache.getIfPresent(directoryPath.get()); - if (files == null) { + ValueHolder value = cache.getIfPresent(directoryPath.get()); + if (value == null) { throw new PrestoException(INVALID_PROCEDURE_ARGUMENT, "Given directory path is not cached : " + directoryPath); } cache.invalidate(directoryPath.get()); @@ -204,6 +208,28 @@ public long getSize() return cache.size(); } + private static class ValueHolder + { + private static final long INSTANCE_SIZE = ClassLayout.parseClass(ValueHolder.class).instanceSize(); + + private final List files; + + public ValueHolder(List files) + { + this.files = ImmutableList.copyOf(requireNonNull(files, "files is null")); + } + + public List getFiles() + { + return files; + } + + public long getRetainedSizeInBytes() + { + return INSTANCE_SIZE + files.stream().map(HiveFileInfo::getRetainedSizeInBytes).reduce(0L, Long::sum); + } + } + private static class CachedTableChecker { private final Set cachedTableNames; diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java index 0c8eaddda2809..f4d9c7e00a1d4 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientConfig.java @@ -47,6 +47,7 @@ import static com.facebook.presto.hive.HiveStorageFormat.ORC; import static com.google.common.base.Preconditions.checkArgument; import static io.airlift.units.DataSize.Unit.BYTE; +import static io.airlift.units.DataSize.Unit.KILOBYTE; import static io.airlift.units.DataSize.Unit.MEGABYTE; import static java.lang.String.format; import static java.util.Locale.ENGLISH; @@ -163,7 +164,7 @@ public class HiveClientConfig private boolean parquetPushdownFilterEnabled; private boolean adaptiveFilterReorderingEnabled = true; private Duration fileStatusCacheExpireAfterWrite = new Duration(0, TimeUnit.SECONDS); - private long fileStatusCacheMaxSize; + private DataSize fileStatusCacheMaxRetainedSize = new DataSize(0, KILOBYTE); private List fileStatusCacheTables = ImmutableList.of(); private DataSize pageFileStripeMaxSize = new DataSize(24, MEGABYTE); @@ -857,15 +858,15 @@ public HiveClientConfig setFileStatusCacheTables(String fileStatusCacheTables) return this; } - public long getFileStatusCacheMaxSize() + public DataSize getFileStatusCacheMaxRetainedSize() { - return fileStatusCacheMaxSize; + return fileStatusCacheMaxRetainedSize; } - @Config("hive.file-status-cache-size") - public HiveClientConfig setFileStatusCacheMaxSize(long fileStatusCacheMaxSize) + @Config("hive.file-status-cache.max-retained-size") + public HiveClientConfig setFileStatusCacheMaxRetainedSize(DataSize fileStatusCacheMaxRetainedSize) { - this.fileStatusCacheMaxSize = fileStatusCacheMaxSize; + this.fileStatusCacheMaxRetainedSize = fileStatusCacheMaxRetainedSize; return this; } diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestBackgroundHiveSplitLoader.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestBackgroundHiveSplitLoader.java index ed09968f7e881..a8299615211f4 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestBackgroundHiveSplitLoader.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestBackgroundHiveSplitLoader.java @@ -86,6 +86,7 @@ import static com.google.common.util.concurrent.MoreExecutors.directExecutor; import static io.airlift.slice.Slices.utf8Slice; import static io.airlift.units.DataSize.Unit.GIGABYTE; +import static io.airlift.units.DataSize.Unit.KILOBYTE; import static io.airlift.units.DataSize.Unit.MEGABYTE; import static java.util.concurrent.Executors.newCachedThreadPool; import static org.testng.Assert.assertEquals; @@ -284,21 +285,21 @@ public void testCachingDirectoryLister() new CachingDirectoryLister( new HadoopDirectoryLister(), new Duration(5, TimeUnit.MINUTES), - 1000, + new DataSize(100, KILOBYTE), ImmutableList.of("test_dbname.test_table")), "test_dbname.test_table"); testCachingDirectoryLister( new CachingDirectoryLister( new HadoopDirectoryLister(), new Duration(5, TimeUnit.MINUTES), - 1000, + new DataSize(100, KILOBYTE), ImmutableList.of("*")), "*"); testCachingDirectoryLister( new CachingDirectoryLister( new HadoopDirectoryLister(), new Duration(5, TimeUnit.MINUTES), - 1000, + new DataSize(100, KILOBYTE), ImmutableList.of("*")), ""); assertThrows( @@ -307,7 +308,7 @@ public void testCachingDirectoryLister() new CachingDirectoryLister( new HadoopDirectoryLister(), new Duration(5, TimeUnit.MINUTES), - 1000, + new DataSize(100, KILOBYTE), ImmutableList.of("*", "test_dbname.test_table")), "*,test_dbname.test_table")); } diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java index 831cf8907969a..376884f805b7d 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveClientConfig.java @@ -38,6 +38,7 @@ import static com.facebook.presto.hive.HiveStorageFormat.ORC; import static com.facebook.presto.hive.TestHiveUtil.nonDefaultTimeZone; import static io.airlift.units.DataSize.Unit.BYTE; +import static io.airlift.units.DataSize.Unit.KILOBYTE; import static io.airlift.units.DataSize.Unit.MEGABYTE; public class TestHiveClientConfig @@ -123,7 +124,7 @@ public void testDefaults() .setParquetPushdownFilterEnabled(false) .setAdaptiveFilterReorderingEnabled(true) .setFileStatusCacheExpireAfterWrite(new Duration(0, TimeUnit.SECONDS)) - .setFileStatusCacheMaxSize(0) + .setFileStatusCacheMaxRetainedSize(new DataSize(0, KILOBYTE)) .setFileStatusCacheTables("") .setPageFileStripeMaxSize(new DataSize(24, Unit.MEGABYTE)) .setBucketFunctionTypeForExchange(HIVE_COMPATIBLE) @@ -251,7 +252,7 @@ public void testExplicitPropertyMappings() .put("hive.parquet.pushdown-filter-enabled", "true") .put("hive.adaptive-filter-reordering-enabled", "false") .put("hive.file-status-cache-tables", "foo.bar1, foo.bar2") - .put("hive.file-status-cache-size", "1000") + .put("hive.file-status-cache.max-retained-size", "500MB") .put("hive.file-status-cache-expire-time", "30m") .put("hive.pagefile.writer.stripe-max-size", "1kB") .put("hive.bucket-function-type-for-exchange", "PRESTO_NATIVE") @@ -375,7 +376,7 @@ public void testExplicitPropertyMappings() .setParquetPushdownFilterEnabled(true) .setAdaptiveFilterReorderingEnabled(false) .setFileStatusCacheTables("foo.bar1,foo.bar2") - .setFileStatusCacheMaxSize(1000) + .setFileStatusCacheMaxRetainedSize((new DataSize(500, MEGABYTE))) .setFileStatusCacheExpireAfterWrite(new Duration(30, TimeUnit.MINUTES)) .setPageFileStripeMaxSize(new DataSize(1, Unit.KILOBYTE)) .setBucketFunctionTypeForExchange(PRESTO_NATIVE)