From 6413d50b1cce7f53be219a3ae24dc9e7af79a69d Mon Sep 17 00:00:00 2001 From: Zac Blanco Date: Thu, 28 Mar 2024 21:27:31 -0700 Subject: [PATCH] [Iceberg] Add histogram statistic support Utilizes the sketch_kll function to generate histograms and store them into the Iceberg table's puffin files for table-level statistic storage. Histograms are always collected by ANALYZE, but they are not used by the cost calculator unless enabled via optimizer.use-histograms --- .../com/facebook/presto/common/Utils.java | 196 ++++++++++++++- .../presto/common/predicate/Marker.java | 5 + .../presto/common/predicate/Range.java | 10 + .../common/predicate/SortedRangeSet.java | 22 ++ .../common/predicate/TestSortedRangeSet.java | 62 +++++ presto-iceberg/pom.xml | 15 +- .../iceberg/IcebergAbstractMetadata.java | 2 +- .../presto/iceberg/IcebergConfig.java | 17 ++ .../iceberg/IcebergSessionProperties.java | 10 + .../presto/iceberg/TableStatisticsMaker.java | 104 ++++++-- .../iceberg/statistics/KllHistogram.java | 219 +++++++++++++++++ .../iceberg/IcebergDistributedTestBase.java | 223 ++++++++++++++++- .../presto/iceberg/TestIcebergConfig.java | 7 +- .../iceberg/statistics/TestKllHistogram.java | 225 ++++++++++++++++++ .../cost/ComparisonStatsCalculator.java | 7 +- ...ConnectorFilterStatsCalculatorService.java | 5 + .../facebook/presto/cost/JoinStatsRule.java | 6 +- .../cost/PlanNodeStatsEstimateMath.java | 9 +- .../facebook/presto/cost/StatisticRange.java | 28 +-- .../presto/sql/rewrite/ShowStatsRewrite.java | 5 + .../presto/cost/TestHistogramCalculator.java | 100 -------- .../cost/TestPlanNodeStatsEstimateMath.java | 18 +- .../cost/TestVariableStatsEstimate.java | 4 +- ...ApproximateStatsOutputRowCountMatcher.java | 54 +++++ .../planner/assertions/PlanMatchPattern.java | 6 + presto-spi/pom.xml | 6 + .../spi/statistics/ColumnStatistics.java | 7 + .../DisjointRangeDomainHistogram.java | 134 ++++++----- .../spi/statistics}/HistogramCalculator.java | 65 +++-- .../UniformDistributionHistogram.java | 11 +- .../TestDisjointRangeDomainHistogram.java | 84 ++++--- .../presto/spi/statistics}/TestHistogram.java | 3 +- .../statistics/TestHistogramCalculator.java | 101 ++++++++ .../spi/statistics}/TestUniformHistogram.java | 7 +- 34 files changed, 1465 insertions(+), 312 deletions(-) create mode 100644 presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java create mode 100644 presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java delete mode 100644 presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java create mode 100644 presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java rename {presto-main/src/main/java/com/facebook/presto/cost => presto-spi/src/main/java/com/facebook/presto/spi/statistics}/DisjointRangeDomainHistogram.java (74%) rename {presto-main/src/main/java/com/facebook/presto/cost => presto-spi/src/main/java/com/facebook/presto/spi/statistics}/HistogramCalculator.java (69%) rename {presto-main/src/main/java/com/facebook/presto/cost => presto-spi/src/main/java/com/facebook/presto/spi/statistics}/UniformDistributionHistogram.java (90%) rename {presto-main/src/test/java/com/facebook/presto/cost => presto-spi/src/test/java/com/facebook/presto/spi/statistics}/TestDisjointRangeDomainHistogram.java (81%) rename {presto-main/src/test/java/com/facebook/presto/cost => presto-spi/src/test/java/com/facebook/presto/spi/statistics}/TestHistogram.java (97%) create mode 100644 presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java rename {presto-main/src/test/java/com/facebook/presto/cost => presto-spi/src/test/java/com/facebook/presto/spi/statistics}/TestUniformHistogram.java (93%) diff --git a/presto-common/src/main/java/com/facebook/presto/common/Utils.java b/presto-common/src/main/java/com/facebook/presto/common/Utils.java index 447f1dd87e79a..5e1e18185bd72 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/Utils.java +++ b/presto-common/src/main/java/com/facebook/presto/common/Utils.java @@ -18,8 +18,15 @@ import com.facebook.presto.common.predicate.Primitives; import com.facebook.presto.common.type.Type; +import javax.annotation.Nullable; + +import java.util.Arrays; +import java.util.function.Supplier; + import static com.facebook.presto.common.type.TypeUtils.readNativeValue; import static com.facebook.presto.common.type.TypeUtils.writeNativeValue; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; public final class Utils { @@ -30,7 +37,7 @@ private Utils() public static Block nativeValueToBlock(Type type, Object object) { if (object != null && !Primitives.wrap(type.getJavaType()).isInstance(object)) { - throw new IllegalArgumentException(String.format("Object '%s' does not match type %s", object, type.getJavaType())); + throw new IllegalArgumentException(format("Object '%s' does not match type %s", object, type.getJavaType())); } BlockBuilder blockBuilder = type.createBlockBuilder(null, 1); writeNativeValue(type, blockBuilder, object); @@ -48,4 +55,191 @@ public static void checkArgument(boolean expression) throw new IllegalArgumentException(); } } + + public static void checkArgument(boolean expression, String message, Object... args) + { + if (!expression) { + throw new IllegalArgumentException(format(message, args)); + } + } + + /** + * Returns a supplier which caches the instance retrieved during the first call to {@code get()} + * and returns that value on subsequent calls to {@code get()}. + */ + public static Supplier memoizedSupplier(Supplier delegate) + { + if (delegate instanceof MemoizingSupplier) { + return delegate; + } + return new MemoizingSupplier<>(delegate); + } + + static class MemoizingSupplier + implements Supplier + { + volatile Supplier delegate; + volatile boolean initialized; + // "value" does not need to be volatile; visibility piggy-backs + // on volatile read of "initialized". + @Nullable T value; + + MemoizingSupplier(Supplier delegate) + { + this.delegate = requireNonNull(delegate); + } + + @Override + public T get() + { + // A 2-field variant of Double Checked Locking. + if (!initialized) { + synchronized (this) { + if (!initialized) { + T t = delegate.get(); + value = t; + initialized = true; + // Release the delegate to GC. + delegate = null; + return t; + } + } + } + return value; + } + + @Override + public String toString() + { + Supplier delegate = this.delegate; + return "Suppliers.memoize(" + + (delegate == null ? "" : delegate) + + ")"; + } + } + + public static ToStringHelper toStringHelper(Object self) + { + return new ToStringHelper(self.getClass().getSimpleName()); + } + + public static ToStringHelper toStringHelper(String className) + { + return new ToStringHelper(className); + } + + public static final class ToStringHelper + { + private final String className; + private final ValueHolder holderHead = new ValueHolder(); + private ValueHolder holderTail = holderHead; + private boolean omitNullValues; + + private ToStringHelper(String className) + { + this.className = requireNonNull(className); + } + + public ToStringHelper omitNullValues() + { + omitNullValues = true; + return this; + } + + public ToStringHelper add(String name, @Nullable Object value) + { + return addHolder(name, value); + } + + public ToStringHelper add(String name, boolean value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, char value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, double value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, float value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, int value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, long value) + { + return addHolder(name, String.valueOf(value)); + } + + @Override + public String toString() + { + // create a copy to keep it consistent in case value changes + boolean omitNullValuesSnapshot = omitNullValues; + String nextSeparator = ""; + StringBuilder builder = new StringBuilder(32).append(className).append('{'); + for (ValueHolder valueHolder = holderHead.next; + valueHolder != null; + valueHolder = valueHolder.next) { + Object value = valueHolder.value; + if (!omitNullValuesSnapshot || value != null) { + builder.append(nextSeparator); + nextSeparator = ", "; + + if (valueHolder.name != null) { + builder.append(valueHolder.name).append('='); + } + if (value != null && value.getClass().isArray()) { + Object[] objectArray = {value}; + String arrayString = Arrays.deepToString(objectArray); + builder.append(arrayString, 1, arrayString.length() - 1); + } + else { + builder.append(value); + } + } + } + return builder.append('}').toString(); + } + + private ValueHolder addHolder() + { + ValueHolder valueHolder = new ValueHolder(); + holderTail.next = valueHolder; + holderTail = valueHolder; + return valueHolder; + } + + private ToStringHelper addHolder(@Nullable Object value) + { + ValueHolder valueHolder = addHolder(); + valueHolder.value = value; + return this; + } + + private ToStringHelper addHolder(String name, @Nullable Object value) + { + ValueHolder valueHolder = addHolder(); + valueHolder.value = value; + valueHolder.name = requireNonNull(name); + return this; + } + + private static final class ValueHolder + { + @Nullable String name; + @Nullable Object value; + @Nullable ValueHolder next; + } + } } diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java index f20a87065bcfa..76a58d147a281 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java @@ -129,6 +129,11 @@ public Object getValue() return Utils.blockToNativeValue(type, valueBlock.get()); } + public Optional getObjectValue() + { + return valueBlock.map(block -> Utils.blockToNativeValue(type, block)); + } + public Object getPrintableValue(SqlFunctionProperties properties) { if (!valueBlock.isPresent()) { diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java index d00a3c77df0e2..501996cc6aa82 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java @@ -247,6 +247,16 @@ public boolean equals(Object obj) Objects.equals(this.high, other.high); } + @Override + public String toString() + { + return (low.getBound() == Marker.Bound.EXACTLY ? "[" : "(") + + low.getObjectValue().orElse(Double.NEGATIVE_INFINITY) + + ".." + + high.getObjectValue().orElse(Double.POSITIVE_INFINITY) + + (high.getBound() == Marker.Bound.EXACTLY ? "]" : ")"); + } + private void appendQuotedValue(StringBuilder buffer, Marker marker, SqlFunctionProperties properties) { buffer.append('"'); diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java index 5f1988be005d1..4af54a8e2a685 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java @@ -168,6 +168,28 @@ public Object getSingleValue() return lowIndexedRanges.values().iterator().next().getSingleValue(); } + /** + * Build a new {@link SortedRangeSet} that contains ranges which lie within the argument range + * + * @param span the range which the new set should span + * @return a new range set + */ + public SortedRangeSet subRangeSet(Range span) + { + Builder builder = new Builder(type); + + for (Range range : getOrderedRanges()) { + if (span.contains(range)) { + builder.add(range); + } + else if (span.overlaps(range)) { + builder.add(range.intersect(span)); + } + } + + return builder.build(); + } + @Override public boolean containsValue(Object value) { diff --git a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java index f754681359a5a..087073c432bd5 100644 --- a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java +++ b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java @@ -26,6 +26,9 @@ import com.google.common.collect.Iterables; import org.testng.annotations.Test; +import java.util.Arrays; +import java.util.stream.Collectors; + import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.BooleanType.BOOLEAN; import static com.facebook.presto.common.type.DoubleType.DOUBLE; @@ -500,6 +503,65 @@ public void testCanonicalize() assertDifferentSet(SortedRangeSet.all(BIGINT), SortedRangeSet.all(BOOLEAN), true); } + @Test + public void testSubRangeSet() + { + // test subrange no overlap below and above + assertEquals(SortedRangeSet.of(Range.lessThan(BIGINT, 10L)) + .subRangeSet(Range.greaterThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + assertEquals(SortedRangeSet.of(Range.greaterThan(BIGINT, 10L)) + .subRangeSet(Range.lessThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + assertEquals(SortedRangeSet.of(Range.greaterThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.lessThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + assertEquals(SortedRangeSet.of(Range.lessThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.greaterThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + + // test with equal bounds + assertEquals(SortedRangeSet.of(Range.lessThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.greaterThanOrEqual(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 1); + assertEquals(SortedRangeSet.of(Range.greaterThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.lessThanOrEqual(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 1); + assertEquals(SortedRangeSet.of(Range.lessThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.greaterThanOrEqual(BIGINT, 10L)) + .getOrderedRanges().get(0), Range.range(BIGINT, 10L, true, 10L, true)); + // two ranges + assertEquals(SortedRangeSet.of(Range.lessThan(BIGINT, -10L), Range.greaterThan(BIGINT, 10L)) + .subRangeSet(Range.range(BIGINT, -20L, true, 20L, true)).getOrderedRanges(), + Arrays.stream(new Range[] { + Range.range(BIGINT, -20L, true, -10L, false), + Range.range(BIGINT, 10L, false, 20L, true)}) + .collect(Collectors.toList())); + // range entirely contained + assertEquals(SortedRangeSet.of( + Range.lessThan(BIGINT, -10L), + Range.greaterThan(BIGINT, 10L), + Range.range(BIGINT, -5L, true, 5L, true)) + .subRangeSet(Range.range(BIGINT, -20L, true, 20L, true)).getOrderedRanges(), + Arrays.stream(new Range[] { + Range.range(BIGINT, -20L, true, -10L, false), + Range.range(BIGINT, -5L, true, 5L, true), + Range.range(BIGINT, 10L, false, 20L, true)}) + .collect(Collectors.toList())); + } + private void assertSameSet(SortedRangeSet set1, SortedRangeSet set2, boolean removeSafeConstants) throws Exception { diff --git a/presto-iceberg/pom.xml b/presto-iceberg/pom.xml index f8d65d59164df..239a5c65b0e93 100644 --- a/presto-iceberg/pom.xml +++ b/presto-iceberg/pom.xml @@ -510,19 +510,16 @@ presto-cache compile - com.facebook.presto presto-main test - - com.facebook.presto - presto-parser - test + com.facebook.presto + presto-parser + test - com.facebook.presto presto-analyzer @@ -597,7 +594,7 @@ org.apache.iceberg iceberg-core - 1.5.0 + ${dep.iceberg.version} tests test @@ -627,6 +624,10 @@ + + org.apache.commons + commons-math3 + diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java index 8c4b3e1182f5c..a0c348a422b5e 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java @@ -606,7 +606,7 @@ public TableStatisticsMetadata getStatisticsCollectionMetadata(ConnectorSession MetricsConfig metricsConfig = MetricsConfig.forTable(table); Set columnStatistics = tableMetadata.getColumns().stream() .filter(column -> !column.isHidden() && metricsConfig.columnMode(column.getName()) != None.get()) - .flatMap(meta -> getSupportedColumnStatistics(meta.getName(), meta.getType()).stream()) + .flatMap(meta -> getSupportedColumnStatistics(session, meta.getName(), meta.getType()).stream()) .collect(toImmutableSet()); Set tableStatistics = ImmutableSet.of(ROW_COUNT); diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java index 74464f938f516..eb2c0871c80b2 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java @@ -23,6 +23,7 @@ import javax.validation.constraints.DecimalMax; import javax.validation.constraints.DecimalMin; +import javax.validation.constraints.Max; import javax.validation.constraints.Min; import javax.validation.constraints.NotNull; @@ -59,6 +60,7 @@ public class IcebergConfig private int metadataPreviousVersionsMax = METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT; private boolean metadataDeleteAfterCommit = METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT; private int metricsMaxInferredColumn = METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT; + private int statisticsKllSketchKParameter = 1024; private EnumSet hiveStatisticsMergeFlags = EnumSet.noneOf(ColumnStatisticType.class); private String fileIOImpl = HadoopFileIO.class.getName(); @@ -395,4 +397,19 @@ public IcebergConfig setMetricsMaxInferredColumn(int metricsMaxInferredColumn) this.metricsMaxInferredColumn = metricsMaxInferredColumn; return this; } + + public int getStatisticsKllSketchKParameter() + { + return this.statisticsKllSketchKParameter; + } + + @Config("iceberg.statistics-kll-sketch-k-parameter") + @Min(8) + @Max(65535) + @ConfigDescription("K parameter for KLL sketches when generating histogram statistics") + public IcebergConfig setStatisticsKllSketchKParameter(int kllSketchKParameter) + { + this.statisticsKllSketchKParameter = kllSketchKParameter; + return this; + } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java index 5a597d97051b4..57f954801f2f7 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java @@ -64,6 +64,7 @@ public final class IcebergSessionProperties public static final String HIVE_METASTORE_STATISTICS_MERGE_STRATEGY = "hive_statistics_merge_strategy"; public static final String STATISTIC_SNAPSHOT_RECORD_DIFFERENCE_WEIGHT = "statistic_snapshot_record_difference_weight"; public static final String ROWS_FOR_METADATA_OPTIMIZATION_THRESHOLD = "rows_for_metadata_optimization_threshold"; + public static final String STATISTICS_KLL_SKETCH_K_PARAMETER = "statistics_kll_sketch_k_parameter"; private final List> sessionProperties; @@ -184,6 +185,10 @@ public IcebergSessionProperties( "of an Iceberg table exceeds this threshold, metadata optimization would be skipped for " + "the table. A value of 0 means skip metadata optimization directly.", icebergConfig.getRowsForMetadataOptimizationThreshold(), + false)) + .add(integerProperty(STATISTICS_KLL_SKETCH_K_PARAMETER, + "The K parameter for the Apache DataSketches KLL sketch when computing histogram statistics", + icebergConfig.getStatisticsKllSketchKParameter(), false)); nessieConfig.ifPresent((config) -> propertiesBuilder @@ -313,4 +318,9 @@ public static String getNessieReferenceHash(ConnectorSession session) { return session.getProperty(NESSIE_REFERENCE_HASH, String.class); } + + public static int getStatisticsKllSketchKParameter(ConnectorSession session) + { + return session.getProperty(STATISTICS_KLL_SKETCH_K_PARAMETER, Integer.class); + } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java index 9da3de696f620..6457353e3bded 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java @@ -15,10 +15,14 @@ import com.facebook.airlift.log.Logger; import com.facebook.presto.common.block.Block; +import com.facebook.presto.common.predicate.Range; import com.facebook.presto.common.predicate.TupleDomain; +import com.facebook.presto.common.type.DecimalType; import com.facebook.presto.common.type.FixedWidthType; +import com.facebook.presto.common.type.KllSketchType; import com.facebook.presto.common.type.TypeManager; import com.facebook.presto.hive.NodeVersion; +import com.facebook.presto.iceberg.statistics.KllHistogram; import com.facebook.presto.spi.ConnectorSession; import com.facebook.presto.spi.Constraint; import com.facebook.presto.spi.PrestoException; @@ -26,11 +30,14 @@ import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.ComputedStatistics; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.theta.CompactSketch; import org.apache.iceberg.ContentFile; @@ -58,6 +65,8 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Pair; +import javax.annotation.Nullable; + import java.io.IOException; import java.io.UncheckedIOException; import java.nio.ByteBuffer; @@ -76,6 +85,8 @@ import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.DateType.DATE; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.common.type.TimeType.TIME; import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; import static com.facebook.presto.common.type.TimestampWithTimeZoneType.TIMESTAMP_WITH_TIME_ZONE; import static com.facebook.presto.common.type.TypeUtils.isNumericType; @@ -85,15 +96,20 @@ import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR; import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; import static com.facebook.presto.iceberg.IcebergSessionProperties.getStatisticSnapshotRecordDifferenceWeight; +import static com.facebook.presto.iceberg.IcebergSessionProperties.getStatisticsKllSketchKParameter; import static com.facebook.presto.iceberg.IcebergUtil.getIdentityPartitions; import static com.facebook.presto.iceberg.Partition.toMap; +import static com.facebook.presto.iceberg.TypeConverter.toPrestoType; +import static com.facebook.presto.iceberg.statistics.KllHistogram.isKllHistogramSupportedType; import static com.facebook.presto.iceberg.util.StatisticsUtil.calculateAndSetTableSize; import static com.facebook.presto.iceberg.util.StatisticsUtil.formatIdentifier; +import static com.facebook.presto.spi.statistics.ColumnStatisticType.HISTOGRAM; import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; import static com.facebook.presto.spi.statistics.SourceInfo.ConfidenceLevel.HIGH; import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.collect.Iterators.getOnlyElement; import static java.lang.Long.parseLong; import static java.lang.Math.abs; import static java.lang.String.format; @@ -106,6 +122,7 @@ public class TableStatisticsMaker private static final Logger log = Logger.get(TableStatisticsMaker.class); private static final String ICEBERG_THETA_SKETCH_BLOB_TYPE_ID = "apache-datasketches-theta-v1"; private static final String ICEBERG_DATA_SIZE_BLOB_TYPE_ID = "presto-sum-data-size-bytes-v1"; + private static final String ICEBERG_KLL_SKETCH_BLOB_TYPE_ID = "presto-kll-sketch-bytes-v1"; private static final String ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY = "ndv"; private static final String ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY = "data_size"; private final Table icebergTable; @@ -122,11 +139,13 @@ private TableStatisticsMaker(Table icebergTable, ConnectorSession session, TypeM private static final Map puffinStatWriters = ImmutableMap.builder() .put(NUMBER_OF_DISTINCT_VALUES, TableStatisticsMaker::generateNDVBlob) .put(TOTAL_SIZE_IN_BYTES, TableStatisticsMaker::generateStatSizeBlob) + .put(HISTOGRAM, TableStatisticsMaker::generateKllSketchBlob) .build(); private static final Map puffinStatReaders = ImmutableMap.builder() .put(ICEBERG_THETA_SKETCH_BLOB_TYPE_ID, TableStatisticsMaker::readNDVBlob) .put(ICEBERG_DATA_SIZE_BLOB_TYPE_ID, TableStatisticsMaker::readDataSizeBlob) + .put(ICEBERG_KLL_SKETCH_BLOB_TYPE_ID, TableStatisticsMaker::readKllSketchBlob) .build(); public static TableStatistics getTableStatistics(ConnectorSession session, TypeManager typeManager, Optional> currentPredicate, Constraint constraint, IcebergTableHandle tableHandle, Table icebergTable, List columns) @@ -209,7 +228,23 @@ private TableStatistics makeTableStatistics(IcebergTableHandle tableHandle, Opti Object min = summary.getMinValues().get(fieldId); Object max = summary.getMaxValues().get(fieldId); if (min instanceof Number && max instanceof Number) { - columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue()))); + DoubleRange range = new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue()); + columnBuilder.setRange(Optional.of(range)); + + // the histogram is generated by scanning the entire dataset. It is possible that + // the constraint prevents scanning portions of the table. Given that we know the + // range that the scan provides for a particular column, bound the histogram to the + // scanned range. + + // timestamp and time values in iceberg are stored in microseconds. Presto processes them + // in milliseconds by default, so our range for the sketch must be adjusted + if (columnHandle.getType().equals(TIMESTAMP) || columnHandle.getType().equals(TIME)) { + range = new DoubleRange(range.getMin() / 1000, range.getMax() / 1000); + } + final DoubleRange histRange = range; + columnBuilder.setHistogram(columnBuilder.getHistogram() + .map(histogram -> DisjointRangeDomainHistogram + .addConjunction(histogram, Range.range(DOUBLE, histRange.getMin(), true, histRange.getMax(), true)))); } result.setColumnStatistics(columnHandle, columnBuilder.build()); } @@ -309,9 +344,8 @@ private void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle ta .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)) .forEach((key, value) -> { Optional.ofNullable(puffinStatWriters.get(key.getStatisticType())) - .ifPresent(generator -> { - writer.add(generator.generate(key, value, icebergTable, snapshot)); - }); + .flatMap(generator -> Optional.ofNullable(generator.generate(key, value, icebergTable, snapshot, typeManager))) + .ifPresent(writer::add); }); writer.finish(); icebergTable.updateStatistics().setStatistics( @@ -336,7 +370,8 @@ private void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle ta @FunctionalInterface private interface PuffinBlobGenerator { - Blob generate(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot); + @Nullable + Blob generate(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager); } @FunctionalInterface @@ -345,12 +380,12 @@ private interface PuffinBlobReader /** * Reads the stats from the blob and then updates the stats builder argument. */ - void read(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder stats); + void read(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder stats, Table icebergTable, TypeManager typeManager); } - private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot) + private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager) { - int id = getFieldId(metadata, icebergTable); + int id = getField(metadata, icebergTable, snapshot).fieldId(); ByteBuffer raw = VARBINARY.getSlice(value, 0).toByteBuffer(); CompactSketch sketch = CompactSketch.wrap(Memory.wrap(raw, ByteOrder.nativeOrder())); return new Blob( @@ -363,9 +398,9 @@ private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block valu ImmutableMap.of(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY, Long.toString((long) sketch.getEstimate()))); } - private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot) + private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager) { - int id = getFieldId(metadata, icebergTable); + int id = getField(metadata, icebergTable, snapshot).fieldId(); long size = BIGINT.getLong(value, 0); return new Blob( ICEBERG_DATA_SIZE_BLOB_TYPE_ID, @@ -377,7 +412,26 @@ private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block ImmutableMap.of(ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY, Long.toString(size))); } - private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics) + private static Blob generateKllSketchBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager) + { + Types.NestedField field = getField(metadata, icebergTable, snapshot); + KllSketchType sketchType = new KllSketchType(toPrestoType(field.type(), typeManager)); + Slice sketchSlice = sketchType.getSlice(value, 0); + if (value.isNull(0)) { + // this can occur when all inputs to the sketch are null + return null; + } + return new Blob( + ICEBERG_KLL_SKETCH_BLOB_TYPE_ID, + ImmutableList.of(field.fieldId()), + snapshot.snapshotId(), + snapshot.sequenceNumber(), + sketchSlice.toByteBuffer(), + null, + ImmutableMap.of()); + } + + private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics, Table icebergTable, TypeManager typeManager) { Optional.ofNullable(metadata.properties().get(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY)) .ifPresent(ndvProp -> { @@ -392,7 +446,7 @@ private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnSt }); } - private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics) + private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics, Table icebergTable, TypeManager typeManager) { Optional.ofNullable(metadata.properties().get(ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY)) .ifPresent(sizeProp -> { @@ -407,9 +461,17 @@ private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, Col }); } - private static int getFieldId(ColumnStatisticMetadata metadata, Table icebergTable) + private static void readKllSketchBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics, Table icebergTable, TypeManager typeManager) + { + statistics.setHistogram(Optional.ofNullable(icebergTable.schemas().get(icebergTable.snapshot(metadata.snapshotId()).schemaId())) + .map(schema -> toPrestoType(schema.findType(getOnlyElement(metadata.inputFields().iterator())), typeManager)) + .map(prestoType -> new KllHistogram(Slices.wrappedBuffer(blob), prestoType))); + } + + private static Types.NestedField getField(ColumnStatisticMetadata metadata, Table icebergTable, Snapshot snapshot) { - return Optional.ofNullable(icebergTable.schema().findField(metadata.getColumnName())).map(Types.NestedField::fieldId) + return Optional.ofNullable(icebergTable.schemas().get(snapshot.schemaId())) + .map(schema -> schema.findField(metadata.getColumnName())) .orElseThrow(() -> { log.warn("failed to find column name %s in schema of table %s", metadata.getColumnName(), icebergTable.name()); return new PrestoException(ICEBERG_INVALID_METADATA, format("failed to find column name %s in schema of table %s", metadata.getColumnName(), icebergTable.name())); @@ -513,7 +575,7 @@ private Map loadStatisticsFile(StatisticsFile if (value == null) { value = ColumnStatistics.builder(); } - statReader.read(metadata, blob, value); + statReader.read(metadata, blob, value, icebergTable, typeManager); return value; }); }); @@ -527,7 +589,7 @@ private Map loadStatisticsFile(StatisticsFile return ImmutableMap.copyOf(result); } - public static List getSupportedColumnStatistics(String columnName, com.facebook.presto.common.type.Type type) + public static List getSupportedColumnStatistics(ConnectorSession session, String columnName, com.facebook.presto.common.type.Type type) { ImmutableList.Builder supportedStatistics = ImmutableList.builder(); // all types which support being passed to the sketch_theta function @@ -538,6 +600,16 @@ public static List getSupportedColumnStatistics(String columnName, format("RETURN sketch_theta(%s)", formatIdentifier(columnName)), ImmutableList.of(columnName))); } + if (isKllHistogramSupportedType(type)) { + String histogramFunctionFmt = "RETURN sketch_kll_with_k(%s, CAST(%s as bigint))"; + if (type instanceof DecimalType) { + histogramFunctionFmt = "RETURN sketch_kll_with_k(CAST(%s as double), CAST(%s as bigint))"; + } + supportedStatistics.add(HISTOGRAM.getColumnStatisticMetadataWithCustomFunction(columnName, + format(histogramFunctionFmt, formatIdentifier(columnName), getStatisticsKllSketchKParameter(session)), + ImmutableList.of(columnName))); + } + if (!(type instanceof FixedWidthType)) { supportedStatistics.add(TOTAL_SIZE_IN_BYTES.getColumnStatisticMetadata(columnName)); } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java new file mode 100644 index 0000000000000..f9055d503e989 --- /dev/null +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java @@ -0,0 +1,219 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.iceberg.statistics; + +import com.facebook.presto.common.type.AbstractIntType; +import com.facebook.presto.common.type.AbstractLongType; +import com.facebook.presto.common.type.AbstractVarcharType; +import com.facebook.presto.common.type.DecimalType; +import com.facebook.presto.common.type.Decimals; +import com.facebook.presto.common.type.SqlDecimal; +import com.facebook.presto.common.type.Type; +import com.facebook.presto.spi.PrestoException; +import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.Estimate; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.annotations.VisibleForTesting; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.datasketches.common.ArrayOfBooleansSerDe; +import org.apache.datasketches.common.ArrayOfDoublesSerDe; +import org.apache.datasketches.common.ArrayOfItemsSerDe; +import org.apache.datasketches.common.ArrayOfLongsSerDe; +import org.apache.datasketches.common.ArrayOfStringsSerDe; +import org.apache.datasketches.kll.KllItemsSketch; +import org.apache.datasketches.memory.Memory; + +import java.math.BigInteger; +import java.util.Comparator; +import java.util.function.Function; + +import static com.facebook.presto.common.type.BooleanType.BOOLEAN; +import static com.facebook.presto.common.type.Decimals.isLongDecimal; +import static com.facebook.presto.common.type.Decimals.isShortDecimal; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.common.type.RealType.REAL; +import static com.facebook.presto.common.type.SmallintType.SMALLINT; +import static com.facebook.presto.common.type.TinyintType.TINYINT; +import static com.facebook.presto.common.type.TypeUtils.isNumericType; +import static com.facebook.presto.spi.StandardErrorCode.INVALID_ARGUMENTS; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Verify.verify; +import static java.nio.ByteOrder.LITTLE_ENDIAN; +import static java.util.Objects.requireNonNull; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +public class KllHistogram + implements ConnectorHistogram +{ + // since the actual type parameter is only known at runtime, we can't concretely specify it + private final KllItemsSketch sketch; + private final Type type; + private final Function toDouble; + private final Function fromDouble; + + @SuppressWarnings({"unchecked", "rawtypes"}) + @JsonCreator + public KllHistogram(@JsonProperty("sketch") Slice bytes, @JsonProperty("type") Type type) + { + verify(isKllHistogramSupportedType(type), "histograms do not currently support type " + type.getDisplayName()); + this.type = requireNonNull(type, "type is null"); + SketchParameters parameters = getSketchParameters(type); + // the actual sketch can only accept the same object types which generated it + // however, the API can only accept or generate double types. We cast the inputs + // and results to/from double to satisfy the underlying sketch type. + if (parameters.getSerde().getClassOfT().equals(Double.class)) { + toDouble = x -> (double) x; + fromDouble = x -> x; + } + else if (parameters.getSerde().getClassOfT().equals(Long.class)) { + // dual cast to auto-box/unbox from Double/Long for sketch + toDouble = x -> (double) (long) x; + fromDouble = x -> (long) (double) x; + } + else { + throw new PrestoException(INVALID_ARGUMENTS, "can't create kll sketch from type: " + type); + } + sketch = KllItemsSketch.wrap(Memory.wrap(bytes.toByteBuffer(), LITTLE_ENDIAN), parameters.getComparator(), parameters.getSerde()); + } + + public static boolean isKllHistogramSupportedType(Type type) + { + try { + getSketchParameters(type); + return isNumericType(type) || type instanceof AbstractIntType || type instanceof AbstractLongType; + } + catch (PrestoException e) { + return false; + } + } + + @JsonProperty + public Slice getSketch() + { + return Slices.wrappedBuffer(sketch.toByteArray()); + } + + @JsonProperty + public Type getType() + { + return type; + } + + @VisibleForTesting + @SuppressWarnings("rawtypes") + public KllItemsSketch getKllSketch() + { + return sketch; + } + + @Override + public Estimate cumulativeProbability(double value, boolean inclusive) + { + return Estimate.of(sketch.getRank(fromDouble.apply(value), inclusive ? INCLUSIVE : EXCLUSIVE)); + } + + @Override + public Estimate inverseCumulativeProbability(double percentile) + { + return Estimate.of(toDouble.apply(sketch.getQuantile(percentile))); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("type", type) + .add("k", this.sketch.getK()) + .add("N", this.sketch.getN()) + .add("retained", this.sketch.getNumRetained()) + .add("min", this.sketch.getMinItem()) + .add("max", this.sketch.getMaxItem()) + .add("p50", sketch.getQuantile(0.5)) + .add("p75", sketch.getQuantile(0.75)) + .add("p90", sketch.getQuantile(0.90)) + .add("p99", sketch.getQuantile(0.99)) + .add("p99.9", sketch.getQuantile(0.999)) + .toString(); + } + + private static class SketchParameters + { + private final Comparator comparator; + private final ArrayOfItemsSerDe serde; + private final Function conversion; + + public SketchParameters(Comparator comparator, ArrayOfItemsSerDe serde, Function conversion) + { + this.comparator = comparator; + this.serde = serde; + this.conversion = conversion; + } + + public SketchParameters(Comparator comparator, ArrayOfItemsSerDe serde) + { + this(comparator, serde, Function.identity()); + } + + public Comparator getComparator() + { + return comparator; + } + + public ArrayOfItemsSerDe getSerde() + { + return serde; + } + + public Function getConversion() + { + return conversion; + } + } + + private static SketchParameters getSketchParameters(Type type) + { + if (type.equals(REAL)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe(), + (Object intValue) -> (double) Float.intBitsToFloat(((Long) intValue).intValue())); + } + else if (isShortDecimal(type)) { + DecimalType decimalType = (DecimalType) type; + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe(), + (Object longValue) -> new SqlDecimal(BigInteger.valueOf((long) longValue), decimalType.getScale(), decimalType.getScale()).toBigDecimal().doubleValue()); + } + else if (isLongDecimal(type)) { + DecimalType decimalType = (DecimalType) type; + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe(), + (Object encodedDecimal) -> new SqlDecimal(Decimals.decodeUnscaledValue((Slice) encodedDecimal), decimalType.getScale(), decimalType.getPrecision()).toBigDecimal().doubleValue()); + } + else if (type.equals(DOUBLE)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe()); + } + else if (type.equals(BOOLEAN)) { + return new SketchParameters<>(Boolean::compareTo, new ArrayOfBooleansSerDe()); + } + else if (type instanceof AbstractIntType || type instanceof AbstractLongType || type.equals(SMALLINT) || type.equals(TINYINT)) { + return new SketchParameters<>(Long::compareTo, new ArrayOfLongsSerDe()); + } + else if (type instanceof AbstractVarcharType) { + return new SketchParameters<>(String::compareTo, new ArrayOfStringsSerDe(), (Object slice) -> ((Slice) slice).toStringUtf8()); + } + else { + throw new PrestoException(INVALID_ARGUMENTS, "Unsupported type for KLL sketch: " + type); + } + } +} diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java index f2f2873921966..f53ba0b918402 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java @@ -21,6 +21,9 @@ import com.facebook.presto.common.transaction.TransactionId; import com.facebook.presto.common.type.FixedWidthType; import com.facebook.presto.common.type.TimeZoneKey; +import com.facebook.presto.common.type.Type; +import com.facebook.presto.common.type.TypeParameter; +import com.facebook.presto.hive.BaseHiveColumnHandle; import com.facebook.presto.hive.HdfsConfiguration; import com.facebook.presto.hive.HdfsConfigurationInitializer; import com.facebook.presto.hive.HdfsContext; @@ -38,6 +41,8 @@ import com.facebook.presto.spi.analyzer.MetadataResolver; import com.facebook.presto.spi.security.AllowAllAccessControl; import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.testing.MaterializedResult; @@ -49,6 +54,7 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import org.apache.commons.math3.distribution.NormalDistribution; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -91,18 +97,26 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.UUID; import java.util.function.BiConsumer; +import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; import static com.facebook.presto.SystemSessionProperties.LEGACY_TIMESTAMP; +import static com.facebook.presto.SystemSessionProperties.OPTIMIZER_USE_HISTOGRAMS; import static com.facebook.presto.common.type.BigintType.BIGINT; +import static com.facebook.presto.common.type.DateType.DATE; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static com.facebook.presto.common.type.IntegerType.INTEGER; +import static com.facebook.presto.common.type.RealType.REAL; +import static com.facebook.presto.common.type.TimeType.TIME; import static com.facebook.presto.common.type.TimeZoneKey.UTC_KEY; +import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; import static com.facebook.presto.common.type.VarcharType.VARCHAR; import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.SYNTHESIZED; import static com.facebook.presto.hive.HiveCommonSessionProperties.PARQUET_BATCH_READ_OPTIMIZATION_ENABLED; @@ -125,11 +139,16 @@ import static com.facebook.presto.testing.TestingConnectorSession.SESSION; import static com.facebook.presto.testing.assertions.Assert.assertEquals; import static com.facebook.presto.tests.sql.TestTable.randomTableSuffix; +import static com.facebook.presto.type.DecimalParametricType.DECIMAL; +import static com.google.common.collect.ImmutableMap.toImmutableMap; import static java.lang.String.format; import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; import static org.apache.iceberg.SnapshotSummary.TOTAL_DATA_FILES_PROP; import static org.apache.iceberg.SnapshotSummary.TOTAL_DELETE_FILES_PROP; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotEquals; +import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertTrue; @Test(singleThreaded = true) @@ -1419,6 +1438,57 @@ public void testMetadataDeleteOnUnPartitionedTableWithDeleteFiles() } } + @DataProvider(name = "validHistogramTypes") + public Object[][] validHistogramTypesDataProvider() + { + return new Object[][] { + // types not supported in Iceberg connector, but that histogram could support + // {TINYINT, new String[]{"1", "2", "10"}}, + // {SMALLINT, new String[]{"1", "2", "10"}}, + // {TIMESTAMP_WITH_TIME_ZONE, new String[]{"now() + interval '1' hour", "now() + interval '2' hour"}}, + // requires update to Kll sketch function because real are stored as ints but converted to float bits + {REAL, new String[] {"1.0", "2.0", "3.0"}}, + // iceberg stores microsecond precision but presto calculates on millisecond precision + // need a fix to properly convert for the optimizer. + {TIMESTAMP, new String[] {"localtimestamp + interval '1' hour", "localtimestamp + interval '2' hour"}}, + // supported types + {TIME, new String[] {"localtime", "localtime + interval '1' hour"}}, + {INTEGER, new String[] {"1", "5", "9"}}, + {BIGINT, new String[] {"2", "4", "6"}}, + {DOUBLE, new String[] {"1.0", "3.1", "4.6"}}, + {DECIMAL.createType(ImmutableList.of(TypeParameter.of(2L), TypeParameter.of(1L))), new String[] {"0.0", "3.0", "4.0"}}, + {DATE, new String[] {"date '2024-01-01'", "date '2024-03-30'", "date '2024-05-30'"}}, + }; + } + + /** + * Verifies that the histogram is returned after ANALYZE for a variety of types + */ + @Test(dataProvider = "validHistogramTypes") + public void testHistogramStorage(Type type, Object[] values) + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS create_histograms"); + assertQuerySucceeds(String.format("CREATE TABLE create_histograms (c %s)", type.getDisplayName())); + assertQuerySucceeds(String.format("INSERT INTO create_histograms VALUES %s", Joiner.on(", ").join(values))); + assertQuerySucceeds(session, "ANALYZE create_histograms"); + TableStatistics tableStatistics = getTableStats("create_histograms"); + Map nameToHandle = tableStatistics.getColumnStatistics().keySet() + .stream().map(IcebergColumnHandle.class::cast) + .collect(Collectors.toMap(BaseHiveColumnHandle::getName, identity())); + assertNotNull(nameToHandle.get("c")); + IcebergColumnHandle handle = nameToHandle.get("c"); + ColumnStatistics statistics = tableStatistics.getColumnStatistics().get(handle); + assertTrue(statistics.getHistogram().isPresent()); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS create_histograms"); + } + } + @Test public void testMetadataDeleteOnPartitionedTableWithDeleteFiles() { @@ -1830,10 +1900,10 @@ public void testFilterWithRemainingPredicate(boolean pushdownFilterEnabled) @Language("SQL") String query = "SELECT * FROM test_filterstats_remaining_predicate WHERE (i = 10 AND j = 11) OR (i = 20 AND j = 21)"; if (pushdownFilterEnabled) { assertPlan(session, query, - output( - exchange( - tableScan("test_filterstats_remaining_predicate") - .withOutputRowCount(1)))); + output( + exchange( + tableScan("test_filterstats_remaining_predicate") + .withOutputRowCount(1)))); } else { assertPlan(session, query, @@ -1845,6 +1915,151 @@ public void testFilterWithRemainingPredicate(boolean pushdownFilterEnabled) assertQuerySucceeds("DROP TABLE test_filterstats_remaining_predicate"); } + public void testAllNullHistogramColumn() + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_all_nulls"); + assertQuerySucceeds("CREATE TABLE histogram_all_nulls (c bigint)"); + TableStatistics stats = getTableStats("histogram_all_nulls", Optional.empty(), session); + assertFalse(stats.getColumnStatistics().values().stream().findFirst().isPresent()); + assertUpdate("INSERT INTO histogram_all_nulls VALUES NULL, NULL, NULL, NULL, NULL", 5); + stats = getTableStats("histogram_all_nulls", Optional.empty(), session); + assertFalse(stats.getColumnStatistics().values().stream().findFirst() + .get().getHistogram().isPresent()); + assertQuerySucceeds(session, "ANALYZE histogram_all_nulls"); + stats = getTableStats("histogram_all_nulls", Optional.empty(), session); + assertFalse(stats.getColumnStatistics().values().stream().findFirst() + .get().getHistogram().isPresent()); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_all_nulls"); + } + } + + @Test(dataProvider = "validHistogramTypes") + public void testHistogramShowStats(Type type, Object[] values) + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS create_histograms"); + assertQuerySucceeds(String.format("CREATE TABLE show_histograms (c %s)", type.getDisplayName())); + assertQuerySucceeds(String.format("INSERT INTO show_histograms VALUES %s", Joiner.on(", ").join(values))); + assertQuerySucceeds(session, "ANALYZE show_histograms"); + TableStatistics tableStatistics = getTableStats("show_histograms", Optional.empty(), session); + Map> histogramByColumnName = tableStatistics.getColumnStatistics() + .entrySet() + .stream() + .collect(toImmutableMap( + entry -> ((IcebergColumnHandle) entry.getKey()).getName(), + entry -> entry.getValue().getHistogram())); + MaterializedResult stats = getQueryRunner().execute("SHOW STATS for show_histograms"); + stats.getMaterializedRows() + .forEach(row -> { + String name = (String) row.getField(0); + String histogram = (String) row.getField(7); + assertEquals(Optional.ofNullable(histogramByColumnName.get(name)) + .flatMap(identity()) + .map(Objects::toString).orElse(null), + histogram); + }); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS show_histograms"); + } + } + + /** + * Verifies that when the users opts-in to using histograms that the + * optimizer estimates reflect the actual dataset for a variety of filter + * types (LTE, GT, EQ, NE) on a non-uniform data distribution + */ + @Test + public void testHistogramsUsedInOptimization() + { + Session histogramSession = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + // standard-normal distribution should have vastly different estimates than uniform at the tails (e.g. -3, +3) + NormalDistribution dist = new NormalDistribution(0, 1); + double[] values = dist.sample(1000); + Arrays.sort(values); + + try { + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_validation"); + assertQuerySucceeds("CREATE TABLE histogram_validation (c double)"); + assertQuerySucceeds(String.format("INSERT INTO histogram_validation VALUES %s", Joiner.on(", ").join(Arrays.stream(values).iterator()))); + assertQuerySucceeds(histogramSession, "ANALYZE histogram_validation"); + Consumer assertFilters = (value) -> { + // use Math.abs because if the value isn't found, the returned value of binary + // search is (- insert index). The absolute value index tells us roughly how + // many records would have been returned regardless of if the actual value is in the + // dataset + double estimatedRowCount = Math.abs(Arrays.binarySearch(values, value)); + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c <= " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(estimatedRowCount, 25)); + // check that inverse filter equals roughly the inverse number of rows + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c > " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(Math.max(0.0, values.length - estimatedRowCount), 25)); + // having an exact random double value from the distribution exist more than once is exceedingly rare. + // the histogram calculation should return 1 (and the inverse) in both situations + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c = " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(1.0, 25)); + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c != " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(values.length - 1, 25)); + }; + + assertFilters.accept(values[1]); // choose 1 greater than the min value + assertFilters.accept(-2.0); // should be very unlikely to generate a distribution where all values > -2.0 + assertFilters.accept(-1.0); + assertFilters.accept(0.0); + assertFilters.accept(1.0); + assertFilters.accept(2.0); // should be very unlikely to generate a distribution where all values < 2.0 + assertFilters.accept(values[values.length - 2]); // choose 1 less than the max value + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_validation"); + } + } + + /** + * Verifies that the data in the histogram matches the mins/maxs of the values + * in the table when created + */ + @Test(dataProvider = "validHistogramTypes") + public void testHistogramReconstruction(Type type, Object[] values) + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS verify_histograms"); + assertQuerySucceeds(String.format("CREATE TABLE verify_histograms (c %s)", type.getDisplayName())); + assertQuerySucceeds(String.format("INSERT INTO verify_histograms VALUES %s", Joiner.on(", ").join(values))); + assertQuerySucceeds(session, "ANALYZE verify_histograms"); + TableStatistics tableStatistics = getTableStats("verify_histograms", Optional.empty(), session); + Map nameToHandle = tableStatistics.getColumnStatistics().keySet() + .stream().map(IcebergColumnHandle.class::cast) + .collect(Collectors.toMap(BaseHiveColumnHandle::getName, identity())); + assertNotNull(nameToHandle.get("c")); + IcebergColumnHandle handle = nameToHandle.get("c"); + ColumnStatistics statistics = tableStatistics.getColumnStatistics().get(handle); + ConnectorHistogram histogram = statistics.getHistogram().get(); + DoubleRange range = statistics.getRange().get(); + double min = range.getMin(); + double max = range.getMax(); + assertEquals(histogram.inverseCumulativeProbability(0.0).getValue(), min); + assertEquals(histogram.inverseCumulativeProbability(1.0).getValue(), max); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS verify_histograms"); + } + } + private void testCheckDeleteFiles(Table icebergTable, int expectedSize, List expectedFileContent) { // check delete file list diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java index cbfeebf72797e..ca5da9444de01 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java @@ -66,7 +66,8 @@ public void testDefaults() .setSplitManagerThreads(Runtime.getRuntime().availableProcessors()) .setMetadataPreviousVersionsMax(METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT) .setMetadataDeleteAfterCommit(METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT) - .setMetricsMaxInferredColumn(METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT)); + .setMetricsMaxInferredColumn(METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT) + .setStatisticsKllSketchKParameter(1024)); } @Test @@ -97,6 +98,7 @@ public void testExplicitPropertyMappings() .put("iceberg.metadata-previous-versions-max", "1") .put("iceberg.metadata-delete-after-commit", "true") .put("iceberg.metrics-max-inferred-column", "16") + .put("iceberg.statistics-kll-sketch-k-parameter", "4096") .build(); IcebergConfig expected = new IcebergConfig() @@ -123,7 +125,8 @@ public void testExplicitPropertyMappings() .setSplitManagerThreads(42) .setMetadataPreviousVersionsMax(1) .setMetadataDeleteAfterCommit(true) - .setMetricsMaxInferredColumn(16); + .setMetricsMaxInferredColumn(16) + .setStatisticsKllSketchKParameter(4096); assertFullMapping(properties, expected); } diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java new file mode 100644 index 0000000000000..57b3236997ecf --- /dev/null +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java @@ -0,0 +1,225 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.iceberg.statistics; + +import com.facebook.presto.common.type.CharType; +import com.facebook.presto.common.type.Type; +import com.facebook.presto.common.type.VarcharType; +import com.google.common.base.VerifyException; +import io.airlift.slice.Slices; +import org.apache.datasketches.common.ArrayOfDoublesSerDe; +import org.apache.datasketches.common.ArrayOfLongsSerDe; +import org.apache.datasketches.common.ArrayOfStringsSerDe; +import org.apache.datasketches.kll.KllItemsSketch; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.nio.ByteBuffer; +import java.util.List; +import java.util.concurrent.ThreadLocalRandom; +import java.util.stream.Collector; +import java.util.stream.Collectors; +import java.util.stream.DoubleStream; +import java.util.stream.IntStream; +import java.util.stream.LongStream; + +import static com.facebook.presto.common.type.BigintType.BIGINT; +import static com.facebook.presto.common.type.BooleanType.BOOLEAN; +import static com.facebook.presto.common.type.DateType.DATE; +import static com.facebook.presto.common.type.DecimalType.createDecimalType; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.common.type.IntegerType.INTEGER; +import static com.facebook.presto.common.type.RealType.REAL; +import static com.facebook.presto.common.type.TimeType.TIME; +import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; +import static com.facebook.presto.common.type.TimestampType.TIMESTAMP_MICROSECONDS; +import static com.facebook.presto.common.type.TimestampWithTimeZoneType.TIMESTAMP_WITH_TIME_ZONE; +import static com.facebook.presto.common.type.VarcharType.VARCHAR; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertThrows; + +public class TestKllHistogram +{ + private final KllHistogram basicHistogram; + + public TestKllHistogram() + { + basicHistogram = generateDoublesHistogram(); + } + + @SuppressWarnings("unchecked") + @Test + public void testSimpleCreation() + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(Double::compareTo, new ArrayOfDoublesSerDe()); + DoubleStream.iterate(0.0, i -> i + 1).limit(100).forEach(sketch::update); + KllHistogram histogram = new KllHistogram(Slices.wrappedBuffer(ByteBuffer.wrap(sketch.toByteArray())), DOUBLE); + assertSketchesEqual(histogram.getKllSketch(), sketch); + } + + @Test + public void testInverseCumulativeProbMin() + { + assertEquals(basicHistogram.inverseCumulativeProbability(0.0).getValue(), 0.0, 1E-8); + } + + @Test + public void testInverseCumulativeProbMax() + { + assertEquals(basicHistogram.inverseCumulativeProbability(1.0).getValue(), 99.0, 1E-8); + } + + @Test + public void testInverseCumulativeMiddle() + { + assertEquals(basicHistogram.inverseCumulativeProbability(0.5).getValue(), 49.0, 1E-8); + } + + @Test + public void testCumulativeMinInclusive() + { + assertEquals(basicHistogram.cumulativeProbability(0.0, true).getValue(), 0.01, 1E-8); + } + + @Test + public void testCumulativeMinExclusive() + { + assertEquals(basicHistogram.cumulativeProbability(0.0, false).getValue(), 0.0, 1E-8); + } + + @Test + public void testCumulativeMidExclusive() + { + assertEquals(basicHistogram.cumulativeProbability(49.0, false).getValue(), 0.49, 1E-8); + } + + @Test + public void testCumulativeMidInclusive() + { + assertEquals(basicHistogram.cumulativeProbability(49.0, true).getValue(), 0.5, 1E-8); + } + + @Test + public void testCumulativeMaxExclusive() + { + assertEquals(basicHistogram.cumulativeProbability(99.0, false).getValue(), 0.99, 1E-8); + } + + @Test + public void testCumulativeMaxInclusive() + { + assertEquals(basicHistogram.cumulativeProbability(99.0, true).getValue(), 1.0, 1E-8); + } + + @DataProvider(name = "kllSupportedTypes") + public static Object[][] kllHistogramTypeDataProvider() + { + return new Object[][] { + // long decimal (represented by Slice.class), currently not supported + // {createDecimalType(), TestKllHistogram.generateLongSketch()}, + {INTEGER, TestKllHistogram.generateLongSketch()}, + {BIGINT, TestKllHistogram.generateLongSketch()}, + {DOUBLE, TestKllHistogram.generateDoubleSketch()}, + {createDecimalType(3, 1), TestKllHistogram.generateDoubleSketch()}, + {DATE, TestKllHistogram.generateLongSketch()}, + {createDecimalType(38, 0), TestKllHistogram.generateDoubleSketch()}, + {TIME, generateLongSketch()}, + {TIMESTAMP_WITH_TIME_ZONE, generateLongSketch()}, + {TIMESTAMP, generateLongSketch()}, + {REAL, generateDoubleSketch()}, + {TIMESTAMP_MICROSECONDS, generateLongSketch()}, + }; + } + + @DataProvider(name = "kllUnsupportedTypes") + public static Object[][] unsupportedKllHistogramTypes() + { + return new Object[][] { + // long decimal (represented by Slice.class), currently not supported + {CharType.createCharType(0)}, + {CharType.createCharType(100)}, + {BOOLEAN}, + {VARCHAR}, + {VarcharType.createVarcharType(10)} + }; + } + + @SuppressWarnings("rawtypes") + @Test(dataProvider = "kllSupportedTypes") + public void testTypeCreation(Type type, KllItemsSketch sketch) + { + KllHistogram histogram = new KllHistogram(Slices.wrappedBuffer(sketch.toByteArray()), type); + double value = histogram.inverseCumulativeProbability(0.5).getValue(); + double probability = histogram.cumulativeProbability(49.0, true).getValue(); + assertEquals(probability, 0.5); + assertEquals(value, 49.0); + } + + @Test(dataProvider = "kllUnsupportedTypes") + public void testUnsupportedKllTypes(Type type) + { + assertThrows(VerifyException.class, () -> { + new KllHistogram(null, type); + }); + } + + /** + * @return generates a histogram of doubles from [0.0, 99.9] in intervals of 1.0 + */ + private static KllHistogram generateDoublesHistogram() + { + return new KllHistogram(Slices.wrappedBuffer(ByteBuffer.wrap(generateDoubleSketch().toByteArray())), DOUBLE); + } + + private static KllItemsSketch generateLongSketch() + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(Long::compareTo, new ArrayOfLongsSerDe()); + LongStream.iterate(0, i -> i + 1).limit(100).forEach(sketch::update); + return sketch; + } + + private static KllItemsSketch generateDoubleSketch() + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(Double::compareTo, new ArrayOfDoublesSerDe()); + DoubleStream.iterate(0.0, i -> i + 1).limit(100).forEach(sketch::update); + return sketch; + } + + private static KllItemsSketch generateStringSketch(int maxLen) + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(String::compareTo, new ArrayOfStringsSerDe()); + List strings = LongStream.iterate(0, i -> i + 1).boxed() + .limit(100) + .map(idx -> IntStream.iterate(0, i -> 1 + 1) + .limit(ThreadLocalRandom.current().nextInt(1, maxLen)) + .mapToObj(i -> 'a' + (char) (ThreadLocalRandom.current().nextInt() % 26)) + .collect(Collector.of(StringBuilder::new, + StringBuilder::append, + StringBuilder::append, + StringBuilder::toString))) + .collect(Collectors.toList()); + strings.forEach(sketch::update); + return sketch; + } + + private static void assertSketchesEqual(KllItemsSketch sketch, KllItemsSketch other) + { + assertEquals(other.getK(), sketch.getK()); + assertEquals(other.getN(), sketch.getN()); + assertEquals(other.getMinItem(), sketch.getMinItem()); + assertEquals(other.getMaxItem(), sketch.getMaxItem()); + assertEquals(other.getSortedView().getCumulativeWeights(), sketch.getSortedView().getCumulativeWeights()); + assertEquals(other.getSortedView().getQuantiles(), sketch.getSortedView().getQuantiles()); + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java b/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java index 306e00cfb9079..efc05bf4a20a6 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java @@ -17,7 +17,10 @@ import com.facebook.presto.Session; import com.facebook.presto.SystemSessionProperties; import com.facebook.presto.spi.relation.VariableReferenceExpression; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; import com.facebook.presto.spi.statistics.Estimate; +import com.facebook.presto.spi.statistics.HistogramCalculator; +import com.facebook.presto.spi.statistics.UniformDistributionHistogram; import com.facebook.presto.sql.tree.ComparisonExpression; import java.util.Optional; @@ -156,7 +159,7 @@ private PlanNodeStatsEstimate estimateFilterRange( .setStatisticsRange(intersectRange) .setNullsFraction(0.0); if (useHistograms) { - symbolNewEstimate.setHistogram(expressionStatistics.getHistogram().map(expressionHistogram -> DisjointRangeDomainHistogram.addConjunction(expressionHistogram, intersectRange))); + symbolNewEstimate.setHistogram(expressionStatistics.getHistogram().map(expressionHistogram -> DisjointRangeDomainHistogram.addConjunction(expressionHistogram, intersectRange.toPrestoRange()))); } estimate = estimate.mapVariableColumnStatistics(expressionVariable.get(), oldStats -> symbolNewEstimate.build()); @@ -171,7 +174,7 @@ private double calculateFilterFactor(VariableStatsEstimate variableStatistics, S Estimate filterEstimate; if (useHistograms) { Estimate distinctEstimate = isNaN(variableStatistics.getDistinctValuesCount()) ? Estimate.unknown() : Estimate.of(variableRange.getDistinctValuesCount()); - filterEstimate = HistogramCalculator.calculateFilterFactor(intersectRange, variableStatistics.getHistogram().orElse(new UniformDistributionHistogram(variableStatistics.getLowValue(), variableStatistics.getHighValue())), distinctEstimate, true); + filterEstimate = HistogramCalculator.calculateFilterFactor(intersectRange.toPrestoRange(), intersectRange.getDistinctValuesCount(), variableStatistics.getHistogram().orElse(new UniformDistributionHistogram(variableStatistics.getLowValue(), variableStatistics.getHighValue())), distinctEstimate, true); if (log.isDebugEnabled()) { double expressionFilter = variableRange.overlapPercentWith(intersectRange); if (!Double.isNaN(expressionFilter) && diff --git a/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java b/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java index 7a2e804aa7e17..85a51c2b6bb51 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java @@ -141,6 +141,11 @@ private static ColumnStatistics toColumnStatistics(VariableStatsEstimate variabl if (!Double.isNaN(variableStatsEstimate.getLowValue()) && !Double.isNaN(variableStatsEstimate.getHighValue())) { builder.setRange(new DoubleRange(variableStatsEstimate.getLowValue(), variableStatsEstimate.getHighValue())); } + + if (variableStatsEstimate.getHistogram().isPresent()) { + builder.setHistogram(variableStatsEstimate.getHistogram()); + } + return builder.build(); } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java b/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java index 6547c0b463e1c..fdf4cf1b4bcea 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java @@ -34,10 +34,10 @@ import static com.facebook.presto.SystemSessionProperties.getDefaultJoinSelectivityCoefficient; import static com.facebook.presto.SystemSessionProperties.shouldOptimizerUseHistograms; -import static com.facebook.presto.cost.DisjointRangeDomainHistogram.addConjunction; import static com.facebook.presto.cost.FilterStatsCalculator.UNKNOWN_FILTER_COEFFICIENT; import static com.facebook.presto.cost.VariableStatsEstimate.buildFrom; import static com.facebook.presto.expressions.LogicalRowExpressions.extractConjuncts; +import static com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram.addConjunction; import static com.facebook.presto.sql.analyzer.ExpressionTreeUtils.getNodeLocation; import static com.facebook.presto.sql.planner.plan.Patterns.join; import static com.facebook.presto.sql.tree.ComparisonExpression.Operator.EQUAL; @@ -250,7 +250,7 @@ private PlanNodeStatsEstimate filterByAuxiliaryClause(PlanNodeStatsEstimate stat .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv); if (useHistograms) { - newLeftStats.setHistogram(leftStats.getHistogram().map(leftHistogram -> addConjunction(leftHistogram, intersect))); + newLeftStats.setHistogram(leftStats.getHistogram().map(leftHistogram -> addConjunction(leftHistogram, intersect.toPrestoRange()))); } VariableStatsEstimate.Builder newRightStats = buildFrom(rightStats) @@ -258,7 +258,7 @@ private PlanNodeStatsEstimate filterByAuxiliaryClause(PlanNodeStatsEstimate stat .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv); if (useHistograms) { - newRightStats.setHistogram(rightStats.getHistogram().map(rightHistogram -> addConjunction(rightHistogram, intersect))); + newRightStats.setHistogram(rightStats.getHistogram().map(rightHistogram -> addConjunction(rightHistogram, intersect.toPrestoRange()))); } PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(stats) diff --git a/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java b/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java index 1b2797e18a8af..2a280ae2524ba 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java @@ -14,10 +14,11 @@ package com.facebook.presto.cost; import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; import java.util.Optional; -import static com.facebook.presto.cost.DisjointRangeDomainHistogram.addConjunction; +import static com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram.addConjunction; import static com.google.common.base.Preconditions.checkArgument; import static java.lang.Double.NaN; import static java.lang.Double.isNaN; @@ -139,7 +140,7 @@ public PlanNodeStatsEstimate capStats(PlanNodeStatsEstimate stats, PlanNodeStats double cappedNullsFraction = cappedRowCount == 0 ? 1 : cappedNumberOfNulls / cappedRowCount; newSymbolStats.setNullsFraction(cappedNullsFraction); if (shouldUseHistograms) { - newSymbolStats.setHistogram(symbolStats.getHistogram().map(symbolHistogram -> addConjunction(symbolHistogram, new StatisticRange(newLow, newHigh, 0)))); + newSymbolStats.setHistogram(symbolStats.getHistogram().map(symbolHistogram -> addConjunction(symbolHistogram, new StatisticRange(newLow, newHigh, 0).toPrestoRange()))); } result.addVariableStatistics(symbol, newSymbolStats.build()); @@ -296,8 +297,8 @@ private VariableStatsEstimate addColumnStats( .setNullsFraction(newNullsFraction); if (shouldUseHistograms) { Optional newHistogram = RangeAdditionStrategy.INTERSECT == strategy ? - leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addConjunction(leftHistogram, rightRange)) : - leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addDisjunction(leftHistogram, rightRange)); + leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addConjunction(leftHistogram, rightRange.toPrestoRange())) : + leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addDisjunction(leftHistogram, rightRange.toPrestoRange())); statistics.setHistogram(newHistogram); } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java b/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java index 4d80e13cb92cc..060e02bc8b07b 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java @@ -13,19 +13,19 @@ */ package com.facebook.presto.cost; +import com.facebook.presto.common.predicate.Range; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.collect.BoundType; -import com.google.common.collect.Range; import java.util.Objects; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; import static com.facebook.presto.util.MoreMath.nearlyEqual; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkArgument; -import static java.lang.Double.NEGATIVE_INFINITY; import static java.lang.Double.NaN; -import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; @@ -36,9 +36,6 @@ public class StatisticRange { - protected static final double INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.25; - protected static final double INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.5; - // TODO unify field and method names with SymbolStatsEstimate /** * {@code NaN} represents empty range ({@code high} must be {@code NaN} too) @@ -222,19 +219,12 @@ public StatisticRange addAndCollapseDistinctValues(StatisticRange other) return expandRangeWithNewDistinct(newDistinctValues, other); } - public Range toRange() + public Range toPrestoRange() { - return Range.range(low, openLow ? BoundType.OPEN : BoundType.CLOSED, high, openHigh ? BoundType.OPEN : BoundType.CLOSED); - } - - public static StatisticRange fromRange(Range range) - { - return new StatisticRange( - range.hasLowerBound() ? range.lowerEndpoint() : NEGATIVE_INFINITY, - !range.hasLowerBound() || range.lowerBoundType() == BoundType.OPEN, - range.hasUpperBound() ? range.upperEndpoint() : POSITIVE_INFINITY, - !range.hasUpperBound() || range.upperBoundType() == BoundType.OPEN, - NaN); + if (low == high) { + return Range.equal(DOUBLE, low); + } + return Range.range(DOUBLE, low, !openLow, high, !openHigh); } private StatisticRange expandRangeWithNewDistinct(double newDistinctValues, StatisticRange other) diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index 7a7396824e091..801cb6f337732 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -21,6 +21,7 @@ import com.facebook.presto.common.type.IntegerType; import com.facebook.presto.common.type.RealType; import com.facebook.presto.common.type.SmallintType; +import com.facebook.presto.common.type.SqlTime; import com.facebook.presto.common.type.SqlTimestamp; import com.facebook.presto.common.type.TinyintType; import com.facebook.presto.common.type.Type; @@ -80,6 +81,7 @@ import static com.facebook.presto.common.type.SqlTimestamp.MICROSECONDS_PER_MILLISECOND; import static com.facebook.presto.common.type.StandardTypes.DOUBLE; import static com.facebook.presto.common.type.StandardTypes.VARCHAR; +import static com.facebook.presto.common.type.TimeType.TIME; import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; import static com.facebook.presto.metadata.MetadataUtil.createQualifiedObjectName; import static com.facebook.presto.sql.QueryUtil.aliased; @@ -373,6 +375,9 @@ private Expression toStringLiteral(Type type, double value) if (type.equals(TIMESTAMP)) { return new StringLiteral(new SqlTimestamp(round(value) / MICROSECONDS_PER_MILLISECOND, session.getSqlFunctionProperties().getTimeZoneKey(), MILLISECONDS).toString()); } + if (type.equals(TIME)) { + return new StringLiteral(new SqlTime(round(value)).toString()); + } throw new IllegalArgumentException("Unexpected type: " + type); } } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java b/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java deleted file mode 100644 index ddccfdfe3c065..0000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import org.testng.annotations.Test; - -import static com.facebook.presto.cost.HistogramCalculator.calculateFilterFactor; -import static java.lang.Double.NEGATIVE_INFINITY; -import static java.lang.Double.NaN; -import static java.lang.Double.POSITIVE_INFINITY; -import static org.testng.Assert.assertEquals; - -public class TestHistogramCalculator -{ - @Test - public void testCalculateFilterFactor() - { - StatisticRange zeroToTen = range(0, 10, 10); - StatisticRange empty = StatisticRange.empty(); - - // Equal ranges - assertFilterFactor(Estimate.of(1.0), zeroToTen, uniformHist(0, 10), 5); - assertFilterFactor(Estimate.of(1.0), zeroToTen, uniformHist(0, 10), 20); - - // Some overlap - assertFilterFactor(Estimate.of(0.5), range(5, 3000, 5), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // Single value overlap - assertFilterFactor(Estimate.of(1.0 / zeroToTen.getDistinctValuesCount()), range(3, 3, 1), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - assertFilterFactor(Estimate.of(1.0 / zeroToTen.getDistinctValuesCount()), range(10, 100, 357), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // No overlap - assertFilterFactor(Estimate.zero(), range(20, 30, 10), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // Empty ranges - assertFilterFactor(Estimate.zero(), zeroToTen, uniformHist(empty), empty.getDistinctValuesCount()); - assertFilterFactor(Estimate.zero(), empty, uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // no test for (empty, empty) since any return value is correct - assertFilterFactor(Estimate.zero(), unboundedRange(10), uniformHist(empty), empty.getDistinctValuesCount()); - assertFilterFactor(Estimate.zero(), empty, uniformHist(unboundedRange(10)), 10); - - // Unbounded (infinite), NDV-based - assertFilterFactor(Estimate.of(0.5), unboundedRange(10), uniformHist(unboundedRange(20)), 20); - assertFilterFactor(Estimate.of(1.0), unboundedRange(20), uniformHist(unboundedRange(10)), 10); - - // NEW TESTS (TPC-H Q2) - // unbounded ranges - assertFilterFactor(Estimate.of(.5), unboundedRange(0.5), uniformHist(unboundedRange(NaN)), NaN); - // unbounded ranges with limited distinct values - assertFilterFactor(Estimate.of(0.2), unboundedRange(1.0), - domainConstrained(unboundedRange(5.0), uniformHist(unboundedRange(7.0))), 5.0); - } - - private static StatisticRange range(double low, double high, double distinctValues) - { - return new StatisticRange(low, high, distinctValues); - } - - private static StatisticRange unboundedRange(double distinctValues) - { - return new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, distinctValues); - } - - private static void assertFilterFactor(Estimate expected, StatisticRange range, ConnectorHistogram histogram, double totalDistinctValues) - { - assertEquals( - calculateFilterFactor(range, histogram, Estimate.estimateFromDouble(totalDistinctValues), true), - expected); - } - - private static ConnectorHistogram uniformHist(StatisticRange range) - { - return uniformHist(range.getLow(), range.getHigh()); - } - - private static ConnectorHistogram uniformHist(double low, double high) - { - return new UniformDistributionHistogram(low, high); - } - - private static ConnectorHistogram domainConstrained(StatisticRange range, ConnectorHistogram source) - { - return DisjointRangeDomainHistogram.addDisjunction(source, range); - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java b/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java index dc2e60bb46e8d..79e47477bc129 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java @@ -15,6 +15,8 @@ import com.facebook.presto.spi.relation.VariableReferenceExpression; import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; +import com.facebook.presto.spi.statistics.UniformDistributionHistogram; import org.testng.annotations.Test; import java.util.Optional; @@ -374,31 +376,31 @@ public void testAddHistograms() assertEquals(calculator.addStatsAndCollapseDistinctValues(unknownRowCount, unknownRowCount).getVariableStatistics(VARIABLE).getHistogram(), Optional.empty()); // check when rows are available histograms are added properly. - ConnectorHistogram addedSameRange = DisjointRangeDomainHistogram.addDisjunction(unknownNullsFraction.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen); + ConnectorHistogram addedSameRange = DisjointRangeDomainHistogram.addDisjunction(unknownNullsFraction.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen.toPrestoRange()); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndSumDistinctValues, addedSameRange); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndCollapseDistinctValues, addedSameRange); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndMaxDistinctValues, addedSameRange); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndIntersect, addedSameRange); // check when only a sub-range is added, that the histogram still represents the full range - ConnectorHistogram fullRangeFirst = DisjointRangeDomainHistogram.addDisjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen); - ConnectorHistogram intersectedRangeSecond = DisjointRangeDomainHistogram.addConjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToFive); + ConnectorHistogram fullRangeFirst = DisjointRangeDomainHistogram.addDisjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen.toPrestoRange()); + ConnectorHistogram intersectedRangeSecond = DisjointRangeDomainHistogram.addConjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToFive.toPrestoRange()); assertAddStatsHistogram(first, second, calculator::addStatsAndSumDistinctValues, fullRangeFirst); assertAddStatsHistogram(first, second, calculator::addStatsAndCollapseDistinctValues, fullRangeFirst); assertAddStatsHistogram(first, second, calculator::addStatsAndMaxDistinctValues, fullRangeFirst); assertAddStatsHistogram(first, second, calculator::addStatsAndIntersect, intersectedRangeSecond); // check when two ranges overlap, the new stats span both ranges - ConnectorHistogram fullRangeSecondThird = DisjointRangeDomainHistogram.addDisjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen); - ConnectorHistogram intersectedRangeSecondThird = DisjointRangeDomainHistogram.addConjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen); + ConnectorHistogram fullRangeSecondThird = DisjointRangeDomainHistogram.addDisjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen.toPrestoRange()); + ConnectorHistogram intersectedRangeSecondThird = DisjointRangeDomainHistogram.addConjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen.toPrestoRange()); assertAddStatsHistogram(second, third, calculator::addStatsAndSumDistinctValues, fullRangeSecondThird); assertAddStatsHistogram(second, third, calculator::addStatsAndCollapseDistinctValues, fullRangeSecondThird); assertAddStatsHistogram(second, third, calculator::addStatsAndMaxDistinctValues, fullRangeSecondThird); assertAddStatsHistogram(second, third, calculator::addStatsAndIntersect, intersectedRangeSecondThird); // check when two ranges partially overlap, the addition/intersection is applied correctly - ConnectorHistogram fullRangeThirdFourth = DisjointRangeDomainHistogram.addDisjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven); - ConnectorHistogram intersectedRangeThirdFourth = DisjointRangeDomainHistogram.addConjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven); + ConnectorHistogram fullRangeThirdFourth = DisjointRangeDomainHistogram.addDisjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven.toPrestoRange()); + ConnectorHistogram intersectedRangeThirdFourth = DisjointRangeDomainHistogram.addConjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven.toPrestoRange()); assertAddStatsHistogram(third, fourth, calculator::addStatsAndSumDistinctValues, fullRangeThirdFourth); assertAddStatsHistogram(third, fourth, calculator::addStatsAndCollapseDistinctValues, fullRangeThirdFourth); assertAddStatsHistogram(third, fourth, calculator::addStatsAndMaxDistinctValues, fullRangeThirdFourth); @@ -419,7 +421,7 @@ private static PlanNodeStatsEstimate statistics(double rowCount, double totalSiz .setNullsFraction(nullsFraction) .setAverageRowSize(averageRowSize) .setStatisticsRange(range) - .setHistogram(Optional.of(DisjointRangeDomainHistogram.addConjunction(new UniformDistributionHistogram(range.getLow(), range.getHigh()), range))) + .setHistogram(Optional.of(DisjointRangeDomainHistogram.addConjunction(new UniformDistributionHistogram(range.getLow(), range.getHigh()), range.toPrestoRange()))) .build()) .build(); } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java b/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java index b26665eb30af4..b0f364d1f34e1 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java @@ -15,7 +15,7 @@ package com.facebook.presto.cost; import com.facebook.airlift.json.JsonCodec; -import com.google.common.collect.Range; +import com.facebook.presto.spi.statistics.UniformDistributionHistogram; import org.testng.annotations.Test; import java.util.Optional; @@ -33,7 +33,7 @@ public void testSkipHistogramSerialization() VariableStatsEstimate estimate = VariableStatsEstimate.builder() .setAverageRowSize(100) .setDistinctValuesCount(100) - .setStatisticsRange(StatisticRange.fromRange(Range.open(1.0d, 2.0d))) + .setStatisticsRange(new StatisticRange(55, 65, 100)) .setHistogram(Optional.of(new UniformDistributionHistogram(55, 65))) .setNullsFraction(0.1) .build(); diff --git a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java new file mode 100644 index 0000000000000..720adc0a0bf5b --- /dev/null +++ b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.sql.planner.assertions; + +import com.facebook.presto.Session; +import com.facebook.presto.cost.StatsProvider; +import com.facebook.presto.metadata.Metadata; +import com.facebook.presto.spi.plan.PlanNode; + +import static com.google.common.base.Verify.verify; + +public class ApproximateStatsOutputRowCountMatcher + implements Matcher +{ + private final double expectedOutputRowCount; + private final double error; + + ApproximateStatsOutputRowCountMatcher(double expectedOutputRowCount, double error) + { + verify(error >= 0.0, "error must be >= 0.0"); + verify(expectedOutputRowCount >= 0.0, "expectedOutputRowCount must be >= 0.0"); + this.expectedOutputRowCount = expectedOutputRowCount; + this.error = error; + } + + @Override + public boolean shapeMatches(PlanNode node) + { + return true; + } + + @Override + public MatchResult detailMatches(PlanNode node, StatsProvider stats, Session session, Metadata metadata, SymbolAliases symbolAliases) + { + return new MatchResult(Math.abs(stats.getStats(node).getOutputRowCount() - expectedOutputRowCount) < error); + } + + @Override + public String toString() + { + return "approximateExpectedOutputRowCount(" + expectedOutputRowCount + ", " + error + ")"; + } +} diff --git a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java index d55a6bc544223..60c17c5cbdc01 100644 --- a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java +++ b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java @@ -815,6 +815,12 @@ public PlanMatchPattern withOutputRowCount(boolean exactMatch, String expectedSo return this; } + public PlanMatchPattern withApproximateOutputRowCount(double expectedOutputRowCount, double error) + { + matchers.add(new ApproximateStatsOutputRowCountMatcher(expectedOutputRowCount, error)); + return this; + } + public PlanMatchPattern withOutputSize(double expectedOutputSize) { matchers.add(new StatsOutputSizeMatcher(expectedOutputSize)); diff --git a/presto-spi/pom.xml b/presto-spi/pom.xml index 2454397e48c72..50e975b0ef95f 100644 --- a/presto-spi/pom.xml +++ b/presto-spi/pom.xml @@ -126,5 +126,11 @@ assertj-core test + + + org.apache.commons + commons-math3 + test + diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index bec6fb5c31299..82a2b7da25e19 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -23,6 +23,8 @@ public final class ColumnStatistics { + public static final double INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.25; + public static final double INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.5; private static final ColumnStatistics EMPTY = new ColumnStatistics(Estimate.unknown(), Estimate.unknown(), Estimate.unknown(), Optional.empty(), Optional.empty()); private final Estimate nullsFraction; @@ -209,6 +211,11 @@ public Builder setHistogram(Optional histogram) return this; } + public Optional getHistogram() + { + return histogram; + } + public Builder mergeWith(Builder other) { if (nullsFraction.isUnknown()) { diff --git a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DisjointRangeDomainHistogram.java similarity index 74% rename from presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java rename to presto-spi/src/main/java/com/facebook/presto/spi/statistics/DisjointRangeDomainHistogram.java index 1d4b44140de41..32b383ef42053 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DisjointRangeDomainHistogram.java @@ -12,35 +12,33 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; +import com.facebook.presto.common.predicate.Marker; +import com.facebook.presto.common.predicate.Range; +import com.facebook.presto.common.predicate.SortedRangeSet; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.base.Suppliers; -import com.google.common.collect.BoundType; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Range; -import com.google.common.collect.RangeSet; -import com.google.common.collect.TreeRangeSet; -import java.util.Collection; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; import java.util.NoSuchElementException; import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.function.Supplier; -import static com.facebook.presto.cost.HistogramCalculator.calculateFilterFactor; -import static com.facebook.presto.util.MoreMath.max; -import static com.facebook.presto.util.MoreMath.min; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.facebook.presto.common.Utils.checkArgument; +import static com.facebook.presto.common.Utils.memoizedSupplier; +import static com.facebook.presto.common.Utils.toStringHelper; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static java.lang.Double.NEGATIVE_INFINITY; +import static java.lang.Double.NaN; import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; +import static java.lang.Math.max; +import static java.lang.Math.min; import static java.util.Objects.hash; import static java.util.Objects.requireNonNull; @@ -71,31 +69,27 @@ public class DisjointRangeDomainHistogram private final ConnectorHistogram source; // use RangeSet as the internal representation of the ranges, but the constructor arguments // use StatisticRange to support serialization and deserialization. - private final Supplier> rangeSet; - private final Set> ranges; + private final Supplier rangeSet; + private final Set ranges; @JsonCreator - public DisjointRangeDomainHistogram(@JsonProperty("source") ConnectorHistogram source, @JsonProperty("ranges") Collection ranges) - { - this(source, ranges.stream().map(StatisticRange::toRange).collect(toImmutableSet())); - } - - public DisjointRangeDomainHistogram(ConnectorHistogram source, Set> ranges) + public DisjointRangeDomainHistogram(ConnectorHistogram source, Set ranges) { this.source = requireNonNull(source, "source is null"); this.ranges = requireNonNull(ranges, "ranges is null"); - this.rangeSet = Suppliers.memoize(() -> { - RangeSet rangeSet = TreeRangeSet.create(); - rangeSet.addAll(ranges); + this.rangeSet = memoizedSupplier(() -> { + SortedRangeSet rangeSet = SortedRangeSet.copyOf(DOUBLE, new ArrayList<>(ranges)); return rangeSet.subRangeSet(getSourceSpan(this.source)); }); } - private static Range getSourceSpan(ConnectorHistogram source) + private static Range getSourceSpan(ConnectorHistogram source) { - return Range.closed( + return Range.range(DOUBLE, source.inverseCumulativeProbability(0.0).orElse(() -> NEGATIVE_INFINITY), - source.inverseCumulativeProbability(1.0).orElse(() -> POSITIVE_INFINITY)); + true, + source.inverseCumulativeProbability(1.0).orElse(() -> POSITIVE_INFINITY), + true); } @JsonProperty @@ -105,14 +99,14 @@ public ConnectorHistogram getSource() } @JsonProperty - public Set getRanges() + public SortedRangeSet getRanges() { - return rangeSet.get().asRanges().stream().map(StatisticRange::fromRange).collect(toImmutableSet()); + return rangeSet.get(); } public DisjointRangeDomainHistogram(ConnectorHistogram source) { - this(source, ImmutableSet.>of()); + this(source, Collections.emptySet()); } @Override @@ -126,17 +120,22 @@ public Estimate cumulativeProbability(double value, boolean inclusive) if (Double.isNaN(value)) { return Estimate.unknown(); } - Optional> optionalSpan = getSpan(); + Optional optionalSpan = getSpan(); if (!optionalSpan.isPresent()) { return Estimate.of(0.0); } - Range span = optionalSpan.get(); - if (value <= span.lowerEndpoint()) { + Range span = optionalSpan.get(); + if (value <= span.getLowValue().map(Double.class::cast) + .orElse(NEGATIVE_INFINITY)) { return Estimate.of(0.0); } - Range input = Range.range(span.lowerEndpoint(), span.lowerBoundType(), value, inclusive ? BoundType.CLOSED : BoundType.OPEN); + Range input = Range.range(DOUBLE, + span.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY), + span.getLow().getBound() == Marker.Bound.EXACTLY, + value, + inclusive); Estimate fullSetOverlap = calculateRangeSetOverlap(rangeSet.get()); - RangeSet spanned = rangeSet.get().subRangeSet(input); + SortedRangeSet spanned = rangeSet.get().subRangeSet(input); Estimate spannedOverlap = calculateRangeSetOverlap(spanned); return spannedOverlap.flatMap(spannedProbability -> @@ -148,11 +147,11 @@ public Estimate cumulativeProbability(double value, boolean inclusive) })); } - private Estimate calculateRangeSetOverlap(RangeSet ranges) + private Estimate calculateRangeSetOverlap(SortedRangeSet ranges) { // we require knowing bounds on all ranges double cumulativeTotal = 0.0; - for (Range range : ranges.asRanges()) { + for (Range range : ranges.getOrderedRanges()) { Estimate rangeProbability = getRangeProbability(range); if (rangeProbability.isUnknown()) { return Estimate.unknown(); @@ -169,9 +168,9 @@ private Estimate calculateRangeSetOverlap(RangeSet ranges) * @param range the range over the source domain * @return estimate of the total probability the range covers in the source */ - private Estimate getRangeProbability(Range range) + private Estimate getRangeProbability(Range range) { - return calculateFilterFactor(StatisticRange.fromRange(range), source, Estimate.unknown(), false); + return HistogramCalculator.calculateFilterFactor(range, NaN, source, Estimate.unknown(), false); } @Override @@ -185,17 +184,19 @@ public Estimate inverseCumulativeProbability(double percentile) // rangedPercentile = percentile - percentileLow // // percentileLow + (rangedPercentile * rangePercentileLength) - Optional> optionalSpan = getSpan(); + Optional optionalSpan = getSpan(); if (!optionalSpan.isPresent()) { return Estimate.unknown(); } - Range span = optionalSpan.get(); - if (percentile == 0.0 && isFinite(span.lowerEndpoint())) { - return source.inverseCumulativeProbability(0.0).map(sourceMin -> max(span.lowerEndpoint(), sourceMin)); + Range span = optionalSpan.get(); + double lower = span.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY); + double upper = span.getHighValue().map(Double.class::cast).orElse(POSITIVE_INFINITY); + if (percentile == 0.0 && isFinite(lower)) { + return source.inverseCumulativeProbability(0.0).map(sourceMin -> max(lower, sourceMin)); } - if (percentile == 1.0 && isFinite(span.upperEndpoint())) { - return source.inverseCumulativeProbability(1.0).map(sourceMax -> min(span.upperEndpoint(), sourceMax)); + if (percentile == 1.0 && isFinite(upper)) { + return source.inverseCumulativeProbability(1.0).map(sourceMax -> min(upper, sourceMax)); } Estimate totalCumulativeEstimate = calculateRangeSetOverlap(rangeSet.get()); @@ -209,9 +210,9 @@ public Estimate inverseCumulativeProbability(double percentile) } double cumulativeProbabilityNewDomain = 0.0; double lastRangeEstimateSourceDomain = 0.0; - Range currentRange = null; + Range currentRange = null; // find the range where the percentile falls - for (Range range : rangeSet.get().asRanges()) { + for (Range range : rangeSet.get().getOrderedRanges()) { Estimate rangeEstimate = getRangeProbability(range); if (rangeEstimate.isUnknown()) { return Estimate.unknown(); @@ -227,7 +228,8 @@ public Estimate inverseCumulativeProbability(double percentile) // no ranges to iterate over. Did a constraint cut the entire domain of values? return Estimate.unknown(); } - Estimate rangeLeftSourceEstimate = source.cumulativeProbability(currentRange.lowerEndpoint(), currentRange.lowerBoundType() == BoundType.OPEN); + Double currentLow = currentRange.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY); + Estimate rangeLeftSourceEstimate = source.cumulativeProbability(currentLow, !currentRange.isLowInclusive()); if (rangeLeftSourceEstimate.isUnknown()) { return Estimate.unknown(); } @@ -246,12 +248,10 @@ public Estimate inverseCumulativeProbability(double percentile) * @param other the new range to add to the set. * @return a new {@link DisjointRangeDomainHistogram} */ - public DisjointRangeDomainHistogram addDisjunction(StatisticRange other) + public DisjointRangeDomainHistogram addDisjunction(Range other) { - Set> ranges = ImmutableSet.>builder() - .addAll(this.ranges) - .add(other.toRange()) - .build(); + Set ranges = new HashSet<>(this.ranges); + ranges.add(other); return new DisjointRangeDomainHistogram(source, ranges); } @@ -262,9 +262,9 @@ public DisjointRangeDomainHistogram addDisjunction(StatisticRange other) * @param other the range that should enclose the set. * @return a new {@link DisjointRangeDomainHistogram} where */ - public DisjointRangeDomainHistogram addConjunction(StatisticRange other) + public DisjointRangeDomainHistogram addConjunction(Range other) { - return new DisjointRangeDomainHistogram(source, rangeSet.get().subRangeSet(other.toRange()).asRanges()); + return new DisjointRangeDomainHistogram(source, new HashSet<>(rangeSet.get().subRangeSet(other).getOrderedRanges())); } /** @@ -283,17 +283,17 @@ public DisjointRangeDomainHistogram addConjunction(StatisticRange other) * @param range the range representing the conjunction to add * @return a new histogram with the conjunction applied. */ - public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, StatisticRange range) + public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, Range range) { if (histogram instanceof DisjointRangeDomainHistogram) { return ((DisjointRangeDomainHistogram) histogram).addDisjunction(range); } - return new DisjointRangeDomainHistogram(histogram, ImmutableSet.of(range.toRange())); + return new DisjointRangeDomainHistogram(histogram, Collections.singleton(range)); } /** - * Similar to {@link #addDisjunction(ConnectorHistogram, StatisticRange)} this method constrains + * Similar to {@link #addDisjunction(ConnectorHistogram, Range)} this method constrains * the entire domain such that all ranges in the set intersect with the given range * argument to this method. *
@@ -304,22 +304,24 @@ public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, St * @param range the range of values that the entire histogram's domain must fall within * @return a histogram with the new range constraint */ - public static ConnectorHistogram addConjunction(ConnectorHistogram histogram, StatisticRange range) + public static ConnectorHistogram addConjunction(ConnectorHistogram histogram, Range range) { if (histogram instanceof DisjointRangeDomainHistogram) { return ((DisjointRangeDomainHistogram) histogram).addConjunction(range); } - return new DisjointRangeDomainHistogram(histogram, ImmutableSet.of(range.toRange())); + return new DisjointRangeDomainHistogram(histogram, Collections.singleton(range)); } /** * @return the span if it exists, empty otherwise */ - private Optional> getSpan() + private Optional getSpan() { try { - return Optional.of(rangeSet.get().span()); + return Optional.of(rangeSet.get()) + .filter(set -> !set.isNone()) // prevent exception + .map(SortedRangeSet::getSpan); } catch (NoSuchElementException e) { return Optional.empty(); @@ -331,7 +333,7 @@ public String toString() { return toStringHelper(this) .add("source", this.source) - .add("rangeSet", this.rangeSet) + .add("rangeSet", this.rangeSet.get()) .toString(); } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/HistogramCalculator.java similarity index 69% rename from presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java rename to presto-spi/src/main/java/com/facebook/presto/spi/statistics/HistogramCalculator.java index 12525b6120ccb..2ae539efd5e79 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/HistogramCalculator.java @@ -12,12 +12,14 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.google.common.math.DoubleMath; +import com.facebook.presto.common.predicate.Range; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; +import static java.lang.Double.NEGATIVE_INFINITY; +import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; import static java.lang.Double.isNaN; import static java.lang.Math.min; @@ -43,16 +45,19 @@ private HistogramCalculator() * heuristic would have been used * @return an estimate, x, where 0.0 <= x <= 1.0. */ - public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHistogram histogram, Estimate totalDistinctValues, boolean useHeuristics) + public static Estimate calculateFilterFactor(Range range, double rangeDistinctValues, ConnectorHistogram histogram, Estimate totalDistinctValues, boolean useHeuristics) { - boolean openHigh = range.getOpenHigh(); - boolean openLow = range.getOpenLow(); + boolean openHigh = !range.isLowInclusive(); + boolean openLow = !range.isHighInclusive(); Estimate min = histogram.inverseCumulativeProbability(0.0); Estimate max = histogram.inverseCumulativeProbability(1.0); + double rangeLow = range.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY); + double rangeHigh = range.getHighValue().map(Double.class::cast).orElse(POSITIVE_INFINITY); + double rangeLength = rangeHigh - rangeLow; // range is either above or below histogram - if ((!max.isUnknown() && (openHigh ? max.getValue() <= range.getLow() : max.getValue() < range.getLow())) - || (!min.isUnknown() && (openLow ? min.getValue() >= range.getHigh() : min.getValue() > range.getHigh()))) { + if ((!max.isUnknown() && (openHigh ? max.getValue() <= rangeLow : max.getValue() < rangeLow)) + || (!min.isUnknown() && (openLow ? min.getValue() >= rangeHigh : min.getValue() > rangeHigh))) { return Estimate.of(0.0); } @@ -63,14 +68,14 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist return Estimate.unknown(); } - if (range.length() == 0.0) { + if (rangeLength == 0.0) { return totalDistinctValues.map(distinct -> 1.0 / distinct); } - if (isFinite(range.length())) { - return Estimate.of(StatisticRange.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + if (isFinite(rangeLength)) { + return Estimate.of(INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } - return Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + return Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } // we know the bounds are both known, so calculate the percentile for each bound @@ -82,8 +87,8 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist // thus for the "lowPercentile" calculation we should pass "false" to be non-inclusive // (same as openness) however, on the high-end we want the inclusivity to be the opposite // of the openness since if it's open, we _don't_ want to include the bound. - Estimate lowPercentile = histogram.cumulativeProbability(range.getLow(), openLow); - Estimate highPercentile = histogram.cumulativeProbability(range.getHigh(), !openHigh); + Estimate lowPercentile = histogram.cumulativeProbability(rangeLow, openLow); + Estimate highPercentile = histogram.cumulativeProbability(rangeHigh, !openHigh); // both bounds are probably infinity, use the infinite-infinite heuristic if (lowPercentile.isUnknown() || highPercentile.isUnknown()) { @@ -91,26 +96,26 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist return Estimate.unknown(); } // in the case the histogram has no values - if (totalDistinctValues.equals(Estimate.zero()) || range.getDistinctValuesCount() == 0.0) { + if (totalDistinctValues.equals(Estimate.zero()) || rangeDistinctValues == 0.0) { return Estimate.of(0.0); } // in the case only one is unknown if (((lowPercentile.isUnknown() && !highPercentile.isUnknown()) || (!lowPercentile.isUnknown() && highPercentile.isUnknown())) && - isFinite(range.length())) { - return Estimate.of(StatisticRange.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + isFinite(rangeLength)) { + return Estimate.of(INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } - if (range.length() == 0.0) { + if (rangeLength == 0.0) { return totalDistinctValues.map(distinct -> 1.0 / distinct); } - if (!isNaN(range.getDistinctValuesCount())) { - return totalDistinctValues.map(distinct -> min(1.0, range.getDistinctValuesCount() / distinct)); + if (!isNaN(rangeDistinctValues)) { + return totalDistinctValues.map(distinct -> min(1.0, rangeDistinctValues / distinct)); } - return Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + return Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } // in the case the range is a single value, this can occur if the input @@ -134,15 +139,23 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist } return totalDistinctValues.flatMap(totalDistinct -> { - if (DoubleMath.fuzzyEquals(totalDistinct, 0.0, 1E-6)) { + if (fuzzyEquals(totalDistinct, 0.0, 1E-6)) { return Estimate.of(1.0); } - return Estimate.of(min(1.0, range.getDistinctValuesCount() / totalDistinct)); + return Estimate.of(min(1.0, rangeDistinctValues / totalDistinct)); }) - // in the case totalDistinct is NaN or 0 - .or(() -> Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR)); + // in the case totalDistinct is NaN or 0 + .or(() -> Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR)); } return lowPercentile.flatMap(lowPercent -> highPercentile.map(highPercent -> highPercent - lowPercent)); } + + private static boolean fuzzyEquals(double a, double b, double eps) + { + return Math.copySign(a - b, 1.0) <= eps + // copySign(x, 1.0) is a branch-free version of abs(x), but with different NaN semantics + || (a == b) // needed to ensure that infinities equal themselves + || (Double.isNaN(a) && Double.isNaN(b)); + } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/UniformDistributionHistogram.java similarity index 90% rename from presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java rename to presto-spi/src/main/java/com/facebook/presto/spi/statistics/UniformDistributionHistogram.java index d06232d1fb608..dc91ef42f684f 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/UniformDistributionHistogram.java @@ -12,16 +12,13 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Verify.verify; +import static com.facebook.presto.common.Utils.checkArgument; +import static com.facebook.presto.common.Utils.toStringHelper; import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; import static java.lang.Math.max; @@ -46,7 +43,7 @@ public UniformDistributionHistogram( @JsonProperty("lowValue") double lowValue, @JsonProperty("highValue") double highValue) { - verify(isNaN(lowValue) || isNaN(highValue) || (lowValue <= highValue), "lowValue must be <= highValue"); + checkArgument(isNaN(lowValue) || isNaN(highValue) || (lowValue <= highValue), "lowValue must be <= highValue"); this.lowValue = lowValue; this.highValue = highValue; } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDisjointRangeDomainHistogram.java similarity index 81% rename from presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java rename to presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDisjointRangeDomainHistogram.java index 1cbddcc781c58..823156f2a7b75 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDisjointRangeDomainHistogram.java @@ -12,20 +12,20 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Range; import org.apache.commons.math3.distribution.NormalDistribution; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.distribution.UniformRealDistribution; import org.testng.annotations.Test; import java.util.List; -import java.util.stream.Collectors; +import static com.facebook.presto.common.predicate.Range.greaterThanOrEqual; +import static com.facebook.presto.common.predicate.Range.lessThanOrEqual; +import static com.facebook.presto.common.predicate.Range.range; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static org.testng.Assert.assertEquals; public class TestDisjointRangeDomainHistogram @@ -39,9 +39,9 @@ public void testBasicDisjointRanges() { ConnectorHistogram source = new UniformDistributionHistogram(0, 100); ConnectorHistogram constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(0d, 25d))); + .addDisjunction(source, rangeOpen(0d, 25d)); constrained = DisjointRangeDomainHistogram - .addDisjunction(constrained, StatisticRange.fromRange(Range.open(75d, 100d))); + .addDisjunction(constrained, rangeOpen(75d, 100d)); assertEquals(constrained.inverseCumulativeProbability(0.75).getValue(), 87.5); assertEquals(constrained.inverseCumulativeProbability(0.0).getValue(), 0.0); assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 100); @@ -59,7 +59,7 @@ public void testSingleDisjointRange() // no overlap, left bound ConnectorHistogram constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(-10d, -5d))); + .addDisjunction(source, rangeOpen(-10d, -5d)); for (int i = -11; i < 12; i++) { assertEquals(constrained.cumulativeProbability(i, true).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(i, false).getValue(), 0.0, 1E-8); @@ -68,7 +68,7 @@ public void testSingleDisjointRange() assertEquals(constrained.inverseCumulativeProbability(1.0), Estimate.unknown()); // partial overlap left bound - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(-2d, 2d))); + constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(-2d, 2d))); assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(-1, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); @@ -82,7 +82,7 @@ public void testSingleDisjointRange() assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 2d, 1E-8); //full overlap - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(3d, 4d))); + constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(3d, 4d))); assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(1, false).getValue(), 0.0, 1E-8); @@ -96,7 +96,7 @@ public void testSingleDisjointRange() assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 4d, 1E-8); //right side overlap - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(8d, 12d))); + constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(8d, 12d))); assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(5, false).getValue(), 0.0, 1E-8); @@ -114,7 +114,7 @@ public void testSingleDisjointRange() // no overlap, right bound constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(15d, 20d))); + .addDisjunction(source, rangeOpen(15d, 20d)); for (int i = 15; i < 20; i++) { assertEquals(constrained.cumulativeProbability(i, true).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(i, false).getValue(), 0.0, 1E-8); @@ -132,8 +132,8 @@ public void testMultipleDisjunction() { StandardNormalHistogram source = new StandardNormalHistogram(); RealDistribution dist = source.getDistribution(); - ConnectorHistogram constrained = disjunction(source, Range.closed(-2d, -1d)); - constrained = disjunction(constrained, Range.closed(1d, 2d)); + ConnectorHistogram constrained = disjunction(source, rangeClosed(-2d, -1d)); + constrained = disjunction(constrained, rangeClosed(1d, 2d)); double rangeLeftProb = dist.cumulativeProbability(-1) - dist.cumulativeProbability(-2); double rangeRightProb = dist.cumulativeProbability(2) - dist.cumulativeProbability(1); double sumRangeProb = rangeLeftProb + rangeRightProb; @@ -156,7 +156,7 @@ public void testNormalDistribution() // standard normal StandardNormalHistogram source = new StandardNormalHistogram(); RealDistribution dist = source.getDistribution(); - ConnectorHistogram constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(-1d, 1d))); + ConnectorHistogram constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(-1d, 1d))); assertEquals(constrained.cumulativeProbability(-1.0, true).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0.0, true).getValue(), 0.5, 1E-8); assertEquals(constrained.cumulativeProbability(1.0, true).getValue(), 1.0, 1E-8); @@ -179,16 +179,16 @@ public void testNormalDistribution() public void testAddDisjunction() { ConnectorHistogram source = new UniformDistributionHistogram(0, 100); - DisjointRangeDomainHistogram constrained = disjunction(source, Range.open(-1d, 2d)); - assertEquals(constrained.getRanges().size(), 1); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 2d)); - constrained = disjunction(constrained, Range.open(1d, 10d)); + DisjointRangeDomainHistogram constrained = disjunction(source, rangeOpen(-1d, 2d)); + assertEquals(constrained.getRanges().getOrderedRanges().size(), 1); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 0d, true, 2d, false)); + constrained = disjunction(constrained, rangeOpen(1d, 10d)); assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 10d)); - constrained = disjunction(constrained, Range.closedOpen(50d, 100d)); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 0d, true, 10d, false)); + constrained = disjunction(constrained, range(DOUBLE, 50d, true, 100d, false)); assertEquals(ranges(constrained).size(), 2); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 10d)); - assertEquals(ranges(constrained).get(1), Range.closedOpen(50d, 100d)); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 0d, true, 10d, false)); + assertEquals(ranges(constrained).get(1), range(DOUBLE, 50d, true, 100d, false)); } /** @@ -198,30 +198,40 @@ public void testAddDisjunction() public void testAddConjunction() { ConnectorHistogram source = new UniformDistributionHistogram(0, 100); - DisjointRangeDomainHistogram constrained = disjunction(source, Range.open(10d, 90d)); - assertEquals(constrained.getRanges().size(), 1); - assertEquals(ranges(constrained).get(0), Range.open(10d, 90d)); - constrained = conjunction(constrained, Range.atMost(50d)); + DisjointRangeDomainHistogram constrained = disjunction(source, rangeOpen(10d, 90d)); + assertEquals(constrained.getRanges().getOrderedRanges().size(), 1); + assertEquals(ranges(constrained).get(0), rangeOpen(10d, 90d)); + constrained = conjunction(constrained, lessThanOrEqual(DOUBLE, 50d)); assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.openClosed(10d, 50d)); - constrained = conjunction(constrained, Range.atLeast(25d)); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 10d, false, 50d, true)); + constrained = conjunction(constrained, greaterThanOrEqual(DOUBLE, 25d)); assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.closed(25d, 50d)); + assertEquals(ranges(constrained).get(0), rangeClosed(25d, 50d)); } - private static DisjointRangeDomainHistogram disjunction(ConnectorHistogram source, Range range) + private static DisjointRangeDomainHistogram disjunction(ConnectorHistogram source, com.facebook.presto.common.predicate.Range range) { - return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addDisjunction(source, StatisticRange.fromRange(range)); + return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addDisjunction(source, range); } - private static DisjointRangeDomainHistogram conjunction(ConnectorHistogram source, Range range) + private static DisjointRangeDomainHistogram conjunction(ConnectorHistogram source, com.facebook.presto.common.predicate.Range range) { - return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addConjunction(source, StatisticRange.fromRange(range)); + return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addConjunction(source, range); } - private static List> ranges(DisjointRangeDomainHistogram hist) + private static List ranges(DisjointRangeDomainHistogram hist) { - return hist.getRanges().stream().map(StatisticRange::toRange).collect(Collectors.toList()); + return hist.getRanges().getOrderedRanges(); + } + + private static com.facebook.presto.common.predicate.Range rangeOpen(double low, double high) + { + return range(DOUBLE, low, false, high, false); + } + + private static com.facebook.presto.common.predicate.Range rangeClosed(double low, double high) + { + return range(DOUBLE, low, true, high, true); } private static class StandardNormalHistogram @@ -263,7 +273,7 @@ ConnectorHistogram createHistogram() return new DisjointRangeDomainHistogram( new UniformDistributionHistogram( distribution.getSupportLowerBound(), distribution.getSupportUpperBound())) - .addDisjunction(new StatisticRange(0.0, 100.0, 0.0)); + .addDisjunction(rangeClosed(0.0, 100.0)); } @Override diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogram.java similarity index 97% rename from presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java rename to presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogram.java index 26c68b7e5730e..341870d138bdc 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogram.java @@ -12,9 +12,8 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; import org.apache.commons.math3.distribution.RealDistribution; import org.testng.annotations.Test; diff --git a/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java new file mode 100644 index 0000000000000..0632e14247b1b --- /dev/null +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java @@ -0,0 +1,101 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.spi.statistics; + +import com.facebook.presto.common.predicate.Range; +import org.testng.annotations.Test; + +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.spi.statistics.HistogramCalculator.calculateFilterFactor; +import static java.lang.Double.NEGATIVE_INFINITY; +import static java.lang.Double.NaN; +import static java.lang.Double.POSITIVE_INFINITY; +import static org.testng.Assert.assertEquals; + +public class TestHistogramCalculator +{ + @Test + public void testCalculateFilterFactor() + { + Range zeroToTen = range(0, 10); + Range empty = Range.range(DOUBLE, NaN, true, NaN, true); + + // Equal ranges + assertFilterFactor(Estimate.of(1.0), zeroToTen, 10, uniformHist(0, 10), 5); + assertFilterFactor(Estimate.of(1.0), zeroToTen, 10, uniformHist(0, 10), 20); + + // Some overlap + assertFilterFactor(Estimate.of(0.5), range(5, 3000), 5, uniformHist(zeroToTen), 10); + + // Single value overlap + assertFilterFactor(Estimate.of(1.0 / 10), range(3, 3), 1, uniformHist(zeroToTen), 10); + assertFilterFactor(Estimate.of(1.0 / 10), range(10, 100), 357, uniformHist(zeroToTen), 10); + + // No overlap + assertFilterFactor(Estimate.zero(), range(20, 30), 10, uniformHist(zeroToTen), 10); + + // Empty ranges + assertFilterFactor(Estimate.zero(), zeroToTen, 10, uniformHist(empty), 0); + assertFilterFactor(Estimate.zero(), empty, 0, uniformHist(zeroToTen), 10); + + // no test for (empty, empty) since any return value is correct + assertFilterFactor(Estimate.zero(), unboundedRange(), 10, uniformHist(empty), 0); + assertFilterFactor(Estimate.zero(), empty, 0, uniformHist(unboundedRange()), 10); + + // Unbounded (infinite), NDV-based + assertFilterFactor(Estimate.of(0.5), unboundedRange(), 10, uniformHist(unboundedRange()), 20); + assertFilterFactor(Estimate.of(1.0), unboundedRange(), 20, uniformHist(unboundedRange()), 10); + + // NEW TESTS (TPC-H Q2) + // unbounded ranges + assertFilterFactor(Estimate.of(.5), unboundedRange(), 0.5, uniformHist(unboundedRange()), NaN); + // unbounded ranges with limited distinct values + assertFilterFactor(Estimate.of(0.2), unboundedRange(), 1.0, + domainConstrained(unboundedRange(), uniformHist(unboundedRange())), 5.0); + } + + private static Range range(double low, double high) + { + return Range.range(DOUBLE, low, true, high, true); + } + + private static Range unboundedRange() + { + return Range.all(DOUBLE); + } + + private static void assertFilterFactor(Estimate expected, Range range, double distinctValues, ConnectorHistogram histogram, double totalDistinctValues) + { + assertEquals( + calculateFilterFactor(range, distinctValues, histogram, Estimate.estimateFromDouble(totalDistinctValues), true), + expected); + } + + private static ConnectorHistogram uniformHist(Range range) + { + return uniformHist(range.getLow().getObjectValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY), + range.getHigh().getObjectValue().map(Double.class::cast).orElse(POSITIVE_INFINITY)); + } + + private static ConnectorHistogram uniformHist(double low, double high) + { + return new UniformDistributionHistogram(low, high); + } + + private static ConnectorHistogram domainConstrained(Range range, ConnectorHistogram source) + { + return DisjointRangeDomainHistogram.addDisjunction(source, range); + } +} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestUniformHistogram.java similarity index 93% rename from presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java rename to presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestUniformHistogram.java index 395bc3f6e7518..e1d3dc0b6f162 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestUniformHistogram.java @@ -12,11 +12,8 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.google.common.base.VerifyException; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.distribution.UniformRealDistribution; import org.testng.annotations.Test; @@ -48,7 +45,7 @@ RealDistribution getDistribution() @Test public void testInvalidConstruction() { - assertThrows(VerifyException.class, () -> new UniformDistributionHistogram(2.0, 1.0)); + assertThrows(IllegalArgumentException.class, () -> new UniformDistributionHistogram(2.0, 1.0)); } @Test