diff --git a/presto-common/src/main/java/com/facebook/presto/common/Utils.java b/presto-common/src/main/java/com/facebook/presto/common/Utils.java index 812c100effbf..2015411c6ce1 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/Utils.java +++ b/presto-common/src/main/java/com/facebook/presto/common/Utils.java @@ -18,8 +18,15 @@ import com.facebook.presto.common.predicate.Primitives; import com.facebook.presto.common.type.Type; +import javax.annotation.Nullable; + +import java.util.Arrays; +import java.util.function.Supplier; + import static com.facebook.presto.common.type.TypeUtils.readNativeValue; import static com.facebook.presto.common.type.TypeUtils.writeNativeValue; +import static java.lang.String.format; +import static java.util.Objects.requireNonNull; public final class Utils { @@ -30,7 +37,7 @@ private Utils() public static Block nativeValueToBlock(Type type, Object object) { if (object != null && !Primitives.wrap(type.getJavaType()).isInstance(object)) { - throw new IllegalArgumentException(String.format("Object '%s' does not match type %s", object, type.getJavaType())); + throw new IllegalArgumentException(format("Object '%s' does not match type %s", object, type.getJavaType())); } BlockBuilder blockBuilder = type.createBlockBuilder(null, 1); writeNativeValue(type, blockBuilder, object); @@ -49,10 +56,201 @@ public static void checkArgument(boolean expression) } } - public static void checkArgument(boolean expression, String errorMessage) + public static void checkArgument(boolean expression, String message, Object... args) { if (!expression) { - throw new IllegalArgumentException(errorMessage); + throw new IllegalArgumentException(format(message, args)); + } + } + + /** + * Returns a supplier which caches the instance retrieved during the first call to {@code get()} + * and returns that value on subsequent calls to {@code get()}. + */ + public static Supplier memoizedSupplier(Supplier delegate) + { + if (delegate instanceof MemoizingSupplier) { + return delegate; + } + return new MemoizingSupplier<>(delegate); + } + + /** + * Vendored from Guava + */ + static class MemoizingSupplier + implements Supplier + { + volatile Supplier delegate; + volatile boolean initialized; + // "value" does not need to be volatile; visibility piggy-backs + // on volatile read of "initialized". + @Nullable T value; + + MemoizingSupplier(Supplier delegate) + { + this.delegate = requireNonNull(delegate); + } + + @Override + public T get() + { + // A 2-field variant of Double Checked Locking. + if (!initialized) { + synchronized (this) { + if (!initialized) { + T t = delegate.get(); + value = t; + initialized = true; + // Release the delegate to GC. + delegate = null; + return t; + } + } + } + return value; + } + + @Override + public String toString() + { + Supplier delegate = this.delegate; + return "Suppliers.memoize(" + + (delegate == null ? "" : delegate) + + ")"; + } + } + + public static ToStringHelper toStringHelper(Object self) + { + return new ToStringHelper(self.getClass().getSimpleName()); + } + + public static ToStringHelper toStringHelper(Class self) + { + return new ToStringHelper(self.getSimpleName()); + } + + public static ToStringHelper toStringHelper(String className) + { + return new ToStringHelper(className); + } + + /** + * Vendored class from Guava. + */ + public static final class ToStringHelper + { + private final String className; + private final ValueHolder holderHead = new ValueHolder(); + private ValueHolder holderTail = holderHead; + private boolean omitNullValues; + + private ToStringHelper(String className) + { + this.className = requireNonNull(className); + } + + public ToStringHelper omitNullValues() + { + omitNullValues = true; + return this; + } + + public ToStringHelper add(String name, @Nullable Object value) + { + return addHolder(name, value); + } + + public ToStringHelper add(String name, boolean value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, char value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, double value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, float value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, int value) + { + return addHolder(name, String.valueOf(value)); + } + + public ToStringHelper add(String name, long value) + { + return addHolder(name, String.valueOf(value)); + } + + @Override + public String toString() + { + // create a copy to keep it consistent in case value changes + boolean omitNullValuesSnapshot = omitNullValues; + String nextSeparator = ""; + StringBuilder builder = new StringBuilder(32).append(className).append('{'); + for (ValueHolder valueHolder = holderHead.next; + valueHolder != null; + valueHolder = valueHolder.next) { + Object value = valueHolder.value; + if (!omitNullValuesSnapshot || value != null) { + builder.append(nextSeparator); + nextSeparator = ", "; + + if (valueHolder.name != null) { + builder.append(valueHolder.name).append('='); + } + if (value != null && value.getClass().isArray()) { + Object[] objectArray = {value}; + String arrayString = Arrays.deepToString(objectArray); + builder.append(arrayString, 1, arrayString.length() - 1); + } + else { + builder.append(value); + } + } + } + return builder.append('}').toString(); + } + + private ValueHolder addHolder() + { + ValueHolder valueHolder = new ValueHolder(); + holderTail.next = valueHolder; + holderTail = valueHolder; + return valueHolder; + } + + private ToStringHelper addHolder(@Nullable Object value) + { + ValueHolder valueHolder = addHolder(); + valueHolder.value = value; + return this; + } + + private ToStringHelper addHolder(String name, @Nullable Object value) + { + ValueHolder valueHolder = addHolder(); + valueHolder.value = value; + valueHolder.name = requireNonNull(name); + return this; + } + + private static final class ValueHolder + { + @Nullable String name; + @Nullable Object value; + @Nullable ValueHolder next; } } } diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java index f20a87065bcf..76a58d147a28 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/Marker.java @@ -129,6 +129,11 @@ public Object getValue() return Utils.blockToNativeValue(type, valueBlock.get()); } + public Optional getObjectValue() + { + return valueBlock.map(block -> Utils.blockToNativeValue(type, block)); + } + public Object getPrintableValue(SqlFunctionProperties properties) { if (!valueBlock.isPresent()) { diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java index d00a3c77df0e..501996cc6aa8 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/Range.java @@ -247,6 +247,16 @@ public boolean equals(Object obj) Objects.equals(this.high, other.high); } + @Override + public String toString() + { + return (low.getBound() == Marker.Bound.EXACTLY ? "[" : "(") + + low.getObjectValue().orElse(Double.NEGATIVE_INFINITY) + + ".." + + high.getObjectValue().orElse(Double.POSITIVE_INFINITY) + + (high.getBound() == Marker.Bound.EXACTLY ? "]" : ")"); + } + private void appendQuotedValue(StringBuilder buffer, Marker marker, SqlFunctionProperties properties) { buffer.append('"'); diff --git a/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java b/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java index 5f1988be005d..4af54a8e2a68 100644 --- a/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java +++ b/presto-common/src/main/java/com/facebook/presto/common/predicate/SortedRangeSet.java @@ -168,6 +168,28 @@ public Object getSingleValue() return lowIndexedRanges.values().iterator().next().getSingleValue(); } + /** + * Build a new {@link SortedRangeSet} that contains ranges which lie within the argument range + * + * @param span the range which the new set should span + * @return a new range set + */ + public SortedRangeSet subRangeSet(Range span) + { + Builder builder = new Builder(type); + + for (Range range : getOrderedRanges()) { + if (span.contains(range)) { + builder.add(range); + } + else if (span.overlaps(range)) { + builder.add(range.intersect(span)); + } + } + + return builder.build(); + } + @Override public boolean containsValue(Object value) { diff --git a/presto-common/src/test/java/com/facebook/presto/common/TestToStringHelper.java b/presto-common/src/test/java/com/facebook/presto/common/TestToStringHelper.java new file mode 100644 index 000000000000..05a675520783 --- /dev/null +++ b/presto-common/src/test/java/com/facebook/presto/common/TestToStringHelper.java @@ -0,0 +1,386 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.common; + +import com.facebook.presto.common.Utils.ToStringHelper; +import com.google.common.collect.ImmutableMap; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Map; + +import static com.facebook.presto.common.Utils.toStringHelper; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertThrows; +import static org.testng.Assert.assertTrue; + +public class TestToStringHelper +{ + @Test + public void testConstructorInstance() + { + String toTest = toStringHelper(this).toString(); + assertEquals("TestToStringHelper{}", toTest); + } + + @Test + public void testConstructorLenientInstance() + { + String toTest = toStringHelper(this).toString(); + assertTrue(toTest.matches(".*\\{\\}"), toTest); + } + + @Test + public void testConstructorInnerClass() + { + String toTest = toStringHelper(new TestClass()).toString(); + assertEquals("TestClass{}", toTest); + } + + @Test + public void testConstructorLenientInnerClass() + { + String toTest = toStringHelper(new TestClass()).toString(); + assertTrue(toTest.matches(".*\\{\\}"), toTest); + } + + @Test + public void testConstructorAnonymousClass() + { + String toTest = toStringHelper(new Object() {}).toString(); + assertEquals("{}", toTest); + } + + @Test + public void testConstructorLenientAnonymousClass() + { + String toTest = toStringHelper(new Object() {}).toString(); + assertTrue(toTest.matches(".*\\{\\}"), toTest); + } + + @Test + public void testConstructorClassObject() + { + String toTest = toStringHelper(TestClass.class).toString(); + assertEquals("TestClass{}", toTest); + } + + @Test + public void testConstructorLenientClassObject() + { + String toTest = toStringHelper(TestClass.class).toString(); + assertTrue(toTest.matches(".*\\{\\}"), toTest); + } + + @Test + public void testConstructorStringObject() + { + String toTest = toStringHelper("FooBar").toString(); + assertEquals("FooBar{}", toTest); + } + + @Test + public void testToStringHelperLocalInnerClass() + { + // Local inner classes have names ending like "Outer.$1Inner" + class LocalInnerClass {} + String toTest = toStringHelper(new LocalInnerClass()).toString(); + assertEquals("LocalInnerClass{}", toTest); + } + + @Test + public void testToStringHelperLenientLocalInnerClass() + { + class LocalInnerClass {} + String toTest = toStringHelper(new LocalInnerClass()).toString(); + assertTrue(toTest.matches(".*\\{\\}"), toTest); + } + + @Test + public void testToStringHelperLocalInnerNestedClass() + { + class LocalInnerClass + { + class LocalInnerNestedClass {} + } + String toTest = + toStringHelper(new LocalInnerClass().new LocalInnerNestedClass()).toString(); + assertEquals("LocalInnerNestedClass{}", toTest); + } + + @Test + public void testToStringHelperLenientLocalInnerNestedClass() + { + class LocalInnerClass + { + class LocalInnerNestedClass {} + } + String toTest = + toStringHelper(new LocalInnerClass().new LocalInnerNestedClass()).toString(); + assertTrue(toTest.matches(".*\\{\\}"), toTest); + } + + @Test + public void testToStringHelperMoreThanNineAnonymousClasses() + { + // The nth anonymous class has a name ending like "Outer.$n" + Object unused1 = new Object() {}; + Object unused2 = new Object() {}; + Object unused3 = new Object() {}; + Object unused4 = new Object() {}; + Object unused5 = new Object() {}; + Object unused6 = new Object() {}; + Object unused7 = new Object() {}; + Object unused8 = new Object() {}; + Object unused9 = new Object() {}; + Object o10 = new Object() {}; + String toTest = toStringHelper(o10).toString(); + assertEquals("{}", toTest); + } + + @Test + public void testToStringHelperLenientMoreThanNineAnonymousClasses() + { + // The nth anonymous class has a name ending like "Outer.$n" + Object unused1 = new Object() {}; + Object unused2 = new Object() {}; + Object unused3 = new Object() {}; + Object unused4 = new Object() {}; + Object unused5 = new Object() {}; + Object unused6 = new Object() {}; + Object unused7 = new Object() {}; + Object unused8 = new Object() {}; + Object unused9 = new Object() {}; + Object o10 = new Object() {}; + String toTest = toStringHelper(o10).toString(); + assertTrue(toTest.matches(".*\\{\\}"), toTest); + } + + // all remaining test are on an inner class with various fields + @Test + public void testToStringOneField() + { + String toTest = toStringHelper(new TestClass()).add("field1", "Hello").toString(); + assertEquals("TestClass{field1=Hello}", toTest); + } + + @Test + public void testToStringOneIntegerField() + { + String toTest = + toStringHelper(new TestClass()).add("field1", Integer.valueOf(42)).toString(); + assertEquals("TestClass{field1=42}", toTest); + } + + @Test + public void testToStringNullInteger() + { + String toTest = + toStringHelper(new TestClass()).add("field1", (Integer) null).toString(); + assertEquals("TestClass{field1=null}", toTest); + } + + @Test + public void testToStringLenientOneField() + { + String toTest = toStringHelper(new TestClass()).add("field1", "Hello").toString(); + assertTrue(toTest.matches(".*\\{field1\\=Hello\\}"), toTest); + } + + @Test + public void testToStringLenientOneIntegerField() + { + String toTest = + toStringHelper(new TestClass()).add("field1", Integer.valueOf(42)).toString(); + assertTrue(toTest.matches(".*\\{field1\\=42\\}"), toTest); + } + + @Test + public void testToStringLenientNullInteger() + { + String toTest = + toStringHelper(new TestClass()).add("field1", (Integer) null).toString(); + assertTrue(toTest.matches(".*\\{field1\\=null\\}"), toTest); + } + + @Test + public void testToStringComplexFields() + { + Map map = + ImmutableMap.builder().put("abc", 1).put("def", 2).put("ghi", 3).build(); + String toTest = + toStringHelper(new TestClass()) + .add("field1", "This is string.") + .add("field2", Arrays.asList("abc", "def", "ghi")) + .add("field3", map) + .toString(); + final String expected = + "TestClass{" + + "field1=This is string., field2=[abc, def, ghi], field3={abc=1, def=2, ghi=3}}"; + + assertEquals(expected, toTest); + } + + @Test + public void testToStringLenientComplexFields() + { + Map map = + ImmutableMap.builder().put("abc", 1).put("def", 2).put("ghi", 3).build(); + String toTest = + toStringHelper(new TestClass()) + .add("field1", "This is string.") + .add("field2", Arrays.asList("abc", "def", "ghi")) + .add("field3", map) + .toString(); + final String expectedRegex = + ".*\\{" + + "field1\\=This is string\\., " + + "field2\\=\\[abc, def, ghi\\], " + + "field3=\\{abc\\=1, def\\=2, ghi\\=3\\}\\}"; + + assertTrue(toTest.matches(expectedRegex), toTest); + } + + @Test + public void testToStringAddWithNullName() + { + ToStringHelper helper = toStringHelper(new TestClass()); + assertThrows(NullPointerException.class, () -> helper.add(null, "Hello")); + } + + @Test + public void testToStringAddWithNullValue() + { + final String result = toStringHelper(new TestClass()).add("Hello", null).toString(); + + assertEquals("TestClass{Hello=null}", result); + } + + @Test + public void testToStringLenientAddWithNullValue() + { + final String result = toStringHelper(new TestClass()).add("Hello", null).toString(); + assertTrue(result.matches(".*\\{Hello\\=null\\}"), result); + } + + @Test + public void testToStringOmitNullValuesOneField() + { + String toTest = + toStringHelper(new TestClass()).omitNullValues().add("field1", null).toString(); + assertEquals("TestClass{}", toTest); + } + + @Test + public void testToStringOmitNullValuesManyFieldsFirstNull() + { + String toTest = + toStringHelper(new TestClass()) + .omitNullValues() + .add("field1", null) + .add("field2", "Googley") + .add("field3", "World") + .toString(); + assertEquals("TestClass{field2=Googley, field3=World}", toTest); + } + + @Test + public void testToStringOmitNullValuesManyFieldsOmitAfterNull() + { + String toTest = + toStringHelper(new TestClass()) + .add("field1", null) + .add("field2", "Googley") + .add("field3", "World") + .omitNullValues() + .toString(); + assertEquals("TestClass{field2=Googley, field3=World}", toTest); + } + + @Test + public void testToStringOmitNullValuesManyFieldsLastNull() + { + String toTest = + toStringHelper(new TestClass()) + .omitNullValues() + .add("field1", "Hello") + .add("field2", "Googley") + .add("field3", null) + .toString(); + assertEquals("TestClass{field1=Hello, field2=Googley}", toTest); + } + + @Test + public void testToStringOmitNullValuesDifferentOrder() + { + String expected = "TestClass{field1=Hello, field2=Googley, field3=World}"; + String toTest1 = + toStringHelper(new TestClass()) + .omitNullValues() + .add("field1", "Hello") + .add("field2", "Googley") + .add("field3", "World") + .toString(); + String toTest2 = + toStringHelper(new TestClass()) + .add("field1", "Hello") + .add("field2", "Googley") + .omitNullValues() + .add("field3", "World") + .toString(); + assertEquals(expected, toTest1); + assertEquals(expected, toTest2); + } + + @Test + public void testToStringOmitNullValuesCanBeCalledManyTimes() + { + String toTest = toStringHelper(new TestClass()) + .omitNullValues() + .omitNullValues() + .add("field1", "Hello") + .omitNullValues() + .add("field2", "Googley") + .omitNullValues() + .add("field3", "World") + .toString(); + assertEquals("TestClass{field1=Hello, field2=Googley, field3=World}", toTest); + } + + @Test + public void testToStringHelperWithArrays() + { + String[] strings = {"hello", "world"}; + int[] ints = {2, 42}; + Object[] objects = {"obj"}; + String[] arrayWithNull = {null}; + Object[] empty = {}; + String toTest = + toStringHelper("TSH") + .add("strings", strings) + .add("ints", ints) + .add("objects", objects) + .add("arrayWithNull", arrayWithNull) + .add("empty", empty) + .toString(); + assertEquals( + "TSH{strings=[hello, world], ints=[2, 42], objects=[obj], arrayWithNull=[null], empty=[]}", + toTest); + } + + /** + * Test class for testing formatting of inner classes. + */ + private static class TestClass {} +} diff --git a/presto-common/src/test/java/com/facebook/presto/common/TestUtils.java b/presto-common/src/test/java/com/facebook/presto/common/TestUtils.java new file mode 100644 index 000000000000..646a44fc919c --- /dev/null +++ b/presto-common/src/test/java/com/facebook/presto/common/TestUtils.java @@ -0,0 +1,148 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.common; + +import org.testng.annotations.Test; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Function; +import java.util.function.Supplier; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertSame; +import static org.testng.Assert.fail; + +public class TestUtils +{ + @Test + public void testCheckArgumentFailWithMessage() + { + try { + Utils.checkArgument(false, "test %s", "test"); + fail(); + } + catch (IllegalArgumentException e) { + assertEquals(e.getMessage(), "test test"); + } + } + + @Test + public void testCheckArgumentPassWithMessage() + { + try { + Utils.checkArgument(true, "test %s", "test"); + } + catch (IllegalArgumentException e) { + fail(); + } + } + + @Test + public void testMemoizedSupplierThreadSafe() + throws Throwable + { + Function, Supplier> memoizer = + supplier -> Utils.memoizedSupplier(supplier); + testSupplierThreadSafe(memoizer); + } + + /** + * Vendored from Guava + */ + private void testSupplierThreadSafe(Function, Supplier> memoizer) + throws Throwable + { + final AtomicInteger count = new AtomicInteger(0); + final AtomicReference thrown = new AtomicReference<>(null); + final int numThreads = 3; + final Thread[] threads = new Thread[numThreads]; + final long timeout = TimeUnit.SECONDS.toNanos(60); + + final Supplier supplier = + new Supplier() + { + boolean isWaiting(Thread thread) + { + switch (thread.getState()) { + case BLOCKED: + case WAITING: + case TIMED_WAITING: + return true; + default: + return false; + } + } + + int waitingThreads() + { + int waitingThreads = 0; + for (Thread thread : threads) { + if (isWaiting(thread)) { + waitingThreads++; + } + } + return waitingThreads; + } + + @Override + @SuppressWarnings("ThreadPriorityCheck") // doing our best to test for races + public Boolean get() + { + // Check that this method is called exactly once, by the first + // thread to synchronize. + long t0 = System.nanoTime(); + while (waitingThreads() != numThreads - 1) { + if (System.nanoTime() - t0 > timeout) { + thrown.set( + new TimeoutException( + "timed out waiting for other threads to block" + + " synchronizing on supplier")); + break; + } + Thread.yield(); + } + count.getAndIncrement(); + return Boolean.TRUE; + } + }; + + final Supplier memoizedSupplier = memoizer.apply(supplier); + + for (int i = 0; i < numThreads; i++) { + threads[i] = + new Thread() + { + @Override + public void run() + { + assertSame(Boolean.TRUE, memoizedSupplier.get()); + } + }; + } + for (Thread t : threads) { + t.start(); + } + for (Thread t : threads) { + t.join(); + } + + if (thrown.get() != null) { + throw thrown.get(); + } + assertEquals(1, count.get()); + } +} diff --git a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestMarker.java b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestMarker.java index 362fc8f29c97..1eb4085e7222 100644 --- a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestMarker.java +++ b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestMarker.java @@ -191,6 +191,16 @@ public void testCanonicalize() assertDifferentMarker(Marker.upperUnbounded(BIGINT), Marker.lowerUnbounded(BIGINT), true); } + @Test + public void testGetValue() + { + assertTrue(Marker.exactly(BIGINT, 1L).getObjectValue().isPresent()); + assertTrue(Marker.above(BIGINT, 1L).getObjectValue().isPresent()); + assertTrue(Marker.below(BIGINT, 1L).getObjectValue().isPresent()); + assertFalse(Marker.upperUnbounded(BIGINT).getObjectValue().isPresent()); + assertFalse(Marker.lowerUnbounded(BIGINT).getObjectValue().isPresent()); + } + private void assertSameMarker(Marker marker1, Marker marker2, boolean removeConstants) throws Exception { diff --git a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java index f754681359a5..087073c432bd 100644 --- a/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java +++ b/presto-common/src/test/java/com/facebook/presto/common/predicate/TestSortedRangeSet.java @@ -26,6 +26,9 @@ import com.google.common.collect.Iterables; import org.testng.annotations.Test; +import java.util.Arrays; +import java.util.stream.Collectors; + import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.BooleanType.BOOLEAN; import static com.facebook.presto.common.type.DoubleType.DOUBLE; @@ -500,6 +503,65 @@ public void testCanonicalize() assertDifferentSet(SortedRangeSet.all(BIGINT), SortedRangeSet.all(BOOLEAN), true); } + @Test + public void testSubRangeSet() + { + // test subrange no overlap below and above + assertEquals(SortedRangeSet.of(Range.lessThan(BIGINT, 10L)) + .subRangeSet(Range.greaterThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + assertEquals(SortedRangeSet.of(Range.greaterThan(BIGINT, 10L)) + .subRangeSet(Range.lessThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + assertEquals(SortedRangeSet.of(Range.greaterThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.lessThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + assertEquals(SortedRangeSet.of(Range.lessThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.greaterThan(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 0); + + // test with equal bounds + assertEquals(SortedRangeSet.of(Range.lessThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.greaterThanOrEqual(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 1); + assertEquals(SortedRangeSet.of(Range.greaterThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.lessThanOrEqual(BIGINT, 10L)) + .getOrderedRanges() + .size(), + 1); + assertEquals(SortedRangeSet.of(Range.lessThanOrEqual(BIGINT, 10L)) + .subRangeSet(Range.greaterThanOrEqual(BIGINT, 10L)) + .getOrderedRanges().get(0), Range.range(BIGINT, 10L, true, 10L, true)); + // two ranges + assertEquals(SortedRangeSet.of(Range.lessThan(BIGINT, -10L), Range.greaterThan(BIGINT, 10L)) + .subRangeSet(Range.range(BIGINT, -20L, true, 20L, true)).getOrderedRanges(), + Arrays.stream(new Range[] { + Range.range(BIGINT, -20L, true, -10L, false), + Range.range(BIGINT, 10L, false, 20L, true)}) + .collect(Collectors.toList())); + // range entirely contained + assertEquals(SortedRangeSet.of( + Range.lessThan(BIGINT, -10L), + Range.greaterThan(BIGINT, 10L), + Range.range(BIGINT, -5L, true, 5L, true)) + .subRangeSet(Range.range(BIGINT, -20L, true, 20L, true)).getOrderedRanges(), + Arrays.stream(new Range[] { + Range.range(BIGINT, -20L, true, -10L, false), + Range.range(BIGINT, -5L, true, 5L, true), + Range.range(BIGINT, 10L, false, 20L, true)}) + .collect(Collectors.toList())); + } + private void assertSameSet(SortedRangeSet set1, SortedRangeSet set2, boolean removeSafeConstants) throws Exception { diff --git a/presto-docs/src/main/sphinx/admin/properties.rst b/presto-docs/src/main/sphinx/admin/properties.rst index 0435d11c2966..d38e8057c591 100644 --- a/presto-docs/src/main/sphinx/admin/properties.rst +++ b/presto-docs/src/main/sphinx/admin/properties.rst @@ -863,6 +863,18 @@ on a per-query basis using the ``treat-low-confidence-zero-estimation-as-unknown Enable retry for failed queries who can potentially be helped by HBO. This can also be specified on a per-query basis using the ``retry-query-with-history-based-optimization`` session property. +``optimizer.use-histograms`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +* **Type:** ``boolean`` +* **Default Value:** ``false`` + +Enables the optimizer to use histograms when available to perform cost estimate calculations +during query optimization. When set to ``false``, this parameter does not prevent histograms +from being collected by ``ANALYZE``, but prevents them from being used during query +optimization. This behavior can be controlled on a per-query basis using the +``optimizer_use_histograms`` session property. + Planner Properties ------------------ diff --git a/presto-docs/src/main/sphinx/optimizer/statistics.rst b/presto-docs/src/main/sphinx/optimizer/statistics.rst index a0c562abeb4a..eeb763575c82 100644 --- a/presto-docs/src/main/sphinx/optimizer/statistics.rst +++ b/presto-docs/src/main/sphinx/optimizer/statistics.rst @@ -6,8 +6,9 @@ Presto supports statistics based optimizations for queries. For a query to take advantage of these optimizations, Presto must have statistical information for the tables in that query. -Table statistics are provided to the query planner by connectors. Currently, the -only connector that supports statistics is the :doc:`/connector/hive`. +Table statistics are provided to the query planner by connectors. Implementing +support for table statistics is optional. The decision is left to the authors +of the connector. Table Layouts ------------- @@ -30,23 +31,23 @@ Available Statistics The following statistics are available in Presto: - * For a table: +* For a table: - * **row count**: the total number of rows in the table layout + * **row count**: the total number of rows in the table layout - * For each column in a table: +* For each column in a table: - * **data size**: the size of the data that needs to be read - * **nulls fraction**: the fraction of null values - * **distinct value count**: the number of distinct values - * **low value**: the smallest value in the column - * **high value**: the largest value in the column + * **data size**: the size of the data that needs to be read + * **nulls fraction**: the fraction of null values + * **distinct value count**: the number of distinct values + * **low value**: the smallest value in the column + * **high value**: the largest value in the column + * **histogram**: A connector-dependent histogram data structure. The set of statistics available for a particular query depends on the connector being used and can also vary by table or even by table layout. For example, the Hive connector does not currently provide statistics on data size. -Table statistics can be displayed via the Presto SQL interface using the -:doc:`/sql/show-stats` command. For the Hive connector, refer to the -:ref:`Hive connector ` documentation to learn how to update table -statistics. +Table statistics can be can be fetched using the :doc:`/sql/show-stats` query. +For the Hive connector, refer to the :ref:`Hive connector ` +documentation to learn how to update table statistics. diff --git a/presto-docs/src/main/sphinx/sql/show-stats.rst b/presto-docs/src/main/sphinx/sql/show-stats.rst index 51dae9df5dac..79b5ec48ffa6 100644 --- a/presto-docs/src/main/sphinx/sql/show-stats.rst +++ b/presto-docs/src/main/sphinx/sql/show-stats.rst @@ -60,3 +60,6 @@ The following table lists the returned columns and what statistics they represen - The highest value found in this column - ``NULL`` in the table summary row. Available for columns of DATE, integer, floating-point, and fixed-precision data types. + * - ``histogram`` + - The histogram for this column + - A summary of the underlying histogram is displayed in a human-readable format. ``NULL`` in the table summary row. diff --git a/presto-hive-metastore/src/main/java/com/facebook/presto/hive/HiveStatisticsUtil.java b/presto-hive-metastore/src/main/java/com/facebook/presto/hive/HiveStatisticsUtil.java index f05c524f0012..b138a899c94c 100644 --- a/presto-hive-metastore/src/main/java/com/facebook/presto/hive/HiveStatisticsUtil.java +++ b/presto-hive-metastore/src/main/java/com/facebook/presto/hive/HiveStatisticsUtil.java @@ -28,11 +28,13 @@ import java.util.Map; import java.util.Optional; import java.util.OptionalLong; +import java.util.Set; import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.hive.metastore.Statistics.fromComputedStatistics; import static com.facebook.presto.spi.statistics.TableStatisticType.ROW_COUNT; import static com.google.common.base.Verify.verify; +import static com.google.common.collect.ImmutableMap.toImmutableMap; public final class HiveStatisticsUtil { @@ -61,9 +63,14 @@ public static PartitionStatistics createPartitionStatistics( ConnectorSession session, Map columnTypes, ComputedStatistics computedStatistics, + Set supportedColumnStatistics, DateTimeZone timeZone) { - Map computedColumnStatistics = computedStatistics.getColumnStatistics(); + Map computedColumnStatistics = computedStatistics.getColumnStatistics() + .entrySet() + .stream() + .filter((entry) -> supportedColumnStatistics.contains(entry.getKey())) + .collect(toImmutableMap(Map.Entry::getKey, Map.Entry::getValue)); Block rowCountBlock = Optional.ofNullable(computedStatistics.getTableStatistics().get(ROW_COUNT)) .orElseThrow(() -> new VerifyException("rowCount not present")); @@ -73,6 +80,15 @@ public static PartitionStatistics createPartitionStatistics( return createPartitionStatistics(session, rowCountOnlyBasicStatistics, columnTypes, computedColumnStatistics, timeZone); } + public static PartitionStatistics createPartitionStatistics( + ConnectorSession session, + Map columnTypes, + ComputedStatistics computedStatistics, + DateTimeZone timeZone) + { + return createPartitionStatistics(session, columnTypes, computedStatistics, computedStatistics.getColumnStatistics().keySet(), timeZone); + } + public static Map getColumnStatistics(Map, ComputedStatistics> statistics, List partitionValues) { return Optional.ofNullable(statistics.get(partitionValues)) @@ -81,10 +97,11 @@ public static Map getColumnStatistics(Mappresto-cache compile - com.facebook.presto presto-main test - - com.facebook.presto - presto-parser - test + com.facebook.presto + presto-parser + test - com.facebook.presto presto-analyzer @@ -599,7 +596,7 @@ org.apache.iceberg iceberg-core - 1.5.0 + ${dep.iceberg.version} tests test @@ -629,6 +626,10 @@ + + org.apache.commons + commons-math3 + diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java index c920f22e6f6b..a7fed4261a9b 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergAbstractMetadata.java @@ -610,7 +610,7 @@ public TableStatisticsMetadata getStatisticsCollectionMetadata(ConnectorSession MetricsConfig metricsConfig = MetricsConfig.forTable(table); Set columnStatistics = tableMetadata.getColumns().stream() .filter(column -> !column.isHidden() && metricsConfig.columnMode(column.getName()) != None.get()) - .flatMap(meta -> getSupportedColumnStatistics(meta.getName(), meta.getType()).stream()) + .flatMap(meta -> getSupportedColumnStatistics(session, meta.getName(), meta.getType()).stream()) .collect(toImmutableSet()); Set tableStatistics = ImmutableSet.of(ROW_COUNT); diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java index fe11f4b2fefb..42328ddb4e7f 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergConfig.java @@ -24,6 +24,7 @@ import javax.validation.constraints.DecimalMax; import javax.validation.constraints.DecimalMin; +import javax.validation.constraints.Max; import javax.validation.constraints.Min; import javax.validation.constraints.NotNull; @@ -62,6 +63,7 @@ public class IcebergConfig private int metadataPreviousVersionsMax = METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT; private boolean metadataDeleteAfterCommit = METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT; private int metricsMaxInferredColumn = METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT; + private int statisticsKllSketchKParameter = 1024; private EnumSet hiveStatisticsMergeFlags = EnumSet.noneOf(ColumnStatisticType.class); private String fileIOImpl = HadoopFileIO.class.getName(); @@ -412,4 +414,19 @@ public IcebergConfig setMaxStatisticsFileCacheSize(DataSize maxStatisticsFileCac this.maxStatisticsFileCacheSize = maxStatisticsFileCacheSize; return this; } + + public int getStatisticsKllSketchKParameter() + { + return this.statisticsKllSketchKParameter; + } + + @Config("iceberg.statistics-kll-sketch-k-parameter") + @Min(8) + @Max(65535) + @ConfigDescription("K parameter for KLL sketches when generating histogram statistics") + public IcebergConfig setStatisticsKllSketchKParameter(int kllSketchKParameter) + { + this.statisticsKllSketchKParameter = kllSketchKParameter; + return this; + } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java index ea610a503708..1f8680635c0f 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergHiveMetadata.java @@ -471,10 +471,23 @@ public IcebergTableHandle getTableHandleForStatisticsCollection(ConnectorSession @Override public TableStatisticsMetadata getStatisticsCollectionMetadata(ConnectorSession session, ConnectorTableMetadata tableMetadata) { - org.apache.iceberg.Table table = getIcebergTable(session, tableMetadata.getTable()); + org.apache.iceberg.Table icebergTable = getIcebergTable(session, tableMetadata.getTable()); + Set hiveColumnStatistics = getHiveSupportedColumnStatistics(session, icebergTable, tableMetadata); + Set supportedStatistics = ImmutableSet.builder() + .addAll(hiveColumnStatistics) + // iceberg table-supported statistics + .addAll(super.getStatisticsCollectionMetadata(session, tableMetadata).getColumnStatistics()) + .build(); + Set tableStatistics = ImmutableSet.of(ROW_COUNT); + return new TableStatisticsMetadata(supportedStatistics, tableStatistics, emptyList()); + } + + private Set getHiveSupportedColumnStatistics(ConnectorSession session, org.apache.iceberg.Table table, ConnectorTableMetadata tableMetadata) + { MetricsConfig metricsConfig = MetricsConfig.forTable(table); - Set columnStatistics = tableMetadata.getColumns().stream() - .filter(column -> !column.isHidden() && metricsConfig.columnMode(column.getName()) != None.get()) + return tableMetadata.getColumns().stream() + .filter(column -> !column.isHidden()) + .filter(column -> metricsConfig.columnMode(column.getName()) != None.get()) .flatMap(meta -> { try { return metastore.getSupportedColumnStatistics(getMetastoreContext(session), meta.getType()) @@ -487,9 +500,6 @@ public TableStatisticsMetadata getStatisticsCollectionMetadata(ConnectorSession } }) .collect(toImmutableSet()); - - Set tableStatistics = ImmutableSet.of(ROW_COUNT); - return new TableStatisticsMetadata(columnStatistics, tableStatistics, emptyList()); } @Override @@ -518,11 +528,31 @@ public void finishStatisticsCollection(ConnectorSession session, ConnectorTableH Map, ComputedStatistics> computedStatisticsMap = createComputedStatisticsToPartitionMap(computedStatistics, partitionColumnNames, columnTypes); // commit analyze to unpartitioned table - PartitionStatistics tableStatistics = createPartitionStatistics(session, columnTypes, computedStatisticsMap.get(ImmutableList.of()), timeZone); + ConnectorTableMetadata metadata = getTableMetadata(session, tableHandle); + org.apache.iceberg.Table icebergTable = getIcebergTable(session, icebergTableHandle.getSchemaTableName()); + Set hiveSupportedStatistics = getHiveSupportedColumnStatistics(session, icebergTable, metadata); + PartitionStatistics tableStatistics = createPartitionStatistics( + session, + columnTypes, + computedStatisticsMap.get(ImmutableList.of()), + hiveSupportedStatistics, + timeZone); metastore.updateTableStatistics(metastoreContext, table.getDatabaseName(), table.getTableName(), oldStats -> updatePartitionStatistics(oldStats, tableStatistics)); + + Set icebergSupportedStatistics = super.getStatisticsCollectionMetadata(session, metadata).getColumnStatistics(); + Collection icebergComputedStatistics = computedStatistics.stream().map(stat -> { + ComputedStatistics.Builder builder = ComputedStatistics.builder(stat.getGroupingColumns(), stat.getGroupingValues()); + stat.getTableStatistics() + .forEach(builder::addTableStatistic); + stat.getColumnStatistics().entrySet().stream() + .filter(entry -> icebergSupportedStatistics.contains(entry.getKey())) + .forEach(entry -> builder.addColumnStatistic(entry.getKey(), entry.getValue())); + return builder.build(); + }).collect(toImmutableList()); + super.finishStatisticsCollection(session, tableHandle, icebergComputedStatistics); } @Override diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java index 5a597d97051b..57f954801f2f 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/IcebergSessionProperties.java @@ -64,6 +64,7 @@ public final class IcebergSessionProperties public static final String HIVE_METASTORE_STATISTICS_MERGE_STRATEGY = "hive_statistics_merge_strategy"; public static final String STATISTIC_SNAPSHOT_RECORD_DIFFERENCE_WEIGHT = "statistic_snapshot_record_difference_weight"; public static final String ROWS_FOR_METADATA_OPTIMIZATION_THRESHOLD = "rows_for_metadata_optimization_threshold"; + public static final String STATISTICS_KLL_SKETCH_K_PARAMETER = "statistics_kll_sketch_k_parameter"; private final List> sessionProperties; @@ -184,6 +185,10 @@ public IcebergSessionProperties( "of an Iceberg table exceeds this threshold, metadata optimization would be skipped for " + "the table. A value of 0 means skip metadata optimization directly.", icebergConfig.getRowsForMetadataOptimizationThreshold(), + false)) + .add(integerProperty(STATISTICS_KLL_SKETCH_K_PARAMETER, + "The K parameter for the Apache DataSketches KLL sketch when computing histogram statistics", + icebergConfig.getStatisticsKllSketchKParameter(), false)); nessieConfig.ifPresent((config) -> propertiesBuilder @@ -313,4 +318,9 @@ public static String getNessieReferenceHash(ConnectorSession session) { return session.getProperty(NESSIE_REFERENCE_HASH, String.class); } + + public static int getStatisticsKllSketchKParameter(ConnectorSession session) + { + return session.getProperty(STATISTICS_KLL_SKETCH_K_PARAMETER, Integer.class); + } } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java index b86a7e9c87cb..307a27dddf77 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/TableStatisticsMaker.java @@ -16,10 +16,14 @@ import com.facebook.airlift.log.Logger; import com.facebook.presto.common.RuntimeUnit; import com.facebook.presto.common.block.Block; +import com.facebook.presto.common.predicate.Range; import com.facebook.presto.common.predicate.TupleDomain; +import com.facebook.presto.common.type.DecimalType; import com.facebook.presto.common.type.FixedWidthType; +import com.facebook.presto.common.type.KllSketchType; import com.facebook.presto.common.type.TypeManager; import com.facebook.presto.hive.NodeVersion; +import com.facebook.presto.iceberg.statistics.KllHistogram; import com.facebook.presto.iceberg.statistics.StatisticsFileCache; import com.facebook.presto.iceberg.statistics.StatisticsFileCacheKey; import com.facebook.presto.spi.ConnectorSession; @@ -29,12 +33,15 @@ import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.facebook.presto.spi.statistics.ColumnStatistics; import com.facebook.presto.spi.statistics.ComputedStatistics; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.theta.CompactSketch; import org.apache.iceberg.ContentFile; @@ -62,6 +69,8 @@ import org.apache.iceberg.types.Types; import org.apache.iceberg.util.Pair; +import javax.annotation.Nullable; + import java.io.IOException; import java.io.UncheckedIOException; import java.nio.ByteBuffer; @@ -80,6 +89,7 @@ import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.DateType.DATE; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; import static com.facebook.presto.common.type.TimestampWithTimeZoneType.TIMESTAMP_WITH_TIME_ZONE; import static com.facebook.presto.common.type.TypeUtils.isNumericType; @@ -89,10 +99,14 @@ import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_FILESYSTEM_ERROR; import static com.facebook.presto.iceberg.IcebergErrorCode.ICEBERG_INVALID_METADATA; import static com.facebook.presto.iceberg.IcebergSessionProperties.getStatisticSnapshotRecordDifferenceWeight; +import static com.facebook.presto.iceberg.IcebergSessionProperties.getStatisticsKllSketchKParameter; import static com.facebook.presto.iceberg.IcebergUtil.getIdentityPartitions; import static com.facebook.presto.iceberg.Partition.toMap; +import static com.facebook.presto.iceberg.TypeConverter.toPrestoType; +import static com.facebook.presto.iceberg.statistics.KllHistogram.isKllHistogramSupportedType; import static com.facebook.presto.iceberg.util.StatisticsUtil.calculateAndSetTableSize; import static com.facebook.presto.iceberg.util.StatisticsUtil.formatIdentifier; +import static com.facebook.presto.spi.statistics.ColumnStatisticType.HISTOGRAM; import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; import static com.facebook.presto.spi.statistics.SourceInfo.ConfidenceLevel.HIGH; @@ -100,6 +114,7 @@ import static com.google.common.collect.ImmutableMap.toImmutableMap; import static com.google.common.collect.ImmutableSet.toImmutableSet; import static com.google.common.collect.Iterables.getOnlyElement; +import static com.google.common.collect.Iterators.getOnlyElement; import static java.lang.Long.parseLong; import static java.lang.Math.abs; import static java.lang.String.format; @@ -112,6 +127,7 @@ public class TableStatisticsMaker private static final Logger log = Logger.get(TableStatisticsMaker.class); private static final String ICEBERG_THETA_SKETCH_BLOB_TYPE_ID = "apache-datasketches-theta-v1"; private static final String ICEBERG_DATA_SIZE_BLOB_TYPE_ID = "presto-sum-data-size-bytes-v1"; + private static final String ICEBERG_KLL_SKETCH_BLOB_TYPE_ID = "presto-kll-sketch-bytes-v1"; private static final String ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY = "ndv"; private static final String ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY = "data_size"; private final Table icebergTable; @@ -132,11 +148,13 @@ private TableStatisticsMaker(Table icebergTable, ConnectorSession session, TypeM private static final Map puffinStatWriters = ImmutableMap.builder() .put(NUMBER_OF_DISTINCT_VALUES, TableStatisticsMaker::generateNDVBlob) .put(TOTAL_SIZE_IN_BYTES, TableStatisticsMaker::generateStatSizeBlob) + .put(HISTOGRAM, TableStatisticsMaker::generateKllSketchBlob) .build(); private static final Map puffinStatReaders = ImmutableMap.builder() .put(ICEBERG_THETA_SKETCH_BLOB_TYPE_ID, TableStatisticsMaker::readNDVBlob) .put(ICEBERG_DATA_SIZE_BLOB_TYPE_ID, TableStatisticsMaker::readDataSizeBlob) + .put(ICEBERG_KLL_SKETCH_BLOB_TYPE_ID, TableStatisticsMaker::readKllSketchBlob) .build(); public static TableStatistics getTableStatistics( @@ -237,7 +255,18 @@ private TableStatistics makeTableStatistics(StatisticsFileCache statisticsFileCa Object min = summary.getMinValues().get(fieldId); Object max = summary.getMaxValues().get(fieldId); if (min instanceof Number && max instanceof Number) { - columnBuilder.setRange(Optional.of(new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue()))); + DoubleRange range = new DoubleRange(((Number) min).doubleValue(), ((Number) max).doubleValue()); + columnBuilder.setRange(Optional.of(range)); + + // the histogram is generated by scanning the entire dataset. It is possible that + // the constraint prevents scanning portions of the table. Given that we know the + // range that the scan provides for a particular column, bound the histogram to the + // scanned range. + + final DoubleRange histRange = range; + columnBuilder.setHistogram(columnBuilder.getHistogram() + .map(histogram -> DisjointRangeDomainHistogram + .addConjunction(histogram, Range.range(DOUBLE, histRange.getMin(), true, histRange.getMax(), true)))); } result.setColumnStatistics(columnHandle, columnBuilder.build()); } @@ -337,9 +366,8 @@ private void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle ta .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)) .forEach((key, value) -> { Optional.ofNullable(puffinStatWriters.get(key.getStatisticType())) - .ifPresent(generator -> { - writer.add(generator.generate(key, value, icebergTable, snapshot)); - }); + .flatMap(generator -> Optional.ofNullable(generator.generate(key, value, icebergTable, snapshot, typeManager))) + .ifPresent(writer::add); }); writer.finish(); icebergTable.updateStatistics().setStatistics( @@ -364,7 +392,8 @@ private void writeTableStatistics(NodeVersion nodeVersion, IcebergTableHandle ta @FunctionalInterface private interface PuffinBlobGenerator { - Blob generate(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot); + @Nullable + Blob generate(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager); } @FunctionalInterface @@ -373,12 +402,12 @@ private interface PuffinBlobReader /** * Reads the stats from the blob and then updates the stats builder argument. */ - void read(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder stats); + void read(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder stats, Table icebergTable, TypeManager typeManager); } - private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot) + private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager) { - int id = getFieldId(metadata, icebergTable); + int id = getField(metadata, icebergTable, snapshot).fieldId(); ByteBuffer raw = VARBINARY.getSlice(value, 0).toByteBuffer(); CompactSketch sketch = CompactSketch.wrap(Memory.wrap(raw, ByteOrder.nativeOrder())); return new Blob( @@ -391,9 +420,9 @@ private static Blob generateNDVBlob(ColumnStatisticMetadata metadata, Block valu ImmutableMap.of(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY, Long.toString((long) sketch.getEstimate()))); } - private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot) + private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager) { - int id = getFieldId(metadata, icebergTable); + int id = getField(metadata, icebergTable, snapshot).fieldId(); long size = BIGINT.getLong(value, 0); return new Blob( ICEBERG_DATA_SIZE_BLOB_TYPE_ID, @@ -405,7 +434,26 @@ private static Blob generateStatSizeBlob(ColumnStatisticMetadata metadata, Block ImmutableMap.of(ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY, Long.toString(size))); } - private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics) + private static Blob generateKllSketchBlob(ColumnStatisticMetadata metadata, Block value, Table icebergTable, Snapshot snapshot, TypeManager typeManager) + { + Types.NestedField field = getField(metadata, icebergTable, snapshot); + KllSketchType sketchType = new KllSketchType(toPrestoType(field.type(), typeManager)); + Slice sketchSlice = sketchType.getSlice(value, 0); + if (value.isNull(0)) { + // this can occur when all inputs to the sketch are null + return null; + } + return new Blob( + ICEBERG_KLL_SKETCH_BLOB_TYPE_ID, + ImmutableList.of(field.fieldId()), + snapshot.snapshotId(), + snapshot.sequenceNumber(), + sketchSlice.toByteBuffer(), + null, + ImmutableMap.of()); + } + + private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics, Table icebergTable, TypeManager typeManager) { Optional.ofNullable(metadata.properties().get(ICEBERG_THETA_SKETCH_BLOB_PROPERTY_NDV_KEY)) .ifPresent(ndvProp -> { @@ -420,7 +468,7 @@ private static void readNDVBlob(BlobMetadata metadata, ByteBuffer blob, ColumnSt }); } - private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics) + private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics, Table icebergTable, TypeManager typeManager) { Optional.ofNullable(metadata.properties().get(ICEBERG_DATA_SIZE_BLOB_PROPERTY_KEY)) .ifPresent(sizeProp -> { @@ -435,9 +483,17 @@ private static void readDataSizeBlob(BlobMetadata metadata, ByteBuffer blob, Col }); } - private static int getFieldId(ColumnStatisticMetadata metadata, Table icebergTable) + private static void readKllSketchBlob(BlobMetadata metadata, ByteBuffer blob, ColumnStatistics.Builder statistics, Table icebergTable, TypeManager typeManager) + { + statistics.setHistogram(Optional.ofNullable(icebergTable.schemas().get(icebergTable.snapshot(metadata.snapshotId()).schemaId())) + .map(schema -> toPrestoType(schema.findType(getOnlyElement(metadata.inputFields().iterator())), typeManager)) + .map(prestoType -> new KllHistogram(Slices.wrappedBuffer(blob), prestoType))); + } + + private static Types.NestedField getField(ColumnStatisticMetadata metadata, Table icebergTable, Snapshot snapshot) { - return Optional.ofNullable(icebergTable.schema().findField(metadata.getColumnName())).map(Types.NestedField::fieldId) + return Optional.ofNullable(icebergTable.schemas().get(snapshot.schemaId())) + .map(schema -> schema.findField(metadata.getColumnName())) .orElseThrow(() -> { log.warn("failed to find column name %s in schema of table %s", metadata.getColumnName(), icebergTable.name()); return new PrestoException(ICEBERG_INVALID_METADATA, format("failed to find column name %s in schema of table %s", metadata.getColumnName(), icebergTable.name())); @@ -579,7 +635,7 @@ private Map loadStatisticsFile(IcebergTableHandle tab if (value == null) { value = ColumnStatistics.builder(); } - statReader.read(metadata, blob, value); + statReader.read(metadata, blob, value, icebergTable, typeManager); return value; }); }); @@ -604,7 +660,7 @@ private Map loadStatisticsFile(IcebergTableHandle tab return finalResult.build(); } - public static List getSupportedColumnStatistics(String columnName, com.facebook.presto.common.type.Type type) + public static List getSupportedColumnStatistics(ConnectorSession session, String columnName, com.facebook.presto.common.type.Type type) { ImmutableList.Builder supportedStatistics = ImmutableList.builder(); // all types which support being passed to the sketch_theta function @@ -615,6 +671,16 @@ public static List getSupportedColumnStatistics(String columnName, format("RETURN sketch_theta(%s)", formatIdentifier(columnName)), ImmutableList.of(columnName))); } + if (isKllHistogramSupportedType(type)) { + String histogramFunctionFmt = "RETURN sketch_kll_with_k(%s, CAST(%s as bigint))"; + if (type instanceof DecimalType) { + histogramFunctionFmt = "RETURN sketch_kll_with_k(CAST(%s as double), CAST(%s as bigint))"; + } + supportedStatistics.add(HISTOGRAM.getColumnStatisticMetadataWithCustomFunction(columnName, + format(histogramFunctionFmt, formatIdentifier(columnName), getStatisticsKllSketchKParameter(session)), + ImmutableList.of(columnName))); + } + if (!(type instanceof FixedWidthType)) { supportedStatistics.add(TOTAL_SIZE_IN_BYTES.getColumnStatisticMetadata(columnName)); } diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java new file mode 100644 index 000000000000..0aab4cc0d175 --- /dev/null +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/statistics/KllHistogram.java @@ -0,0 +1,210 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.iceberg.statistics; + +import com.facebook.presto.common.type.AbstractIntType; +import com.facebook.presto.common.type.AbstractLongType; +import com.facebook.presto.common.type.AbstractVarcharType; +import com.facebook.presto.common.type.Type; +import com.facebook.presto.spi.PrestoException; +import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.Estimate; +import com.fasterxml.jackson.annotation.JsonCreator; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.google.common.annotations.VisibleForTesting; +import io.airlift.slice.Slice; +import io.airlift.slice.Slices; +import org.apache.datasketches.common.ArrayOfBooleansSerDe; +import org.apache.datasketches.common.ArrayOfDoublesSerDe; +import org.apache.datasketches.common.ArrayOfItemsSerDe; +import org.apache.datasketches.common.ArrayOfLongsSerDe; +import org.apache.datasketches.common.ArrayOfStringsSerDe; +import org.apache.datasketches.kll.KllItemsSketch; +import org.apache.datasketches.memory.Memory; +import org.openjdk.jol.info.ClassLayout; + +import java.util.Comparator; +import java.util.function.Function; + +import static com.facebook.presto.common.type.BooleanType.BOOLEAN; +import static com.facebook.presto.common.type.Decimals.isLongDecimal; +import static com.facebook.presto.common.type.Decimals.isShortDecimal; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.common.type.RealType.REAL; +import static com.facebook.presto.common.type.SmallintType.SMALLINT; +import static com.facebook.presto.common.type.TinyintType.TINYINT; +import static com.facebook.presto.common.type.TypeUtils.isNumericType; +import static com.facebook.presto.spi.StandardErrorCode.INVALID_ARGUMENTS; +import static com.google.common.base.MoreObjects.toStringHelper; +import static com.google.common.base.Verify.verify; +import static java.nio.ByteOrder.LITTLE_ENDIAN; +import static java.util.Objects.requireNonNull; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.EXCLUSIVE; +import static org.apache.datasketches.quantilescommon.QuantileSearchCriteria.INCLUSIVE; + +public class KllHistogram + implements ConnectorHistogram +{ + private static final long INSTANCE_SIZE = ClassLayout.parseClass(KllHistogram.class).instanceSize(); + // since the actual type parameter is only known at runtime, we can't concretely specify it + private final KllItemsSketch sketch; + private final Type type; + private final Function toDouble; + private final Function fromDouble; + + @SuppressWarnings({"unchecked", "rawtypes"}) + @JsonCreator + public KllHistogram(@JsonProperty("sketch") Slice bytes, @JsonProperty("type") Type type) + { + verify(isKllHistogramSupportedType(type), "histograms do not currently support type " + type.getDisplayName()); + this.type = requireNonNull(type, "type is null"); + SketchParameters parameters = getSketchParameters(type); + // the actual sketch can only accept the same object types which generated it + // however, the API can only accept or generate double types. We cast the inputs + // and results to/from double to satisfy the underlying sketch type. + if (parameters.getSerde().getClassOfT().equals(Double.class)) { + toDouble = x -> (double) x; + fromDouble = x -> x; + } + else if (parameters.getSerde().getClassOfT().equals(Long.class)) { + // dual cast to auto-box/unbox from Double/Long for sketch + toDouble = x -> (double) (long) x; + fromDouble = x -> (long) (double) x; + } + else { + throw new PrestoException(INVALID_ARGUMENTS, "can't create kll sketch from type: " + type); + } + sketch = KllItemsSketch.wrap(Memory.wrap(bytes.toByteBuffer(), LITTLE_ENDIAN), parameters.getComparator(), parameters.getSerde()); + } + + public static boolean isKllHistogramSupportedType(Type type) + { + try { + return isNumericType(type) || + type instanceof AbstractIntType; + } + catch (PrestoException e) { + return false; + } + } + + @JsonProperty + public Slice getSketch() + { + return Slices.wrappedBuffer(sketch.toByteArray()); + } + + @JsonProperty + public Type getType() + { + return type; + } + + @VisibleForTesting + @SuppressWarnings("rawtypes") + public KllItemsSketch getKllSketch() + { + return sketch; + } + + @Override + public Estimate cumulativeProbability(double value, boolean inclusive) + { + return Estimate.of(sketch.getRank(fromDouble.apply(value), inclusive ? INCLUSIVE : EXCLUSIVE)); + } + + @Override + public Estimate inverseCumulativeProbability(double percentile) + { + return Estimate.of(toDouble.apply(sketch.getQuantile(percentile))); + } + + /** + * The memory utilization is dominated by the size of the sketch. This estimate + * doesn't account for the other fields in the class. + */ + @Override + public long getEstimatedSize() + { + return INSTANCE_SIZE + sketch.getSerializedSizeBytes(); + } + + @Override + public String toString() + { + return toStringHelper(this) + .add("type", type) + .add("k", this.sketch.getK()) + .add("N", this.sketch.getN()) + .add("retained", this.sketch.getNumRetained()) + .add("mingetSerialized", this.sketch.getMinItem()) + .add("max", this.sketch.getMaxItem()) + .add("p50", sketch.getQuantile(0.5)) + .add("p75", sketch.getQuantile(0.75)) + .add("p90", sketch.getQuantile(0.90)) + .add("p99", sketch.getQuantile(0.99)) + .add("p99.9", sketch.getQuantile(0.999)) + .toString(); + } + + private static class SketchParameters + { + private final Comparator comparator; + private final ArrayOfItemsSerDe serde; + + public SketchParameters(Comparator comparator, ArrayOfItemsSerDe serde) + { + this.comparator = comparator; + this.serde = serde; + } + + public Comparator getComparator() + { + return comparator; + } + + public ArrayOfItemsSerDe getSerde() + { + return serde; + } + } + + private static SketchParameters getSketchParameters(Type type) + { + if (type.equals(REAL)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe()); + } + else if (isShortDecimal(type)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe()); + } + else if (isLongDecimal(type)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe()); + } + else if (type.equals(DOUBLE)) { + return new SketchParameters<>(Double::compareTo, new ArrayOfDoublesSerDe()); + } + else if (type.equals(BOOLEAN)) { + return new SketchParameters<>(Boolean::compareTo, new ArrayOfBooleansSerDe()); + } + else if (type instanceof AbstractIntType || type instanceof AbstractLongType || type.equals(SMALLINT) || type.equals(TINYINT)) { + return new SketchParameters<>(Long::compareTo, new ArrayOfLongsSerDe()); + } + else if (type instanceof AbstractVarcharType) { + return new SketchParameters<>(String::compareTo, new ArrayOfStringsSerDe()); + } + else { + throw new PrestoException(INVALID_ARGUMENTS, "Unsupported type for KLL sketch: " + type); + } + } +} diff --git a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java index a0587ccd8b06..f19ae11b0795 100644 --- a/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java +++ b/presto-iceberg/src/main/java/com/facebook/presto/iceberg/util/StatisticsUtil.java @@ -103,6 +103,7 @@ public static TableStatistics mergeHiveStatistics(TableStatistics icebergStatist .setRange(icebergColumnStats.getRange()) .setNullsFraction(icebergColumnStats.getNullsFraction()) .setDistinctValuesCount(icebergColumnStats.getDistinctValuesCount()) + .setHistogram(icebergColumnStats.getHistogram()) .setRange(icebergColumnStats.getRange()); if (hiveColumnStats != null) { // NDVs diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java index a9f17f3a0f16..accc1bfbc3c3 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergDistributedTestBase.java @@ -21,6 +21,9 @@ import com.facebook.presto.common.transaction.TransactionId; import com.facebook.presto.common.type.FixedWidthType; import com.facebook.presto.common.type.TimeZoneKey; +import com.facebook.presto.common.type.Type; +import com.facebook.presto.common.type.TypeParameter; +import com.facebook.presto.hive.BaseHiveColumnHandle; import com.facebook.presto.hive.HdfsConfiguration; import com.facebook.presto.hive.HdfsConfigurationInitializer; import com.facebook.presto.hive.HdfsContext; @@ -41,6 +44,8 @@ import com.facebook.presto.spi.connector.classloader.ClassLoaderSafeConnectorMetadata; import com.facebook.presto.spi.security.AllowAllAccessControl; import com.facebook.presto.spi.statistics.ColumnStatistics; +import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.DoubleRange; import com.facebook.presto.spi.statistics.Estimate; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.testing.MaterializedResult; @@ -54,6 +59,7 @@ import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Maps; +import org.apache.commons.math3.distribution.NormalDistribution; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -98,17 +104,23 @@ import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.UUID; import java.util.function.BiConsumer; +import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; import static com.facebook.presto.SystemSessionProperties.LEGACY_TIMESTAMP; +import static com.facebook.presto.SystemSessionProperties.OPTIMIZER_USE_HISTOGRAMS; import static com.facebook.presto.common.type.BigintType.BIGINT; +import static com.facebook.presto.common.type.DateType.DATE; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static com.facebook.presto.common.type.IntegerType.INTEGER; +import static com.facebook.presto.common.type.RealType.REAL; import static com.facebook.presto.common.type.TimeZoneKey.UTC_KEY; import static com.facebook.presto.common.type.VarcharType.VARCHAR; import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.SYNTHESIZED; @@ -132,13 +144,18 @@ import static com.facebook.presto.testing.TestingConnectorSession.SESSION; import static com.facebook.presto.testing.assertions.Assert.assertEquals; import static com.facebook.presto.tests.sql.TestTable.randomTableSuffix; +import static com.facebook.presto.type.DecimalParametricType.DECIMAL; +import static com.google.common.collect.ImmutableMap.toImmutableMap; import static java.lang.String.format; import static java.util.Objects.requireNonNull; +import static java.util.function.Function.identity; import static org.apache.iceberg.SnapshotSummary.TOTAL_DATA_FILES_PROP; import static org.apache.iceberg.SnapshotSummary.TOTAL_DELETE_FILES_PROP; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_1_0; import static org.apache.parquet.column.ParquetProperties.WriterVersion.PARQUET_2_0; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotEquals; +import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertTrue; @Test(singleThreaded = true) @@ -1003,6 +1020,11 @@ private TableStatistics getTableStats(String name, Optional snapshot) return getTableStats(name, snapshot, getSession(), Optional.empty()); } + private TableStatistics getTableStats(String name, Optional snapshot, Session session) + { + return getTableStats(name, snapshot, session, Optional.empty()); + } + private TableStatistics getTableStats(String name, Optional snapshot, Session session, Optional> columns) { TransactionId transactionId = getQueryRunner().getTransactionManager().beginTransaction(false); @@ -1430,6 +1452,59 @@ public void testMetadataDeleteOnUnPartitionedTableWithDeleteFiles() } } + @DataProvider(name = "validHistogramTypes") + public Object[][] validHistogramTypesDataProvider() + { + return new Object[][] { + // types not supported in Iceberg connector, but that histogram could support + // {TINYINT, new String[]{"1", "2", "10"}}, + // {SMALLINT, new String[]{"1", "2", "10"}}, + // {TIMESTAMP_WITH_TIME_ZONE, new String[]{"now() + interval '1' hour", "now() + interval '2' hour"}}, + // iceberg stores microsecond precision but presto calculates on millisecond precision + // need a fix to properly convert for the optimizer. + // {TIMESTAMP, new String[] {"localtimestamp + interval '1' hour", "localtimestamp + interval '2' hour"}}, + // {TIME, new String[] {"localtime", "localtime + interval '1' hour"}}, + // supported types + {INTEGER, new String[] {"1", "5", "9"}}, + {BIGINT, new String[] {"2", "4", "6"}}, + {DOUBLE, new String[] {"1.0", "3.1", "4.6"}}, + // short decimal + {DECIMAL.createType(ImmutableList.of(TypeParameter.of(2L), TypeParameter.of(1L))), new String[] {"0.0", "3.0", "4.0"}}, + // long decimal + {DECIMAL.createType(ImmutableList.of(TypeParameter.of(38L), TypeParameter.of(1L))), new String[] {"0.0", "3.0", "4.0"}}, + {DATE, new String[] {"date '2024-01-01'", "date '2024-03-30'", "date '2024-05-30'"}}, + {REAL, new String[] {"1.0", "2.0", "3.0"}}, + }; + } + + /** + * Verifies that the histogram is returned after ANALYZE for a variety of types + */ + @Test(dataProvider = "validHistogramTypes") + public void testHistogramStorage(Type type, Object[] values) + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS create_histograms"); + assertQuerySucceeds(String.format("CREATE TABLE create_histograms (c %s)", type.getDisplayName())); + assertQuerySucceeds(String.format("INSERT INTO create_histograms VALUES %s", Joiner.on(", ").join(values))); + assertQuerySucceeds(session, "ANALYZE create_histograms"); + TableStatistics tableStatistics = getTableStats("create_histograms"); + Map nameToHandle = tableStatistics.getColumnStatistics().keySet() + .stream().map(IcebergColumnHandle.class::cast) + .collect(Collectors.toMap(BaseHiveColumnHandle::getName, identity())); + assertNotNull(nameToHandle.get("c")); + IcebergColumnHandle handle = nameToHandle.get("c"); + ColumnStatistics statistics = tableStatistics.getColumnStatistics().get(handle); + assertTrue(statistics.getHistogram().isPresent()); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS create_histograms"); + } + } + @Test public void testMetadataDeleteOnPartitionedTableWithDeleteFiles() { @@ -1916,6 +1991,151 @@ public void testBatchReadOnTimeType(WriterVersion writerVersion) assertQuerySucceeds("DROP TABLE time_batch_read"); } + public void testAllNullHistogramColumn() + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_all_nulls"); + assertQuerySucceeds("CREATE TABLE histogram_all_nulls (c bigint)"); + TableStatistics stats = getTableStats("histogram_all_nulls", Optional.empty(), session); + assertFalse(stats.getColumnStatistics().values().stream().findFirst().isPresent()); + assertUpdate("INSERT INTO histogram_all_nulls VALUES NULL, NULL, NULL, NULL, NULL", 5); + stats = getTableStats("histogram_all_nulls", Optional.empty(), session); + assertFalse(stats.getColumnStatistics().values().stream().findFirst() + .get().getHistogram().isPresent()); + assertQuerySucceeds(session, "ANALYZE histogram_all_nulls"); + stats = getTableStats("histogram_all_nulls", Optional.empty(), session); + assertFalse(stats.getColumnStatistics().values().stream().findFirst() + .get().getHistogram().isPresent()); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_all_nulls"); + } + } + + @Test(dataProvider = "validHistogramTypes") + public void testHistogramShowStats(Type type, Object[] values) + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS create_histograms"); + assertQuerySucceeds(String.format("CREATE TABLE show_histograms (c %s)", type.getDisplayName())); + assertQuerySucceeds(String.format("INSERT INTO show_histograms VALUES %s", Joiner.on(", ").join(values))); + assertQuerySucceeds(session, "ANALYZE show_histograms"); + TableStatistics tableStatistics = getTableStats("show_histograms", Optional.empty(), session, Optional.empty()); + Map> histogramByColumnName = tableStatistics.getColumnStatistics() + .entrySet() + .stream() + .collect(toImmutableMap( + entry -> ((IcebergColumnHandle) entry.getKey()).getName(), + entry -> entry.getValue().getHistogram())); + MaterializedResult stats = getQueryRunner().execute("SHOW STATS for show_histograms"); + stats.getMaterializedRows() + .forEach(row -> { + String name = (String) row.getField(0); + String histogram = (String) row.getField(7); + assertEquals(Optional.ofNullable(histogramByColumnName.get(name)) + .flatMap(identity()) + .map(Objects::toString).orElse(null), + histogram); + }); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS show_histograms"); + } + } + + /** + * Verifies that when the users opts-in to using histograms that the + * optimizer estimates reflect the actual dataset for a variety of filter + * types (LTE, GT, EQ, NE) on a non-uniform data distribution + */ + @Test + public void testHistogramsUsedInOptimization() + { + Session histogramSession = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + // standard-normal distribution should have vastly different estimates than uniform at the tails (e.g. -3, +3) + NormalDistribution dist = new NormalDistribution(0, 1); + double[] values = dist.sample(1000); + Arrays.sort(values); + + try { + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_validation"); + assertQuerySucceeds("CREATE TABLE histogram_validation (c double)"); + assertQuerySucceeds(String.format("INSERT INTO histogram_validation VALUES %s", Joiner.on(", ").join(Arrays.stream(values).iterator()))); + assertQuerySucceeds(histogramSession, "ANALYZE histogram_validation"); + Consumer assertFilters = (value) -> { + // use Math.abs because if the value isn't found, the returned value of binary + // search is (- insert index). The absolute value index tells us roughly how + // many records would have been returned regardless of if the actual value is in the + // dataset + double estimatedRowCount = Math.abs(Arrays.binarySearch(values, value)); + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c <= " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(estimatedRowCount, 25)); + // check that inverse filter equals roughly the inverse number of rows + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c > " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(Math.max(0.0, values.length - estimatedRowCount), 25)); + // having an exact random double value from the distribution exist more than once is exceedingly rare. + // the histogram calculation should return 1 (and the inverse) in both situations + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c = " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(1.0, 25)); + assertPlan(histogramSession, "SELECT * FROM histogram_validation WHERE c != " + value, + output(anyTree(tableScan("histogram_validation"))).withApproximateOutputRowCount(values.length - 1, 25)); + }; + + assertFilters.accept(values[1]); // choose 1 greater than the min value + assertFilters.accept(-2.0); // should be very unlikely to generate a distribution where all values > -2.0 + assertFilters.accept(-1.0); + assertFilters.accept(0.0); + assertFilters.accept(1.0); + assertFilters.accept(2.0); // should be very unlikely to generate a distribution where all values < 2.0 + assertFilters.accept(values[values.length - 2]); // choose 1 less than the max value + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS histogram_validation"); + } + } + + /** + * Verifies that the data in the histogram matches the mins/maxs of the values + * in the table when created + */ + @Test(dataProvider = "validHistogramTypes") + public void testHistogramReconstruction(Type type, Object[] values) + { + try { + Session session = Session.builder(getSession()) + .setSystemProperty(OPTIMIZER_USE_HISTOGRAMS, "true") + .build(); + assertQuerySucceeds("DROP TABLE IF EXISTS verify_histograms"); + assertQuerySucceeds(String.format("CREATE TABLE verify_histograms (c %s)", type.getDisplayName())); + assertQuerySucceeds(String.format("INSERT INTO verify_histograms VALUES %s", Joiner.on(", ").join(values))); + assertQuerySucceeds(session, "ANALYZE verify_histograms"); + TableStatistics tableStatistics = getTableStats("verify_histograms", Optional.empty(), session, Optional.empty()); + Map nameToHandle = tableStatistics.getColumnStatistics().keySet() + .stream().map(IcebergColumnHandle.class::cast) + .collect(Collectors.toMap(BaseHiveColumnHandle::getName, identity())); + assertNotNull(nameToHandle.get("c")); + IcebergColumnHandle handle = nameToHandle.get("c"); + ColumnStatistics statistics = tableStatistics.getColumnStatistics().get(handle); + ConnectorHistogram histogram = statistics.getHistogram().get(); + DoubleRange range = statistics.getRange().get(); + double min = range.getMin(); + double max = range.getMax(); + assertEquals(histogram.inverseCumulativeProbability(0.0).getValue(), min); + assertEquals(histogram.inverseCumulativeProbability(1.0).getValue(), max); + } + finally { + assertQuerySucceeds("DROP TABLE IF EXISTS verify_histograms"); + } + } + private void testCheckDeleteFiles(Table icebergTable, int expectedSize, List expectedFileContent) { // check delete file list diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergQueryRunner.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergQueryRunner.java index e1163b801c5a..67733fa514de 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergQueryRunner.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/IcebergQueryRunner.java @@ -200,7 +200,7 @@ public static DistributedQueryRunner createIcebergQueryRunner( queryRunner.createCatalog("jmx", "jmx"); } - if (catalogType == HIVE.name()) { + if (catalogType.equals(HIVE.name())) { ExtendedHiveMetastore metastore = getFileHiveMetastore(icebergDataDirectory); if (!metastore.getDatabase(METASTORE_CONTEXT, "tpch").isPresent()) { queryRunner.execute("CREATE SCHEMA tpch"); diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java index d28503a27d31..588b7273d44c 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/TestIcebergConfig.java @@ -69,7 +69,8 @@ public void testDefaults() .setMetadataPreviousVersionsMax(METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT) .setMetadataDeleteAfterCommit(METADATA_DELETE_AFTER_COMMIT_ENABLED_DEFAULT) .setMetricsMaxInferredColumn(METRICS_MAX_INFERRED_COLUMN_DEFAULTS_DEFAULT) - .setMaxStatisticsFileCacheSize(succinctDataSize(256, MEGABYTE))); + .setMaxStatisticsFileCacheSize(succinctDataSize(256, MEGABYTE)) + .setStatisticsKllSketchKParameter(1024)); } @Test @@ -101,6 +102,7 @@ public void testExplicitPropertyMappings() .put("iceberg.metadata-delete-after-commit", "true") .put("iceberg.metrics-max-inferred-column", "16") .put("iceberg.max-statistics-file-cache-size", "512MB") + .put("iceberg.statistics-kll-sketch-k-parameter", "4096") .build(); IcebergConfig expected = new IcebergConfig() @@ -128,7 +130,8 @@ public void testExplicitPropertyMappings() .setMetadataPreviousVersionsMax(1) .setMetadataDeleteAfterCommit(true) .setMetricsMaxInferredColumn(16) - .setMaxStatisticsFileCacheSize(succinctDataSize(512, MEGABYTE)); + .setMaxStatisticsFileCacheSize(succinctDataSize(512, MEGABYTE)) + .setStatisticsKllSketchKParameter(4096); assertFullMapping(properties, expected); } diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergHiveStatistics.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergHiveStatistics.java index 98fd3e94c2f4..be90d5fc0dd0 100644 --- a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergHiveStatistics.java +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/hive/TestIcebergHiveStatistics.java @@ -20,11 +20,27 @@ import com.facebook.presto.common.predicate.TupleDomain; import com.facebook.presto.common.predicate.ValueSet; import com.facebook.presto.common.transaction.TransactionId; +import com.facebook.presto.hive.HdfsConfiguration; +import com.facebook.presto.hive.HdfsConfigurationInitializer; +import com.facebook.presto.hive.HdfsEnvironment; +import com.facebook.presto.hive.HiveClientConfig; +import com.facebook.presto.hive.HiveHdfsConfiguration; +import com.facebook.presto.hive.MetastoreClientConfig; +import com.facebook.presto.hive.authentication.NoHdfsAuthentication; +import com.facebook.presto.hive.metastore.ExtendedHiveMetastore; +import com.facebook.presto.hive.metastore.file.FileHiveMetastore; +import com.facebook.presto.iceberg.CatalogType; import com.facebook.presto.iceberg.IcebergColumnHandle; +import com.facebook.presto.iceberg.IcebergHiveTableOperationsConfig; import com.facebook.presto.iceberg.IcebergMetadataColumn; +import com.facebook.presto.iceberg.IcebergUtil; +import com.facebook.presto.metadata.CatalogManager; import com.facebook.presto.metadata.Metadata; import com.facebook.presto.spi.ColumnHandle; +import com.facebook.presto.spi.ConnectorId; import com.facebook.presto.spi.Constraint; +import com.facebook.presto.spi.PrestoException; +import com.facebook.presto.spi.SchemaTableName; import com.facebook.presto.spi.TableHandle; import com.facebook.presto.spi.analyzer.MetadataResolver; import com.facebook.presto.spi.plan.TableScanNode; @@ -39,16 +55,22 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; +import org.apache.iceberg.StatisticsFile; +import org.apache.iceberg.Table; +import org.apache.iceberg.UpdateStatistics; import org.intellij.lang.annotations.Language; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.io.File; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; @@ -56,9 +78,14 @@ import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.PARTITION_KEY; import static com.facebook.presto.hive.BaseHiveColumnHandle.ColumnType.REGULAR; +import static com.facebook.presto.hive.metastore.InMemoryCachingHiveMetastore.memoizeMetastore; +import static com.facebook.presto.iceberg.CatalogType.HIVE; +import static com.facebook.presto.iceberg.IcebergQueryRunner.ICEBERG_CATALOG; +import static com.facebook.presto.iceberg.IcebergQueryRunner.TEST_DATA_DIRECTORY; import static com.facebook.presto.iceberg.IcebergQueryRunner.createIcebergQueryRunner; import static com.facebook.presto.iceberg.IcebergSessionProperties.HIVE_METASTORE_STATISTICS_MERGE_STRATEGY; import static com.facebook.presto.iceberg.IcebergSessionProperties.PUSHDOWN_FILTER_ENABLED; +import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; import static com.facebook.presto.spi.statistics.ColumnStatisticType.NUMBER_OF_DISTINCT_VALUES; import static com.facebook.presto.spi.statistics.ColumnStatisticType.TOTAL_SIZE_IN_BYTES; import static com.facebook.presto.testing.assertions.Assert.assertEquals; @@ -159,6 +186,7 @@ public void testStatsWithPartitionedTableAnalyzed() assertQuerySucceeds("CREATE TABLE statsWithPartitionAnalyze WITH (partitioning = ARRAY['orderdate']) as SELECT * FROM statsNoPartitionAnalyze"); assertQuerySucceeds("ANALYZE statsNoPartitionAnalyze"); assertQuerySucceeds("ANALYZE statsWithPartitionAnalyze"); + deleteTableStatistics("statsWithPartitionAnalyze"); Metadata meta = getQueryRunner().getMetadata(); TransactionId txid = getQueryRunner().getTransactionManager().beginTransaction(false); Session session = getSession().beginTransactionId(txid, getQueryRunner().getTransactionManager(), new AllowAllAccessControl()); @@ -295,12 +323,17 @@ public void testHiveStatisticsMergeFlags() { assertQuerySucceeds("CREATE TABLE mergeFlagsStats (i int, v varchar)"); assertQuerySucceeds("INSERT INTO mergeFlagsStats VALUES (0, '1'), (1, '22'), (2, '333'), (NULL, 'aaaaa'), (4, NULL)"); - assertQuerySucceeds("ANALYZE mergeFlagsStats"); // stats stored in + assertQuerySucceeds("ANALYZE mergeFlagsStats"); + + // invalidate puffin files so only hive stats can be returned + deleteTableStatistics("mergeFlagsStats"); + // Test stats without merging doesn't return NDVs or data size Session session = Session.builder(getSession()) .setCatalogSessionProperty("iceberg", HIVE_METASTORE_STATISTICS_MERGE_STRATEGY, "") .build(); TableStatistics stats = getTableStatistics(session, "mergeFlagsStats"); + Map columnStatistics = getColumnNameMap(stats); assertEquals(columnStatistics.get("i").getDistinctValuesCount(), Estimate.unknown()); assertEquals(columnStatistics.get("i").getDataSize(), Estimate.unknown()); @@ -468,6 +501,64 @@ static void assertStatValue(StatsSchema column, MaterializedResult result, Set new RuntimeException("Catalog directory does not exist: " + getCatalogDirectory(HIVE))), + "test"); + return memoizeMetastore(fileHiveMetastore, false, 1000, 0); + } + + protected static HdfsEnvironment getHdfsEnvironment() + { + HiveClientConfig hiveClientConfig = new HiveClientConfig(); + MetastoreClientConfig metastoreClientConfig = new MetastoreClientConfig(); + HdfsConfiguration hdfsConfiguration = new HiveHdfsConfiguration(new HdfsConfigurationInitializer(hiveClientConfig, metastoreClientConfig), + ImmutableSet.of(), + hiveClientConfig); + return new HdfsEnvironment(hdfsConfiguration, metastoreClientConfig, new NoHdfsAuthentication()); + } + + protected File getCatalogDirectory(CatalogType catalogType) + { + Path dataDirectory = getDistributedQueryRunner().getCoordinator().getDataDirectory(); + switch (catalogType) { + case HIVE: + return dataDirectory + .resolve(TEST_DATA_DIRECTORY) + .resolve(HIVE.name()) + .toFile(); + case HADOOP: + case NESSIE: + return dataDirectory.toFile(); + } + + throw new PrestoException(NOT_SUPPORTED, "Unsupported Presto Iceberg catalog type " + catalogType); + } + private static Map getColumnNameMap(TableStatistics statistics) { return statistics.getColumnStatistics().entrySet().stream().collect(Collectors.toMap(e -> diff --git a/presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java new file mode 100644 index 000000000000..2db83a1b8f0a --- /dev/null +++ b/presto-iceberg/src/test/java/com/facebook/presto/iceberg/statistics/TestKllHistogram.java @@ -0,0 +1,166 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.iceberg.statistics; + +import com.facebook.presto.common.type.CharType; +import com.facebook.presto.common.type.Type; +import com.facebook.presto.common.type.VarcharType; +import com.google.common.base.VerifyException; +import io.airlift.slice.Slices; +import org.apache.datasketches.common.ArrayOfDoublesSerDe; +import org.apache.datasketches.common.ArrayOfLongsSerDe; +import org.apache.datasketches.kll.KllItemsSketch; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.nio.ByteBuffer; +import java.util.stream.DoubleStream; +import java.util.stream.LongStream; + +import static com.facebook.presto.common.type.BigintType.BIGINT; +import static com.facebook.presto.common.type.BooleanType.BOOLEAN; +import static com.facebook.presto.common.type.DateType.DATE; +import static com.facebook.presto.common.type.DecimalType.createDecimalType; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.common.type.IntegerType.INTEGER; +import static com.facebook.presto.common.type.RealType.REAL; +import static com.facebook.presto.common.type.TimeType.TIME; +import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; +import static com.facebook.presto.common.type.TimestampWithTimeZoneType.TIMESTAMP_WITH_TIME_ZONE; +import static com.facebook.presto.common.type.VarcharType.VARCHAR; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertThrows; + +public class TestKllHistogram +{ + @SuppressWarnings("unchecked") + @Test + public void testSimpleCreation() + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(Double::compareTo, new ArrayOfDoublesSerDe()); + DoubleStream.iterate(0.0, i -> i + 1).limit(100).forEach(sketch::update); + KllHistogram histogram = new KllHistogram(Slices.wrappedBuffer(ByteBuffer.wrap(sketch.toByteArray())), DOUBLE); + assertSketchesEqual(histogram.getKllSketch(), sketch); + } + + @Test + public void smokeTestHistogram() + { + // a histogram with a uniform distribution from 0.0 to 99.0 + KllHistogram basicHistogram = generateDoublesHistogram(); + // inverse cumulative probability + assertEquals(basicHistogram.inverseCumulativeProbability(0.0).getValue(), 0.0, 1E-8); + assertEquals(basicHistogram.inverseCumulativeProbability(1.0).getValue(), 99.0, 1E-8); + assertEquals(basicHistogram.inverseCumulativeProbability(0.5).getValue(), 49.0, 1E-8); + + // cumulative probability w/ inclusivities + assertEquals(basicHistogram.cumulativeProbability(0.0, true).getValue(), 0.01, 1E-8); + assertEquals(basicHistogram.cumulativeProbability(0.0, false).getValue(), 0.0, 1E-8); + + assertEquals(basicHistogram.cumulativeProbability(49.0, false).getValue(), 0.49, 1E-8); + assertEquals(basicHistogram.cumulativeProbability(49.0, true).getValue(), 0.5, 1E-8); + + assertEquals(basicHistogram.cumulativeProbability(99.0, false).getValue(), 0.99, 1E-8); + assertEquals(basicHistogram.cumulativeProbability(99.0, true).getValue(), 1.0, 1E-8); + } + + @DataProvider(name = "kllSupportedTypes") + public static Object[][] kllHistogramTypeDataProvider() + { + return new Object[][] { + // long decimal (represented by Slice.class), currently not supported + // {createDecimalType(), TestKllHistogram.generateLongSketch()}, + // time and timestamp types need additional changes because iceberg stores them in + // microsecond format but Presto always processes in milliseconds + // {TIMESTAMP_WITH_TIME_ZONE, generateLongSketch()}, + // {TIMESTAMP_MICROSECONDS, generateLongSketch()}, + // {TIMESTAMP, generateLongSketch()}, + // {TIME, generateLongSketch()}, + {INTEGER, TestKllHistogram.generateLongSketch()}, + {BIGINT, TestKllHistogram.generateLongSketch()}, + {DOUBLE, TestKllHistogram.generateDoubleSketch()}, + {createDecimalType(3, 1), TestKllHistogram.generateDoubleSketch()}, + {DATE, TestKllHistogram.generateLongSketch()}, + {createDecimalType(38, 0), TestKllHistogram.generateDoubleSketch()}, + {REAL, generateDoubleSketch()}, + }; + } + + @DataProvider(name = "kllUnsupportedTypes") + public static Object[][] unsupportedKllHistogramTypes() + { + return new Object[][] { + // long decimal (represented by Slice.class), currently not supported + {CharType.createCharType(0)}, + {CharType.createCharType(100)}, + {BOOLEAN}, + {VARCHAR}, + {VarcharType.createVarcharType(10)}, + {TIMESTAMP}, + {TIMESTAMP_WITH_TIME_ZONE}, + {TIME}, + }; + } + + @SuppressWarnings("rawtypes") + @Test(dataProvider = "kllSupportedTypes") + public void testTypeCreation(Type type, KllItemsSketch sketch) + { + KllHistogram histogram = new KllHistogram(Slices.wrappedBuffer(sketch.toByteArray()), type); + double value = histogram.inverseCumulativeProbability(0.5).getValue(); + double probability = histogram.cumulativeProbability(49.0, true).getValue(); + assertEquals(probability, 0.5); + assertEquals(value, 49.0); + } + + @Test(dataProvider = "kllUnsupportedTypes") + public void testUnsupportedKllTypes(Type type) + { + assertThrows(VerifyException.class, () -> { + new KllHistogram(null, type); + }); + } + + /** + * @return generates a histogram of doubles from [0.0, 99.9] in intervals of 1.0 + */ + private static KllHistogram generateDoublesHistogram() + { + return new KllHistogram(Slices.wrappedBuffer(ByteBuffer.wrap(generateDoubleSketch().toByteArray())), DOUBLE); + } + + private static KllItemsSketch generateLongSketch() + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(Long::compareTo, new ArrayOfLongsSerDe()); + LongStream.iterate(0, i -> i + 1).limit(100).forEach(sketch::update); + return sketch; + } + + private static KllItemsSketch generateDoubleSketch() + { + KllItemsSketch sketch = KllItemsSketch.newHeapInstance(Double::compareTo, new ArrayOfDoublesSerDe()); + DoubleStream.iterate(0.0, i -> i + 1).limit(100).forEach(sketch::update); + return sketch; + } + + private static void assertSketchesEqual(KllItemsSketch sketch, KllItemsSketch other) + { + assertEquals(other.getK(), sketch.getK()); + assertEquals(other.getN(), sketch.getN()); + assertEquals(other.getMinItem(), sketch.getMinItem()); + assertEquals(other.getMaxItem(), sketch.getMaxItem()); + assertEquals(other.getSortedView().getCumulativeWeights(), sketch.getSortedView().getCumulativeWeights()); + assertEquals(other.getSortedView().getQuantiles(), sketch.getSortedView().getQuantiles()); + } +} diff --git a/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java b/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java index 306e00cfb907..efc05bf4a20a 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/ComparisonStatsCalculator.java @@ -17,7 +17,10 @@ import com.facebook.presto.Session; import com.facebook.presto.SystemSessionProperties; import com.facebook.presto.spi.relation.VariableReferenceExpression; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; import com.facebook.presto.spi.statistics.Estimate; +import com.facebook.presto.spi.statistics.HistogramCalculator; +import com.facebook.presto.spi.statistics.UniformDistributionHistogram; import com.facebook.presto.sql.tree.ComparisonExpression; import java.util.Optional; @@ -156,7 +159,7 @@ private PlanNodeStatsEstimate estimateFilterRange( .setStatisticsRange(intersectRange) .setNullsFraction(0.0); if (useHistograms) { - symbolNewEstimate.setHistogram(expressionStatistics.getHistogram().map(expressionHistogram -> DisjointRangeDomainHistogram.addConjunction(expressionHistogram, intersectRange))); + symbolNewEstimate.setHistogram(expressionStatistics.getHistogram().map(expressionHistogram -> DisjointRangeDomainHistogram.addConjunction(expressionHistogram, intersectRange.toPrestoRange()))); } estimate = estimate.mapVariableColumnStatistics(expressionVariable.get(), oldStats -> symbolNewEstimate.build()); @@ -171,7 +174,7 @@ private double calculateFilterFactor(VariableStatsEstimate variableStatistics, S Estimate filterEstimate; if (useHistograms) { Estimate distinctEstimate = isNaN(variableStatistics.getDistinctValuesCount()) ? Estimate.unknown() : Estimate.of(variableRange.getDistinctValuesCount()); - filterEstimate = HistogramCalculator.calculateFilterFactor(intersectRange, variableStatistics.getHistogram().orElse(new UniformDistributionHistogram(variableStatistics.getLowValue(), variableStatistics.getHighValue())), distinctEstimate, true); + filterEstimate = HistogramCalculator.calculateFilterFactor(intersectRange.toPrestoRange(), intersectRange.getDistinctValuesCount(), variableStatistics.getHistogram().orElse(new UniformDistributionHistogram(variableStatistics.getLowValue(), variableStatistics.getHighValue())), distinctEstimate, true); if (log.isDebugEnabled()) { double expressionFilter = variableRange.overlapPercentWith(intersectRange); if (!Double.isNaN(expressionFilter) && diff --git a/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java b/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java index 7a2e804aa7e1..85a51c2b6bb5 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/ConnectorFilterStatsCalculatorService.java @@ -141,6 +141,11 @@ private static ColumnStatistics toColumnStatistics(VariableStatsEstimate variabl if (!Double.isNaN(variableStatsEstimate.getLowValue()) && !Double.isNaN(variableStatsEstimate.getHighValue())) { builder.setRange(new DoubleRange(variableStatsEstimate.getLowValue(), variableStatsEstimate.getHighValue())); } + + if (variableStatsEstimate.getHistogram().isPresent()) { + builder.setHistogram(variableStatsEstimate.getHistogram()); + } + return builder.build(); } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java b/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java index 6547c0b463e1..fdf4cf1b4bce 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/JoinStatsRule.java @@ -34,10 +34,10 @@ import static com.facebook.presto.SystemSessionProperties.getDefaultJoinSelectivityCoefficient; import static com.facebook.presto.SystemSessionProperties.shouldOptimizerUseHistograms; -import static com.facebook.presto.cost.DisjointRangeDomainHistogram.addConjunction; import static com.facebook.presto.cost.FilterStatsCalculator.UNKNOWN_FILTER_COEFFICIENT; import static com.facebook.presto.cost.VariableStatsEstimate.buildFrom; import static com.facebook.presto.expressions.LogicalRowExpressions.extractConjuncts; +import static com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram.addConjunction; import static com.facebook.presto.sql.analyzer.ExpressionTreeUtils.getNodeLocation; import static com.facebook.presto.sql.planner.plan.Patterns.join; import static com.facebook.presto.sql.tree.ComparisonExpression.Operator.EQUAL; @@ -250,7 +250,7 @@ private PlanNodeStatsEstimate filterByAuxiliaryClause(PlanNodeStatsEstimate stat .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv); if (useHistograms) { - newLeftStats.setHistogram(leftStats.getHistogram().map(leftHistogram -> addConjunction(leftHistogram, intersect))); + newLeftStats.setHistogram(leftStats.getHistogram().map(leftHistogram -> addConjunction(leftHistogram, intersect.toPrestoRange()))); } VariableStatsEstimate.Builder newRightStats = buildFrom(rightStats) @@ -258,7 +258,7 @@ private PlanNodeStatsEstimate filterByAuxiliaryClause(PlanNodeStatsEstimate stat .setStatisticsRange(intersect) .setDistinctValuesCount(retainedNdv); if (useHistograms) { - newRightStats.setHistogram(rightStats.getHistogram().map(rightHistogram -> addConjunction(rightHistogram, intersect))); + newRightStats.setHistogram(rightStats.getHistogram().map(rightHistogram -> addConjunction(rightHistogram, intersect.toPrestoRange()))); } PlanNodeStatsEstimate.Builder result = PlanNodeStatsEstimate.buildFrom(stats) diff --git a/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java b/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java index 1b2797e18a8a..2a280ae2524b 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/PlanNodeStatsEstimateMath.java @@ -14,10 +14,11 @@ package com.facebook.presto.cost; import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; import java.util.Optional; -import static com.facebook.presto.cost.DisjointRangeDomainHistogram.addConjunction; +import static com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram.addConjunction; import static com.google.common.base.Preconditions.checkArgument; import static java.lang.Double.NaN; import static java.lang.Double.isNaN; @@ -139,7 +140,7 @@ public PlanNodeStatsEstimate capStats(PlanNodeStatsEstimate stats, PlanNodeStats double cappedNullsFraction = cappedRowCount == 0 ? 1 : cappedNumberOfNulls / cappedRowCount; newSymbolStats.setNullsFraction(cappedNullsFraction); if (shouldUseHistograms) { - newSymbolStats.setHistogram(symbolStats.getHistogram().map(symbolHistogram -> addConjunction(symbolHistogram, new StatisticRange(newLow, newHigh, 0)))); + newSymbolStats.setHistogram(symbolStats.getHistogram().map(symbolHistogram -> addConjunction(symbolHistogram, new StatisticRange(newLow, newHigh, 0).toPrestoRange()))); } result.addVariableStatistics(symbol, newSymbolStats.build()); @@ -296,8 +297,8 @@ private VariableStatsEstimate addColumnStats( .setNullsFraction(newNullsFraction); if (shouldUseHistograms) { Optional newHistogram = RangeAdditionStrategy.INTERSECT == strategy ? - leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addConjunction(leftHistogram, rightRange)) : - leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addDisjunction(leftHistogram, rightRange)); + leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addConjunction(leftHistogram, rightRange.toPrestoRange())) : + leftStats.getHistogram().map(leftHistogram -> DisjointRangeDomainHistogram.addDisjunction(leftHistogram, rightRange.toPrestoRange())); statistics.setHistogram(newHistogram); } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java b/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java index 4d80e13cb92c..060e02bc8b07 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java +++ b/presto-main/src/main/java/com/facebook/presto/cost/StatisticRange.java @@ -13,19 +13,19 @@ */ package com.facebook.presto.cost; +import com.facebook.presto.common.predicate.Range; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.collect.BoundType; -import com.google.common.collect.Range; import java.util.Objects; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; import static com.facebook.presto.util.MoreMath.nearlyEqual; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkArgument; -import static java.lang.Double.NEGATIVE_INFINITY; import static java.lang.Double.NaN; -import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; @@ -36,9 +36,6 @@ public class StatisticRange { - protected static final double INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.25; - protected static final double INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.5; - // TODO unify field and method names with SymbolStatsEstimate /** * {@code NaN} represents empty range ({@code high} must be {@code NaN} too) @@ -222,19 +219,12 @@ public StatisticRange addAndCollapseDistinctValues(StatisticRange other) return expandRangeWithNewDistinct(newDistinctValues, other); } - public Range toRange() + public Range toPrestoRange() { - return Range.range(low, openLow ? BoundType.OPEN : BoundType.CLOSED, high, openHigh ? BoundType.OPEN : BoundType.CLOSED); - } - - public static StatisticRange fromRange(Range range) - { - return new StatisticRange( - range.hasLowerBound() ? range.lowerEndpoint() : NEGATIVE_INFINITY, - !range.hasLowerBound() || range.lowerBoundType() == BoundType.OPEN, - range.hasUpperBound() ? range.upperEndpoint() : POSITIVE_INFINITY, - !range.hasUpperBound() || range.upperBoundType() == BoundType.OPEN, - NaN); + if (low == high) { + return Range.equal(DOUBLE, low); + } + return Range.range(DOUBLE, low, !openLow, high, !openHigh); } private StatisticRange expandRangeWithNewDistinct(double newDistinctValues, StatisticRange other) diff --git a/presto-main/src/main/java/com/facebook/presto/sql/planner/LogicalPlanner.java b/presto-main/src/main/java/com/facebook/presto/sql/planner/LogicalPlanner.java index 8c5a5c7a52a0..7cf36c6c9f77 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/planner/LogicalPlanner.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/planner/LogicalPlanner.java @@ -589,13 +589,13 @@ private ConnectorTableMetadata createTableMetadata(QualifiedObjectName table, Li private RowExpression rowExpression(Expression expression, SqlPlannerContext context, Analysis analysis) { return toRowExpression( - expression, - metadata, - session, - sqlParser, - variableAllocator, - analysis, - context.getTranslatorContext()); + expression, + metadata, + session, + sqlParser, + variableAllocator, + analysis, + context.getTranslatorContext()); } private static List getOutputTableColumns(RelationPlan plan, Optional> columnAliases) diff --git a/presto-main/src/main/java/com/facebook/presto/sql/planner/StatisticsAggregationPlanner.java b/presto-main/src/main/java/com/facebook/presto/sql/planner/StatisticsAggregationPlanner.java index 305ad85f77b0..43e67318b34b 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/planner/StatisticsAggregationPlanner.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/planner/StatisticsAggregationPlanner.java @@ -43,6 +43,7 @@ import java.util.Optional; import java.util.stream.Collectors; +import static com.facebook.presto.SystemSessionProperties.shouldOptimizerUseHistograms; import static com.facebook.presto.common.type.BigintType.BIGINT; import static com.facebook.presto.common.type.UnknownType.UNKNOWN; import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; @@ -58,6 +59,7 @@ public class StatisticsAggregationPlanner { private final VariableAllocator variableAllocator; private final FunctionAndTypeResolver functionAndTypeResolver; + private final boolean useHistograms; private final Session session; private final FunctionAndTypeManager functionAndTypeManager; @@ -67,6 +69,7 @@ public StatisticsAggregationPlanner(VariableAllocator variableAllocator, Functio this.session = requireNonNull(session, "session is null"); this.functionAndTypeManager = requireNonNull(functionAndTypeManager, "functionAndTypeManager is null"); this.functionAndTypeResolver = functionAndTypeManager.getFunctionAndTypeResolver(); + this.useHistograms = shouldOptimizerUseHistograms(session); } public TableStatisticAggregation createStatisticsAggregation(TableStatisticsMetadata statisticsMetadata, Map columnToVariableMap) @@ -105,6 +108,9 @@ public TableStatisticAggregation createStatisticsAggregation(TableStatisticsMeta } for (ColumnStatisticMetadata columnStatisticMetadata : statisticsMetadata.getColumnStatistics()) { + if (!useHistograms && columnStatisticMetadata.getStatisticType() == ColumnStatisticType.HISTOGRAM) { + continue; + } String columnName = columnStatisticMetadata.getColumnName(); ColumnStatisticType statisticType = columnStatisticMetadata.getStatisticType(); VariableReferenceExpression inputVariable = columnToVariableMap.get(columnName); diff --git a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java index 7a7396824e09..801cb6f33773 100644 --- a/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java +++ b/presto-main/src/main/java/com/facebook/presto/sql/rewrite/ShowStatsRewrite.java @@ -21,6 +21,7 @@ import com.facebook.presto.common.type.IntegerType; import com.facebook.presto.common.type.RealType; import com.facebook.presto.common.type.SmallintType; +import com.facebook.presto.common.type.SqlTime; import com.facebook.presto.common.type.SqlTimestamp; import com.facebook.presto.common.type.TinyintType; import com.facebook.presto.common.type.Type; @@ -80,6 +81,7 @@ import static com.facebook.presto.common.type.SqlTimestamp.MICROSECONDS_PER_MILLISECOND; import static com.facebook.presto.common.type.StandardTypes.DOUBLE; import static com.facebook.presto.common.type.StandardTypes.VARCHAR; +import static com.facebook.presto.common.type.TimeType.TIME; import static com.facebook.presto.common.type.TimestampType.TIMESTAMP; import static com.facebook.presto.metadata.MetadataUtil.createQualifiedObjectName; import static com.facebook.presto.sql.QueryUtil.aliased; @@ -373,6 +375,9 @@ private Expression toStringLiteral(Type type, double value) if (type.equals(TIMESTAMP)) { return new StringLiteral(new SqlTimestamp(round(value) / MICROSECONDS_PER_MILLISECOND, session.getSqlFunctionProperties().getTimeZoneKey(), MILLISECONDS).toString()); } + if (type.equals(TIME)) { + return new StringLiteral(new SqlTime(round(value)).toString()); + } throw new IllegalArgumentException("Unexpected type: " + type); } } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java b/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java deleted file mode 100644 index ddccfdfe3c06..000000000000 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogramCalculator.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package com.facebook.presto.cost; - -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import org.testng.annotations.Test; - -import static com.facebook.presto.cost.HistogramCalculator.calculateFilterFactor; -import static java.lang.Double.NEGATIVE_INFINITY; -import static java.lang.Double.NaN; -import static java.lang.Double.POSITIVE_INFINITY; -import static org.testng.Assert.assertEquals; - -public class TestHistogramCalculator -{ - @Test - public void testCalculateFilterFactor() - { - StatisticRange zeroToTen = range(0, 10, 10); - StatisticRange empty = StatisticRange.empty(); - - // Equal ranges - assertFilterFactor(Estimate.of(1.0), zeroToTen, uniformHist(0, 10), 5); - assertFilterFactor(Estimate.of(1.0), zeroToTen, uniformHist(0, 10), 20); - - // Some overlap - assertFilterFactor(Estimate.of(0.5), range(5, 3000, 5), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // Single value overlap - assertFilterFactor(Estimate.of(1.0 / zeroToTen.getDistinctValuesCount()), range(3, 3, 1), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - assertFilterFactor(Estimate.of(1.0 / zeroToTen.getDistinctValuesCount()), range(10, 100, 357), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // No overlap - assertFilterFactor(Estimate.zero(), range(20, 30, 10), uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // Empty ranges - assertFilterFactor(Estimate.zero(), zeroToTen, uniformHist(empty), empty.getDistinctValuesCount()); - assertFilterFactor(Estimate.zero(), empty, uniformHist(zeroToTen), zeroToTen.getDistinctValuesCount()); - - // no test for (empty, empty) since any return value is correct - assertFilterFactor(Estimate.zero(), unboundedRange(10), uniformHist(empty), empty.getDistinctValuesCount()); - assertFilterFactor(Estimate.zero(), empty, uniformHist(unboundedRange(10)), 10); - - // Unbounded (infinite), NDV-based - assertFilterFactor(Estimate.of(0.5), unboundedRange(10), uniformHist(unboundedRange(20)), 20); - assertFilterFactor(Estimate.of(1.0), unboundedRange(20), uniformHist(unboundedRange(10)), 10); - - // NEW TESTS (TPC-H Q2) - // unbounded ranges - assertFilterFactor(Estimate.of(.5), unboundedRange(0.5), uniformHist(unboundedRange(NaN)), NaN); - // unbounded ranges with limited distinct values - assertFilterFactor(Estimate.of(0.2), unboundedRange(1.0), - domainConstrained(unboundedRange(5.0), uniformHist(unboundedRange(7.0))), 5.0); - } - - private static StatisticRange range(double low, double high, double distinctValues) - { - return new StatisticRange(low, high, distinctValues); - } - - private static StatisticRange unboundedRange(double distinctValues) - { - return new StatisticRange(NEGATIVE_INFINITY, POSITIVE_INFINITY, distinctValues); - } - - private static void assertFilterFactor(Estimate expected, StatisticRange range, ConnectorHistogram histogram, double totalDistinctValues) - { - assertEquals( - calculateFilterFactor(range, histogram, Estimate.estimateFromDouble(totalDistinctValues), true), - expected); - } - - private static ConnectorHistogram uniformHist(StatisticRange range) - { - return uniformHist(range.getLow(), range.getHigh()); - } - - private static ConnectorHistogram uniformHist(double low, double high) - { - return new UniformDistributionHistogram(low, high); - } - - private static ConnectorHistogram domainConstrained(StatisticRange range, ConnectorHistogram source) - { - return DisjointRangeDomainHistogram.addDisjunction(source, range); - } -} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java b/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java index dc2e60bb46e8..79e47477bc12 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestPlanNodeStatsEstimateMath.java @@ -15,6 +15,8 @@ import com.facebook.presto.spi.relation.VariableReferenceExpression; import com.facebook.presto.spi.statistics.ConnectorHistogram; +import com.facebook.presto.spi.statistics.DisjointRangeDomainHistogram; +import com.facebook.presto.spi.statistics.UniformDistributionHistogram; import org.testng.annotations.Test; import java.util.Optional; @@ -374,31 +376,31 @@ public void testAddHistograms() assertEquals(calculator.addStatsAndCollapseDistinctValues(unknownRowCount, unknownRowCount).getVariableStatistics(VARIABLE).getHistogram(), Optional.empty()); // check when rows are available histograms are added properly. - ConnectorHistogram addedSameRange = DisjointRangeDomainHistogram.addDisjunction(unknownNullsFraction.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen); + ConnectorHistogram addedSameRange = DisjointRangeDomainHistogram.addDisjunction(unknownNullsFraction.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen.toPrestoRange()); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndSumDistinctValues, addedSameRange); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndCollapseDistinctValues, addedSameRange); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndMaxDistinctValues, addedSameRange); assertAddStatsHistogram(unknownNullsFraction, unknownNullsFraction, calculator::addStatsAndIntersect, addedSameRange); // check when only a sub-range is added, that the histogram still represents the full range - ConnectorHistogram fullRangeFirst = DisjointRangeDomainHistogram.addDisjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen); - ConnectorHistogram intersectedRangeSecond = DisjointRangeDomainHistogram.addConjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToFive); + ConnectorHistogram fullRangeFirst = DisjointRangeDomainHistogram.addDisjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToTen.toPrestoRange()); + ConnectorHistogram intersectedRangeSecond = DisjointRangeDomainHistogram.addConjunction(first.getVariableStatistics(VARIABLE).getHistogram().get(), zeroToFive.toPrestoRange()); assertAddStatsHistogram(first, second, calculator::addStatsAndSumDistinctValues, fullRangeFirst); assertAddStatsHistogram(first, second, calculator::addStatsAndCollapseDistinctValues, fullRangeFirst); assertAddStatsHistogram(first, second, calculator::addStatsAndMaxDistinctValues, fullRangeFirst); assertAddStatsHistogram(first, second, calculator::addStatsAndIntersect, intersectedRangeSecond); // check when two ranges overlap, the new stats span both ranges - ConnectorHistogram fullRangeSecondThird = DisjointRangeDomainHistogram.addDisjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen); - ConnectorHistogram intersectedRangeSecondThird = DisjointRangeDomainHistogram.addConjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen); + ConnectorHistogram fullRangeSecondThird = DisjointRangeDomainHistogram.addDisjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen.toPrestoRange()); + ConnectorHistogram intersectedRangeSecondThird = DisjointRangeDomainHistogram.addConjunction(second.getVariableStatistics(VARIABLE).getHistogram().get(), fiveToTen.toPrestoRange()); assertAddStatsHistogram(second, third, calculator::addStatsAndSumDistinctValues, fullRangeSecondThird); assertAddStatsHistogram(second, third, calculator::addStatsAndCollapseDistinctValues, fullRangeSecondThird); assertAddStatsHistogram(second, third, calculator::addStatsAndMaxDistinctValues, fullRangeSecondThird); assertAddStatsHistogram(second, third, calculator::addStatsAndIntersect, intersectedRangeSecondThird); // check when two ranges partially overlap, the addition/intersection is applied correctly - ConnectorHistogram fullRangeThirdFourth = DisjointRangeDomainHistogram.addDisjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven); - ConnectorHistogram intersectedRangeThirdFourth = DisjointRangeDomainHistogram.addConjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven); + ConnectorHistogram fullRangeThirdFourth = DisjointRangeDomainHistogram.addDisjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven.toPrestoRange()); + ConnectorHistogram intersectedRangeThirdFourth = DisjointRangeDomainHistogram.addConjunction(third.getVariableStatistics(VARIABLE).getHistogram().get(), threeToSeven.toPrestoRange()); assertAddStatsHistogram(third, fourth, calculator::addStatsAndSumDistinctValues, fullRangeThirdFourth); assertAddStatsHistogram(third, fourth, calculator::addStatsAndCollapseDistinctValues, fullRangeThirdFourth); assertAddStatsHistogram(third, fourth, calculator::addStatsAndMaxDistinctValues, fullRangeThirdFourth); @@ -419,7 +421,7 @@ private static PlanNodeStatsEstimate statistics(double rowCount, double totalSiz .setNullsFraction(nullsFraction) .setAverageRowSize(averageRowSize) .setStatisticsRange(range) - .setHistogram(Optional.of(DisjointRangeDomainHistogram.addConjunction(new UniformDistributionHistogram(range.getLow(), range.getHigh()), range))) + .setHistogram(Optional.of(DisjointRangeDomainHistogram.addConjunction(new UniformDistributionHistogram(range.getLow(), range.getHigh()), range.toPrestoRange()))) .build()) .build(); } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java b/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java index b26665eb30af..b0f364d1f34e 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java +++ b/presto-main/src/test/java/com/facebook/presto/cost/TestVariableStatsEstimate.java @@ -15,7 +15,7 @@ package com.facebook.presto.cost; import com.facebook.airlift.json.JsonCodec; -import com.google.common.collect.Range; +import com.facebook.presto.spi.statistics.UniformDistributionHistogram; import org.testng.annotations.Test; import java.util.Optional; @@ -33,7 +33,7 @@ public void testSkipHistogramSerialization() VariableStatsEstimate estimate = VariableStatsEstimate.builder() .setAverageRowSize(100) .setDistinctValuesCount(100) - .setStatisticsRange(StatisticRange.fromRange(Range.open(1.0d, 2.0d))) + .setStatisticsRange(new StatisticRange(55, 65, 100)) .setHistogram(Optional.of(new UniformDistributionHistogram(55, 65))) .setNullsFraction(0.1) .build(); diff --git a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java new file mode 100644 index 000000000000..720adc0a0bf5 --- /dev/null +++ b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/ApproximateStatsOutputRowCountMatcher.java @@ -0,0 +1,54 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.sql.planner.assertions; + +import com.facebook.presto.Session; +import com.facebook.presto.cost.StatsProvider; +import com.facebook.presto.metadata.Metadata; +import com.facebook.presto.spi.plan.PlanNode; + +import static com.google.common.base.Verify.verify; + +public class ApproximateStatsOutputRowCountMatcher + implements Matcher +{ + private final double expectedOutputRowCount; + private final double error; + + ApproximateStatsOutputRowCountMatcher(double expectedOutputRowCount, double error) + { + verify(error >= 0.0, "error must be >= 0.0"); + verify(expectedOutputRowCount >= 0.0, "expectedOutputRowCount must be >= 0.0"); + this.expectedOutputRowCount = expectedOutputRowCount; + this.error = error; + } + + @Override + public boolean shapeMatches(PlanNode node) + { + return true; + } + + @Override + public MatchResult detailMatches(PlanNode node, StatsProvider stats, Session session, Metadata metadata, SymbolAliases symbolAliases) + { + return new MatchResult(Math.abs(stats.getStats(node).getOutputRowCount() - expectedOutputRowCount) < error); + } + + @Override + public String toString() + { + return "approximateExpectedOutputRowCount(" + expectedOutputRowCount + ", " + error + ")"; + } +} diff --git a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java index 320c24100f96..88bd7bc53b7c 100644 --- a/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java +++ b/presto-main/src/test/java/com/facebook/presto/sql/planner/assertions/PlanMatchPattern.java @@ -815,6 +815,12 @@ public PlanMatchPattern withOutputRowCount(boolean exactMatch, String expectedSo return this; } + public PlanMatchPattern withApproximateOutputRowCount(double expectedOutputRowCount, double error) + { + matchers.add(new ApproximateStatsOutputRowCountMatcher(expectedOutputRowCount, error)); + return this; + } + public PlanMatchPattern withOutputSize(double expectedOutputSize) { matchers.add(new StatsOutputSizeMatcher(expectedOutputSize)); diff --git a/presto-spi/pom.xml b/presto-spi/pom.xml index 2454397e48c7..50e975b0ef95 100644 --- a/presto-spi/pom.xml +++ b/presto-spi/pom.xml @@ -126,5 +126,11 @@ assertj-core test + + + org.apache.commons + commons-math3 + test + diff --git a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java index 97b1ac7fd4c1..255c72835099 100644 --- a/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/ColumnStatistics.java @@ -29,6 +29,8 @@ public final class ColumnStatistics private static final long COLUMN_STATISTICS_SIZE = ClassLayout.parseClass(ColumnStatistics.class).instanceSize(); private static final long OPTION_SIZE = ClassLayout.parseClass(Optional.class).instanceSize(); + public static final double INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.25; + public static final double INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR = 0.5; private static final ColumnStatistics EMPTY = new ColumnStatistics(Estimate.unknown(), Estimate.unknown(), Estimate.unknown(), Optional.empty(), Optional.empty()); private final Estimate nullsFraction; @@ -224,6 +226,11 @@ public Builder setHistogram(Optional histogram) return this; } + public Optional getHistogram() + { + return histogram; + } + public Builder mergeWith(Builder other) { if (nullsFraction.isUnknown()) { diff --git a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DisjointRangeDomainHistogram.java similarity index 75% rename from presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java rename to presto-spi/src/main/java/com/facebook/presto/spi/statistics/DisjointRangeDomainHistogram.java index 553cf84d07bf..0a8df7ea13fe 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/DisjointRangeDomainHistogram.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/DisjointRangeDomainHistogram.java @@ -12,36 +12,34 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; +import com.facebook.presto.common.predicate.Marker; +import com.facebook.presto.common.predicate.Range; +import com.facebook.presto.common.predicate.SortedRangeSet; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; -import com.google.common.base.Suppliers; -import com.google.common.collect.BoundType; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Range; -import com.google.common.collect.RangeSet; -import com.google.common.collect.TreeRangeSet; import org.openjdk.jol.info.ClassLayout; -import java.util.Collection; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; import java.util.NoSuchElementException; import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.function.Supplier; -import static com.facebook.presto.cost.HistogramCalculator.calculateFilterFactor; -import static com.facebook.presto.util.MoreMath.max; -import static com.facebook.presto.util.MoreMath.min; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.collect.ImmutableSet.toImmutableSet; +import static com.facebook.presto.common.Utils.checkArgument; +import static com.facebook.presto.common.Utils.memoizedSupplier; +import static com.facebook.presto.common.Utils.toStringHelper; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static java.lang.Double.NEGATIVE_INFINITY; +import static java.lang.Double.NaN; import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; +import static java.lang.Math.max; +import static java.lang.Math.min; import static java.util.Objects.hash; import static java.util.Objects.requireNonNull; @@ -75,31 +73,27 @@ public class DisjointRangeDomainHistogram private final ConnectorHistogram source; // use RangeSet as the internal representation of the ranges, but the constructor arguments // use StatisticRange to support serialization and deserialization. - private final Supplier> rangeSet; - private final Set> ranges; + private final Supplier rangeSet; + private final Set ranges; @JsonCreator - public DisjointRangeDomainHistogram(@JsonProperty("source") ConnectorHistogram source, @JsonProperty("ranges") Collection ranges) - { - this(source, ranges.stream().map(StatisticRange::toRange).collect(toImmutableSet())); - } - - public DisjointRangeDomainHistogram(ConnectorHistogram source, Set> ranges) + public DisjointRangeDomainHistogram(ConnectorHistogram source, Set ranges) { this.source = requireNonNull(source, "source is null"); this.ranges = requireNonNull(ranges, "ranges is null"); - this.rangeSet = Suppliers.memoize(() -> { - RangeSet rangeSet = TreeRangeSet.create(); - rangeSet.addAll(ranges); + this.rangeSet = memoizedSupplier(() -> { + SortedRangeSet rangeSet = SortedRangeSet.copyOf(DOUBLE, new ArrayList<>(ranges)); return rangeSet.subRangeSet(getSourceSpan(this.source)); }); } - private static Range getSourceSpan(ConnectorHistogram source) + private static Range getSourceSpan(ConnectorHistogram source) { - return Range.closed( + return Range.range(DOUBLE, source.inverseCumulativeProbability(0.0).orElse(() -> NEGATIVE_INFINITY), - source.inverseCumulativeProbability(1.0).orElse(() -> POSITIVE_INFINITY)); + true, + source.inverseCumulativeProbability(1.0).orElse(() -> POSITIVE_INFINITY), + true); } @JsonProperty @@ -109,14 +103,14 @@ public ConnectorHistogram getSource() } @JsonProperty - public Set getRanges() + public SortedRangeSet getRanges() { - return rangeSet.get().asRanges().stream().map(StatisticRange::fromRange).collect(toImmutableSet()); + return rangeSet.get(); } public DisjointRangeDomainHistogram(ConnectorHistogram source) { - this(source, ImmutableSet.>of()); + this(source, Collections.emptySet()); } @Override @@ -130,17 +124,22 @@ public Estimate cumulativeProbability(double value, boolean inclusive) if (Double.isNaN(value)) { return Estimate.unknown(); } - Optional> optionalSpan = getSpan(); + Optional optionalSpan = getSpan(); if (!optionalSpan.isPresent()) { return Estimate.of(0.0); } - Range span = optionalSpan.get(); - if (value <= span.lowerEndpoint()) { + Range span = optionalSpan.get(); + if (value <= span.getLowValue().map(Double.class::cast) + .orElse(NEGATIVE_INFINITY)) { return Estimate.of(0.0); } - Range input = Range.range(span.lowerEndpoint(), span.lowerBoundType(), value, inclusive ? BoundType.CLOSED : BoundType.OPEN); + Range input = Range.range(DOUBLE, + span.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY), + span.getLow().getBound() == Marker.Bound.EXACTLY, + value, + inclusive); Estimate fullSetOverlap = calculateRangeSetOverlap(rangeSet.get()); - RangeSet spanned = rangeSet.get().subRangeSet(input); + SortedRangeSet spanned = rangeSet.get().subRangeSet(input); Estimate spannedOverlap = calculateRangeSetOverlap(spanned); return spannedOverlap.flatMap(spannedProbability -> @@ -152,11 +151,11 @@ public Estimate cumulativeProbability(double value, boolean inclusive) })); } - private Estimate calculateRangeSetOverlap(RangeSet ranges) + private Estimate calculateRangeSetOverlap(SortedRangeSet ranges) { // we require knowing bounds on all ranges double cumulativeTotal = 0.0; - for (Range range : ranges.asRanges()) { + for (Range range : ranges.getOrderedRanges()) { Estimate rangeProbability = getRangeProbability(range); if (rangeProbability.isUnknown()) { return Estimate.unknown(); @@ -173,9 +172,9 @@ private Estimate calculateRangeSetOverlap(RangeSet ranges) * @param range the range over the source domain * @return estimate of the total probability the range covers in the source */ - private Estimate getRangeProbability(Range range) + private Estimate getRangeProbability(Range range) { - return calculateFilterFactor(StatisticRange.fromRange(range), source, Estimate.unknown(), false); + return HistogramCalculator.calculateFilterFactor(range, NaN, source, Estimate.unknown(), false); } @Override @@ -189,17 +188,19 @@ public Estimate inverseCumulativeProbability(double percentile) // rangedPercentile = percentile - percentileLow // // percentileLow + (rangedPercentile * rangePercentileLength) - Optional> optionalSpan = getSpan(); + Optional optionalSpan = getSpan(); if (!optionalSpan.isPresent()) { return Estimate.unknown(); } - Range span = optionalSpan.get(); - if (percentile == 0.0 && isFinite(span.lowerEndpoint())) { - return source.inverseCumulativeProbability(0.0).map(sourceMin -> max(span.lowerEndpoint(), sourceMin)); + Range span = optionalSpan.get(); + double lower = span.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY); + double upper = span.getHighValue().map(Double.class::cast).orElse(POSITIVE_INFINITY); + if (percentile == 0.0 && isFinite(lower)) { + return source.inverseCumulativeProbability(0.0).map(sourceMin -> max(lower, sourceMin)); } - if (percentile == 1.0 && isFinite(span.upperEndpoint())) { - return source.inverseCumulativeProbability(1.0).map(sourceMax -> min(span.upperEndpoint(), sourceMax)); + if (percentile == 1.0 && isFinite(upper)) { + return source.inverseCumulativeProbability(1.0).map(sourceMax -> min(upper, sourceMax)); } Estimate totalCumulativeEstimate = calculateRangeSetOverlap(rangeSet.get()); @@ -213,9 +214,9 @@ public Estimate inverseCumulativeProbability(double percentile) } double cumulativeProbabilityNewDomain = 0.0; double lastRangeEstimateSourceDomain = 0.0; - Range currentRange = null; + Range currentRange = null; // find the range where the percentile falls - for (Range range : rangeSet.get().asRanges()) { + for (Range range : rangeSet.get().getOrderedRanges()) { Estimate rangeEstimate = getRangeProbability(range); if (rangeEstimate.isUnknown()) { return Estimate.unknown(); @@ -231,7 +232,8 @@ public Estimate inverseCumulativeProbability(double percentile) // no ranges to iterate over. Did a constraint cut the entire domain of values? return Estimate.unknown(); } - Estimate rangeLeftSourceEstimate = source.cumulativeProbability(currentRange.lowerEndpoint(), currentRange.lowerBoundType() == BoundType.OPEN); + Double currentLow = currentRange.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY); + Estimate rangeLeftSourceEstimate = source.cumulativeProbability(currentLow, !currentRange.isLowInclusive()); if (rangeLeftSourceEstimate.isUnknown()) { return Estimate.unknown(); } @@ -250,12 +252,10 @@ public Estimate inverseCumulativeProbability(double percentile) * @param other the new range to add to the set. * @return a new {@link DisjointRangeDomainHistogram} */ - public DisjointRangeDomainHistogram addDisjunction(StatisticRange other) + public DisjointRangeDomainHistogram addDisjunction(Range other) { - Set> ranges = ImmutableSet.>builder() - .addAll(this.ranges) - .add(other.toRange()) - .build(); + Set ranges = new HashSet<>(this.ranges); + ranges.add(other); return new DisjointRangeDomainHistogram(source, ranges); } @@ -266,9 +266,9 @@ public DisjointRangeDomainHistogram addDisjunction(StatisticRange other) * @param other the range that should enclose the set. * @return a new {@link DisjointRangeDomainHistogram} where */ - public DisjointRangeDomainHistogram addConjunction(StatisticRange other) + public DisjointRangeDomainHistogram addConjunction(Range other) { - return new DisjointRangeDomainHistogram(source, rangeSet.get().subRangeSet(other.toRange()).asRanges()); + return new DisjointRangeDomainHistogram(source, new HashSet<>(rangeSet.get().subRangeSet(other).getOrderedRanges())); } /** @@ -287,17 +287,17 @@ public DisjointRangeDomainHistogram addConjunction(StatisticRange other) * @param range the range representing the conjunction to add * @return a new histogram with the conjunction applied. */ - public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, StatisticRange range) + public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, Range range) { if (histogram instanceof DisjointRangeDomainHistogram) { return ((DisjointRangeDomainHistogram) histogram).addDisjunction(range); } - return new DisjointRangeDomainHistogram(histogram, ImmutableSet.of(range.toRange())); + return new DisjointRangeDomainHistogram(histogram, Collections.singleton(range)); } /** - * Similar to {@link #addDisjunction(ConnectorHistogram, StatisticRange)} this method constrains + * Similar to {@link #addDisjunction(ConnectorHistogram, Range)} this method constrains * the entire domain such that all ranges in the set intersect with the given range * argument to this method. *
@@ -308,22 +308,24 @@ public static ConnectorHistogram addDisjunction(ConnectorHistogram histogram, St * @param range the range of values that the entire histogram's domain must fall within * @return a histogram with the new range constraint */ - public static ConnectorHistogram addConjunction(ConnectorHistogram histogram, StatisticRange range) + public static ConnectorHistogram addConjunction(ConnectorHistogram histogram, Range range) { if (histogram instanceof DisjointRangeDomainHistogram) { return ((DisjointRangeDomainHistogram) histogram).addConjunction(range); } - return new DisjointRangeDomainHistogram(histogram, ImmutableSet.of(range.toRange())); + return new DisjointRangeDomainHistogram(histogram, Collections.singleton(range)); } /** * @return the span if it exists, empty otherwise */ - private Optional> getSpan() + private Optional getSpan() { try { - return Optional.of(rangeSet.get().span()); + return Optional.of(rangeSet.get()) + .filter(set -> !set.isNone()) // prevent exception + .map(SortedRangeSet::getSpan); } catch (NoSuchElementException e) { return Optional.empty(); @@ -335,7 +337,7 @@ public String toString() { return toStringHelper(this) .add("source", this.source) - .add("rangeSet", this.rangeSet) + .add("rangeSet", this.rangeSet.get()) .toString(); } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/HistogramCalculator.java similarity index 68% rename from presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java rename to presto-spi/src/main/java/com/facebook/presto/spi/statistics/HistogramCalculator.java index 12525b6120cc..8db65dbc69ff 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/HistogramCalculator.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/HistogramCalculator.java @@ -12,12 +12,14 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.google.common.math.DoubleMath; +import com.facebook.presto.common.predicate.Range; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; +import static com.facebook.presto.spi.statistics.ColumnStatistics.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR; +import static java.lang.Double.NEGATIVE_INFINITY; +import static java.lang.Double.POSITIVE_INFINITY; import static java.lang.Double.isFinite; import static java.lang.Double.isNaN; import static java.lang.Math.min; @@ -43,16 +45,19 @@ private HistogramCalculator() * heuristic would have been used * @return an estimate, x, where 0.0 <= x <= 1.0. */ - public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHistogram histogram, Estimate totalDistinctValues, boolean useHeuristics) + public static Estimate calculateFilterFactor(Range range, double rangeDistinctValues, ConnectorHistogram histogram, Estimate totalDistinctValues, boolean useHeuristics) { - boolean openHigh = range.getOpenHigh(); - boolean openLow = range.getOpenLow(); + boolean openHigh = !range.isHighInclusive(); + boolean openLow = !range.isLowInclusive(); Estimate min = histogram.inverseCumulativeProbability(0.0); Estimate max = histogram.inverseCumulativeProbability(1.0); + double rangeLow = range.getLowValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY); + double rangeHigh = range.getHighValue().map(Double.class::cast).orElse(POSITIVE_INFINITY); + double rangeLength = rangeHigh - rangeLow; // range is either above or below histogram - if ((!max.isUnknown() && (openHigh ? max.getValue() <= range.getLow() : max.getValue() < range.getLow())) - || (!min.isUnknown() && (openLow ? min.getValue() >= range.getHigh() : min.getValue() > range.getHigh()))) { + if ((!max.isUnknown() && (openHigh ? max.getValue() <= rangeLow : max.getValue() < rangeLow)) + || (!min.isUnknown() && (openLow ? min.getValue() >= rangeHigh : min.getValue() > rangeHigh))) { return Estimate.of(0.0); } @@ -63,14 +68,14 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist return Estimate.unknown(); } - if (range.length() == 0.0) { + if (rangeLength == 0.0) { return totalDistinctValues.map(distinct -> 1.0 / distinct); } - if (isFinite(range.length())) { - return Estimate.of(StatisticRange.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + if (isFinite(rangeLength)) { + return Estimate.of(INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } - return Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + return Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } // we know the bounds are both known, so calculate the percentile for each bound @@ -82,8 +87,8 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist // thus for the "lowPercentile" calculation we should pass "false" to be non-inclusive // (same as openness) however, on the high-end we want the inclusivity to be the opposite // of the openness since if it's open, we _don't_ want to include the bound. - Estimate lowPercentile = histogram.cumulativeProbability(range.getLow(), openLow); - Estimate highPercentile = histogram.cumulativeProbability(range.getHigh(), !openHigh); + Estimate lowPercentile = histogram.cumulativeProbability(rangeLow, openLow); + Estimate highPercentile = histogram.cumulativeProbability(rangeHigh, !openHigh); // both bounds are probably infinity, use the infinite-infinite heuristic if (lowPercentile.isUnknown() || highPercentile.isUnknown()) { @@ -91,26 +96,26 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist return Estimate.unknown(); } // in the case the histogram has no values - if (totalDistinctValues.equals(Estimate.zero()) || range.getDistinctValuesCount() == 0.0) { + if (totalDistinctValues.equals(Estimate.zero()) || rangeDistinctValues == 0.0) { return Estimate.of(0.0); } // in the case only one is unknown if (((lowPercentile.isUnknown() && !highPercentile.isUnknown()) || (!lowPercentile.isUnknown() && highPercentile.isUnknown())) && - isFinite(range.length())) { - return Estimate.of(StatisticRange.INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + isFinite(rangeLength)) { + return Estimate.of(INFINITE_TO_FINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } - if (range.length() == 0.0) { + if (rangeLength == 0.0) { return totalDistinctValues.map(distinct -> 1.0 / distinct); } - if (!isNaN(range.getDistinctValuesCount())) { - return totalDistinctValues.map(distinct -> min(1.0, range.getDistinctValuesCount() / distinct)); + if (!isNaN(rangeDistinctValues)) { + return totalDistinctValues.map(distinct -> min(1.0, rangeDistinctValues / distinct)); } - return Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); + return Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR); } // in the case the range is a single value, this can occur if the input @@ -134,15 +139,23 @@ public static Estimate calculateFilterFactor(StatisticRange range, ConnectorHist } return totalDistinctValues.flatMap(totalDistinct -> { - if (DoubleMath.fuzzyEquals(totalDistinct, 0.0, 1E-6)) { + if (fuzzyEquals(totalDistinct, 0.0, 1E-6)) { return Estimate.of(1.0); } - return Estimate.of(min(1.0, range.getDistinctValuesCount() / totalDistinct)); + return Estimate.of(min(1.0, rangeDistinctValues / totalDistinct)); }) - // in the case totalDistinct is NaN or 0 - .or(() -> Estimate.of(StatisticRange.INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR)); + // in the case totalDistinct is NaN or 0 + .or(() -> Estimate.of(INFINITE_TO_INFINITE_RANGE_INTERSECT_OVERLAP_HEURISTIC_FACTOR)); } return lowPercentile.flatMap(lowPercent -> highPercentile.map(highPercent -> highPercent - lowPercent)); } + + private static boolean fuzzyEquals(double a, double b, double tolerance) + { + return Math.copySign(a - b, 1.0) <= tolerance + // copySign(x, 1.0) is a branch-free version of abs(x), but with different NaN semantics + || (a == b) // needed to ensure that infinities equal themselves + || (Double.isNaN(a) && Double.isNaN(b)); + } } diff --git a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/UniformDistributionHistogram.java similarity index 90% rename from presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java rename to presto-spi/src/main/java/com/facebook/presto/spi/statistics/UniformDistributionHistogram.java index c9cb6bfd1952..8bc17d9e5c81 100644 --- a/presto-main/src/main/java/com/facebook/presto/cost/UniformDistributionHistogram.java +++ b/presto-spi/src/main/java/com/facebook/presto/spi/statistics/UniformDistributionHistogram.java @@ -12,17 +12,14 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; import org.openjdk.jol.info.ClassLayout; -import static com.google.common.base.MoreObjects.toStringHelper; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Verify.verify; +import static com.facebook.presto.common.Utils.checkArgument; +import static com.facebook.presto.common.Utils.toStringHelper; import static java.lang.Double.isInfinite; import static java.lang.Double.isNaN; import static java.lang.Math.max; @@ -48,7 +45,7 @@ public UniformDistributionHistogram( @JsonProperty("lowValue") double lowValue, @JsonProperty("highValue") double highValue) { - verify(isNaN(lowValue) || isNaN(highValue) || (lowValue <= highValue), "lowValue must be <= highValue"); + checkArgument(isNaN(lowValue) || isNaN(highValue) || (lowValue <= highValue), "lowValue must be <= highValue"); this.lowValue = lowValue; this.highValue = highValue; } diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDisjointRangeDomainHistogram.java similarity index 81% rename from presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java rename to presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDisjointRangeDomainHistogram.java index 0c11e729b1fe..6939becf267c 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestDisjointRangeDomainHistogram.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestDisjointRangeDomainHistogram.java @@ -12,20 +12,20 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Range; import org.apache.commons.math3.distribution.NormalDistribution; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.distribution.UniformRealDistribution; import org.testng.annotations.Test; import java.util.List; -import java.util.stream.Collectors; +import static com.facebook.presto.common.predicate.Range.greaterThanOrEqual; +import static com.facebook.presto.common.predicate.Range.lessThanOrEqual; +import static com.facebook.presto.common.predicate.Range.range; +import static com.facebook.presto.common.type.DoubleType.DOUBLE; import static org.testng.Assert.assertEquals; public class TestDisjointRangeDomainHistogram @@ -39,9 +39,9 @@ public void testBasicDisjointRanges() { ConnectorHistogram source = new UniformDistributionHistogram(0, 100); ConnectorHistogram constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(0d, 25d))); + .addDisjunction(source, rangeOpen(0d, 25d)); constrained = DisjointRangeDomainHistogram - .addDisjunction(constrained, StatisticRange.fromRange(Range.open(75d, 100d))); + .addDisjunction(constrained, rangeOpen(75d, 100d)); assertEquals(constrained.inverseCumulativeProbability(0.75).getValue(), 87.5); assertEquals(constrained.inverseCumulativeProbability(0.0).getValue(), 0.0); assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 100); @@ -59,7 +59,7 @@ public void testSingleDisjointRange() // no overlap, left bound ConnectorHistogram constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(-10d, -5d))); + .addDisjunction(source, rangeOpen(-10d, -5d)); for (int i = -11; i < 12; i++) { assertEquals(constrained.cumulativeProbability(i, true).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(i, false).getValue(), 0.0, 1E-8); @@ -68,7 +68,7 @@ public void testSingleDisjointRange() assertEquals(constrained.inverseCumulativeProbability(1.0), Estimate.unknown()); // partial overlap left bound - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(-2d, 2d))); + constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(-2d, 2d))); assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(-1, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); @@ -82,7 +82,7 @@ public void testSingleDisjointRange() assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 2d, 1E-8); //full overlap - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(3d, 4d))); + constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(3d, 4d))); assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(1, false).getValue(), 0.0, 1E-8); @@ -96,7 +96,7 @@ public void testSingleDisjointRange() assertEquals(constrained.inverseCumulativeProbability(1.0).getValue(), 4d, 1E-8); //right side overlap - constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(8d, 12d))); + constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(8d, 12d))); assertEquals(constrained.cumulativeProbability(-3, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0, false).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(5, false).getValue(), 0.0, 1E-8); @@ -114,7 +114,7 @@ public void testSingleDisjointRange() // no overlap, right bound constrained = DisjointRangeDomainHistogram - .addDisjunction(source, StatisticRange.fromRange(Range.open(15d, 20d))); + .addDisjunction(source, rangeOpen(15d, 20d)); for (int i = 15; i < 20; i++) { assertEquals(constrained.cumulativeProbability(i, true).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(i, false).getValue(), 0.0, 1E-8); @@ -132,8 +132,8 @@ public void testMultipleDisjunction() { StandardNormalHistogram source = new StandardNormalHistogram(); RealDistribution dist = source.getDistribution(); - ConnectorHistogram constrained = disjunction(source, Range.closed(-2d, -1d)); - constrained = disjunction(constrained, Range.closed(1d, 2d)); + ConnectorHistogram constrained = disjunction(source, rangeClosed(-2d, -1d)); + constrained = disjunction(constrained, rangeClosed(1d, 2d)); double rangeLeftProb = dist.cumulativeProbability(-1) - dist.cumulativeProbability(-2); double rangeRightProb = dist.cumulativeProbability(2) - dist.cumulativeProbability(1); double sumRangeProb = rangeLeftProb + rangeRightProb; @@ -156,7 +156,7 @@ public void testNormalDistribution() // standard normal StandardNormalHistogram source = new StandardNormalHistogram(); RealDistribution dist = source.getDistribution(); - ConnectorHistogram constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(Range.open(-1d, 1d))); + ConnectorHistogram constrained = new DisjointRangeDomainHistogram(source, ImmutableSet.of(rangeOpen(-1d, 1d))); assertEquals(constrained.cumulativeProbability(-1.0, true).getValue(), 0.0, 1E-8); assertEquals(constrained.cumulativeProbability(0.0, true).getValue(), 0.5, 1E-8); assertEquals(constrained.cumulativeProbability(1.0, true).getValue(), 1.0, 1E-8); @@ -179,16 +179,16 @@ public void testNormalDistribution() public void testAddDisjunction() { ConnectorHistogram source = new UniformDistributionHistogram(0, 100); - DisjointRangeDomainHistogram constrained = disjunction(source, Range.open(-1d, 2d)); - assertEquals(constrained.getRanges().size(), 1); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 2d)); - constrained = disjunction(constrained, Range.open(1d, 10d)); + DisjointRangeDomainHistogram constrained = disjunction(source, rangeOpen(-1d, 2d)); + assertEquals(constrained.getRanges().getOrderedRanges().size(), 1); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 0d, true, 2d, false)); + constrained = disjunction(constrained, rangeOpen(1d, 10d)); assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 10d)); - constrained = disjunction(constrained, Range.closedOpen(50d, 100d)); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 0d, true, 10d, false)); + constrained = disjunction(constrained, range(DOUBLE, 50d, true, 100d, false)); assertEquals(ranges(constrained).size(), 2); - assertEquals(ranges(constrained).get(0), Range.closedOpen(0d, 10d)); - assertEquals(ranges(constrained).get(1), Range.closedOpen(50d, 100d)); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 0d, true, 10d, false)); + assertEquals(ranges(constrained).get(1), range(DOUBLE, 50d, true, 100d, false)); } /** @@ -198,30 +198,40 @@ public void testAddDisjunction() public void testAddConjunction() { ConnectorHistogram source = new UniformDistributionHistogram(0, 100); - DisjointRangeDomainHistogram constrained = disjunction(source, Range.open(10d, 90d)); - assertEquals(constrained.getRanges().size(), 1); - assertEquals(ranges(constrained).get(0), Range.open(10d, 90d)); - constrained = conjunction(constrained, Range.atMost(50d)); + DisjointRangeDomainHistogram constrained = disjunction(source, rangeOpen(10d, 90d)); + assertEquals(constrained.getRanges().getOrderedRanges().size(), 1); + assertEquals(ranges(constrained).get(0), rangeOpen(10d, 90d)); + constrained = conjunction(constrained, lessThanOrEqual(DOUBLE, 50d)); assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.openClosed(10d, 50d)); - constrained = conjunction(constrained, Range.atLeast(25d)); + assertEquals(ranges(constrained).get(0), range(DOUBLE, 10d, false, 50d, true)); + constrained = conjunction(constrained, greaterThanOrEqual(DOUBLE, 25d)); assertEquals(ranges(constrained).size(), 1); - assertEquals(ranges(constrained).get(0), Range.closed(25d, 50d)); + assertEquals(ranges(constrained).get(0), rangeClosed(25d, 50d)); } - private static DisjointRangeDomainHistogram disjunction(ConnectorHistogram source, Range range) + private static DisjointRangeDomainHistogram disjunction(ConnectorHistogram source, com.facebook.presto.common.predicate.Range range) { - return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addDisjunction(source, StatisticRange.fromRange(range)); + return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addDisjunction(source, range); } - private static DisjointRangeDomainHistogram conjunction(ConnectorHistogram source, Range range) + private static DisjointRangeDomainHistogram conjunction(ConnectorHistogram source, com.facebook.presto.common.predicate.Range range) { - return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addConjunction(source, StatisticRange.fromRange(range)); + return (DisjointRangeDomainHistogram) DisjointRangeDomainHistogram.addConjunction(source, range); } - private static List> ranges(DisjointRangeDomainHistogram hist) + private static List ranges(DisjointRangeDomainHistogram hist) { - return hist.getRanges().stream().map(StatisticRange::toRange).collect(Collectors.toList()); + return hist.getRanges().getOrderedRanges(); + } + + private static com.facebook.presto.common.predicate.Range rangeOpen(double low, double high) + { + return range(DOUBLE, low, false, high, false); + } + + private static com.facebook.presto.common.predicate.Range rangeClosed(double low, double high) + { + return range(DOUBLE, low, true, high, true); } private static class StandardNormalHistogram @@ -269,7 +279,7 @@ ConnectorHistogram createHistogram() return new DisjointRangeDomainHistogram( new UniformDistributionHistogram( distribution.getSupportLowerBound(), distribution.getSupportUpperBound())) - .addDisjunction(new StatisticRange(0.0, 100.0, 0.0)); + .addDisjunction(rangeClosed(0.0, 100.0)); } @Override diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogram.java similarity index 97% rename from presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java rename to presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogram.java index 26c68b7e5730..341870d138bd 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestHistogram.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogram.java @@ -12,9 +12,8 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; import org.apache.commons.math3.distribution.RealDistribution; import org.testng.annotations.Test; diff --git a/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java new file mode 100644 index 000000000000..0632e14247b1 --- /dev/null +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestHistogramCalculator.java @@ -0,0 +1,101 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.facebook.presto.spi.statistics; + +import com.facebook.presto.common.predicate.Range; +import org.testng.annotations.Test; + +import static com.facebook.presto.common.type.DoubleType.DOUBLE; +import static com.facebook.presto.spi.statistics.HistogramCalculator.calculateFilterFactor; +import static java.lang.Double.NEGATIVE_INFINITY; +import static java.lang.Double.NaN; +import static java.lang.Double.POSITIVE_INFINITY; +import static org.testng.Assert.assertEquals; + +public class TestHistogramCalculator +{ + @Test + public void testCalculateFilterFactor() + { + Range zeroToTen = range(0, 10); + Range empty = Range.range(DOUBLE, NaN, true, NaN, true); + + // Equal ranges + assertFilterFactor(Estimate.of(1.0), zeroToTen, 10, uniformHist(0, 10), 5); + assertFilterFactor(Estimate.of(1.0), zeroToTen, 10, uniformHist(0, 10), 20); + + // Some overlap + assertFilterFactor(Estimate.of(0.5), range(5, 3000), 5, uniformHist(zeroToTen), 10); + + // Single value overlap + assertFilterFactor(Estimate.of(1.0 / 10), range(3, 3), 1, uniformHist(zeroToTen), 10); + assertFilterFactor(Estimate.of(1.0 / 10), range(10, 100), 357, uniformHist(zeroToTen), 10); + + // No overlap + assertFilterFactor(Estimate.zero(), range(20, 30), 10, uniformHist(zeroToTen), 10); + + // Empty ranges + assertFilterFactor(Estimate.zero(), zeroToTen, 10, uniformHist(empty), 0); + assertFilterFactor(Estimate.zero(), empty, 0, uniformHist(zeroToTen), 10); + + // no test for (empty, empty) since any return value is correct + assertFilterFactor(Estimate.zero(), unboundedRange(), 10, uniformHist(empty), 0); + assertFilterFactor(Estimate.zero(), empty, 0, uniformHist(unboundedRange()), 10); + + // Unbounded (infinite), NDV-based + assertFilterFactor(Estimate.of(0.5), unboundedRange(), 10, uniformHist(unboundedRange()), 20); + assertFilterFactor(Estimate.of(1.0), unboundedRange(), 20, uniformHist(unboundedRange()), 10); + + // NEW TESTS (TPC-H Q2) + // unbounded ranges + assertFilterFactor(Estimate.of(.5), unboundedRange(), 0.5, uniformHist(unboundedRange()), NaN); + // unbounded ranges with limited distinct values + assertFilterFactor(Estimate.of(0.2), unboundedRange(), 1.0, + domainConstrained(unboundedRange(), uniformHist(unboundedRange())), 5.0); + } + + private static Range range(double low, double high) + { + return Range.range(DOUBLE, low, true, high, true); + } + + private static Range unboundedRange() + { + return Range.all(DOUBLE); + } + + private static void assertFilterFactor(Estimate expected, Range range, double distinctValues, ConnectorHistogram histogram, double totalDistinctValues) + { + assertEquals( + calculateFilterFactor(range, distinctValues, histogram, Estimate.estimateFromDouble(totalDistinctValues), true), + expected); + } + + private static ConnectorHistogram uniformHist(Range range) + { + return uniformHist(range.getLow().getObjectValue().map(Double.class::cast).orElse(NEGATIVE_INFINITY), + range.getHigh().getObjectValue().map(Double.class::cast).orElse(POSITIVE_INFINITY)); + } + + private static ConnectorHistogram uniformHist(double low, double high) + { + return new UniformDistributionHistogram(low, high); + } + + private static ConnectorHistogram domainConstrained(Range range, ConnectorHistogram source) + { + return DisjointRangeDomainHistogram.addDisjunction(source, range); + } +} diff --git a/presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestUniformHistogram.java similarity index 93% rename from presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java rename to presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestUniformHistogram.java index 395bc3f6e751..e1d3dc0b6f16 100644 --- a/presto-main/src/test/java/com/facebook/presto/cost/TestUniformHistogram.java +++ b/presto-spi/src/test/java/com/facebook/presto/spi/statistics/TestUniformHistogram.java @@ -12,11 +12,8 @@ * limitations under the License. */ -package com.facebook.presto.cost; +package com.facebook.presto.spi.statistics; -import com.facebook.presto.spi.statistics.ConnectorHistogram; -import com.facebook.presto.spi.statistics.Estimate; -import com.google.common.base.VerifyException; import org.apache.commons.math3.distribution.RealDistribution; import org.apache.commons.math3.distribution.UniformRealDistribution; import org.testng.annotations.Test; @@ -48,7 +45,7 @@ RealDistribution getDistribution() @Test public void testInvalidConstruction() { - assertThrows(VerifyException.class, () -> new UniformDistributionHistogram(2.0, 1.0)); + assertThrows(IllegalArgumentException.class, () -> new UniformDistributionHistogram(2.0, 1.0)); } @Test