Skip to content

Commit

Permalink
Add null support to PrefixSortEncoder (facebookincubator#8350)
Browse files Browse the repository at this point in the history
Summary:
When processing normalization-encoding in the PrefixSort, we cannot simply equate null to max-value or min-value. Because in this way, we cannot distinguish between null and max/min value in a null-first scenario, and we have to add a nullbyte in normalize encoding.
The changes :
1. Add CompareFlag: ascending nullsFirst and isNull to the encode() method
2. Add new method encodeNoNulls to handle cases where there are no nulls, suports: uint64_t/int64_t/uint32_t/int32_t/float/double/Timestamp

Design doc: https://docs.google.com/document/d/1wa1lbbR-bhf0eg1mSaH7JUzeG7vhwz94a6ElUTK0J8k/edit?usp=sharing

Part of facebookincubator#6766

Pull Request resolved: facebookincubator#8350

Reviewed By: pedroerp

Differential Revision: D53348718

Pulled By: mbasmanova

fbshipit-source-id: 6f7887fb9d09b17af6a786de82fc00e116156a62
  • Loading branch information
skadilover authored and root committed Feb 12, 2024
1 parent af27dfc commit 862c7a6
Show file tree
Hide file tree
Showing 4 changed files with 531 additions and 18 deletions.
181 changes: 166 additions & 15 deletions velox/exec/prefixsort/PrefixSortEncoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,33 +22,184 @@
#include "velox/common/base/BitUtil.h"
#include "velox/common/base/Exceptions.h"
#include "velox/common/base/SimdUtil.h"
#include "velox/type/Timestamp.h"

namespace facebook::velox::exec::prefixsort {

/// Provides encode/decode methods for PrefixSort.
class PrefixSortEncoder {
public:
/// 1. Only int64_t is supported now.
/// 2. Encoding is compatible with sorting ascending with no nulls.
PrefixSortEncoder(bool ascending, bool nullsFirst)
: ascending_(ascending), nullsFirst_(nullsFirst){};

/// Encode native primitive types(such as uint64_t, int64_t, uint32_t,
/// int32_t, float, double, Timestamp). TODO: Add support for strings.
/// 1. The first byte of the encoded result is null byte. The value is 0 if
/// (nulls first and value is null) or (nulls last and value is not null).
/// Otherwise, the value is 1.
/// 2. The remaining bytes are the encoding result of value:
/// -If value is null, we set the remaining sizeof(T) bytes to '0', they
/// do not affect the comparison results at all.
/// -If value is not null, the result is set by calling encodeNoNulls.
template <typename T>
static FOLLY_ALWAYS_INLINE void encode(T value, char* row);
FOLLY_ALWAYS_INLINE void encode(std::optional<T> value, char* dest) const {
if (value.has_value()) {
dest[0] = nullsFirst_ ? 1 : 0;
encodeNoNulls(value.value(), dest + 1);
} else {
dest[0] = nullsFirst_ ? 0 : 1;
simd::memset(dest + 1, 0, sizeof(T));
}
}

private:
FOLLY_ALWAYS_INLINE static uint8_t flipSignBit(uint8_t byte) {
return byte ^ 128;
/// @tparam T Type of value. Supported type are: uint64_t, int64_t, uint32_t,
/// int32_t, float, double, Timestamp. TODO Add support for int16_t, uint16_t.
template <typename T>
FOLLY_ALWAYS_INLINE void encodeNoNulls(T value, char* dest) const;

bool isAscending() const {
return ascending_;
}

bool isNullsFirst() const {
return nullsFirst_;
}

private:
const bool ascending_;
const bool nullsFirst_;
};

/// Assuming that value is little-endian encoded, we encode it as follows to
/// make sure encoding is compatible with sorting ascending:
/// 1 Reverse each byte, for example, 0xaabbccdd becomes 0xddccbbaa.
/// 2 Flip the sign bit.
/// The decode logic is exactly the opposite of the above approach.
/// Assuming that value is little-endian encoded, means:
/// for an unsigned integer '0x aa bb cc dd', The content of bytes,
/// starting at the address of it, would be '0xdd 0xcc 0xbb 0xaa'. If we store
/// them into a buffer, and reverse the bytes of the buffer : [0xaa, 0xbb,
/// 0xcc, 0xdd], and then we can compare two buffers from the first byte to
/// last byte, the compare result is equal to value-compare. For any two
/// unsigned integers, a < b <==> ~a > ~b so we invert bits when descending
/// order.
template <>
FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
uint32_t value,
char* dest) const {
auto& v = *reinterpret_cast<uint32_t*>(dest);
v = __builtin_bswap32(value);
if (!ascending_) {
v = ~v;
}
}

/// Compare two positive signed integers: storage layout is as same as
/// unsigned integer, their sign-bit are same, flip sign-bit do not change
/// result. Compare two negative signed integers: -n = ~n + 1, we can treat ~n
/// + 1 as an unsigned integer, so the logic is as same as unsigned integer,
/// also flip sign-bit do not change result. Compare positive vs negative:
/// flip sign-bit to promise that positive always bigger than negative.
template <>
FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
int32_t value,
char* dest) const {
encodeNoNulls((uint32_t)(value ^ (1u << 31)), dest);
}

/// Logic is as same as int32_t.
template <>
FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
uint64_t value,
char* dest) const {
auto& v = *reinterpret_cast<uint64_t*>(dest);
v = __builtin_bswap64(value);
if (!ascending_) {
v = ~v;
}
}

template <>
FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
int64_t value,
char* dest) const {
encodeNoNulls((uint64_t)(value ^ (1ull << 63)), dest);
}

namespace detail {
/// Convert double to a uint64_t, their value comparison semantics remain
/// consistent.
static FOLLY_ALWAYS_INLINE uint64_t encodeDouble(double value) {
// Zero is the smallest positive value.
if (value == 0) {
return 1ull << 63;
}
// Nan is max value.
if (std::isnan(value)) {
return std::numeric_limits<uint64_t>::max();
}
// Infinity is the second max value.
if (value > std::numeric_limits<double>::max()) {
return std::numeric_limits<uint64_t>::max() - 1;
}
// -Infinity is the smallest value.
if (value < -std::numeric_limits<double>::max()) {
return 0;
}
auto encoded = *reinterpret_cast<uint64_t*>(&value);
if ((encoded & (1ull << 63)) == 0) {
// For positive numbers, set sign bit to 1.
encoded |= (1ull << 63);
} else {
// For negative numbers, invert bits to get the opposite order.
encoded = ~encoded;
}
return encoded;
}

// Logic is as same as double.
static FOLLY_ALWAYS_INLINE uint32_t encodeFloat(float value) {
if (value == 0) {
return 1u << 31;
}
if (std::isnan(value)) {
return std::numeric_limits<uint32_t>::max();
}
if (value > std::numeric_limits<float>::max()) {
return std::numeric_limits<uint32_t>::max() - 1;
}
if (value < -std::numeric_limits<float>::max()) {
return 0;
}
auto encoded = *reinterpret_cast<uint32_t*>(&value);
if ((encoded & (1u << 31)) == 0) {
encoded |= (1u << 31);
} else {
encoded = ~encoded;
}
return encoded;
}
} // namespace detail

/// The result of encodeDouble() keeps value comparison semantics, then we
/// can treat it as an unsigned-integer.
template <>
FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
double value,
char* dest) const {
encodeNoNulls(detail::encodeDouble(value), dest);
}

template <>
FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
float value,
char* dest) const {
encodeNoNulls(detail::encodeFloat(value), dest);
}

/// When comparing Timestamp, first compare seconds and then compare nanos, so
/// when encoding, just encode seconds and nanos in sequence.
template <>
FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encode(int64_t value, char* row) {
const auto v = __builtin_bswap64(static_cast<uint64_t>(value));
simd::memcpy(row, &v, sizeof(int64_t));
row[0] = flipSignBit(row[0]);
FOLLY_ALWAYS_INLINE void PrefixSortEncoder::encodeNoNulls(
Timestamp value,
char* dest) const {
encodeNoNulls(value.getSeconds(), dest);
encodeNoNulls(value.getNanos(), dest + 8);
}

} // namespace facebook::velox::exec::prefixsort
3 changes: 2 additions & 1 deletion velox/exec/prefixsort/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
# limitations under the License.
add_subdirectory(utils)

add_executable(velox_exec_prefixsort_test PrefixSortAlgorithmTest.cpp)
add_executable(velox_exec_prefixsort_test PrefixSortAlgorithmTest.cpp
PrefixEncoderTest.cpp)

add_test(
NAME velox_exec_prefixsort_test
Expand Down
Loading

0 comments on commit 862c7a6

Please sign in to comment.