Skip to content

Commit

Permalink
Add various Cuda decoders (facebookincubator#9038)
Browse files Browse the repository at this point in the history
Summary:
Adds decoders for a set of columnar encodings and test. Hooks these up to Wave stream. A DecodePrograms is a set of thread block level programs eahc running on its own TB and consisting of sequential decode steps.

Adds a copy of TypeKind type enum to Operand.h.  The original definition has includes that are incompatible with Cuda.

Used in subsequent Wave table scan.

Pull Request resolved: facebookincubator#9038

Reviewed By: Yuhta

Differential Revision: D54775533

Pulled By: oerling

fbshipit-source-id: 768354aa3ed14213da9256937d2fd2dd973b1abe
  • Loading branch information
Ubuntu authored and facebook-github-bot committed Mar 13, 2024
1 parent 5bdb787 commit 17fc58b
Show file tree
Hide file tree
Showing 9 changed files with 1,754 additions and 0 deletions.
1 change: 1 addition & 0 deletions velox/experimental/wave/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@
add_subdirectory(common)
add_subdirectory(exec)
add_subdirectory(vector)
add_subdirectory(dwio/decode)
21 changes: 21 additions & 0 deletions velox/experimental/wave/dwio/decode/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

add_subdirectory(tests)

add_library(velox_wave_decode GpuDecoder.cu)

set_target_properties(velox_wave_decode PROPERTIES CUDA_ARCHITECTURES native)

target_link_libraries(velox_wave_decode velox_wave_common)
236 changes: 236 additions & 0 deletions velox/experimental/wave/dwio/decode/DecodeStep.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include "velox/experimental/wave/common/Buffer.h"
#include "velox/experimental/wave/common/GpuArena.h"
#include "velox/experimental/wave/vector/Operand.h"

namespace facebook::velox::wave {

/// Instructions for GPU decode. This can be decoding,
/// or pre/post processing other than decoding.
enum class DecodeStep {
kConstant32,
kConstant64,
kConstantChar,
kConstantBool,
kConstantBytes,
kTrivial,
kTrivialNoOp,
kMainlyConstant,
kBitpack32,
kBitpack64,
kRleTotalLength,
kRleBool,
kRle,
kDictionary,
kDictionaryOnBitpack,
kVarint,
kNullable,
kSentinel,
kSparseBool,
kMakeScatterIndices,
kScatter32,
kScatter64,
kLengthToOffset,
kMissing,
kStruct,
kArray,
kMap,
kFlatMap,
kFlatMapNode,
kUnsupported,
};

/// Describes a decoding loop's input and result disposition.
struct GpuDecode {
// The operation to perform. Decides which branch of the union to use.
DecodeStep step;

struct Trivial {
// Type of the input and result data.
WaveTypeKind dataType;
// Input data.
const void* input;
// Begin position for input, scatter and result.
int begin;
// End position (exclusive) for input, scatter and result.
int end;
// If not null, contains the output position relative to result pointer.
const int32_t* scatter;
// Starting address of the result.
void* result;
};

struct MainlyConstant {
// Type of the values and result.
WaveTypeKind dataType;
// Number of total values that should be written to result.
int count;
// Common value that is repeated.
const void* commonValue;
// Sparse values that will be picked up when isCommon is false.
const void* otherValues;
// Bitmask indicating whether a values is common or not.
const uint8_t* isCommon;
// Temporary storage to keep non-common value indices. Should be
// preallocated to at least count large.
int32_t* otherIndices;
// Starting address of the result.
void* result;
// If non-null, the count of non-common elements will be written to this
// address.
int32_t* otherCount;
};

struct DictionaryOnBitpack {
// Type of the alphabet and result.
WaveTypeKind dataType;
// Dictionary alphabet.
const void* alphabet;
// Indices into the alphabet.
const uint64_t* indices;
// Begin position for indices, scatter and result.
int begin;
// End position (exclusive) for indices, scatter and result.
int end;
// Bit width of each index.
int bitWidth;
// If not null, contains the output position relative to result pointer.
const int32_t* scatter;
// All indices should be offseted by this baseline.
int64_t baseline;
// Starting address of the result.
void* result;
};

struct Varint {
// Address of the input data.
const char* input;
// Byte size of the input data.
int size;
// Temporary storage to keep whether each byte is a terminal for a number.
// Should be allocated at least "size" large.
bool* ends;
// Temporary storage to keep the location of end position of each number.
// Should be allocated at least "size" large.
int32_t* endPos;
// Type of the result number.
WaveTypeKind resultType;
// Starting address of the result.
void* result;
// Count of the numbers in the result.
int resultSize;
};

struct SparseBool {
// Number of bits in the result.
int totalCount;
// Sparse value; common value is the opposite of this.
bool sparseValue;
// Bit indices where sparse value should be stored.
const int32_t* sparseIndices;
// Number of sparse values.
int sparseCount;
// Temporary storage to keep bool representation of bits. Should be at
// least totalCount large.
bool* bools;
// Address of the result bits. We do not support unaligned result because
// different blocks could access the same byte and cause racing condition.
uint8_t* result;
};

struct RleTotalLength {
// Input data to be summed.
const int32_t* input;
// Number of input data.
int count;
// The sum of input data.
int64_t* result;
};

struct Rle {
// Type of values and result.
WaveTypeKind valueType;
// Values that will be repeated.
const void* values;
// Length of each value.
const int32_t* lengths;
// Number of values and lengths.
int count;
// Starting address of the result.
const void* result;
};

struct MakeScatterIndices {
// Input bits.
const uint8_t* bits;
// Whether set or unset bits should be found in the input.
bool findSetBits;
// Begin bit position of the input.
int begin;
// End bit position (exclusive) of the input.
int end;
// Address of the result indices.
int32_t* indices;
// If non-null, store the number of indices being written.
int32_t* indicesCount;
};

union {
Trivial trivial;
MainlyConstant mainlyConstant;
DictionaryOnBitpack dictionaryOnBitpack;
Varint varint;
SparseBool sparseBool;
RleTotalLength rleTotalLength;
Rle rle;
MakeScatterIndices makeScatterIndices;
} data;

/// Returns the amount f shared memory for standard size thread block for
/// 'step'.
int32_t sharedMemorySize() const;
};

struct DecodePrograms {
// Set of decode programs submitted as a unit. Each vector<DecodeStep> is run
// on its own thread block. The consecutive DecodeSteps in the same program
// are consecutive and the next one can depend on a previous one.
std::vector<std::vector<std::unique_ptr<GpuDecode>>> programs;

/// Unified or device memory buffer where steps in 'programs' write results
/// for the host. Decode results stay on device, only control information like
/// filter result counts or length sums come to the host via this buffer. If
/// nullptr, no data transfer is scheduled. 'result' should be nullptr if all
/// steps are unconditional, like simple decoding.
WaveBufferPtr result;
/// Host addressable copy of 'result'. If result is unified memory
/// this can be nullptr and we just enqueue a prefetch. to host. If
/// 'result' is device memory, this should be pinned host memory
/// with the same size.
WaveBufferPtr hostResult;
};

void launchDecode(
const DecodePrograms& programs,
GpuArena* arena,
WaveBufferPtr& extra,
Stream* stream);

} // namespace facebook::velox::wave
Loading

0 comments on commit 17fc58b

Please sign in to comment.