Skip to content

Commit

Permalink
Add run time compilation
Browse files Browse the repository at this point in the history
- Adds a CompiledModule abstraction on top of Cuda run time compilation.

- Adds a cache of run time compiled kernels. The cache returns a
  kernel immediately and leaves the kernel compiling in the
  background. The kernel's methods wait for the compilation to be
  ready.

- tests that runtime API and driver API streams are interchangeable
  when running a dynamically generated kernel.

- Add proper use of contexts, one per device. The contexts are needed
  because of using the driver API to handle run time compilation.

- Add device properties to the Device* struct.
  • Loading branch information
Orri Erling committed Oct 10, 2024
1 parent 7356542 commit 0480a22
Show file tree
Hide file tree
Showing 25 changed files with 765 additions and 82 deletions.
2 changes: 1 addition & 1 deletion scripts/setup-centos9.sh
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ function install_protobuf {
wget_and_untar https://github.com/protocolbuffers/protobuf/releases/download/v21.8/protobuf-all-21.8.tar.gz protobuf
(
cd ${DEPENDENCY_DIR}/protobuf
./configure --prefix=${INSTALL_PREFIX}
./configure CXXFLAGS="-fPIC" --prefix=${INSTALL_PREFIX}
make "-j${NPROC}"
make install
ldconfig
Expand Down
12 changes: 12 additions & 0 deletions scripts/setup-ubuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,17 @@ function install_boost {
)
}

function install_protobuf {
wget_and_untar https://github.com/protocolbuffers/protobuf/releases/download/v21.8/protobuf-all-21.8.tar.gz protobuf
(
cd ${DEPENDENCY_DIR}/protobuf
./configure CXXFLAGS="-fPIC" --prefix=${INSTALL_PREFIX}
make "-j${NPROC}"
make install
ldconfig
)
}

function install_folly {
wget_and_untar https://github.com/facebook/folly/archive/refs/tags/${FB_OS_VERSION}.tar.gz folly
cmake_install_dir folly -DBUILD_TESTS=OFF -DFOLLY_HAVE_INT128_T=ON
Expand Down Expand Up @@ -238,6 +249,7 @@ function install_cuda {
function install_velox_deps {
run_and_time install_velox_deps_from_apt
run_and_time install_fmt
run_and_time install_protobuf
run_and_time install_boost
run_and_time install_folly
run_and_time install_fizz
Expand Down
2 changes: 0 additions & 2 deletions velox/exec/fuzzer/AggregationFuzzerOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@ struct AggregationFuzzerOptions {

std::unordered_set<std::string> orderDependentFunctions;

std::unordered_map<std::string, DataSpec> functionDataSpec;

/// Timestamp precision to use when generating inputs of type TIMESTAMP.
VectorFuzzer::Options::TimestampPrecision timestampPrecision{
VectorFuzzer::Options::TimestampPrecision::kMilliSeconds};
Expand Down
5 changes: 4 additions & 1 deletion velox/exec/fuzzer/AggregationFuzzerRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,12 +107,15 @@ class AggregationFuzzerRunner {
registerVectorSerde();
facebook::velox::filesystems::registerLocalFileSystem();

auto& aggregationFunctionDataSpecs =
referenceQueryRunner->aggregationFunctionDataSpecs();

facebook::velox::exec::test::aggregateFuzzer(
filteredSignatures,
seed,
options.customVerificationFunctions,
options.customInputGenerators,
options.functionDataSpec,
aggregationFunctionDataSpecs,
options.timestampPrecision,
options.queryConfigs,
options.hiveConfigs,
Expand Down
23 changes: 23 additions & 0 deletions velox/exec/fuzzer/DuckQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,29 @@ const std::vector<TypePtr>& DuckQueryRunner::supportedScalarTypes() const {
return kScalarTypes;
}

const std::unordered_map<std::string, DataSpec>&
DuckQueryRunner::aggregationFunctionDataSpecs() const {
// There are some functions for which DuckDB and Velox have inconsistent
// behavior with Nan and Infinity, so we exclude those.
static const std::unordered_map<std::string, DataSpec>
kAggregationFunctionDataSpecs{
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
{"histogram", DataSpec{false, false}},
{"regr_avgx", DataSpec{true, false}},
{"regr_avgy", DataSpec{true, false}},
{"regr_intercept", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"var_pop", DataSpec{false, false}}};

return kAggregationFunctionDataSpecs;
}

std::multiset<std::vector<velox::variant>> DuckQueryRunner::execute(
const std::string& sql,
const std::vector<RowVectorPtr>& input,
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/fuzzer/DuckQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ class DuckQueryRunner : public ReferenceQueryRunner {
/// TODO Investigate mismatches reported when comparing Varbinary.
const std::vector<TypePtr>& supportedScalarTypes() const override;

const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const override;

/// Specify names of aggregate function to exclude from the list of supported
/// functions. Used to exclude functions that are non-determonistic, have bugs
/// or whose semantics differ from Velox.
Expand Down
23 changes: 23 additions & 0 deletions velox/exec/fuzzer/PrestoQueryRunner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,29 @@ const std::vector<TypePtr>& PrestoQueryRunner::supportedScalarTypes() const {
return kScalarTypes;
}

const std::unordered_map<std::string, DataSpec>&
PrestoQueryRunner::aggregationFunctionDataSpecs() const {
// For some functions, velox supports NaN, Infinity better than presto query
// runner, which makes the comparison impossible.
// Add data constraint in vector fuzzer to enforce to not generate such data
// for those functions before they are fixed in presto query runner
static const std::unordered_map<std::string, DataSpec>
kAggregationFunctionDataSpecs{
{"regr_avgx", DataSpec{false, false}},
{"regr_avgy", DataSpec{false, false}},
{"regr_r2", DataSpec{false, false}},
{"regr_sxx", DataSpec{false, false}},
{"regr_syy", DataSpec{false, false}},
{"regr_sxy", DataSpec{false, false}},
{"regr_slope", DataSpec{false, false}},
{"regr_replacement", DataSpec{false, false}},
{"covar_pop", DataSpec{true, false}},
{"covar_samp", DataSpec{true, false}},
};

return kAggregationFunctionDataSpecs;
}

std::optional<std::string> PrestoQueryRunner::toSql(
const std::shared_ptr<const core::AggregationNode>& aggregationNode) {
// Assume plan is Aggregation over Values.
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/fuzzer/PrestoQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class PrestoQueryRunner : public velox::exec::test::ReferenceQueryRunner {

const std::vector<TypePtr>& supportedScalarTypes() const override;

const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const override;

/// Converts Velox query plan to Presto SQL. Supports Values -> Aggregation or
/// Window with an optional Project on top.
///
Expand Down
3 changes: 3 additions & 0 deletions velox/exec/fuzzer/ReferenceQueryRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class ReferenceQueryRunner {
return defaultScalarTypes();
}

virtual const std::unordered_map<std::string, DataSpec>&
aggregationFunctionDataSpecs() const = 0;

/// Converts Velox plan into SQL accepted by the reference database.
/// @return std::nullopt if the plan uses features not supported by the
/// reference database.
Expand Down
5 changes: 4 additions & 1 deletion velox/exec/fuzzer/WindowFuzzerRunner.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,17 @@ class WindowFuzzerRunner {
registerVectorSerde();
facebook::velox::filesystems::registerLocalFileSystem();

auto& aggregationFunctionDataSpecs =
referenceQueryRunner->aggregationFunctionDataSpecs();

facebook::velox::exec::test::windowFuzzer(
filteredAggregationSignatures,
filteredWindowSignatures,
seed,
options.customVerificationFunctions,
options.customInputGenerators,
options.orderDependentFunctions,
options.functionDataSpec,
aggregationFunctionDataSpecs,
options.timestampPrecision,
options.queryConfigs,
options.hiveConfigs,
Expand Down
2 changes: 2 additions & 0 deletions velox/experimental/wave/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@ velox_add_library(
velox_wave_common
GpuArena.cpp
Buffer.cpp
Compile.cu
Cuda.cu
Exception.cpp
KernelCache.cpp
Type.cpp
ResultStaging.cpp)

Expand Down
165 changes: 165 additions & 0 deletions velox/experimental/wave/common/Compile.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <fmt/format.h>
#include <gflags/gflags.h>
#include <nvrtc.h>
#include "velox/experimental/wave/common/Cuda.h"
#include "velox/experimental/wave/common/CudaUtil.cuh"
#include "velox/experimental/wave/common/Exception.h"

DEFINE_string(
wavegen_architecture,
"compute_80",
"--gpu-architecture flag for generated code");

namespace facebook::velox::wave {

void nvrtcCheck(nvrtcResult result) {
if (result != NVRTC_SUCCESS) {
waveError(nvrtcGetErrorString(result));
}
}

class CompiledModuleImpl : public CompiledModule {
public:
CompiledModuleImpl(CUmodule module, std::vector<CUfunction> kernels)
: module_(module), kernels_(std::move(kernels)) {}

~CompiledModuleImpl() {
auto result = cuModuleUnload(module_);
if (result != CUDA_SUCCESS) {
LOG(ERROR) << "Error in unloading module " << result;
}
}

void launch(
int32_t kernelIdx,
int32_t numBlocks,
int32_t numThreads,
int32_t shared,
Stream* stream,
void** args) override;

KernelInfo info(int32_t kernelIdx) override;

private:
CUmodule module_;
std::vector<CUfunction> kernels_;
};

std::shared_ptr<CompiledModule> CompiledModule::create(const KernelSpec& spec) {
nvrtcProgram prog;
nvrtcCreateProgram(
&prog,
spec.code.c_str(), // buffer
spec.filePath.c_str(), // name
spec.numHeaders, // numHeaders
spec.headers, // headers
spec.headerNames); // includeNames
for (auto& name : spec.entryPoints) {
nvrtcCheck(nvrtcAddNameExpression(prog, name.c_str()));
}
auto architecture =
fmt::format("--gpu-architecture={}", FLAGS_wavegen_architecture);
const char* opts[] = {
architecture.c_str(),
#ifndef NDEBUG
"-G"
#else
"-O3"
#endif
};
auto compileResult = nvrtcCompileProgram(
prog, // prog
sizeof(opts) / sizeof(char*), // numOptions
opts); // options

size_t logSize;

nvrtcGetProgramLogSize(prog, &logSize);
std::string log;
log.resize(logSize);
nvrtcGetProgramLog(prog, log.data());

if (compileResult != NVRTC_SUCCESS) {
nvrtcDestroyProgram(&prog);
waveError(std::string("Cuda compilation error: ") + log);
}
// Obtain PTX from the program.
size_t ptxSize;
nvrtcCheck(nvrtcGetPTXSize(prog, &ptxSize));
std::string ptx;
ptx.resize(ptxSize);
nvrtcCheck(nvrtcGetPTX(prog, ptx.data()));
std::vector<std::string> loweredNames;
for (auto& entry : spec.entryPoints) {
const char* temp;
nvrtcCheck(nvrtcGetLoweredName(prog, entry.c_str(), &temp));
loweredNames.push_back(std::string(temp));
}

nvrtcDestroyProgram(&prog);

CUmodule module;
CU_CHECK(cuModuleLoadDataEx(&module, ptx.data(), 0, 0, 0));
std::vector<CUfunction> funcs;
for (auto& name : loweredNames) {
funcs.emplace_back();
CU_CHECK(cuModuleGetFunction(&funcs.back(), module, name.c_str()));
}
return std::make_shared<CompiledModuleImpl>(module, std::move(funcs));
}

void CompiledModuleImpl::launch(
int32_t kernelIdx,
int32_t numBlocks,
int32_t numThreads,
int32_t shared,
Stream* stream,
void** args) {
auto result = cuLaunchKernel(
kernels_[kernelIdx],
numBlocks,
1,
1, // grid dim
numThreads,
1,
1, // block dim
shared,
reinterpret_cast<CUstream>(stream->stream()->stream),
args,
0);
CU_CHECK(result);
};

KernelInfo CompiledModuleImpl::info(int32_t kernelIdx) {
KernelInfo info;
auto f = kernels_[kernelIdx];
cuFuncGetAttribute(&info.numRegs, CU_FUNC_ATTRIBUTE_NUM_REGS, f);
cuFuncGetAttribute(
&info.sharedMemory, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, f);
cuFuncGetAttribute(
&info.maxThreadsPerBlock, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, f);
int32_t max;
cuOccupancyMaxActiveBlocksPerMultiprocessor(&max, f, 256, 0);
info.maxOccupancy0 = max;
cuOccupancyMaxActiveBlocksPerMultiprocessor(&max, f, 256, 256 * 32);
info.maxOccupancy32 = max;
return info;
}

} // namespace facebook::velox::wave
Loading

0 comments on commit 0480a22

Please sign in to comment.