Skip to content

Commit

Permalink
Add Cuda rountrip meter (facebookincubator#9037)
Browse files Browse the repository at this point in the history
Summary:
Adds a test to run Cuda programs consisting of data transfers and memory operations. See comments for the language for customizing the programs. Generates a test matrix of throughputs and latencies for different numbers of threads, data sizes and operation mixes. Measures the same for both GPU and CPU.

Makes GpuArena.h free of folly so it can be included together with Cuda headres.

Pull Request resolved: facebookincubator#9037

Reviewed By: Yuhta

Differential Revision: D54772459

Pulled By: oerling

fbshipit-source-id: 56709533780f01d59794dc98c2ab7290e0cb7fae
  • Loading branch information
Ubuntu authored and facebook-github-bot committed Mar 13, 2024
1 parent 9b6935b commit 5bdb787
Show file tree
Hide file tree
Showing 7 changed files with 538 additions and 13 deletions.
7 changes: 4 additions & 3 deletions velox/experimental/wave/common/Buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@

#pragma once
#include <boost/intrusive_ptr.hpp>
#include "velox/common/base/Exceptions.h"
#include <atomic>
#include <cstdint>

namespace facebook::velox::wave {

Expand Down Expand Up @@ -45,7 +46,7 @@ class Buffer {
}

void setSize(size_t newSize) {
VELOX_DCHECK_LE(newSize, capacity_);
assert(newSize <= capacity_);
size_ = newSize;
}

Expand All @@ -54,7 +55,7 @@ class Buffer {
}

bool unpin() {
VELOX_DCHECK_LT(0, pinCount_);
assert(0 < pinCount_);
return --pinCount_ == 0;
}

Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/wave/common/CudaUtil.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void cudaCheck(cudaError_t err, const char* file, int line);
#define CUDA_CHECK(e) ::facebook::velox::wave::cudaCheck(e, __FILE__, __LINE__)

template <typename T, typename U>
constexpr inline T roundUp(T value, U factor) {
__host__ __device__ constexpr inline T roundUp(T value, U factor) {
return (value + (factor - 1)) / factor * factor;
}

Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/wave/common/GpuArena.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ class GpuSlab {

// Returns an address for at least 'bytes' of memory inside this slab, nullptr
// if there is no contiguous run of at least 'bytes'.
void* FOLLY_NULLABLE allocate(uint64_t bytes);
void* allocate(uint64_t bytes);

/// Frees an area returned by allocate().
void free(void* address, uint64_t bytes);
Expand Down
1 change: 1 addition & 0 deletions velox/experimental/wave/common/tests/BlockTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ void BlockTestStream::testBoolToIndices(
int32_t** indices,
int32_t* sizes,
int64_t* times) {
CUDA_CHECK(cudaGetLastError());
auto tempBytes = sizeof(typename ScanAlgorithm::TempStorage);
boolToIndices<<<numBlocks, 256, tempBytes, stream_->stream>>>(
flags, indices, sizes, times);
Expand Down
Loading

0 comments on commit 5bdb787

Please sign in to comment.