Skip to content

Commit

Permalink
force TE/PyTorch to always let Userbuffers manually allocate its buff…
Browse files Browse the repository at this point in the history
…er for comm+GEMM overlap

Signed-off-by: Alp Dener <adener@nvidia.com>
  • Loading branch information
denera committed Jul 25, 2024
1 parent ee34110 commit ffb9010
Showing 1 changed file with 6 additions and 25 deletions.
31 changes: 6 additions & 25 deletions transformer_engine/pytorch/csrc/extensions/comm_gemm_overlap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,17 +117,8 @@ te_torch::CommGemmOverlap::CommGemmOverlap(torch::Tensor sample, int world_rank,
_ubuf_dtype = (sample.element_size() == 1) ? te::DType::kFloat8E4M3
: GetTransformerEngineDType(sample.scalar_type());
void *ubuf_ptr;
if (te::getenv<bool>("UB_SKIPMC")) {
// Multicast is disabled so we have to pre-allocate the buffer here.
_ubuf = torch::empty({sample.size(0), sample.size(1)}, sample.options());
ubuf_ptr = _ubuf.data_ptr();
this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, false);
} else {
// Multicast requires UB to allocate the buffer with specific memory options
// that PyTorch allocator does not support.
this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, true);
_ubuf = torch::from_blob(ubuf_ptr, {sample.size(0), sample.size(1)}, sample.options());
}
this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, true);
_ubuf = torch::from_blob(ubuf_ptr, {sample.size(0), sample.size(1)}, sample.options());

if (_atomic_gemm) {
auto counter_options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
Expand Down Expand Up @@ -406,20 +397,10 @@ te_torch::CommGemmOverlapP2P::CommGemmOverlapP2P(
: GetTransformerEngineDType(sample.scalar_type());

void *ubuf_ptr;
if (te::getenv<bool>("UB_SKIPMC")) {
// Multicast is disabled so we have to pre-allocate the buffer here.
_ubuf = torch::empty({(sample.size(0) / _tp_size) * _num_ubuf_chunks, sample.size(1)},
sample.options());
ubuf_ptr = _ubuf.data_ptr();
this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, false);
} else {
// Multicast requires UB to allocate the buffer with specific memory options
// that PyTorch allocator does not support.
this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, true);
_ubuf =
torch::from_blob(ubuf_ptr, {(sample.size(0) / _tp_size) * _num_ubuf_chunks, sample.size(1)},
sample.options());
}
this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, true);
_ubuf =
torch::from_blob(ubuf_ptr, {(sample.size(0) / _tp_size) * _num_ubuf_chunks, sample.size(1)},
sample.options());

// Create tensor chunks for easy management
char *ubuf_byte_ptr = reinterpret_cast<char *>(ubuf_ptr);
Expand Down

0 comments on commit ffb9010

Please sign in to comment.