force TE/PyTorch to always let Userbuffers manually allocate its buff…

…er for comm+GEMM overlap Signed-off-by: Alp Dener <adener@nvidia.com>
NVIDIA · Jul 25, 2024 · ffb9010 · ffb9010
1 parent ee34110
commit ffb9010
Showing 1 changed file with 6 additions and 25 deletions.
diff --git a/transformer_engine/pytorch/csrc/extensions/comm_gemm_overlap.cpp b/transformer_engine/pytorch/csrc/extensions/comm_gemm_overlap.cpp
@@ -117,17 +117,8 @@ te_torch::CommGemmOverlap::CommGemmOverlap(torch::Tensor sample, int world_rank,
   _ubuf_dtype = (sample.element_size() == 1) ? te::DType::kFloat8E4M3
                                              : GetTransformerEngineDType(sample.scalar_type());
   void *ubuf_ptr;
-  if (te::getenv<bool>("UB_SKIPMC")) {
-    // Multicast is disabled so we have to pre-allocate the buffer here.
-    _ubuf = torch::empty({sample.size(0), sample.size(1)}, sample.options());
-    ubuf_ptr = _ubuf.data_ptr();
-    this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, false);
-  } else {
-    // Multicast requires UB to allocate the buffer with specific memory options
-    // that PyTorch allocator does not support.
-    this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, true);
-    _ubuf = torch::from_blob(ubuf_ptr, {sample.size(0), sample.size(1)}, sample.options());
-  }
+  this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, true);
+  _ubuf = torch::from_blob(ubuf_ptr, {sample.size(0), sample.size(1)}, sample.options());
 
   if (_atomic_gemm) {
     auto counter_options = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA);
@@ -406,20 +397,10 @@ te_torch::CommGemmOverlapP2P::CommGemmOverlapP2P(
                                              : GetTransformerEngineDType(sample.scalar_type());
 
   void *ubuf_ptr;
-  if (te::getenv<bool>("UB_SKIPMC")) {
-    // Multicast is disabled so we have to pre-allocate the buffer here.
-    _ubuf = torch::empty({(sample.size(0) / _tp_size) * _num_ubuf_chunks, sample.size(1)},
-                         sample.options());
-    ubuf_ptr = _ubuf.data_ptr();
-    this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, false);
-  } else {
-    // Multicast requires UB to allocate the buffer with specific memory options
-    // that PyTorch allocator does not support.
-    this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, true);
-    _ubuf =
-        torch::from_blob(ubuf_ptr, {(sample.size(0) / _tp_size) * _num_ubuf_chunks, sample.size(1)},
-                         sample.options());
-  }
+  this->register_gpu_buffer(&ubuf_ptr, _ubuf_bytes, true);
+  _ubuf =
+      torch::from_blob(ubuf_ptr, {(sample.size(0) / _tp_size) * _num_ubuf_chunks, sample.size(1)},
+                        sample.options());
 
   // Create tensor chunks for easy management
   char *ubuf_byte_ptr = reinterpret_cast<char *>(ubuf_ptr);