Hipify source in place (#2)

This is just the output of the hipify-inplace.sh and hipify-tensoradapter.py scripts with no further modifications, so probably you'll want to just spot-check. I think it will be easier to review changes to the actual HIP source directly rather than trying to thing about what the hipify script will do and since there are a decent number of changes to get the build and tests working, that seems worth it. In the end we should have a bunch of hip source files and .prehip cuda files that they can be generated from. Then we can handle organization however we want: restore the originals and have hipification be part of a build process, have the hip versions on a separate branch, etc. I put the .prehip files in a separate commit to keep things a bit cleaner. Note that this PR is based off of #1. I didn't make that the base branch though because then it would all be in my fork.
nod-ai · Jan 29, 2025 · 95243ef · 95243ef
1 parent f8a99c1
commit 95243ef
Show file tree

Hide file tree

Showing 174 changed files with 27,274 additions and 1,102 deletions.
diff --git a/include/dgl/aten/macro.h b/include/dgl/aten/macro.h
@@ -41,7 +41,7 @@
  * We treat pinned memory as normal host memory if we don't want
  * to enable CUDA UVA access for this operator
  */
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 #define ATEN_XPU_SWITCH_CUDA(val, XPU, op, ...)                          \
   do {                                                                   \
     if ((val) == kDGLCPU) {                                              \
@@ -55,9 +55,9 @@
                  << dgl::runtime::DeviceTypeCode2Str(val) << " device."; \
     }                                                                    \
   } while (0)
-#else  // DGL_USE_CUDA
+#else  // DGL_USE_ROCM
 #define ATEN_XPU_SWITCH_CUDA ATEN_XPU_SWITCH
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 
 /**
  * Dispatch according to integral type (either int32 or int64):
@@ -132,7 +132,7 @@
  * Dispatch according to float type, including 16bits
  * (float16/bfloat16/float32/float64).
  */
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 #if BF16_ENABLED
 #define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...)   \
   do {                                                                      \
@@ -150,7 +150,7 @@
       { __VA_ARGS__ }                                                       \
     } else if (                                                             \
         XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) {  \
-      typedef __nv_bfloat16 FloatType;                                      \
+      typedef __hip_bfloat16 FloatType;                                      \
       { __VA_ARGS__ }                                                       \
     } else if (                                                             \
         XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) {    \
@@ -195,7 +195,7 @@
     }                                                                      \
   } while (0)
 #endif  // BF16_ENABLED
-#else   // DGL_USE_CUDA
+#else   // DGL_USE_ROCM
 #define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \
   do {                                                                    \
     CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat))            \
@@ -215,7 +215,7 @@
                  << " can only be bfloat16/float32/float64 on CPU";       \
     }                                                                     \
   } while (0)
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 
 /**
  * Dispatch according to data type (int32, int64, float32 or float64):
@@ -361,7 +361,7 @@
   } while (0)
 
 // Macro to dispatch according to device context (allowing cuda)
-#ifdef DGL_USE_CUDA
+#ifdef DGL_USE_ROCM
 #define ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, op, ...)                \
   ATEN_XPU_SWITCH_CUDA((csr).indptr->ctx.device_type, XPU, op, {       \
     ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \
@@ -372,10 +372,10 @@
   ATEN_XPU_SWITCH_CUDA((coo).row->ctx.device_type, XPU, op, {       \
     ATEN_ID_TYPE_SWITCH((coo).row->dtype, IdType, {{__VA_ARGS__}}); \
   });
-#else  // DGL_USE_CUDA
+#else  // DGL_USE_ROCM
 #define ATEN_CSR_SWITCH_CUDA ATEN_CSR_SWITCH
 #define ATEN_COO_SWITCH_CUDA ATEN_COO_SWITCH
-#endif  // DGL_USE_CUDA
+#endif  // DGL_USE_ROCM
 
 ///////////////////////// Array checks //////////////////////////
 

diff --git a/include/dgl/aten/macro.h.prehip b/include/dgl/aten/macro.h.prehip
diff --git a/include/dgl/runtime/device_api.h b/include/dgl/runtime/device_api.h
@@ -174,7 +174,7 @@ class DeviceAPI {
       DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst);
 
   /**
-   * @brief Pin host memory using cudaHostRegister().
+   * @brief Pin host memory using hipHostRegister().
    *
    * @param ptr The host memory pointer to be pinned.
    * @param nbytes The size to be pinned.
@@ -183,7 +183,7 @@ class DeviceAPI {
   DGL_DLL virtual bool PinData(void* ptr, size_t nbytes);
 
   /**
-   * @brief Unpin host memory using cudaHostUnregister().
+   * @brief Unpin host memory using hipHostUnregister().
    *
    * @param ptr The host memory pointer to be unpinned.
    */
@@ -203,7 +203,7 @@ class DeviceAPI {
 
   /**
    * @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
-   * @note It avoids unnecessary cudaFreeHost calls and puts the memory
+   * @note It avoids unnecessary hipHostFree calls and puts the memory
    *     block into CachingHostAllocator's free list.
    * @param deleter Pointer to the deleter function from PyTorch's
    *     CachingHostAllocator.

diff --git a/include/dgl/runtime/device_api.h.prehip b/include/dgl/runtime/device_api.h.prehip
@@ -0,0 +1,268 @@
+/**
+ *  Copyright (c) 2016 by Contributors
+ * @file dgl/runtime/device_api.h
+ * @brief Abstract device memory management API
+ */
+#ifndef DGL_RUNTIME_DEVICE_API_H_
+#define DGL_RUNTIME_DEVICE_API_H_
+
+#include <string>
+
+#include "c_runtime_api.h"
+#include "packed_func.h"
+
+namespace dgl {
+namespace runtime {
+/**
+ * @brief the query type into GetAttr
+ */
+enum DeviceAttrKind : int {
+  kExist = 0,
+  kMaxThreadsPerBlock = 1,
+  kWarpSize = 2,
+  kMaxSharedMemoryPerBlock = 3,
+  kComputeVersion = 4,
+  kDeviceName = 5,
+  kMaxClockRate = 6,
+  kMultiProcessorCount = 7,
+  kMaxThreadDimensions = 8
+};
+
+/** @brief Number of bytes each allocation must align to */
+constexpr int kAllocAlignment = 64;
+
+/** @brief Number of bytes each allocation must align to in temporary allocation
+ */
+constexpr int kTempAllocaAlignment = 64;
+
+/** @brief Maximum size that can be allocated on stack */
+constexpr int kMaxStackAlloca = 1024;
+
+/**
+ * @brief DGL Runtime Device API, abstracts the device
+ *  specific interface for memory management.
+ */
+class DeviceAPI {
+ public:
+  /** @brief virtual destructor */
+  virtual ~DeviceAPI() {}
+  /**
+   * @brief Check whether the device is available.
+   */
+  virtual bool IsAvailable() { return true; }
+
+  /**
+   * @brief Set the environment device id to ctx
+   * @param ctx The context to be set.
+   */
+  virtual void SetDevice(DGLContext ctx) = 0;
+
+  /**
+   * @brief Get attribute of specified device.
+   * @param ctx The device context
+   * @param kind The result kind
+   * @param rv The return value.
+   * @sa DeviceAttrKind
+   */
+  virtual void GetAttr(
+      DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) = 0;
+
+  /**
+   * @brief Allocate a data space on device.
+   * @param ctx The device context to perform operation.
+   * @param nbytes The number of bytes in memory.
+   * @param alignment The alignment of the memory.
+   * @param type_hint The type of elements. Only needed by certain backends such
+   * as OpenGL, as nbytes & alignment are sufficient for most backends.
+   * @return The allocated device pointer.
+   */
+  virtual void* AllocDataSpace(
+      DGLContext ctx, size_t nbytes, size_t alignment,
+      DGLDataType type_hint) = 0;
+
+  /**
+   * @brief Free a data space on device.
+   * @param ctx The device context to perform operation.
+   * @param ptr The data space.
+   */
+  virtual void FreeDataSpace(DGLContext ctx, void* ptr) = 0;
+
+  /**
+   * @brief copy data from one place to another
+   * @param from The source array.
+   * @param from_offset The byte offeset in the from.
+   * @param to The target array.
+   * @param to_offset The byte offset in the to.
+   * @param num_bytes The size of the memory in bytes.
+   * @param ctx_from The source context.
+   * @param ctx_to The target context.
+   * @param type_hint The type of elements, only needed by certain backends,
+   *     can be useful for cross device endian converison.
+   */
+  virtual void CopyDataFromTo(
+      const void* from, size_t from_offset, void* to, size_t to_offset,
+      size_t num_bytes, DGLContext ctx_from, DGLContext ctx_to,
+      DGLDataType type_hint) = 0;
+
+  /**
+   * @brief copy data between device and CPU while recording the event.
+   * @param from The source array.
+   * @param from_offset The byte offeset in the from.
+   * @param to The target array.
+   * @param to_offset The byte offset in the to.
+   * @param num_bytes The size of the memory in bytes.
+   * @param ctx_from The source context.
+   * @param ctx_to The target context.
+   * @param type_hint The type of elements, only needed by certain backends,
+   *     can be useful for cross device endian converison.
+   * @param pytorch_ctx The context pointer from PyTorch's CachingHostAllocator.
+   * @note This function only works when PyTorch CachingHostAllocator is
+   *     available.
+   */
+  virtual void RecordedCopyDataFromTo(
+      void* from, size_t from_offset, void* to, size_t to_offset,
+      size_t num_bytes, DGLContext ctx_from, DGLContext ctx_to,
+      DGLDataType type_hint, void* pytorch_ctx) = 0;
+
+  /**
+   * @brief Create a new stream of execution.
+   *
+   * @param ctx The context of allocation.
+   */
+  DGL_DLL virtual DGLStreamHandle CreateStream(DGLContext ctx);
+
+  /**
+   * @brief Free a stream of execution
+   *
+   * @param ctx The context of the stream
+   * @param stream The pointer to be freed.
+   */
+  DGL_DLL virtual void FreeStream(DGLContext ctx, DGLStreamHandle stream);
+
+  /**
+   * @brief Synchronize the stream
+   * @param ctx The context to perform operation.
+   * @param stream The stream to be sync.
+   */
+  virtual void StreamSync(DGLContext ctx, DGLStreamHandle stream) = 0;
+
+  /**
+   * @brief Set the stream
+   * @param ctx The context to set stream.
+   * @param stream The stream to be set.
+   */
+  virtual void SetStream(DGLContext ctx, DGLStreamHandle stream) {}
+
+  /**
+   * @brief Get the stream
+   */
+  virtual DGLStreamHandle GetStream() const { return nullptr; }
+
+  /**
+   * @brief Synchronize 2 streams of execution.
+   *
+   * An event is created in event_src stream that the second then
+   * stream waits on.  Neither event_src or event_dst need to be of
+   * the same device ID as the context, but they must be of the same
+   * device type.
+   *
+   * @param ctx The context of the streams.
+   * @param event_src The source stream to synchronize.
+   * @param event_dst The destination stream to synchronize.
+   */
+  DGL_DLL virtual void SyncStreamFromTo(
+      DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst);
+
+  /**
+   * @brief Pin host memory using cudaHostRegister().
+   *
+   * @param ptr The host memory pointer to be pinned.
+   * @param nbytes The size to be pinned.
+   * @return false when pinning an empty tensor. true otherwise.
+   */
+  DGL_DLL virtual bool PinData(void* ptr, size_t nbytes);
+
+  /**
+   * @brief Unpin host memory using cudaHostUnregister().
+   *
+   * @param ptr The host memory pointer to be unpinned.
+   */
+  DGL_DLL virtual void UnpinData(void* ptr);
+
+  /**
+   * @brief Allocate the pinned memory using PyTorch CachingHostAllocator.
+   *
+   * @param nbytes The size to be pinned.
+   * @param ctx Pointer to the context pointer from PyTorch's
+   *     CachingHostAllocator.
+   * @param deleter Pointer to the deleter function from PyTorch's
+   *     CachingHostAllocator.
+   */
+  DGL_DLL virtual void* AllocPinnedDataSpace(
+      size_t nbytes, void** ctx, void** deleter);
+
+  /**
+   * @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
+   * @note It avoids unnecessary cudaFreeHost calls and puts the memory
+   *     block into CachingHostAllocator's free list.
+   * @param deleter Pointer to the deleter function from PyTorch's
+   *     CachingHostAllocator.
+   */
+  DGL_DLL virtual void FreePinnedDataSpace(void** deleter);
+
+  /**
+   * @brief Check whether the memory is in pinned memory.
+   */
+  DGL_DLL virtual bool IsPinned(const void* ptr) { return false; }
+
+  /**
+   * @brief Allocate temporal workspace for backend execution.
+   *
+   *  \note We have the following assumption about backend temporal
+   *   workspace allocation, and backend will optimize for such assumption:
+   *
+   *  - Only a few allocation will happen, and space will be released after use.
+   *  - The release order is usually in reverse order of allocate (stack style).
+   *  - Repeative pattern of same allocations over different runs.
+   *  - Workspace should not overlap between different threads(i.e. be
+   * threadlocal)
+   *
+   * @param ctx The context of allocation.
+   * @param nbytes The size to be allocated.
+   * @param type_hint The type of elements. Only needed by certain backends such
+   * as OpenGL, as nbytes is sufficient for most backends.
+   */
+  DGL_DLL virtual void* AllocWorkspace(
+      DGLContext ctx, size_t nbytes, DGLDataType type_hint = {});
+
+  /**
+   * @brief Free temporal workspace in backend execution.
+   *
+   * @param ctx The context of allocation.
+   * @param ptr The pointer to be freed.
+   */
+  DGL_DLL virtual void FreeWorkspace(DGLContext ctx, void* ptr);
+
+  /**
+   * @brief Get device API based on context.
+   * @param ctx The context
+   * @param allow_missing Whether allow missing
+   * @return The corresponding device API.
+   */
+  DGL_DLL static DeviceAPI* Get(DGLContext ctx, bool allow_missing = false);
+
+  /**
+   * @brief Get device API based on device type.
+   * @param dev_type The device type
+   * @param allow_missing Whether allow missing
+   * @return The corresponding device API.
+   */
+  DGL_DLL static DeviceAPI* Get(
+      DGLDeviceType dev_type, bool allow_missing = false);
+};
+
+/** @brief The device type bigger than this is RPC device */
+constexpr int kRPCSessMask = 128;
+}  // namespace runtime
+}  // namespace dgl
+#endif  // DGL_RUNTIME_DEVICE_API_H_