Skip to content

Commit

Permalink
Hipify source in place (#2)
Browse files Browse the repository at this point in the history
This is just the output of the hipify-inplace.sh and hipify-tensoradapter.py scripts with no further modifications, so probably you'll want to just spot-check.

I think it will be easier to review changes to the actual HIP source directly rather than trying to thing about what the hipify script will do and since there are a decent number of changes to get the build and tests working, that seems worth it. In the end we should have a bunch of hip source files and .prehip cuda files that they can be generated from. Then we can handle organization however we want: restore the originals and have hipification be part of a build process, have the hip versions on a separate branch, etc.

I put the .prehip files in a separate commit to keep things a bit cleaner.

Note that this PR is based off of #1. I didn't make that the base branch though because then it would all be in my fork.
  • Loading branch information
GMNGeoffrey authored Jan 29, 2025
1 parent f8a99c1 commit 95243ef
Show file tree
Hide file tree
Showing 174 changed files with 27,274 additions and 1,102 deletions.
20 changes: 10 additions & 10 deletions include/dgl/aten/macro.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
* We treat pinned memory as normal host memory if we don't want
* to enable CUDA UVA access for this operator
*/
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_ROCM
#define ATEN_XPU_SWITCH_CUDA(val, XPU, op, ...) \
do { \
if ((val) == kDGLCPU) { \
Expand All @@ -55,9 +55,9 @@
<< dgl::runtime::DeviceTypeCode2Str(val) << " device."; \
} \
} while (0)
#else // DGL_USE_CUDA
#else // DGL_USE_ROCM
#define ATEN_XPU_SWITCH_CUDA ATEN_XPU_SWITCH
#endif // DGL_USE_CUDA
#endif // DGL_USE_ROCM

/**
* Dispatch according to integral type (either int32 or int64):
Expand Down Expand Up @@ -132,7 +132,7 @@
* Dispatch according to float type, including 16bits
* (float16/bfloat16/float32/float64).
*/
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_ROCM
#if BF16_ENABLED
#define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \
do { \
Expand All @@ -150,7 +150,7 @@
{ __VA_ARGS__ } \
} else if ( \
XPU == kDGLCUDA && (val).bits == 16 && (val).code == kDGLBfloat) { \
typedef __nv_bfloat16 FloatType; \
typedef __hip_bfloat16 FloatType; \
{ __VA_ARGS__ } \
} else if ( \
XPU == kDGLCPU && (val).bits == 16 && (val).code == kDGLFloat) { \
Expand Down Expand Up @@ -195,7 +195,7 @@
} \
} while (0)
#endif // BF16_ENABLED
#else // DGL_USE_CUDA
#else // DGL_USE_ROCM
#define ATEN_FLOAT_TYPE_SWITCH_16BITS(val, FloatType, XPU, val_name, ...) \
do { \
CHECK((val).code == kDGLFloat || (val.code == kDGLBfloat)) \
Expand All @@ -215,7 +215,7 @@
<< " can only be bfloat16/float32/float64 on CPU"; \
} \
} while (0)
#endif // DGL_USE_CUDA
#endif // DGL_USE_ROCM

/**
* Dispatch according to data type (int32, int64, float32 or float64):
Expand Down Expand Up @@ -361,7 +361,7 @@
} while (0)

// Macro to dispatch according to device context (allowing cuda)
#ifdef DGL_USE_CUDA
#ifdef DGL_USE_ROCM
#define ATEN_CSR_SWITCH_CUDA(csr, XPU, IdType, op, ...) \
ATEN_XPU_SWITCH_CUDA((csr).indptr->ctx.device_type, XPU, op, { \
ATEN_ID_TYPE_SWITCH((csr).indptr->dtype, IdType, {{__VA_ARGS__}}); \
Expand All @@ -372,10 +372,10 @@
ATEN_XPU_SWITCH_CUDA((coo).row->ctx.device_type, XPU, op, { \
ATEN_ID_TYPE_SWITCH((coo).row->dtype, IdType, {{__VA_ARGS__}}); \
});
#else // DGL_USE_CUDA
#else // DGL_USE_ROCM
#define ATEN_CSR_SWITCH_CUDA ATEN_CSR_SWITCH
#define ATEN_COO_SWITCH_CUDA ATEN_COO_SWITCH
#endif // DGL_USE_CUDA
#endif // DGL_USE_ROCM

///////////////////////// Array checks //////////////////////////

Expand Down
434 changes: 434 additions & 0 deletions include/dgl/aten/macro.h.prehip

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions include/dgl/runtime/device_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ class DeviceAPI {
DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst);

/**
* @brief Pin host memory using cudaHostRegister().
* @brief Pin host memory using hipHostRegister().
*
* @param ptr The host memory pointer to be pinned.
* @param nbytes The size to be pinned.
Expand All @@ -183,7 +183,7 @@ class DeviceAPI {
DGL_DLL virtual bool PinData(void* ptr, size_t nbytes);

/**
* @brief Unpin host memory using cudaHostUnregister().
* @brief Unpin host memory using hipHostUnregister().
*
* @param ptr The host memory pointer to be unpinned.
*/
Expand All @@ -203,7 +203,7 @@ class DeviceAPI {

/**
* @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
* @note It avoids unnecessary cudaFreeHost calls and puts the memory
* @note It avoids unnecessary hipHostFree calls and puts the memory
* block into CachingHostAllocator's free list.
* @param deleter Pointer to the deleter function from PyTorch's
* CachingHostAllocator.
Expand Down
268 changes: 268 additions & 0 deletions include/dgl/runtime/device_api.h.prehip
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
/**
* Copyright (c) 2016 by Contributors
* @file dgl/runtime/device_api.h
* @brief Abstract device memory management API
*/
#ifndef DGL_RUNTIME_DEVICE_API_H_
#define DGL_RUNTIME_DEVICE_API_H_

#include <string>

#include "c_runtime_api.h"
#include "packed_func.h"

namespace dgl {
namespace runtime {
/**
* @brief the query type into GetAttr
*/
enum DeviceAttrKind : int {
kExist = 0,
kMaxThreadsPerBlock = 1,
kWarpSize = 2,
kMaxSharedMemoryPerBlock = 3,
kComputeVersion = 4,
kDeviceName = 5,
kMaxClockRate = 6,
kMultiProcessorCount = 7,
kMaxThreadDimensions = 8
};

/** @brief Number of bytes each allocation must align to */
constexpr int kAllocAlignment = 64;

/** @brief Number of bytes each allocation must align to in temporary allocation
*/
constexpr int kTempAllocaAlignment = 64;

/** @brief Maximum size that can be allocated on stack */
constexpr int kMaxStackAlloca = 1024;

/**
* @brief DGL Runtime Device API, abstracts the device
* specific interface for memory management.
*/
class DeviceAPI {
public:
/** @brief virtual destructor */
virtual ~DeviceAPI() {}
/**
* @brief Check whether the device is available.
*/
virtual bool IsAvailable() { return true; }

/**
* @brief Set the environment device id to ctx
* @param ctx The context to be set.
*/
virtual void SetDevice(DGLContext ctx) = 0;

/**
* @brief Get attribute of specified device.
* @param ctx The device context
* @param kind The result kind
* @param rv The return value.
* @sa DeviceAttrKind
*/
virtual void GetAttr(
DGLContext ctx, DeviceAttrKind kind, DGLRetValue* rv) = 0;

/**
* @brief Allocate a data space on device.
* @param ctx The device context to perform operation.
* @param nbytes The number of bytes in memory.
* @param alignment The alignment of the memory.
* @param type_hint The type of elements. Only needed by certain backends such
* as OpenGL, as nbytes & alignment are sufficient for most backends.
* @return The allocated device pointer.
*/
virtual void* AllocDataSpace(
DGLContext ctx, size_t nbytes, size_t alignment,
DGLDataType type_hint) = 0;

/**
* @brief Free a data space on device.
* @param ctx The device context to perform operation.
* @param ptr The data space.
*/
virtual void FreeDataSpace(DGLContext ctx, void* ptr) = 0;

/**
* @brief copy data from one place to another
* @param from The source array.
* @param from_offset The byte offeset in the from.
* @param to The target array.
* @param to_offset The byte offset in the to.
* @param num_bytes The size of the memory in bytes.
* @param ctx_from The source context.
* @param ctx_to The target context.
* @param type_hint The type of elements, only needed by certain backends,
* can be useful for cross device endian converison.
*/
virtual void CopyDataFromTo(
const void* from, size_t from_offset, void* to, size_t to_offset,
size_t num_bytes, DGLContext ctx_from, DGLContext ctx_to,
DGLDataType type_hint) = 0;

/**
* @brief copy data between device and CPU while recording the event.
* @param from The source array.
* @param from_offset The byte offeset in the from.
* @param to The target array.
* @param to_offset The byte offset in the to.
* @param num_bytes The size of the memory in bytes.
* @param ctx_from The source context.
* @param ctx_to The target context.
* @param type_hint The type of elements, only needed by certain backends,
* can be useful for cross device endian converison.
* @param pytorch_ctx The context pointer from PyTorch's CachingHostAllocator.
* @note This function only works when PyTorch CachingHostAllocator is
* available.
*/
virtual void RecordedCopyDataFromTo(
void* from, size_t from_offset, void* to, size_t to_offset,
size_t num_bytes, DGLContext ctx_from, DGLContext ctx_to,
DGLDataType type_hint, void* pytorch_ctx) = 0;

/**
* @brief Create a new stream of execution.
*
* @param ctx The context of allocation.
*/
DGL_DLL virtual DGLStreamHandle CreateStream(DGLContext ctx);

/**
* @brief Free a stream of execution
*
* @param ctx The context of the stream
* @param stream The pointer to be freed.
*/
DGL_DLL virtual void FreeStream(DGLContext ctx, DGLStreamHandle stream);

/**
* @brief Synchronize the stream
* @param ctx The context to perform operation.
* @param stream The stream to be sync.
*/
virtual void StreamSync(DGLContext ctx, DGLStreamHandle stream) = 0;

/**
* @brief Set the stream
* @param ctx The context to set stream.
* @param stream The stream to be set.
*/
virtual void SetStream(DGLContext ctx, DGLStreamHandle stream) {}

/**
* @brief Get the stream
*/
virtual DGLStreamHandle GetStream() const { return nullptr; }

/**
* @brief Synchronize 2 streams of execution.
*
* An event is created in event_src stream that the second then
* stream waits on. Neither event_src or event_dst need to be of
* the same device ID as the context, but they must be of the same
* device type.
*
* @param ctx The context of the streams.
* @param event_src The source stream to synchronize.
* @param event_dst The destination stream to synchronize.
*/
DGL_DLL virtual void SyncStreamFromTo(
DGLContext ctx, DGLStreamHandle event_src, DGLStreamHandle event_dst);

/**
* @brief Pin host memory using cudaHostRegister().
*
* @param ptr The host memory pointer to be pinned.
* @param nbytes The size to be pinned.
* @return false when pinning an empty tensor. true otherwise.
*/
DGL_DLL virtual bool PinData(void* ptr, size_t nbytes);

/**
* @brief Unpin host memory using cudaHostUnregister().
*
* @param ptr The host memory pointer to be unpinned.
*/
DGL_DLL virtual void UnpinData(void* ptr);

/**
* @brief Allocate the pinned memory using PyTorch CachingHostAllocator.
*
* @param nbytes The size to be pinned.
* @param ctx Pointer to the context pointer from PyTorch's
* CachingHostAllocator.
* @param deleter Pointer to the deleter function from PyTorch's
* CachingHostAllocator.
*/
DGL_DLL virtual void* AllocPinnedDataSpace(
size_t nbytes, void** ctx, void** deleter);

/**
* @brief 'Deallocate' the pinned memory from PyTorch CachingHostAllocator.
* @note It avoids unnecessary cudaFreeHost calls and puts the memory
* block into CachingHostAllocator's free list.
* @param deleter Pointer to the deleter function from PyTorch's
* CachingHostAllocator.
*/
DGL_DLL virtual void FreePinnedDataSpace(void** deleter);

/**
* @brief Check whether the memory is in pinned memory.
*/
DGL_DLL virtual bool IsPinned(const void* ptr) { return false; }

/**
* @brief Allocate temporal workspace for backend execution.
*
* \note We have the following assumption about backend temporal
* workspace allocation, and backend will optimize for such assumption:
*
* - Only a few allocation will happen, and space will be released after use.
* - The release order is usually in reverse order of allocate (stack style).
* - Repeative pattern of same allocations over different runs.
* - Workspace should not overlap between different threads(i.e. be
* threadlocal)
*
* @param ctx The context of allocation.
* @param nbytes The size to be allocated.
* @param type_hint The type of elements. Only needed by certain backends such
* as OpenGL, as nbytes is sufficient for most backends.
*/
DGL_DLL virtual void* AllocWorkspace(
DGLContext ctx, size_t nbytes, DGLDataType type_hint = {});

/**
* @brief Free temporal workspace in backend execution.
*
* @param ctx The context of allocation.
* @param ptr The pointer to be freed.
*/
DGL_DLL virtual void FreeWorkspace(DGLContext ctx, void* ptr);

/**
* @brief Get device API based on context.
* @param ctx The context
* @param allow_missing Whether allow missing
* @return The corresponding device API.
*/
DGL_DLL static DeviceAPI* Get(DGLContext ctx, bool allow_missing = false);

/**
* @brief Get device API based on device type.
* @param dev_type The device type
* @param allow_missing Whether allow missing
* @return The corresponding device API.
*/
DGL_DLL static DeviceAPI* Get(
DGLDeviceType dev_type, bool allow_missing = false);
};

/** @brief The device type bigger than this is RPC device */
constexpr int kRPCSessMask = 128;
} // namespace runtime
} // namespace dgl
#endif // DGL_RUNTIME_DEVICE_API_H_
Loading

0 comments on commit 95243ef

Please sign in to comment.