Skip to content

Commit

Permalink
Merge branch 'awslabs:main' into sharding
Browse files Browse the repository at this point in the history
  • Loading branch information
Tonny-Gu authored Jun 12, 2022
2 parents f124494 + c8ddbc9 commit 1727b4f
Show file tree
Hide file tree
Showing 97 changed files with 2,531 additions and 984 deletions.
2 changes: 1 addition & 1 deletion 3rdparty/tvm
Submodule tvm updated from 0b9bcf to 609d6a
17 changes: 12 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ file(GLOB_RECURSE RAF_CXX_SOURCE_FILES
${CMAKE_CURRENT_LIST_DIR}/src/impl/*.cc
${CMAKE_CURRENT_LIST_DIR}/src/profiler/memory_profiler.cc
${CMAKE_CURRENT_LIST_DIR}/src/profiler/op_profiler.cc
${CMAKE_CURRENT_LIST_DIR}/src/profiler/scope_timer.cc
${CMAKE_CURRENT_LIST_DIR}/src/profiler/base/*.cc
${CMAKE_CURRENT_LIST_DIR}/src/distributed/common/*.cc
)
Expand Down Expand Up @@ -160,17 +161,22 @@ else()
endif()

if (${RAF_USE_NCCL} STREQUAL "OFF")
set(RAF_DISTRIBUTED_SOURCE_FILES "")
set(RAF_NCCL_SOURCE_FILES "")
else ()
set(RAF_CXX_FLAGS ${RAF_CXX_FLAGS} -DRAF_USE_NCCL)
file(GLOB_RECURSE RAF_DISTRIBUTED_SOURCE_FILES
${CMAKE_CURRENT_LIST_DIR}/src/distributed/cuda/*.cc
file(GLOB_RECURSE RAF_NCCL_SOURCE_FILES
${CMAKE_CURRENT_LIST_DIR}/src/distributed/cuda/nccl*.cc
${CMAKE_CURRENT_LIST_DIR}/src/op/dialect/nccl/*.cc
)
endif()

if (${RAF_USE_MPI} STREQUAL "ON")
if (${RAF_USE_MPI} STREQUAL "OFF")
set(RAF_MPI_SOURCE_FILES "")
else ()
set(RAF_CXX_FLAGS ${RAF_CXX_FLAGS} -DRAF_USE_MPI)
file(GLOB_RECURSE RAF_MPI_SOURCE_FILES
${CMAKE_CURRENT_LIST_DIR}/src/distributed/cuda/mpi*.cc
)
endif()

set(RAF_SOURCE_FILES
Expand All @@ -179,7 +185,8 @@ set(RAF_SOURCE_FILES
${RAF_CUDNN_SOURCE_FILES}
${RAF_CUBLAS_SOURCE_FILES}
${RAF_CUTLASS_SOURCE_FILES}
${RAF_DISTRIBUTED_SOURCE_FILES}
${RAF_MPI_SOURCE_FILES}
${RAF_NCCL_SOURCE_FILES}
)

add_library(raf_objs OBJECT ${RAF_SOURCE_FILES})
Expand Down
15 changes: 8 additions & 7 deletions docs/wiki/2_user_guide/Train-Model.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ The programming model of implementing a deep learning mode in RAF is basically t

```python
import raf
from raf.model import Conv2D, BatchNorm
from raf.testing import randn_torch, get_vm_executor, run_vm_executor
from raf.model import Conv2d, BatchNorm, Sequential, Linear
from raf.testing import randn_torch, get_vm_executor, run_vm_executor, one_hot_torch

class RAFBottleneck(raf.Model):
expansion = 4
Expand Down Expand Up @@ -125,16 +125,17 @@ RAF offers a virtual machine (VM) runtime to execute the model training process.

```python
batch_size = 8

input_shape = (batch_size, 3, 224, 224)
dy, _ = randn_torch((), std=0.0, mean=1.0, requires_grad=False, device=device) # dy = tensor(1.0)

# Training loop
num_epoch = 2
record = None
executor = None
for _ in range(num_epoch):
# Prepare input data, use random data as example here
r_x, _ = randn_torch(input_shape, device=device, dtype="float32")
r_ytrue, _ = one_hot_torch(batch_size=batch_size, num_classes=10, device=device)
r_ytrue, _ = one_hot_torch(size=batch_size, num_classes=10, device=device)
args = [dy, r_x, r_ytrue]

# Initialize the VM at the first iteration.
Expand All @@ -143,8 +144,8 @@ for _ in range(num_epoch):
executor = get_vm_executor(record.mod, device)

ret = run_vm_executor(executor, record, args, device)
loss = ret[0] # ret[0][0] for some models
print("Loss:", loss)
loss = ret[0][0] # ret[0] for some models
print("Loss:", loss.numpy())
```

One major different as PyTorch is RAF needs to initialize a virtual machine in the first iteration. The initialization involves graph level optimization and VM bytecode compilation. In addition, when running the VM executor in the first iteration, RAF performs just-in-time (JIT) compilation to code generate each kernel, so it may take a bit longer.
Expand All @@ -158,7 +159,7 @@ batch_size = 8

dy, _ = randn_torch((), std=0.0, mean=1.0, requires_grad=False, device=device) # dy = tensor(1.0)
r_x, _ = randn_torch(input_shape, device=device, dtype="float32")
r_ytrue, _ = one_hot_torch(batch_size=batch_size, num_classes=10, device=device)
r_ytrue, _ = one_hot_torch(size=batch_size, num_classes=10, device=device)
args = [dy, r_x, r_ytrue]

ret = optimizer(*args)
Expand Down
4 changes: 2 additions & 2 deletions docs/wiki/2_user_guide/Train-PyTorch-Model.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ from raf._op import sym

# prepare random data, they just provides shape and dtype info
r_x, _ = randn_torch(input_shape, device=device, dtype=dtype)
r_ytrue, _ = one_hot_torch(batch_size=batch_size, num_classes=10, device=device)
r_ytrue, _ = one_hot_torch(size=batch_size, num_classes=10, device=device)

out = r_model.record(r_x)
y_pred = sym.log_softmax(out)
Expand Down Expand Up @@ -130,7 +130,7 @@ executor = get_vm_executor(record.mod, device)
for _ in range(num_epoch):
# prepare input data, use random data as example here
r_x, _ = randn_torch(input_shape, device=device, dtype=dtype)
r_ytrue, _ = one_hot_torch(batch_size=batch_size, num_classes=10, device=device)
r_ytrue, _ = one_hot_torch(size=batch_size, num_classes=10, device=device)
args = [dy, r_x, r_ytrue]
ret = run_vm_executor(executor, record, args, device)
loss = ret[0] # ret[0][0] for some models
Expand Down
19 changes: 2 additions & 17 deletions include/raf/cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
*/

/*!
* \file src/op/cache.h
* \file cache.h
* \brief The RAF cache.
*/
#pragma once

#include <chrono>
#include <dmlc/memory_io.h>
#include <sys/stat.h>
#include "./file.h"
#include "./op.h"
#include "./value.h"

Expand Down Expand Up @@ -319,22 +320,6 @@ class MetaPersistCache : public MetaCache<T>, public MetaCacheMetric {
return path_ + "/" + std::to_string(hashed_key);
}

inline void CreateDir(const std::string& path) {
if (mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) == -1) {
if (errno != EEXIST) {
LOG(FATAL) << "Failed to create directory " << path << ": " << strerror(errno);
throw;
}
}
}

inline bool DirExists(const std::string& path) {
std::ifstream ifs(path);
auto ret = ifs.good();
ifs.close();
return ret;
}

inline void AddMetric(const std::string name, size_t val) {
metrics_[name] += val;
}
Expand Down
4 changes: 2 additions & 2 deletions include/raf/communicator.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ class CommunicatorPool {
}

static CommunicatorPool* Get() {
static CommunicatorPool* instance = dmlc::ThreadLocalStore<CommunicatorPool>::Get();
return instance;
static CommunicatorPool instance;
return &instance;
}

Communicator GetCommunicator(const std::string& name, const Value rank_list) {
Expand Down
3 changes: 2 additions & 1 deletion include/raf/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <string>
#include "dlpack/dlpack.h"
#include "tvm/runtime/c_runtime_api.h"
#include "tvm/runtime/device_api.h"
#include "tvm/runtime/ndarray.h"
#include "tvm/support/with.h"
#include "./enum_base.h"
Expand All @@ -20,7 +21,7 @@ namespace raf {

using namespace raf::ir;

constexpr int64_t kDefaultMemoryAlignment = 64;
constexpr int64_t kDefaultMemoryAlignment = tvm::runtime::kAllocAlignment;

class DTypeCode final : public EnumBase<DTypeCode, 5, int32_t, DLDataTypeCode> {
public:
Expand Down
2 changes: 2 additions & 0 deletions include/raf/dist_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,14 @@ class DistConfigObj : public ir::Object {
int zero_opt_level = 0;
int auto_dp_profiling_start_iter = 2;
int auto_dp_profiling_end_iter = 4;
int64_t group_bucket_size = 5000000000;

void VisitAttrs(tvm::AttrVisitor* v) {
v->Visit("enable_data_parallel", &enable_data_parallel);
v->Visit("zero_opt_level", &zero_opt_level);
v->Visit("auto_dp_profiling_start_iter", &auto_dp_profiling_start_iter);
v->Visit("auto_dp_profiling_end_iter", &auto_dp_profiling_end_iter);
v->Visit("group_bucket_size", &group_bucket_size);
}

public:
Expand Down
34 changes: 34 additions & 0 deletions include/raf/file.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0
*/

/*!
* \file file.h
* \brief File/Directory manipulation functions.
*/
#pragma once

#include <sys/stat.h>
#include <fstream>
#include <cerrno>
#include <cstring>
#include "dmlc/logging.h"

namespace raf {
inline void CreateDir(const std::string& path) {
if (mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH) == -1) {
if (errno != EEXIST) {
LOG(FATAL) << "Failed to create directory " << path << ": " << strerror(errno);
throw;
}
}
}

inline bool DirExists(const std::string& path) {
std::ifstream ifs(path);
auto ret = ifs.good();
ifs.close();
return ret;
}
} // namespace raf
10 changes: 7 additions & 3 deletions include/raf/op.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,6 @@ class Requests;
namespace raf {
namespace op {

extern std::vector<std::string> dispatch_error_msgs;

class CallValuesNode : public ir::Object {
public:
mutable value::Value callee;
Expand Down Expand Up @@ -79,6 +77,12 @@ class OpEnv {
*/
virtual void Execute(const std::vector<value::Value>& inputs, value::Value output) = 0;

/*! \brief Whether this OpEnv is valid. */
std::vector<std::string> error_msgs;
bool HasError() {
return !error_msgs.empty();
}

void RequestWorkspace(void** dest, const Device& device, int64_t nbytes);
void RequestStream(void** dest, const Device& device, int tag_idx);
void RequestDistributed(void** dest, const std::string& name, const value::Value rank_list);
Expand Down Expand Up @@ -333,7 +337,7 @@ inline T GetOpAttrOrDefault(const ir::Op& op, const std::string attr_name, T def
static constexpr const char* _type_key = type_key; \
RAF_FINAL_OBJECT(class_name, ::tvm::BaseAttrsNode) \
template <typename FVisit> \
void __VisitAttrs__(FVisit& __fvisit__) { \
void _tvm_VisitAttrs(FVisit& _tvm_fvisit) { \
}

#define RAF_OP_GRAD(op_name, body) \
Expand Down
6 changes: 6 additions & 0 deletions include/raf/op_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,12 @@ inline bool IsReshapeOp(const Op& op) {
return IsInOpSet(op, reshape_ops);
}

inline bool IsNonDeterministicOp(const Op& op) {
static std::unordered_set<Op, ObjectPtrHash, ObjectPtrEqual> non_deterministic_ops{
Op::Get("raf.op._contrib_dropout"), Op::Get("raf.op._contrib_dropout_dx")};
return IsInOpSet(op, non_deterministic_ops);
}

inline bool IsMemcpyOp(const Expr& op) {
static OpSet memcpy_ops = {
Op::Get("raf.op.fuse_tensor"),
Expand Down
6 changes: 6 additions & 0 deletions include/raf/pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,12 @@ Pass IOSStreamSchedule();
Pass Deduplicate(int forward_steps, bool consider_type, bool must_dominate,
ir::Optional<ir::String> salt);

/*!
* \brief This pass works in ANF and group allgather operators for ZeRO.
* \return The created pass.
*/
Pass GroupAllgather();

// Helper functions

/*!
Expand Down
33 changes: 0 additions & 33 deletions include/raf/profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,38 +251,5 @@ inline void ProfilerHelper::collect() {
Profiler::Get()->AddNewProfileStat(categories_, name_, start_time_, end_time_, args);
}

/*!
* \brief A helper class and macro to profile the execution time of a scope (e.g., function).
* This is used for debugging purpose. For example:
* void some_func() {
* RAF_TIMED_SEC("some_func")
* // do something;
* }
* The profiled time is then the life time of the created TimeSection object, and will be
* logged to stderr.
*/
class TimedSection {
public:
explicit TimedSection(std::string name, bool in_us = false)
: name_(name), start_(ProfileStat::NowInMicrosec()), in_us_(in_us) {
}

~TimedSection() {
auto timed = ProfileStat::NowInMicrosec() - start_;
if (in_us_) {
LOG(INFO) << "Timed " << name_ << ": " << timed << "us";
} else {
LOG(INFO) << "Timed " << name_ << ": " << std::setprecision(2) << timed / 1000000.0 << "s";
}
}

private:
std::string name_;
uint64_t start_;
bool in_us_;
};
#define RAF_TIMED_SEC(name) raf::profiler::TimedSection timed_section(name);
#define RAF_TIMED_US(name) raf::profiler::TimedSection timed_section(name, true);

} // namespace profiler
} // namespace raf
Loading

0 comments on commit 1727b4f

Please sign in to comment.