Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Enable Large Tensor Support: Stage 1 #18625

Merged
merged 8 commits into from
Nov 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions 3rdparty/mshadow/mshadow/dot_engine-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -314,20 +314,20 @@ struct BLASEngine<cpu, float> {
#if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
// since same m/n/k is used for all single gemms, so we put all gemms into one group
const int GROUP_SIZE = 1;
MKL_INT p_m[GROUP_SIZE] = {m};
MKL_INT p_n[GROUP_SIZE] = {n};
MKL_INT p_k[GROUP_SIZE] = {k};
MKL_INT p_lda[GROUP_SIZE] = {lda};
MKL_INT p_ldb[GROUP_SIZE] = {ldb};
MKL_INT p_ldc[GROUP_SIZE] = {ldc};
MKL_INT p_m[GROUP_SIZE] = {static_cast<MKL_INT>(m)};
MKL_INT p_n[GROUP_SIZE] = {static_cast<MKL_INT>(n)};
MKL_INT p_k[GROUP_SIZE] = {static_cast<MKL_INT>(k)};
MKL_INT p_lda[GROUP_SIZE] = {static_cast<MKL_INT>(lda)};
MKL_INT p_ldb[GROUP_SIZE] = {static_cast<MKL_INT>(ldb)};
MKL_INT p_ldc[GROUP_SIZE] = {static_cast<MKL_INT>(ldc)};

float p_alpha[GROUP_SIZE] = {alpha};
float p_beta[GROUP_SIZE] = {beta};

CBLAS_TRANSPOSE cblas_a_trans = GetT(transa);
CBLAS_TRANSPOSE cblas_b_trans = GetT(transb);

MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count};
MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast<MKL_INT>(batch_count)};
CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};

Expand Down Expand Up @@ -423,20 +423,20 @@ struct BLASEngine<cpu, double> {
#if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
// since same m/n/k is used for all single gemms, so we put all gemms into one group
const int GROUP_SIZE = 1;
MKL_INT p_m[GROUP_SIZE] = {m};
MKL_INT p_n[GROUP_SIZE] = {n};
MKL_INT p_k[GROUP_SIZE] = {k};
MKL_INT p_lda[GROUP_SIZE] = {lda};
MKL_INT p_ldb[GROUP_SIZE] = {ldb};
MKL_INT p_ldc[GROUP_SIZE] = {ldc};
MKL_INT p_m[GROUP_SIZE] = {static_cast<MKL_INT>(m)};
MKL_INT p_n[GROUP_SIZE] = {static_cast<MKL_INT>(n)};
MKL_INT p_k[GROUP_SIZE] = {static_cast<MKL_INT>(k)};
MKL_INT p_lda[GROUP_SIZE] = {static_cast<MKL_INT>(lda)};
MKL_INT p_ldb[GROUP_SIZE] = {static_cast<MKL_INT>(ldb)};
MKL_INT p_ldc[GROUP_SIZE] = {static_cast<MKL_INT>(ldc)};

double p_alpha[GROUP_SIZE] = {alpha};
double p_beta[GROUP_SIZE] = {beta};

CBLAS_TRANSPOSE cblas_a_trans = GetT(transa);
CBLAS_TRANSPOSE cblas_b_trans = GetT(transb);

MKL_INT p_group_sizeb[GROUP_SIZE] = {batch_count};
MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast<MKL_INT>(batch_count)};
CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};

Expand Down
4 changes: 3 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,6 @@ option(USE_SIGNAL_HANDLER "Print stack traces on segfaults." ON)
option(USE_TENSORRT "Enable inference optimization with TensorRT." OFF)
option(USE_ASAN "Enable Clang/GCC ASAN sanitizers." OFF)
cmake_dependent_option(ENABLE_TESTCOVERAGE "Enable compilation with test coverage metric output" OFF "NOT MSVC" OFF)
option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" OFF)
option(BUILD_EXTENSION_PATH "Path to extension to build" "")
option(BUILD_CYTHON_MODULES "Build cython modules." OFF)
option(LOG_FATAL_THROW "Log exceptions but do not abort" ON)
Expand Down Expand Up @@ -306,6 +305,8 @@ endif()
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)

cmake_dependent_option(USE_INT64_TENSOR_SIZE "Use int64_t to represent the total number of elements in a tensor" ON "CMAKE_SIZEOF_VOID_P EQUAL 8" OFF)

include(cmake/ChooseBlas.cmake)

if(USE_ASAN)
Expand Down Expand Up @@ -984,3 +985,4 @@ if(BUILD_CYTHON_MODULES)
message(FATAL_ERROR "No python interpreter found to build cython modules")
endif()
endif()

7 changes: 4 additions & 3 deletions ci/docker/runtime_functions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,7 @@ build_centos7_cpu() {
-DUSE_DIST_KVSTORE=ON \
-DUSE_CUDA=OFF \
-DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
-DUSE_INT64_TENSOR_SIZE=OFF \
-G Ninja /work/mxnet
ninja
}
Expand All @@ -282,6 +283,7 @@ build_centos7_mkldnn() {
-DUSE_MKL_IF_AVAILABLE=OFF \
-DUSE_MKLDNN=ON \
-DUSE_CUDA=OFF \
-DUSE_INT64_TENSOR_SIZE=OFF \
-G Ninja /work/mxnet
ninja
}
Expand All @@ -298,8 +300,9 @@ build_centos7_gpu() {
-DUSE_MKLDNN=ON \
-DUSE_CUDA=ON \
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-DUSE_DIST_KVSTORE=ON\
-DUSE_DIST_KVSTORE=ON \
-DBUILD_EXTENSION_PATH=/work/mxnet/example/extensions/lib_external_ops \
-DUSE_INT64_TENSOR_SIZE=OFF \
-G Ninja /work/mxnet
ninja
}
Expand Down Expand Up @@ -694,7 +697,6 @@ build_ubuntu_cpu_large_tensor() {
-DUSE_CUDA=OFF \
-DUSE_CUDNN=OFF \
-DUSE_MKLDNN=ON \
-DUSE_INT64_TENSOR_SIZE=ON \
-G Ninja \
/work/mxnet

Expand All @@ -714,7 +716,6 @@ build_ubuntu_gpu_large_tensor() {
-DUSE_DIST_KVSTORE=ON \
-DCMAKE_BUILD_TYPE=Release \
-DMXNET_CUDA_ARCH="$CI_CMAKE_CUDA_ARCH" \
-DUSE_INT64_TENSOR_SIZE=ON \
-G Ninja \
/work/mxnet

Expand Down
2 changes: 1 addition & 1 deletion config/darwin.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
# This will cause performance degradation reported in issue #14496
# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
# Note: the size of each dimension is still bounded by INT32_MAX
set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")

# Other GPU features
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
Expand Down
2 changes: 1 addition & 1 deletion config/linux.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
# This will cause performance degradation reported in issue #14496
# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
# Note: the size of each dimension is still bounded by INT32_MAX
set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")

# Other GPU features
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
Expand Down
2 changes: 1 addition & 1 deletion config/linux_gpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
# This will cause performance degradation reported in issue #14496
# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
# Note: the size of each dimension is still bounded by INT32_MAX
set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
set(USE_INT64_TENSOR_SIZE ON CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")

# Other GPU features
set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
Expand Down
14 changes: 7 additions & 7 deletions src/operator/contrib/transformer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,20 +140,20 @@ void strided_batch_sgemm(bool transA, bool transB,

#if (MSHADOW_USE_MKL && INTEL_MKL_VERSION >= 20160000)
const int GROUP_SIZE = 1;
MKL_INT p_m[GROUP_SIZE] = {m};
MKL_INT p_n[GROUP_SIZE] = {n};
MKL_INT p_k[GROUP_SIZE] = {k};
MKL_INT p_lda[GROUP_SIZE] = {lda};
MKL_INT p_ldb[GROUP_SIZE] = {ldb};
MKL_INT p_ldc[GROUP_SIZE] = {ldc};
MKL_INT p_m[GROUP_SIZE] = {static_cast<MKL_INT>(m)};
MKL_INT p_n[GROUP_SIZE] = {static_cast<MKL_INT>(n)};
MKL_INT p_k[GROUP_SIZE] = {static_cast<MKL_INT>(k)};
MKL_INT p_lda[GROUP_SIZE] = {static_cast<MKL_INT>(lda)};
MKL_INT p_ldb[GROUP_SIZE] = {static_cast<MKL_INT>(ldb)};
MKL_INT p_ldc[GROUP_SIZE] = {static_cast<MKL_INT>(ldc)};

float p_alpha[GROUP_SIZE] = {alpha};
float p_beta[GROUP_SIZE] = {beta};

CBLAS_TRANSPOSE cblas_a_trans = transA ? CblasTrans : CblasNoTrans;
CBLAS_TRANSPOSE cblas_b_trans = transB ? CblasTrans : CblasNoTrans;

MKL_INT p_group_sizeb[GROUP_SIZE] = {batchCount};
MKL_INT p_group_sizeb[GROUP_SIZE] = {static_cast<MKL_INT>(batchCount)};
CBLAS_TRANSPOSE p_transa[GROUP_SIZE] = {cblas_a_trans};
CBLAS_TRANSPOSE p_transb[GROUP_SIZE] = {cblas_b_trans};

Expand Down
2 changes: 1 addition & 1 deletion src/operator/numpy/np_insert_op_slice-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ void NumpyInsertSliceCompute(const nnvm::NodeAttrs& attrs,
CHECK((values.shape_[i] == 1) || (values.shape_[i] == sz));
}
size_t temp_storage_bytes, temp_mem_size;
temp_storage_bytes = SortByKeyWorkspaceSize<int64_t, int, xpu>(indices_len, false, true);
temp_storage_bytes = SortByKeyWorkspaceSize<int64_t, index_t, xpu>(indices_len, false, true);
temp_mem_size = indices_len * sizeof(int64_t) * 2 +
indices_len * sizeof(index_t) +
outshape[axis] * sizeof(index_t) * 2 +
Expand Down
3 changes: 2 additions & 1 deletion src/operator/numpy/np_polynomial_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ struct polyval_backward_p {
DType igrad_p = 0;
index_t j = x_size - 1;
while (j >= 0) {
igrad_p += pow(x_dptr[j], p_size - i - 1) * ograd_dptr[j];
igrad_p += pow(x_dptr[j], static_cast<DType>(p_size) -
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to confirm, there is a unittest coverage for this change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This prevents Windows CI build failure

static_cast<DType>(i + 1)) * ograd_dptr[j];
j--;
}
KERNEL_ASSIGN(igrad_p_dptr[i], req, igrad_p);
Expand Down