Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make HostDeviceVector single gpu only #4773

Merged
merged 29 commits into from
Aug 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
62efe25
make HostDeviceVector single gpu only
rongou Aug 14, 2019
23c6272
get non-tests to compile
rongou Aug 14, 2019
c9dccbb
make code compile
rongou Aug 15, 2019
9e1ea22
fixed some tests
rongou Aug 16, 2019
e89cf17
fix more tests
rongou Aug 16, 2019
ab302d2
make n_gpus private
rongou Aug 16, 2019
962155d
fix one gpu predictor test
rongou Aug 16, 2019
57be3a8
Merge remote-tracking branch 'upstream/master' into single-gpu-hdv
rongou Aug 19, 2019
ddbfc8b
fix after mege
rongou Aug 19, 2019
5959ca3
fix gpu predictor external memory code
rongou Aug 19, 2019
4a6f38f
fix cpu compile
rongou Aug 19, 2019
f51a2fa
Merge branch 'master' into single-gpu-hdv
rongou Aug 20, 2019
352fe00
fix mgpu tests
rongou Aug 20, 2019
2b0a981
fix python cpu test
rongou Aug 20, 2019
3797f57
fix
rongou Aug 20, 2019
dac2608
better initialization
rongou Aug 20, 2019
97e74fc
Merge branch 'master' into single-gpu-hdv
rongou Aug 21, 2019
888b202
fix base margin in gpu predictor
rongou Aug 21, 2019
fdda5b3
remove GPUDistribution
rongou Aug 21, 2019
3b55244
remove GPUSet
rongou Aug 21, 2019
4837b3a
fix cpu build
rongou Aug 21, 2019
7124923
remove reference to n_gpus
rongou Aug 21, 2019
f1930c4
Merge branch 'master' into single-gpu-hdv
rongou Aug 22, 2019
974aeed
remove sharding in host device vector
rongou Aug 23, 2019
f02f084
fix windows build
rongou Aug 23, 2019
ab700bd
clean up hist_util
rongou Aug 23, 2019
2ffcf54
clean up gpu coordinate updater
rongou Aug 23, 2019
3107dca
clean up gpu predictor
rongou Aug 23, 2019
d6520e4
more clean up of gpu predictor
rongou Aug 24, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions demo/c-api/c-api-demo.c
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,12 @@ int main(int argc, char** argv) {
// https://xgboost.readthedocs.io/en/latest/parameter.html
safe_xgboost(XGBoosterSetParam(booster, "tree_method", use_gpu ? "gpu_hist" : "hist"));
if (use_gpu) {
// set the number of GPUs and the first GPU to use;
// set the GPU to use;
// this is not necessary, but provided here as an illustration
safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "1"));
safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "0"));
} else {
// avoid evaluating objective and metric on a GPU
safe_xgboost(XGBoosterSetParam(booster, "n_gpus", "0"));
safe_xgboost(XGBoosterSetParam(booster, "gpu_id", "-1"));
}

safe_xgboost(XGBoosterSetParam(booster, "objective", "binary:logistic"));
Expand Down
13 changes: 8 additions & 5 deletions include/xgboost/generic_parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,8 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
// number of threads to use if OpenMP is enabled
// if equals 0, use system default
int nthread;
// primary device.
// primary device, -1 means no gpu.
int gpu_id;
// number of devices to use, -1 implies using all available devices.
int n_gpus;
// declare parameters
DMLC_DECLARE_PARAMETER(GenericParameter) {
DMLC_DECLARE_FIELD(seed).set_default(0).describe(
Expand All @@ -36,15 +34,20 @@ struct GenericParameter : public dmlc::Parameter<GenericParameter> {
DMLC_DECLARE_FIELD(nthread).set_default(0).describe(
"Number of threads to use.");
DMLC_DECLARE_FIELD(gpu_id)
.set_default(0)
.set_default(-1)
.set_lower_bound(-1)
.describe("The primary GPU device ordinal.");
DMLC_DECLARE_FIELD(n_gpus)
.set_default(0)
.set_range(0, 1)
.set_range(0, 0)
.describe("Deprecated. Single process multi-GPU training is no longer supported. "
"Please switch to distributed training with one process per GPU. "
"This can be done using Dask or Spark.");
}

private:
// number of devices to use (deprecated).
int n_gpus;
};
} // namespace xgboost

Expand Down
4 changes: 2 additions & 2 deletions plugin/example/custom_obj.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ class MyLogistic : public ObjFunction {
void PredTransform(HostDeviceVector<bst_float> *io_preds) override {
// transform margin value to probability.
std::vector<bst_float> &preds = io_preds->HostVector();
for (size_t i = 0; i < preds.size(); ++i) {
preds[i] = 1.0f / (1.0f + std::exp(-preds[i]));
for (auto& pred : preds) {
pred = 1.0f / (1.0f + std::exp(-pred));
}
}
bst_float ProbToMargin(bst_float base_score) const override {
Expand Down
40 changes: 2 additions & 38 deletions src/common/common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,48 +22,12 @@ using RandomThreadLocalStore = dmlc::ThreadLocalStore<RandomThreadLocalEntry>;
GlobalRandomEngine& GlobalRandom() {
return RandomThreadLocalStore::Get()->engine;
}
} // namespace common

#if !defined(XGBOOST_USE_CUDA)
int AllVisibleImpl::AllVisible() {
int AllVisibleGPUs() {
return 0;
}
#endif // !defined(XGBOOST_USE_CUDA)

constexpr GPUSet::GpuIdType GPUSet::kAll;

GPUSet GPUSet::All(GpuIdType gpu_id, GpuIdType n_gpus, int32_t n_rows) {
CHECK_GE(gpu_id, 0) << "gpu_id must be >= 0.";
CHECK_GE(n_gpus, -1) << "n_gpus must be >= -1.";

GpuIdType const n_devices_visible = AllVisible().Size();
CHECK_LE(n_gpus, n_devices_visible);
if (n_devices_visible == 0 || n_gpus == 0 || n_rows == 0) {
LOG(DEBUG) << "Runing on CPU.";
return Empty();
}

GpuIdType const n_available_devices = n_devices_visible - gpu_id;

if (n_gpus == kAll) { // Use all devices starting from `gpu_id'.
CHECK(gpu_id < n_devices_visible)
<< "\ngpu_id should be less than number of visible devices.\ngpu_id: "
<< gpu_id
<< ", number of visible devices: "
<< n_devices_visible;
GpuIdType n_devices =
n_available_devices < n_rows ? n_available_devices : n_rows;
LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
return Range(gpu_id, n_devices);
} else { // Use devices in ( gpu_id, gpu_id + n_gpus ).
CHECK_LE(n_gpus, n_available_devices)
<< "Starting from gpu id: " << gpu_id << ", there are only "
<< n_available_devices << " available devices, while n_gpus is set to: "
<< n_gpus;
GpuIdType n_devices = n_gpus < n_rows ? n_gpus : n_rows;
LOG(DEBUG) << "GPU ID: " << gpu_id << ", Number of GPUs: " << n_devices;
return Range(gpu_id, n_devices);
}
}

} // namespace common
} // namespace xgboost
4 changes: 3 additions & 1 deletion src/common/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,9 @@
#include "common.h"

namespace xgboost {
namespace common {

int AllVisibleImpl::AllVisible() {
int AllVisibleGPUs() {
int n_visgpus = 0;
try {
// When compiled with CUDA but running on CPU only device,
Expand All @@ -17,4 +18,5 @@ int AllVisibleImpl::AllVisible() {
return n_visgpus;
}

} // namespace common
} // namespace xgboost
84 changes: 2 additions & 82 deletions src/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,88 +140,8 @@ class Range {
Iterator begin_;
Iterator end_;
};
} // namespace common

struct AllVisibleImpl {
static int AllVisible();
};
/* \brief set of devices across which HostDeviceVector can be distributed.
*
* Currently implemented as a range, but can be changed later to something else,
* e.g. a bitset
*/
class GPUSet {
public:
using GpuIdType = int;
static constexpr GpuIdType kAll = -1;

explicit GPUSet(int start = 0, int ndevices = 0)
: devices_(start, start + ndevices) {}

static GPUSet Empty() { return GPUSet(); }

static GPUSet Range(GpuIdType start, GpuIdType n_gpus) {
return n_gpus <= 0 ? Empty() : GPUSet{start, n_gpus};
}
/*! \brief n_gpus and num_rows both are upper bounds. */
static GPUSet All(GpuIdType gpu_id, GpuIdType n_gpus,
GpuIdType num_rows = std::numeric_limits<GpuIdType>::max());

static GPUSet AllVisible() {
GpuIdType n = AllVisibleImpl::AllVisible();
return Range(0, n);
}

size_t Size() const {
GpuIdType size = *devices_.end() - *devices_.begin();
GpuIdType res = size < 0 ? 0 : size;
return static_cast<size_t>(res);
}

/*
* By default, we have two configurations of identifying device, one
* is the device id obtained from `cudaGetDevice'. But we sometimes
* store objects that allocated one for each device in a list, which
* requires a zero-based index.
*
* Hence, `DeviceId' converts a zero-based index to actual device id,
* `Index' converts a device id to a zero-based index.
*/
GpuIdType DeviceId(size_t index) const {
GpuIdType result = *devices_.begin() + static_cast<GpuIdType>(index);
CHECK(Contains(result)) << "\nDevice " << result << " is not in GPUSet."
<< "\nIndex: " << index
<< "\nGPUSet: (" << *begin() << ", " << *end() << ")"
<< std::endl;
return result;
}
size_t Index(GpuIdType device) const {
CHECK(Contains(device)) << "\nDevice " << device << " is not in GPUSet."
<< "\nGPUSet: (" << *begin() << ", " << *end() << ")"
<< std::endl;
size_t result = static_cast<size_t>(device - *devices_.begin());
return result;
}

bool IsEmpty() const { return Size() == 0; }

bool Contains(GpuIdType device) const {
return *devices_.begin() <= device && device < *devices_.end();
}

common::Range::Iterator begin() const { return devices_.begin(); } // NOLINT
common::Range::Iterator end() const { return devices_.end(); } // NOLINT

friend bool operator==(const GPUSet& lhs, const GPUSet& rhs) {
return lhs.devices_ == rhs.devices_;
}
friend bool operator!=(const GPUSet& lhs, const GPUSet& rhs) {
return !(lhs == rhs);
}

private:
common::Range devices_;
};

int AllVisibleGPUs();
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_COMMON_H_
18 changes: 1 addition & 17 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -72,22 +72,6 @@ const T *Raw(const thrust::device_vector<T> &v) { // NOLINT
return raw_pointer_cast(v.data());
}

// if n_devices=-1, then use all visible devices
inline void SynchronizeNDevices(xgboost::GPUSet devices) {
devices = devices.IsEmpty() ? xgboost::GPUSet::AllVisible() : devices;
for (auto const d : devices) {
safe_cuda(cudaSetDevice(d));
safe_cuda(cudaDeviceSynchronize());
}
}

inline void SynchronizeAll() {
for (int device_idx : xgboost::GPUSet::AllVisible()) {
safe_cuda(cudaSetDevice(device_idx));
safe_cuda(cudaDeviceSynchronize());
}
}

inline size_t AvailableMemory(int device_idx) {
size_t device_free = 0;
size_t device_total = 0;
Expand Down Expand Up @@ -119,7 +103,7 @@ inline size_t MaxSharedMemory(int device_idx) {
}

inline void CheckComputeCapability() {
for (int d_idx : xgboost::GPUSet::AllVisible()) {
for (int d_idx = 0; d_idx < xgboost::common::AllVisibleGPUs(); ++d_idx) {
cudaDeviceProp prop;
safe_cuda(cudaGetDeviceProperties(&prop, d_idx));
std::ostringstream oss;
Expand Down
Loading