Skip to content

Commit

Permalink
Merge simplifying the executor copy interface
Browse files Browse the repository at this point in the history
This simplifies the executor copy interface by adding an executor-local copy() and
a single-element copy_val_to_host() function.

Related PR: #516
  • Loading branch information
upsj authored Apr 30, 2020
2 parents 54c2f09 + e7608fb commit 4df4dea
Show file tree
Hide file tree
Showing 19 changed files with 118 additions and 115 deletions.
13 changes: 4 additions & 9 deletions core/factorization/ilu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);

const auto exec = this->get_executor();
const auto host_exec = exec->get_master();

// Converts the system matrix to CSR.
// Throws an exception if it is not convertible.
Expand All @@ -92,15 +91,11 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
local_system_matrix.get(), l_row_ptrs.get_data(),
u_row_ptrs.get_data()));

IndexType l_nnz_it{};
IndexType u_nnz_it{};
// Get nnz from device memory
host_exec->copy_from(exec.get(), 1, l_row_ptrs.get_data() + num_rows,
&l_nnz_it);
host_exec->copy_from(exec.get(), 1, u_row_ptrs.get_data() + num_rows,
&u_nnz_it);
auto l_nnz = static_cast<size_type>(l_nnz_it);
auto u_nnz = static_cast<size_type>(u_nnz_it);
auto l_nnz = static_cast<size_type>(
exec->copy_val_to_host(l_row_ptrs.get_data() + num_rows));
auto u_nnz = static_cast<size_type>(
exec->copy_val_to_host(u_row_ptrs.get_data() + num_rows));

// Init arrays
Array<IndexType> l_col_idxs{exec, l_nnz};
Expand Down
15 changes: 5 additions & 10 deletions core/factorization/par_ilu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,6 @@ ParIlu<ValueType, IndexType>::generate_l_u(
GKO_ASSERT_IS_SQUARE_MATRIX(system_matrix);

const auto exec = this->get_executor();
const auto host_exec = exec->get_master();

// Converts the system matrix to CSR.
// Throws an exception if it is not convertible.
Expand All @@ -104,15 +103,11 @@ ParIlu<ValueType, IndexType>::generate_l_u(
exec->run(par_ilu_factorization::make_initialize_row_ptrs_l_u(
csr_system_matrix, l_row_ptrs.get_data(), u_row_ptrs.get_data()));

IndexType l_nnz_it;
IndexType u_nnz_it;
// Since nnz is always at row_ptrs[m], it can be extracted easily
host_exec->copy_from(exec.get(), 1, l_row_ptrs.get_data() + number_rows,
&l_nnz_it);
host_exec->copy_from(exec.get(), 1, u_row_ptrs.get_data() + number_rows,
&u_nnz_it);
auto l_nnz = static_cast<size_type>(l_nnz_it);
auto u_nnz = static_cast<size_type>(u_nnz_it);
// Get nnz from device memory
auto l_nnz = static_cast<size_type>(
exec->copy_val_to_host(l_row_ptrs.get_data() + number_rows));
auto u_nnz = static_cast<size_type>(
exec->copy_val_to_host(u_row_ptrs.get_data() + number_rows));

// Since `row_ptrs` of L and U is already created, the matrix can be
// directly created with it
Expand Down
16 changes: 14 additions & 2 deletions core/test/base/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ TEST(OmpExecutor, CopiesData)
int *copy = omp->alloc<int>(num_elems);

// user code is run on the OMP, so local variables are in OMP memory
omp->copy_from(omp.get(), num_elems, orig, copy);
omp->copy(num_elems, orig, copy);
EXPECT_EQ(3, copy[0]);
EXPECT_EQ(8, copy[1]);

Expand Down Expand Up @@ -210,14 +210,26 @@ TEST(ReferenceExecutor, CopiesData)
int *copy = ref->alloc<int>(num_elems);

// ReferenceExecutor is a type of OMP executor, so this is O.K.
ref->copy_from(ref.get(), num_elems, orig, copy);
ref->copy(num_elems, orig, copy);
EXPECT_EQ(3, copy[0]);
EXPECT_EQ(8, copy[1]);

ref->free(copy);
}


TEST(ReferenceExecutor, CopiesSingleValue)
{
exec_ptr ref = gko::ReferenceExecutor::create();
int *el = ref->alloc<int>(1);
el[0] = 83683;

EXPECT_EQ(83683, ref->copy_val_to_host(el));

ref->free(el);
}


TEST(ReferenceExecutor, CopiesDataFromOmp)
{
int orig[] = {3, 8};
Expand Down
4 changes: 1 addition & 3 deletions cuda/components/reduction.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,7 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const CudaExecutor> exec,
reduce_add_array<<<1, default_block_size>>>(
grid_dim, as_cuda_type(block_results_val),
as_cuda_type(d_result.get_data()));
ValueType answer = zero<ValueType>();
exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
&answer);
auto answer = exec->copy_val_to_host(d_result.get_const_data());
return answer;
}

Expand Down
5 changes: 2 additions & 3 deletions cuda/factorization/factorization_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -113,9 +113,8 @@ void add_diagonal_elements(std::shared_ptr<const CudaExecutor> exec,
prefix_sum(exec, cuda_row_ptrs_add, row_ptrs_size);
exec->synchronize();

IndexType total_additions{};
exec->get_master()->copy_from(
exec.get(), 1, cuda_row_ptrs_add + row_ptrs_size - 1, &total_additions);
auto total_additions =
exec->copy_val_to_host(cuda_row_ptrs_add + row_ptrs_size - 1);
size_type new_num_elems = static_cast<size_type>(total_additions) +
mtx->get_num_stored_elements();

Expand Down
16 changes: 5 additions & 11 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -516,9 +516,7 @@ void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
auto d_descr = cusparse::create_mat_descr();
auto info = cusparse::create_spgemm_info();

ValueType valpha{};
exec->get_master()->copy_from(exec.get(), 1, alpha->get_const_values(),
&valpha);
auto valpha = exec->copy_val_to_host(alpha->get_const_values());
auto a_nnz = IndexType(a->get_num_stored_elements());
auto a_vals = a->get_const_values();
auto a_row_ptrs = a->get_const_row_ptrs();
Expand All @@ -527,9 +525,7 @@ void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
auto b_vals = b->get_const_values();
auto b_row_ptrs = b->get_const_row_ptrs();
auto b_col_idxs = b->get_const_col_idxs();
ValueType vbeta{};
exec->get_master()->copy_from(exec.get(), 1, beta->get_const_values(),
&vbeta);
auto vbeta = exec->copy_val_to_host(beta->get_const_values());
auto d_nnz = IndexType(d->get_num_stored_elements());
auto d_vals = d->get_const_values();
auto d_row_ptrs = d->get_const_row_ptrs();
Expand Down Expand Up @@ -767,8 +763,7 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
grid_dim, as_cuda_type(block_results.get_const_data()),
as_cuda_type(d_result.get_data()));

exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
result);
*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down Expand Up @@ -900,8 +895,7 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
reduce_dim, as_cuda_type(block_results.get_const_data()),
as_cuda_type(d_result.get_data()));

exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
result);
*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down Expand Up @@ -983,7 +977,7 @@ void sort_by_column_index(std::shared_ptr<const CudaExecutor> exec,

// copy values
Array<ValueType> tmp_vals_array(exec, nnz);
exec->copy_from(exec.get(), nnz, vals, tmp_vals_array.get_data());
exec->copy(nnz, vals, tmp_vals_array.get_data());
auto tmp_vals = tmp_vals_array.get_const_data();

// init identity permutation
Expand Down
6 changes: 2 additions & 4 deletions cuda/matrix/dense_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -435,8 +435,7 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
grid_dim, as_cuda_type(block_results.get_const_data()),
as_cuda_type(d_result.get_data()));

exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
result);
*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
Expand Down Expand Up @@ -500,8 +499,7 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
grid_dim, as_cuda_type(block_results.get_const_data()),
as_cuda_type(d_result.get_data()));

exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
result);
*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
Expand Down
14 changes: 6 additions & 8 deletions cuda/preconditioner/isai_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,10 +74,9 @@ void generate_l_inverse(std::shared_ptr<const DefaultExecutor> exec,
const auto nnz = l_csr->get_num_stored_elements();
const auto num_rows = l_csr->get_size()[0];

exec->copy_from(exec.get(), nnz, l_csr->get_const_col_idxs(),
inverse_l->get_col_idxs());
exec->copy_from(exec.get(), num_rows + 1, l_csr->get_const_row_ptrs(),
inverse_l->get_row_ptrs());
exec->copy(nnz, l_csr->get_const_col_idxs(), inverse_l->get_col_idxs());
exec->copy(num_rows + 1, l_csr->get_const_row_ptrs(),
inverse_l->get_row_ptrs());


const dim3 block(default_block_size, 1, 1);
Expand All @@ -104,10 +103,9 @@ void generate_u_inverse(std::shared_ptr<const DefaultExecutor> exec,
const auto nnz = u_csr->get_num_stored_elements();
const auto num_rows = u_csr->get_size()[0];

exec->copy_from(exec.get(), nnz, u_csr->get_const_col_idxs(),
inverse_u->get_col_idxs());
exec->copy_from(exec.get(), num_rows + 1, u_csr->get_const_row_ptrs(),
inverse_u->get_row_ptrs());
exec->copy(nnz, u_csr->get_const_col_idxs(), inverse_u->get_col_idxs());
exec->copy(num_rows + 1, u_csr->get_const_row_ptrs(),
inverse_u->get_row_ptrs());


const dim3 block(default_block_size, 1, 1);
Expand Down
7 changes: 2 additions & 5 deletions cuda/stop/residual_norm_reduction_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -112,11 +112,8 @@ void residual_norm_reduction(std::shared_ptr<const CudaExecutor> exec,
as_cuda_type(device_storage->get_data()));

/* Represents all_converged, one_changed */
bool tmp[2] = {true, false};
exec->get_master()->copy_from(exec.get(), 2,
device_storage->get_const_data(), tmp);
*all_converged = tmp[0];
*one_changed = tmp[1];
*all_converged = exec->copy_val_to_host(device_storage->get_const_data());
*one_changed = exec->copy_val_to_host(device_storage->get_const_data() + 1);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
Expand Down
16 changes: 8 additions & 8 deletions cuda/test/factorization/par_ilu_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,14 +169,14 @@ class ParIlu : public ::testing::Test {
*l_cuda = Csr::create(cuda, csr_cuda->get_size(), l_nnz);
*u_cuda = Csr::create(cuda, csr_cuda->get_size(), u_nnz);
// Copy the already initialized `row_ptrs` to the new matrices
ref->copy_from(gko::lend(ref), num_row_ptrs, l_row_ptrs_ref.get_data(),
(*l_ref)->get_row_ptrs());
ref->copy_from(gko::lend(ref), num_row_ptrs, u_row_ptrs_ref.get_data(),
(*u_ref)->get_row_ptrs());
cuda->copy_from(gko::lend(cuda), num_row_ptrs,
l_row_ptrs_cuda.get_data(), (*l_cuda)->get_row_ptrs());
cuda->copy_from(gko::lend(cuda), num_row_ptrs,
u_row_ptrs_cuda.get_data(), (*u_cuda)->get_row_ptrs());
ref->copy(num_row_ptrs, l_row_ptrs_ref.get_data(),
(*l_ref)->get_row_ptrs());
ref->copy(num_row_ptrs, u_row_ptrs_ref.get_data(),
(*u_ref)->get_row_ptrs());
cuda->copy(num_row_ptrs, l_row_ptrs_cuda.get_data(),
(*l_cuda)->get_row_ptrs());
cuda->copy(num_row_ptrs, u_row_ptrs_cuda.get_data(),
(*u_cuda)->get_row_ptrs());

gko::kernels::reference::factorization::initialize_l_u(
ref, gko::lend(csr_ref), gko::lend(*l_ref), gko::lend(*u_ref));
Expand Down
4 changes: 1 addition & 3 deletions hip/components/reduction.hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,7 @@ __host__ ValueType reduce_add_array(std::shared_ptr<const HipExecutor> exec,
hipLaunchKernelGGL(reduce_add_array, dim3(1), dim3(default_block_size), 0,
0, grid_dim, as_hip_type(block_results_val),
as_hip_type(d_result.get_data()));
ValueType answer = zero<ValueType>();
exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
&answer);
auto answer = exec->copy_val_to_host(d_result.get_const_data());
return answer;
}

Expand Down
5 changes: 2 additions & 3 deletions hip/factorization/factorization_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,8 @@ void add_diagonal_elements(std::shared_ptr<const HipExecutor> exec,
prefix_sum(exec, hip_row_ptrs_add, row_ptrs_size);
exec->synchronize();

IndexType total_additions{};
exec->get_master()->copy_from(
exec.get(), 1, hip_row_ptrs_add + row_ptrs_size - 1, &total_additions);
auto total_additions =
exec->copy_val_to_host(hip_row_ptrs_add + row_ptrs_size - 1);
size_type new_num_elems = static_cast<size_type>(total_additions) +
mtx->get_num_stored_elements();

Expand Down
19 changes: 6 additions & 13 deletions hip/matrix/csr_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -556,8 +556,7 @@ void spgeam(syn::value_list<int, subwarp_size>,

// accumulate non-zeros for alpha * A + beta * B
matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
IndexType c_nnz{};
exec->get_master()->copy_from(exec.get(), 1, c_row_ptrs + m, &c_nnz);
auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m);
c_builder.get_col_idx_array().resize_and_reset(c_nnz);
c_builder.get_value_array().resize_and_reset(c_nnz);
auto c_col_idxs = c->get_col_idxs();
Expand Down Expand Up @@ -650,12 +649,8 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
hipsparse::destroy(b_descr);
hipsparse::destroy(a_descr);

ValueType valpha{};
ValueType vbeta{};
exec->get_master()->copy_from(exec.get(), 1, alpha->get_const_values(),
&valpha);
exec->get_master()->copy_from(exec.get(), 1, beta->get_const_values(),
&vbeta);
auto valpha = exec->copy_val_to_host(alpha->get_const_values());
auto vbeta = exec->copy_val_to_host(beta->get_const_values());
auto total_nnz = c_nnz + d->get_num_stored_elements();
auto nnz_per_row = total_nnz / m;
select_spgeam(spgeam_kernels(),
Expand Down Expand Up @@ -870,8 +865,7 @@ void calculate_total_cols(std::shared_ptr<const HipExecutor> exec,
as_hip_type(block_results.get_const_data()),
as_hip_type(d_result.get_data()));

exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
result);
*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down Expand Up @@ -1006,8 +1000,7 @@ void calculate_max_nnz_per_row(std::shared_ptr<const HipExecutor> exec,
as_hip_type(block_results.get_const_data()),
as_hip_type(d_result.get_data()));

exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
result);
*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down Expand Up @@ -1094,7 +1087,7 @@ void sort_by_column_index(std::shared_ptr<const HipExecutor> exec,

// copy values
Array<ValueType> tmp_vals_array(exec, nnz);
exec->copy_from(exec.get(), nnz, vals, tmp_vals_array.get_data());
exec->copy(nnz, vals, tmp_vals_array.get_data());
auto tmp_vals = tmp_vals_array.get_const_data();

// init identity permutation
Expand Down
6 changes: 2 additions & 4 deletions hip/matrix/dense_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -454,8 +454,7 @@ void calculate_max_nnz_per_row(std::shared_ptr<const HipExecutor> exec,
as_hip_type(block_results.get_const_data()),
as_hip_type(d_result.get_data()));

exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
result);
*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
Expand Down Expand Up @@ -522,8 +521,7 @@ void calculate_total_cols(std::shared_ptr<const HipExecutor> exec,
as_hip_type(block_results.get_const_data()),
as_hip_type(d_result.get_data()));

exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
result);
*result = exec->copy_val_to_host(d_result.get_const_data());
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
Expand Down
14 changes: 6 additions & 8 deletions hip/preconditioner/isai_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,9 @@ void generate_l_inverse(std::shared_ptr<const DefaultExecutor> exec,
const auto nnz = l_csr->get_num_stored_elements();
const auto num_rows = l_csr->get_size()[0];

exec->copy_from(exec.get(), nnz, l_csr->get_const_col_idxs(),
inverse_l->get_col_idxs());
exec->copy_from(exec.get(), num_rows + 1, l_csr->get_const_row_ptrs(),
inverse_l->get_row_ptrs());
exec->copy(nnz, l_csr->get_const_col_idxs(), inverse_l->get_col_idxs());
exec->copy(num_rows + 1, l_csr->get_const_row_ptrs(),
inverse_l->get_row_ptrs());

const dim3 block(default_block_size, 1, 1);
const dim3 grid(ceildiv(num_rows, block.x / config::warp_size), 1, 1);
Expand All @@ -107,10 +106,9 @@ void generate_u_inverse(std::shared_ptr<const DefaultExecutor> exec,
const auto nnz = u_csr->get_num_stored_elements();
const auto num_rows = u_csr->get_size()[0];

exec->copy_from(exec.get(), nnz, u_csr->get_const_col_idxs(),
inverse_u->get_col_idxs());
exec->copy_from(exec.get(), num_rows + 1, u_csr->get_const_row_ptrs(),
inverse_u->get_row_ptrs());
exec->copy(nnz, u_csr->get_const_col_idxs(), inverse_u->get_col_idxs());
exec->copy(num_rows + 1, u_csr->get_const_row_ptrs(),
inverse_u->get_row_ptrs());

const dim3 block(default_block_size, 1, 1);
const dim3 grid(ceildiv(num_rows, block.x / config::warp_size), 1, 1);
Expand Down
7 changes: 2 additions & 5 deletions hip/stop/residual_norm_reduction_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,8 @@ void residual_norm_reduction(std::shared_ptr<const HipExecutor> exec,
as_hip_type(device_storage->get_data()));

/* Represents all_converged, one_changed */
bool tmp[2] = {true, false};
exec->get_master()->copy_from(exec.get(), 2,
device_storage->get_const_data(), tmp);
*all_converged = tmp[0];
*one_changed = tmp[1];
*all_converged = exec->copy_val_to_host(device_storage->get_const_data());
*one_changed = exec->copy_val_to_host(device_storage->get_const_data() + 1);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_RESIDUAL_NORM_REDUCTION_KERNEL);
Expand Down
Loading

0 comments on commit 4df4dea

Please sign in to comment.