-
Notifications
You must be signed in to change notification settings - Fork 91
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
finish candidate add, add cuda kernels
- Loading branch information
Showing
10 changed files
with
1,008 additions
and
213 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,237 @@ | ||
/*******************************<GINKGO LICENSE>****************************** | ||
Copyright (c) 2017-2020, the Ginkgo authors | ||
All rights reserved. | ||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions | ||
are met: | ||
1. Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
2. Redistributions in binary form must reproduce the above copyright | ||
notice, this list of conditions and the following disclaimer in the | ||
documentation and/or other materials provided with the distribution. | ||
3. Neither the name of the copyright holder nor the names of its | ||
contributors may be used to endorse or promote products derived from | ||
this software without specific prior written permission. | ||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS | ||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | ||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A | ||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
******************************<GINKGO LICENSE>*******************************/ | ||
|
||
#include "core/factorization/par_ilut_kernels.hpp" | ||
|
||
|
||
#include <algorithm> | ||
|
||
|
||
#include <ginkgo/core/base/array.hpp> | ||
#include <ginkgo/core/base/math.hpp> | ||
#include <ginkgo/core/matrix/coo.hpp> | ||
#include <ginkgo/core/matrix/csr.hpp> | ||
#include <ginkgo/core/matrix/dense.hpp> | ||
|
||
|
||
#include "core/components/prefix_sum.hpp" | ||
#include "core/factorization/par_ilut_kernels.hpp" | ||
#include "core/matrix/coo_builder.hpp" | ||
#include "core/matrix/csr_builder.hpp" | ||
#include "core/matrix/csr_kernels.hpp" | ||
#include "core/synthesizer/implementation_selection.hpp" | ||
#include "cuda/base/config.hpp" | ||
#include "cuda/base/math.hpp" | ||
#include "cuda/base/types.hpp" | ||
#include "cuda/components/atomic.cuh" | ||
#include "cuda/components/cooperative_groups.cuh" | ||
#include "cuda/components/intrinsics.cuh" | ||
#include "cuda/components/prefix_sum.cuh" | ||
#include "cuda/components/sorting.cuh" | ||
|
||
|
||
namespace gko { | ||
namespace kernels { | ||
namespace cuda { | ||
/** | ||
* @brief The parallel ILUT factorization namespace. | ||
* | ||
* @ingroup factor | ||
*/ | ||
namespace par_ilut_factorization { | ||
|
||
|
||
constexpr auto default_block_size = 512; | ||
constexpr auto items_per_thread = 2; | ||
|
||
|
||
// subwarp sizes for filter kernels | ||
using compiled_kernels = | ||
syn::value_list<int, 1, 2, 4, 8, 16, 32, config::warp_size>; | ||
|
||
|
||
#include "common/factorization/par_ilut_filter_kernels.hpp.inc" | ||
#include "common/factorization/par_ilut_select_kernels.hpp.inc" | ||
|
||
|
||
template <typename ValueType, typename IndexType> | ||
void ssss_count(const ValueType *values, IndexType size, | ||
remove_complex<ValueType> *tree, unsigned char *oracles, | ||
IndexType *partial_counts, IndexType *total_counts) | ||
{ | ||
constexpr auto bucket_count = kernel::searchtree_width; | ||
auto num_threads_total = ceildiv(size, items_per_thread); | ||
auto num_blocks = | ||
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size)); | ||
// pick sample, build searchtree | ||
kernel::build_searchtree<<<1, bucket_count>>>(as_cuda_type(values), size, | ||
tree); | ||
// determine bucket sizes | ||
kernel::count_buckets<<<num_blocks, default_block_size>>>( | ||
as_cuda_type(values), size, tree, partial_counts, oracles, | ||
items_per_thread); | ||
// compute prefix sum and total sum over block-local values | ||
kernel::block_prefix_sum<<<bucket_count, default_block_size>>>( | ||
partial_counts, total_counts, num_blocks); | ||
// compute prefix sum over bucket counts | ||
start_prefix_sum<bucket_count><<<1, bucket_count>>>( | ||
bucket_count, total_counts, total_counts + bucket_count); | ||
} | ||
|
||
|
||
namespace { | ||
|
||
|
||
template <int subwarp_size, typename ValueType, typename IndexType> | ||
void threshold_filter_approx(syn::value_list<int, subwarp_size>, | ||
std::shared_ptr<const CudaExecutor> exec, | ||
const matrix::Csr<ValueType, IndexType> *m, | ||
IndexType rank, Array<ValueType> *tmp, | ||
matrix::Csr<ValueType, IndexType> *m_out, | ||
matrix::Coo<ValueType, IndexType> *m_out_coo) | ||
{ | ||
auto values = m->get_const_values(); | ||
IndexType size = m->get_num_stored_elements(); | ||
using AbsType = remove_complex<ValueType>; | ||
constexpr auto bucket_count = kernel::searchtree_width; | ||
auto max_num_threads = ceildiv(size, items_per_thread); | ||
auto max_num_blocks = ceildiv(max_num_threads, default_block_size); | ||
|
||
size_type tmp_size_totals = | ||
ceildiv((bucket_count + 1) * sizeof(IndexType), sizeof(ValueType)); | ||
size_type tmp_size_partials = ceildiv( | ||
bucket_count * max_num_blocks * sizeof(IndexType), sizeof(ValueType)); | ||
size_type tmp_size_oracles = | ||
ceildiv(size * sizeof(unsigned char), sizeof(ValueType)); | ||
size_type tmp_size_tree = | ||
ceildiv(kernel::searchtree_size * sizeof(AbsType), sizeof(ValueType)); | ||
size_type tmp_size = | ||
tmp_size_totals + tmp_size_partials + tmp_size_oracles + tmp_size_tree; | ||
tmp->resize_and_reset(tmp_size); | ||
|
||
auto total_counts = reinterpret_cast<IndexType *>(tmp->get_data()); | ||
auto partial_counts = | ||
reinterpret_cast<IndexType *>(tmp->get_data() + tmp_size_totals); | ||
auto oracles = reinterpret_cast<unsigned char *>( | ||
tmp->get_data() + tmp_size_totals + tmp_size_partials); | ||
auto tree = | ||
reinterpret_cast<AbsType *>(tmp->get_data() + tmp_size_totals + | ||
tmp_size_partials + tmp_size_oracles); | ||
|
||
ssss_count(values, size, tree, oracles, partial_counts, total_counts); | ||
|
||
// determine bucket with correct rank | ||
auto total_counts_array = | ||
Array<IndexType>::view(exec, bucket_count + 1, total_counts); | ||
Array<IndexType> splitter_ranks_array(exec->get_master(), | ||
total_counts_array); | ||
auto splitter_ranks = splitter_ranks_array.get_const_data(); | ||
auto it = std::upper_bound(splitter_ranks, | ||
splitter_ranks + bucket_count + 1, rank); | ||
auto bucket = | ||
static_cast<unsigned char>(std::distance(splitter_ranks + 1, it)); | ||
|
||
// filter the elements | ||
auto old_row_ptrs = m->get_const_row_ptrs(); | ||
auto old_col_idxs = m->get_const_col_idxs(); | ||
auto old_vals = m->get_const_values(); | ||
// compute nnz for each row | ||
auto num_rows = static_cast<IndexType>(m->get_size()[0]); | ||
auto block_size = default_block_size / subwarp_size; | ||
auto num_blocks = ceildiv(num_rows, block_size); | ||
auto new_row_ptrs = m_out->get_row_ptrs(); | ||
kernel::bucket_filter_nnz<subwarp_size><<<num_blocks, default_block_size>>>( | ||
old_row_ptrs, oracles, num_rows, bucket, new_row_ptrs); | ||
|
||
// build row pointers | ||
prefix_sum(exec, new_row_ptrs, num_rows + 1); | ||
|
||
// build matrix | ||
IndexType new_nnz{}; | ||
exec->get_master()->copy_from(exec.get(), 1, new_row_ptrs + num_rows, | ||
&new_nnz); | ||
// resize arrays and update aliases | ||
matrix::CsrBuilder<ValueType, IndexType> builder{m_out}; | ||
builder.get_col_idx_array().resize_and_reset(new_nnz); | ||
builder.get_value_array().resize_and_reset(new_nnz); | ||
auto new_col_idxs = m_out->get_col_idxs(); | ||
auto new_vals = m_out->get_values(); | ||
matrix::CooBuilder<ValueType, IndexType> coo_builder{m_out_coo}; | ||
coo_builder.get_row_idx_array().resize_and_reset(new_nnz); | ||
coo_builder.get_col_idx_array() = | ||
Array<IndexType>::view(exec, new_nnz, new_col_idxs); | ||
coo_builder.get_value_array() = | ||
Array<ValueType>::view(exec, new_nnz, new_vals); | ||
auto new_row_idxs = m_out_coo->get_row_idxs(); | ||
kernel::bucket_filter<subwarp_size><<<num_blocks, default_block_size>>>( | ||
old_row_ptrs, old_col_idxs, as_cuda_type(old_vals), oracles, num_rows, | ||
bucket, new_row_ptrs, new_row_idxs, new_col_idxs, | ||
as_cuda_type(new_vals)); | ||
} | ||
|
||
|
||
GKO_ENABLE_IMPLEMENTATION_SELECTION(select_threshold_filter_approx, | ||
threshold_filter_approx); | ||
|
||
|
||
} // namespace | ||
|
||
|
||
template <typename ValueType, typename IndexType> | ||
void threshold_filter_approx(std::shared_ptr<const CudaExecutor> exec, | ||
const matrix::Csr<ValueType, IndexType> *m, | ||
IndexType rank, Array<ValueType> &tmp, | ||
matrix::Csr<ValueType, IndexType> *m_out, | ||
matrix::Coo<ValueType, IndexType> *m_out_coo) | ||
{ | ||
auto num_rows = m->get_size()[0]; | ||
auto total_nnz = m->get_num_stored_elements(); | ||
auto total_nnz_per_row = total_nnz / num_rows; | ||
select_threshold_filter_approx( | ||
compiled_kernels(), | ||
[&](int compiled_subwarp_size) { | ||
return total_nnz_per_row <= compiled_subwarp_size || | ||
compiled_subwarp_size == config::warp_size; | ||
}, | ||
syn::value_list<int>(), syn::type_list<>(), exec, m, rank, &tmp, m_out, | ||
m_out_coo); | ||
} | ||
|
||
|
||
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( | ||
GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL); | ||
|
||
|
||
} // namespace par_ilut_factorization | ||
} // namespace cuda | ||
} // namespace kernels | ||
} // namespace gko |
Oops, something went wrong.