diff --git a/master/eigensolver_2reduction__to__band_2impl_8h_source.html b/master/eigensolver_2reduction__to__band_2impl_8h_source.html
index 6ae066a03f..8a2fa30187 100644
--- a/master/eigensolver_2reduction__to__band_2impl_8h_source.html
+++ b/master/eigensolver_2reduction__to__band_2impl_8h_source.html
@@ -394,1161 +394,1170 @@
311 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
- 314 const std::size_t nthreads = getReductionToBandPanelNWorkers();
-
- 316 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
- 317 std::vector<common::internal::vector<T>>{}),
- 318 mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
- 319 ex::when_all_vector(std::move(panel_tiles))) |
- 320 di::continues_on(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
- 321 ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](
const std::size_t index,
auto& barrier_ptr,
- 322 auto& w,
auto& taus,
auto& tiles) {
- 323 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
- 324 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
- 325 const std::size_t begin = index * batch_size;
- 326 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
- 327 const SizeType nrefls = taus.size().rows();
-
-
-
-
-
- 333 for (SizeType j = 0; j < nrefls; ++j) {
-
-
- 336 taus({j, 0}) = computeReflector(tiles, j);
-
-
- 339 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 342 const SizeType pt_cols = cols - (j + 1);
-
-
- 345 const bool has_head = (index == 0);
-
- 347 w[index] = common::internal::vector<T>(pt_cols, 0);
- 348 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
- 349 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
-
- 353 dlaf::eigensolver::internal::reduceColumnVectors(w);
+ 314 const std::size_t nworkers = [nrtiles = panel_tiles.size()]() {
+ 315 const std::size_t min_workers = 1;
+ 316 const std::size_t available_workers = get_red2band_panel_nworkers();
+ 317 const std::size_t ideal_workers =
to_sizet(nrtiles);
+ 318 return std::clamp(ideal_workers, min_workers, available_workers);
+
+
+ 321 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nworkers),
+ 322 std::vector<common::internal::vector<T>>{}),
+ 323 mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
+ 324 ex::when_all_vector(std::move(panel_tiles))) |
+ 325 di::continues_on(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
+ 326 ex::bulk(nworkers, [nworkers, cols = panel_view.cols()](
const std::size_t index,
auto& barrier_ptr,
+ 327 auto& w,
auto& taus,
auto& tiles) {
+ 328 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
+ 329 const std::size_t batch_size = util::ceilDiv(tiles.size(), nworkers);
+ 330 const std::size_t begin = index * batch_size;
+ 331 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
+ 332 const SizeType nrefls = taus.size().rows();
+
+
+
+
+
+ 338 for (SizeType j = 0; j < nrefls; ++j) {
+
+
+ 341 taus({j, 0}) = computeReflector(tiles, j);
+
+
+ 344 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 347 const SizeType pt_cols = cols - (j + 1);
+
+
+ 350 const bool has_head = (index == 0);
+
+ 352 w[index] = common::internal::vector<T>(pt_cols, 0);
+ 353 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
354 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
- 357 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
- 358 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 361 ex::start_detached(std::move(s));
-
-
- 364template <Backend B, Device D,
class T>
- 365void setupReflectorPanelV(
bool has_head,
const matrix::SubPanelView& panel_view,
const SizeType nrefls,
- 366 matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
- 367 bool force_copy =
false) {
- 368 namespace ex = pika::execution::experimental;
-
- 370 using pika::execution::thread_priority;
- 371 using pika::execution::thread_stacksize;
-
-
-
-
-
-
-
-
- 380 auto it_begin = panel_view.iteratorLocal().begin();
- 381 auto it_end = panel_view.iteratorLocal().end();
-
-
- 384 const LocalTileIndex i = *it_begin;
- 385 matrix::SubTileSpec spec = panel_view(i);
+
+
+ 358 dlaf::eigensolver::internal::reduceColumnVectors(w);
+ 359 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 362 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
+ 363 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+
+
+ 368template <Backend B, Device D,
class T>
+ 369void setupReflectorPanelV(
bool has_head,
const matrix::SubPanelView& panel_view,
const SizeType nrefls,
+ 370 matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
+ 371 bool force_copy =
false) {
+ 372 namespace ex = pika::execution::experimental;
+
+ 374 using pika::execution::thread_priority;
+ 375 using pika::execution::thread_stacksize;
+
+
+
+
+
+
+
+
+ 384 auto it_begin = panel_view.iteratorLocal().begin();
+ 385 auto it_end = panel_view.iteratorLocal().end();
-
-
-
- 390 spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
-
-
-
-
-
- 396 ex::start_detached(dlaf::internal::whenAllLift(
splitTile(mat_a.read(i), spec), v.
readwrite(i)) |
-
- 398 ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
-
-
-
-
-
-
-
- 406 for (
auto it = it_begin; it < it_end; ++it) {
- 407 const LocalTileIndex idx = *it;
- 408 const matrix::SubTileSpec& spec = panel_view(idx);
-
-
-
-
-
-
-
-
- 417 ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
-
- 419 thread_stacksize::nostack)));
-
- 421 v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
-
-
-
- 425template <Backend B, Device D,
class T>
- 426void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
- 427 matrix::ReadOnlyTileSender<T, D> tile_t) {
- 428 namespace ex = pika::execution::experimental;
-
- 430 using pika::execution::thread_priority;
- 431 using pika::execution::thread_stacksize;
- 432 using namespace blas;
+
+ 388 const LocalTileIndex i = *it_begin;
+ 389 matrix::SubTileSpec spec = panel_view(i);
+
+
+
+
+ 394 spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
+
+
+
+
+
+ 400 ex::start_detached(dlaf::internal::whenAllLift(
splitTile(mat_a.read(i), spec), v.
readwrite(i)) |
+
+ 402 ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
+
+
+
+
+
+
+
+ 410 for (
auto it = it_begin; it < it_end; ++it) {
+ 411 const LocalTileIndex idx = *it;
+ 412 const matrix::SubTileSpec& spec = panel_view(idx);
+
+
+
+
+
+
+
+
+ 421 ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
+
+ 423 thread_stacksize::nostack)));
+
+ 425 v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
+
+
+
+ 429template <Backend B, Device D,
class T>
+ 430void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
+ 431 matrix::ReadOnlyTileSender<T, D> tile_t) {
+ 432 namespace ex = pika::execution::experimental;
- 434 auto it = w.iteratorLocal();
-
- 436 for (
const auto& index_i : it) {
- 437 ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
- 438 T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
-
- 440 thread_stacksize::nostack)));
-
-
-
- 444 ex::start_detached(std::move(tile_t));
+ 434 using pika::execution::thread_priority;
+ 435 using pika::execution::thread_stacksize;
+ 436 using namespace blas;
+
+ 438 auto it = w.iteratorLocal();
+
+ 440 for (
const auto& index_i : it) {
+ 441 ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
+ 442 T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
+
+ 444 thread_stacksize::nostack)));
-
-
- 448template <Backend B, Device D,
class T>
- 449void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
- 450 matrix::Panel<Coord::Col, const T, D>& v) {
- 451 namespace ex = pika::execution::experimental;
-
- 453 using pika::execution::thread_priority;
- 454 using pika::execution::thread_stacksize;
- 455 using namespace blas;
+
+
+ 448 ex::start_detached(std::move(tile_t));
+
+
+
+ 452template <Backend B, Device D,
class T>
+ 453void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
+ 454 matrix::Panel<Coord::Col, const T, D>& v) {
+ 455 namespace ex = pika::execution::experimental;
-
- 458 for (
const auto& index_i : v.iteratorLocal())
-
- 460 dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
- 461 w2.read(LocalTileIndex(0, 0)), T(1), x.readwrite(index_i)) |
- 462 tile::
gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
-
-
- 465template <Backend B, Device D,
class T>
- 466void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x,
const matrix::SubMatrixView& view,
- 467 matrix::Matrix<const T, D>& a, matrix::Panel<Coord::Col, const T, D>& w) {
- 468 namespace ex = pika::execution::experimental;
-
- 470 using pika::execution::thread_priority;
-
- 472 const auto dist = a.distribution();
+ 457 using pika::execution::thread_priority;
+ 458 using pika::execution::thread_stacksize;
+ 459 using namespace blas;
+
+
+ 462 for (
const auto& index_i : v.iteratorLocal())
+
+ 464 dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
+ 465 w2.read(LocalTileIndex(0, 0)), T(1), x.readwrite(index_i)) |
+ 466 tile::
gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
+
+
+ 469template <Backend B, Device D,
class T>
+ 470void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x,
const matrix::SubMatrixView& view,
+ 471 matrix::Matrix<const T, D>& a, matrix::Panel<Coord::Col, const T, D>& w) {
+ 472 namespace ex = pika::execution::experimental;
-
-
-
-
- 478 matrix::util::set0<B>(thread_priority::high, x);
-
- 480 const LocalTileIndex at_offset = view.begin();
-
- 482 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
- 483 const auto limit = i + 1;
- 484 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
- 485 const LocalTileIndex ij{i, j};
-
- 487 const bool is_diagonal_tile = (ij.row() == ij.col());
-
- 489 const auto& tile_a =
splitTile(a.read(ij), view(ij));
+ 474 using pika::execution::thread_priority;
+
+ 476 const auto dist = a.distribution();
+
+
+
+
+
+ 482 matrix::util::set0<B>(thread_priority::high, x);
+
+ 484 const LocalTileIndex at_offset = view.begin();
+
+ 486 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
+ 487 const auto limit = i + 1;
+ 488 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
+ 489 const LocalTileIndex ij{i, j};
- 491 if (is_diagonal_tile) {
- 492 hemmDiag<B>(thread_priority::high, tile_a, w.read(ij), x.readwrite(ij));
-
-
-
-
-
-
-
-
-
- 502 const LocalTileIndex index_x(Coord::Row, ij.row());
- 503 const LocalTileIndex index_w(Coord::Row, ij.col());
- 504 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, w.read(index_w),
- 505 x.readwrite(index_x));
-
-
-
- 509 const LocalTileIndex index_pretended =
transposed(ij);
- 510 const LocalTileIndex index_x(Coord::Row, index_pretended.row());
- 511 const LocalTileIndex index_w(Coord::Row, index_pretended.col());
- 512 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, tile_a, w.read(index_w),
- 513 x.readwrite(index_x));
-
-
-
-
-
-
- 520template <Backend B, Device D,
class T>
- 521void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
- 522 matrix::Panel<Coord::Col, const T, D>& x) {
- 523 using pika::execution::thread_priority;
- 524 using pika::execution::thread_stacksize;
-
- 526 namespace ex = pika::execution::experimental;
-
-
-
-
-
- 532 ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
-
- 534 thread_stacksize::nostack)));
-
- 536 using namespace blas;
-
- 538 for (
const auto& index_tile : w.iteratorLocal())
-
- 540 dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
- 541 x.read(index_tile), T(1), w2.readwrite(LocalTileIndex(0, 0))) |
- 542 tile::
gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
-
-
- 545template <Backend B, Device D,
class T>
- 546void her2kUpdateTrailingMatrix(
const matrix::SubMatrixView& view, matrix::Matrix<T, D>& a,
- 547 matrix::Panel<Coord::Col, const T, D>& x,
- 548 matrix::Panel<Coord::Col, const T, D>& v) {
- 549 static_assert(std::is_signed_v<BaseType<T>>,
"alpha in computations requires to be -1");
-
- 551 using pika::execution::thread_priority;
-
- 553 const auto dist = a.distribution();
+ 491 const bool is_diagonal_tile = (ij.row() == ij.col());
+
+ 493 const auto& tile_a =
splitTile(a.read(ij), view(ij));
+
+ 495 if (is_diagonal_tile) {
+ 496 hemmDiag<B>(thread_priority::high, tile_a, w.read(ij), x.readwrite(ij));
+
+
+
+
+
+
+
+
+
+ 506 const LocalTileIndex index_x(Coord::Row, ij.row());
+ 507 const LocalTileIndex index_w(Coord::Row, ij.col());
+ 508 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, w.read(index_w),
+ 509 x.readwrite(index_x));
+
+
+
+ 513 const LocalTileIndex index_pretended =
transposed(ij);
+ 514 const LocalTileIndex index_x(Coord::Row, index_pretended.row());
+ 515 const LocalTileIndex index_w(Coord::Row, index_pretended.col());
+ 516 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, tile_a, w.read(index_w),
+ 517 x.readwrite(index_x));
+
+
+
+
+
+
+ 524template <Backend B, Device D,
class T>
+ 525void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
+ 526 matrix::Panel<Coord::Col, const T, D>& x) {
+ 527 using pika::execution::thread_priority;
+ 528 using pika::execution::thread_stacksize;
+
+ 530 namespace ex = pika::execution::experimental;
+
+
+
+
+
+ 536 ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
+
+ 538 thread_stacksize::nostack)));
+
+ 540 using namespace blas;
+
+ 542 for (
const auto& index_tile : w.iteratorLocal())
+
+ 544 dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
+ 545 x.read(index_tile), T(1), w2.readwrite(LocalTileIndex(0, 0))) |
+ 546 tile::
gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
+
+
+ 549template <Backend B, Device D,
class T>
+ 550void her2kUpdateTrailingMatrix(
const matrix::SubMatrixView& view, matrix::Matrix<T, D>& a,
+ 551 matrix::Panel<Coord::Col, const T, D>& x,
+ 552 matrix::Panel<Coord::Col, const T, D>& v) {
+ 553 static_assert(std::is_signed_v<BaseType<T>>,
"alpha in computations requires to be -1");
- 555 const LocalTileIndex at_start = view.begin();
+ 555 using pika::execution::thread_priority;
- 557 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
- 558 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
- 559 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
- 560 for (SizeType j = at_start.col(); j < limit; ++j) {
- 561 const LocalTileIndex ij_local{i, j};
- 562 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
-
- 564 const bool is_diagonal_tile = (ij.row() == ij.col());
-
- 566 auto getSubA = [&a, &view, ij_local]() {
- 567 return splitTile(a.readwrite(ij_local), view(ij_local));
-
+ 557 const auto dist = a.distribution();
+
+ 559 const LocalTileIndex at_start = view.begin();
+
+ 561 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
+ 562 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
+ 563 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
+ 564 for (SizeType j = at_start.col(); j < limit; ++j) {
+ 565 const LocalTileIndex ij_local{i, j};
+ 566 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
+ 568 const bool is_diagonal_tile = (ij.row() == ij.col());
-
-
- 572 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
+ 570 auto getSubA = [&a, &view, ij_local]() {
+ 571 return splitTile(a.readwrite(ij_local), view(ij_local));
+
- 574 if (is_diagonal_tile) {
- 575 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
-
-
-
- 579 her2kOffDiag<B>(priority, x.read(ij_local), v.read(
transposed(ij_local)), getSubA());
-
-
- 582 her2kOffDiag<B>(priority, v.read(ij_local), x.read(
transposed(ij_local)), getSubA());
-
-
-
-
-
-
-
- 590namespace distributed {
- 591template <Device D,
class T>
- 592T computeReflector(
const bool has_head, comm::Communicator& communicator,
- 593 const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
- 594 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
-
-
-
-
-
-
-
-
+
+
+ 576 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
+
+ 578 if (is_diagonal_tile) {
+ 579 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
+
+
+
+ 583 her2kOffDiag<B>(priority, x.read(ij_local), v.read(
transposed(ij_local)), getSubA());
+
+
+ 586 her2kOffDiag<B>(priority, v.read(ij_local), x.read(
transposed(ij_local)), getSubA());
+
+
+
+
+
+
+
+ 594namespace distributed {
+ 595template <Device D,
class T>
+ 596T computeReflector(
const bool has_head, comm::Communicator& communicator,
+ 597 const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
+ 598 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
+
+
+
+
-
-
-
-
- 608 comm::sync::allReduceInPlace(communicator, MPI_SUM,
- 609 common::make_data(x0_and_squares.data(),
-
-
- 612 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
-
-
-
-
- 617template <
class MatrixLikeA,
class MatrixLikeTaus,
class TriggerSender,
class CommSender>
- 618void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
- 619 CommSender&& mpi_col_chain_panel, MatrixLikeA& mat_a,
- 620 MatrixLikeTaus& mat_taus, SizeType j_sub,
- 621 const matrix::SubPanelView& panel_view) {
- 622 static Device
constexpr D = MatrixLikeA::device;
- 623 using T =
typename MatrixLikeA::ElementType;
- 624 namespace ex = pika::execution::experimental;
- 625 namespace di = dlaf::internal;
-
- 627 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
- 628 panel_tiles.reserve(
to_sizet(std::distance(panel_view.iteratorLocal().begin(),
- 629 panel_view.iteratorLocal().end())));
- 630 for (
const auto& i : panel_view.iteratorLocal()) {
- 631 const matrix::SubTileSpec& spec = panel_view(i);
- 632 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
-
-
- 635 const std::size_t nthreads = getReductionToBandPanelNWorkers();
-
- 637 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
- 638 std::vector<common::internal::vector<T>>{}),
- 639 mat_taus.readwrite(GlobalTileIndex(j_sub, 0)),
- 640 ex::when_all_vector(std::move(panel_tiles)),
- 641 std::forward<CommSender>(mpi_col_chain_panel), std::forward<TriggerSender>(trigger)) |
- 642 di::continues_on(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
- 643 ex::bulk(nthreads, [nthreads, rank_v0,
- 644 cols = panel_view.cols()](
const std::size_t index,
auto& barrier_ptr,
auto& w,
- 645 auto& taus,
auto& tiles,
auto&& pcomm) {
- 646 const bool rankHasHead = rank_v0 == pcomm.get().rank();
-
- 648 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
- 649 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
- 650 const std::size_t begin = index * batch_size;
- 651 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
- 652 const SizeType nrefls = taus.size().rows();
-
-
-
-
+
+
+
+
+
+
+
+
+ 612 comm::sync::allReduceInPlace(communicator, MPI_SUM,
+ 613 common::make_data(x0_and_squares.data(),
+
+
+ 616 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
+
+
+
+
+ 621template <
class MatrixLikeA,
class MatrixLikeTaus,
class TriggerSender,
class CommSender>
+ 622void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
+ 623 CommSender&& mpi_col_chain_panel, MatrixLikeA& mat_a,
+ 624 MatrixLikeTaus& mat_taus, SizeType j_sub,
+ 625 const matrix::SubPanelView& panel_view) {
+ 626 static Device
constexpr D = MatrixLikeA::device;
+ 627 using T =
typename MatrixLikeA::ElementType;
+ 628 namespace ex = pika::execution::experimental;
+ 629 namespace di = dlaf::internal;
+
+ 631 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
+ 632 panel_tiles.reserve(
to_sizet(std::distance(panel_view.iteratorLocal().begin(),
+ 633 panel_view.iteratorLocal().end())));
+ 634 for (
const auto& i : panel_view.iteratorLocal()) {
+ 635 const matrix::SubTileSpec& spec = panel_view(i);
+ 636 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
+
+
+ 639 const std::size_t nworkers = [nrtiles = panel_tiles.size()]() {
+ 640 const std::size_t min_workers = 1;
+ 641 const std::size_t available_workers = get_red2band_panel_nworkers();
+ 642 const std::size_t ideal_workers = util::ceilDiv(
to_sizet(nrtiles),
to_sizet(2));
+ 643 return std::clamp(ideal_workers, min_workers, available_workers);
+
+
+
+ 647 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nworkers),
+ 648 std::vector<common::internal::vector<T>>{}),
+ 649 mat_taus.readwrite(GlobalTileIndex(j_sub, 0)),
+ 650 ex::when_all_vector(std::move(panel_tiles)),
+ 651 std::forward<CommSender>(mpi_col_chain_panel), std::forward<TriggerSender>(trigger)) |
+ 652 di::continues_on(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
+ 653 ex::bulk(nworkers, [nworkers, rank_v0,
+ 654 cols = panel_view.cols()](
const std::size_t index,
auto& barrier_ptr,
auto& w,
+ 655 auto& taus,
auto& tiles,
auto&& pcomm) {
+ 656 const bool rankHasHead = rank_v0 == pcomm.get().rank();
- 658 for (SizeType j = 0; j < nrefls; ++j) {
-
-
- 661 const bool has_head = rankHasHead;
- 662 taus({j, 0}) = computeReflector(has_head, pcomm.get(), tiles, j);
-
- 664 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 667 const SizeType pt_cols = cols - (j + 1);
-
-
-
- 671 const bool has_head = rankHasHead && (index == 0);
-
- 673 w[index] = common::internal::vector<T>(pt_cols, 0);
- 674 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
- 675 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
-
- 679 dlaf::eigensolver::internal::reduceColumnVectors(w);
- 680 comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM, common::make_data(w[0].data(), pt_cols));
-
- 682 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 685 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
- 686 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
-
- 689 ex::start_detached(std::move(s));
-
-
- 692template <Backend B, Device D,
class T>
- 693void hemmComputeX(comm::IndexT_MPI reducer_col, matrix::Panel<Coord::Col, T, D>& x,
- 694 matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& xt,
- 695 const matrix::SubMatrixView& view, matrix::Matrix<const T, D>& a,
- 696 matrix::Panel<Coord::Col, const T, D>& w,
- 697 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& wt,
- 698 comm::CommunicatorPipeline<comm::CommunicatorType::Row>& mpi_row_chain,
- 699 comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_chain) {
- 700 namespace ex = pika::execution::experimental;
-
- 702 using pika::execution::thread_priority;
-
- 704 const auto dist = a.distribution();
- 705 const auto rank = dist.rankIndex();
-
-
-
-
-
- 711 matrix::util::set0<B>(thread_priority::high, x);
- 712 matrix::util::set0<B>(thread_priority::high, xt);
-
- 714 const LocalTileIndex at_offset = view.begin();
+ 658 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
+ 659 const std::size_t batch_size = util::ceilDiv(tiles.size(), nworkers);
+ 660 const std::size_t begin = index * batch_size;
+ 661 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
+ 662 const SizeType nrefls = taus.size().rows();
+
+
+
+
+
+ 668 for (SizeType j = 0; j < nrefls; ++j) {
+
+
+ 671 const bool has_head = rankHasHead;
+ 672 taus({j, 0}) = computeReflector(has_head, pcomm.get(), tiles, j);
+
+ 674 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 677 const SizeType pt_cols = cols - (j + 1);
+
+
+
+ 681 const bool has_head = rankHasHead && (index == 0);
+
+ 683 w[index] = common::internal::vector<T>(pt_cols, 0);
+ 684 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
+ 685 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+
+ 689 dlaf::eigensolver::internal::reduceColumnVectors(w);
+ 690 comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM, common::make_data(w[0].data(), pt_cols));
+
+ 692 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+ 695 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
+ 696 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
+
+
+
+ 701template <Backend B, Device D,
class T>
+ 702void hemmComputeX(comm::IndexT_MPI reducer_col, matrix::Panel<Coord::Col, T, D>& x,
+ 703 matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& xt,
+ 704 const matrix::SubMatrixView& view, matrix::Matrix<const T, D>& a,
+ 705 matrix::Panel<Coord::Col, const T, D>& w,
+ 706 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& wt,
+ 707 comm::CommunicatorPipeline<comm::CommunicatorType::Row>& mpi_row_chain,
+ 708 comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_chain) {
+ 709 namespace ex = pika::execution::experimental;
+
+ 711 using pika::execution::thread_priority;
+
+ 713 const auto dist = a.distribution();
+ 714 const auto rank = dist.rankIndex();
- 716 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
- 717 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
- 718 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
- 719 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
- 720 const LocalTileIndex ij_local{i, j};
- 721 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
+
+
+
+ 720 matrix::util::set0<B>(thread_priority::high, x);
+ 721 matrix::util::set0<B>(thread_priority::high, xt);
- 723 const bool is_diagonal_tile = (ij.row() == ij.col());
+ 723 const LocalTileIndex at_offset = view.begin();
- 725 auto tile_a =
splitTile(a.read(ij), view(ij_local));
-
- 727 if (is_diagonal_tile) {
- 728 hemmDiag<B>(thread_priority::high, std::move(tile_a), w.read(ij_local), x.readwrite(ij_local));
-
-
-
-
-
-
-
-
- 737 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, wt.read(ij_local),
- 738 x.readwrite(ij_local));
-
+ 725 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
+ 726 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
+ 727 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
+ 728 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
+ 729 const LocalTileIndex ij_local{i, j};
+ 730 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
+ 732 const bool is_diagonal_tile = (ij.row() == ij.col());
+
+ 734 auto tile_a =
splitTile(a.read(ij), view(ij_local));
+
+ 736 if (is_diagonal_tile) {
+ 737 hemmDiag<B>(thread_priority::high, std::move(tile_a), w.read(ij_local), x.readwrite(ij_local));
+
+
-
-
-
-
-
-
-
- 748 const auto owner = dist.template rankGlobalTile<Coord::Row>(ij.col());
-
- 750 const LocalTileIndex index_x{dist.template localTileFromGlobalTile<Coord::Row>(ij.col()), 0};
- 751 const LocalTileIndex index_xt{0, ij_local.col()};
-
- 753 auto tile_x = (dist.rankIndex().row() == owner) ? x.readwrite(index_x) : xt.readwrite(index_xt);
-
- 755 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, std::move(tile_a), w.read(ij_local),
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 771 if (mpi_col_chain.size() > 1) {
- 772 for (
const auto& index_xt : xt.iteratorLocal()) {
- 773 const auto index_k = dist.template globalTileFromLocalTile<Coord::Col>(index_xt.col());
- 774 const auto rank_owner_row = dist.template rankGlobalTile<Coord::Row>(index_k);
-
- 776 if (rank_owner_row == rank.row()) {
-
-
-
-
-
-
- 783 const auto i = dist.template localTileFromGlobalTile<Coord::Row>(index_k);
- 784 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_col_chain.exclusive(), MPI_SUM,
- 785 x.readwrite({i, 0})));
-
-
- 788 ex::start_detached(comm::schedule_reduce_send(mpi_col_chain.exclusive(), rank_owner_row, MPI_SUM,
-
-
-
-
-
-
-
-
-
- 798 if (mpi_row_chain.size() > 1) {
- 799 for (
const auto& index_x : x.iteratorLocal()) {
- 800 if (reducer_col == rank.col())
- 801 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_row_chain.exclusive(), MPI_SUM,
- 802 x.readwrite(index_x)));
-
- 804 ex::start_detached(comm::schedule_reduce_send(mpi_row_chain.exclusive(), reducer_col, MPI_SUM,
-
-
-
-
-
- 810template <Backend B, Device D,
class T>
- 811void her2kUpdateTrailingMatrix(
const matrix::SubMatrixView& view, Matrix<T, D>& a,
- 812 matrix::Panel<Coord::Col, const T, D>& x,
- 813 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& vt,
- 814 matrix::Panel<Coord::Col, const T, D>& v,
- 815 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& xt) {
- 816 static_assert(std::is_signed_v<BaseType<T>>,
"alpha in computations requires to be -1");
-
- 818 using pika::execution::thread_priority;
-
- 820 const auto dist = a.distribution();
-
- 822 const LocalTileIndex at_start = view.begin();
-
- 824 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
- 825 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
- 826 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
- 827 for (SizeType j = at_start.col(); j < limit; ++j) {
- 828 const LocalTileIndex ij_local{i, j};
- 829 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
+
+
+
+
+ 746 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, wt.read(ij_local),
+ 747 x.readwrite(ij_local));
+
+
+
+
+
+
+
+
+
+ 757 const auto owner = dist.template rankGlobalTile<Coord::Row>(ij.col());
+
+ 759 const LocalTileIndex index_x{dist.template localTileFromGlobalTile<Coord::Row>(ij.col()), 0};
+ 760 const LocalTileIndex index_xt{0, ij_local.col()};
+
+ 762 auto tile_x = (dist.rankIndex().row() == owner) ? x.readwrite(index_x) : xt.readwrite(index_xt);
+
+ 764 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, std::move(tile_a), w.read(ij_local),
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 780 if (mpi_col_chain.size() > 1) {
+ 781 for (
const auto& index_xt : xt.iteratorLocal()) {
+ 782 const auto index_k = dist.template globalTileFromLocalTile<Coord::Col>(index_xt.col());
+ 783 const auto rank_owner_row = dist.template rankGlobalTile<Coord::Row>(index_k);
+
+ 785 if (rank_owner_row == rank.row()) {
+
+
+
+
+
+
+ 792 const auto i = dist.template localTileFromGlobalTile<Coord::Row>(index_k);
+ 793 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_col_chain.exclusive(), MPI_SUM,
+ 794 x.readwrite({i, 0})));
+
+
+ 797 ex::start_detached(comm::schedule_reduce_send(mpi_col_chain.exclusive(), rank_owner_row, MPI_SUM,
+
+
+
+
+
+
+
+
+
+ 807 if (mpi_row_chain.size() > 1) {
+ 808 for (
const auto& index_x : x.iteratorLocal()) {
+ 809 if (reducer_col == rank.col())
+ 810 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_row_chain.exclusive(), MPI_SUM,
+ 811 x.readwrite(index_x)));
+
+ 813 ex::start_detached(comm::schedule_reduce_send(mpi_row_chain.exclusive(), reducer_col, MPI_SUM,
+
+
+
+
+
+ 819template <Backend B, Device D,
class T>
+ 820void her2kUpdateTrailingMatrix(
const matrix::SubMatrixView& view, Matrix<T, D>& a,
+ 821 matrix::Panel<Coord::Col, const T, D>& x,
+ 822 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& vt,
+ 823 matrix::Panel<Coord::Col, const T, D>& v,
+ 824 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& xt) {
+ 825 static_assert(std::is_signed_v<BaseType<T>>,
"alpha in computations requires to be -1");
+
+ 827 using pika::execution::thread_priority;
+
+ 829 const auto dist = a.distribution();
- 831 const bool is_diagonal_tile = (ij.row() == ij.col());
+ 831 const LocalTileIndex at_start = view.begin();
- 833 auto getSubA = [&a, &view, ij_local]() {
- 834 return splitTile(a.readwrite(ij_local), view(ij_local));
-
-
-
-
- 839 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
-
- 841 if (is_diagonal_tile) {
- 842 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
-
-
-
- 846 her2kOffDiag<B>(priority, x.read(ij_local), vt.read(ij_local), getSubA());
-
-
- 849 her2kOffDiag<B>(priority, v.read(ij_local), xt.read(ij_local), getSubA());
-
-
-
-
-
-
- 856template <Backend B, Device D,
class T>
-
-
-
-
-
-
-
-
863 void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus,
const SizeType j_sub,
-
-
865 using red2band::local::computePanelReflectors;
-
866 computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
-
-
-
869 template <Device D,
class CommSender,
class TriggerSender>
-
870 void call(TriggerSender&& trigger,
comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
-
871 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus,
const SizeType j_sub,
-
-
873 using red2band::distributed::computePanelReflectors;
-
874 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
-
875 std::forward<CommSender>(mpi_col_chain_panel), mat_a, mat_taus, j_sub,
-
-
-
+
833 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
+
834 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
+
835 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
+
836 for (SizeType j = at_start.col(); j < limit; ++j) {
+
837 const LocalTileIndex ij_local{i, j};
+
838 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
+
840 const bool is_diagonal_tile = (ij.row() == ij.col());
+
+
842 auto getSubA = [&a, &view, ij_local]() {
+
843 return splitTile(a.readwrite(ij_local), view(ij_local));
+
+
+
+
+
848 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
+
+
850 if (is_diagonal_tile) {
+
851 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
+
+
+
+
855 her2kOffDiag<B>(priority, x.read(ij_local), vt.read(ij_local), getSubA());
+
+
+
858 her2kOffDiag<B>(priority, v.read(ij_local), xt.read(ij_local), getSubA());
+
+
+
+
+
+
+
865template <Backend B, Device D,
class T>
+
+
+
+
+
+
+
+
872 void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus,
const SizeType j_sub,
+
+
874 using red2band::local::computePanelReflectors;
+
875 computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
+
+
+
878 template <Device D,
class CommSender,
class TriggerSender>
+
879 void call(TriggerSender&& trigger,
comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
+
880 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus,
const SizeType j_sub,
+
+
882 using red2band::distributed::computePanelReflectors;
+
883 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
+
884 std::forward<CommSender>(mpi_col_chain_panel), mat_a, mat_taus, j_sub,
+
+
+
-
-
-
-
-
-
-
884 : panels_v(n_workspaces, dist_a) {}
-
-
886 void call(Matrix<T, Device::GPU>& mat_a, Matrix<T, Device::CPU>& mat_taus,
const SizeType j_sub,
-
-
888 using red2band::local::computePanelReflectors;
-
-
890 namespace ex = pika::execution::experimental;
-
-
-
-
-
-
-
897 auto& v = panels_v.nextResource();
+
+
+
+
+
+
+
893 : panels_v(n_workspaces, dist_a) {}
+
+
895 void call(Matrix<T, Device::GPU>& mat_a, Matrix<T, Device::CPU>& mat_taus,
const SizeType j_sub,
+
+
897 using red2band::local::computePanelReflectors;
-
899 copyToCPU(panel_view, mat_a, v);
-
900 computePanelReflectors(v, mat_taus, j_sub, panel_view);
-
901 copyFromCPU(panel_view, v, mat_a);
-
-
-
904 template <Device D,
class CommSender,
class TriggerSender>
-
905 void call(TriggerSender&& trigger,
comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
-
906 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, SizeType j_sub,
-
-
908 auto& v = panels_v.nextResource();
-
-
-
911 copyToCPU(panel_view, mat_a, v);
+
899 namespace ex = pika::execution::experimental;
+
+
+
+
+
+
+
906 auto& v = panels_v.nextResource();
+
+
908 copyToCPU(panel_view, mat_a, v);
+
909 computePanelReflectors(v, mat_taus, j_sub, panel_view);
+
910 copyFromCPU(panel_view, v, mat_a);
+
-
-
914 using dlaf::eigensolver::internal::red2band::distributed::computePanelReflectors;
-
915 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
-
916 std::forward<CommSender>(mpi_col_chain_panel), v, mat_taus, j_sub,
-
+
913 template <Device D,
class CommSender,
class TriggerSender>
+
914 void call(TriggerSender&& trigger,
comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
+
915 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, SizeType j_sub,
+
+
917 auto& v = panels_v.nextResource();
-
-
920 copyFromCPU(panel_view, v, mat_a);
-
-
-
-
-
-
-
-
928 namespace ex = pika::execution::experimental;
-
-
-
931 using dlaf::matrix::internal::CopyBackend_v;
-
932 using pika::execution::thread_priority;
-
933 using pika::execution::thread_stacksize;
+
+
920 copyToCPU(panel_view, mat_a, v);
+
+
+
923 using dlaf::eigensolver::internal::red2band::distributed::computePanelReflectors;
+
924 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
+
925 std::forward<CommSender>(mpi_col_chain_panel), v, mat_taus, j_sub,
+
+
+
+
929 copyFromCPU(panel_view, v, mat_a);
+
+
+
+
-
-
936 auto spec = panel_view(i);
-
-
-
939 ex::when_all(splitTile(mat_a.read(i), spec), splitTile(std::move(tmp_tile), spec)) |
-
940 matrix::copy(Policy<CopyBackend_v<Device::GPU, Device::CPU>>(thread_priority::high,
-
941 thread_stacksize::nostack)));
-
-
-
-
-
-
947 namespace ex = pika::execution::experimental;
-
-
-
950 using dlaf::matrix::internal::CopyBackend_v;
-
951 using pika::execution::thread_priority;
-
952 using pika::execution::thread_stacksize;
+
+
+
937 namespace ex = pika::execution::experimental;
+
+
+
940 using dlaf::matrix::internal::CopyBackend_v;
+
941 using pika::execution::thread_priority;
+
942 using pika::execution::thread_stacksize;
+
+
+
945 auto spec = panel_view(i);
+
+
+
948 ex::when_all(splitTile(mat_a.read(i), spec), splitTile(std::move(tmp_tile), spec)) |
+
949 matrix::copy(Policy<CopyBackend_v<Device::GPU, Device::CPU>>(thread_priority::high,
+
950 thread_stacksize::nostack)));
+
+
-
-
955 auto spec = panel_view(i);
-
-
957 ex::start_detached(ex::when_all(splitTile(v.read(i), spec), splitTile(std::move(tile_a), spec)) |
-
958 matrix::copy(Policy<CopyBackend_v<Device::CPU, Device::GPU>>(
-
959 thread_priority::high, thread_stacksize::nostack)));
-
-
-
+
+
+
956 namespace ex = pika::execution::experimental;
+
+
+
959 using dlaf::matrix::internal::CopyBackend_v;
+
960 using pika::execution::thread_priority;
+
961 using pika::execution::thread_stacksize;
+
+
+
964 auto spec = panel_view(i);
+
+
966 ex::start_detached(ex::when_all(splitTile(v.read(i), spec), splitTile(std::move(tile_a), spec)) |
+
967 matrix::copy(Policy<CopyBackend_v<Device::CPU, Device::GPU>>(
+
968 thread_priority::high, thread_stacksize::nostack)));
+
+
+
-
-
-
-
-
-
968template <Backend B, Device D,
class T>
-
-
-
-
-
973 using namespace red2band::local;
-
-
975 using common::iterate_range2d;
-
976 using factorization::internal::computeTFactor;
-
-
978 using pika::execution::experimental::any_sender;
-
-
980 const auto dist_a = mat_a.distribution();
-
-
982 {dist_a.blockSize().rows(), band_size});
+
+
+
+
+
+
977template <Backend B, Device D,
class T>
+
+
+
+
+
982 using namespace red2band::local;
-
-
-
986 const SizeType nrefls = std::max<SizeType>(0, dist_a.size().rows() - band_size - 1);
-
-
-
989 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
-
-
-
-
-
-
-
-
-
-
999 Matrix<T, Device::CPU> mat_taus_retiled =
-
1000 mat_taus.retiledSubPipeline(
LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
-
-
1002 const SizeType ntiles = (nrefls - 1) / band_size + 1;
-
1003 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
984 using common::iterate_range2d;
+
985 using factorization::internal::computeTFactor;
+
+
987 using pika::execution::experimental::any_sender;
+
+
989 const auto dist_a = mat_a.distribution();
+
+
991 {dist_a.blockSize().rows(), band_size});
+
+
+
+
995 const SizeType nrefls = std::max<SizeType>(0, dist_a.size().rows() - band_size - 1);
+
+
+
998 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
+
+
+
+
+
-
1005 const bool is_full_band = (band_size == dist_a.blockSize().cols());
-
-
1007 constexpr std::size_t n_workspaces = 2;
-
-
-
-
-
-
-
-
-
-
-
-
1019 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist_a);
+
+
+
+
1008 Matrix<T, Device::CPU> mat_taus_retiled =
+
1009 mat_taus.retiledSubPipeline(
LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+
+
1011 const SizeType ntiles = (nrefls - 1) / band_size + 1;
+
1012 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
+
1014 const bool is_full_band = (band_size == dist_a.blockSize().cols());
+
+
1016 constexpr std::size_t n_workspaces = 2;
+
+
+
-
1021 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
-
1022 const auto i_sub = j_sub + 1;
-
-
-
-
1026 const SizeType nrefls_tile = mat_taus_retiled.tileSize(
GlobalTileIndex(j_sub, 0)).rows();
-
-
1028 const bool isPanelIncomplete = (nrefls_tile != band_size);
+
+
+
+
+
+
+
+
1028 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist_a);
-
-
1031 DLAF_ASSERT_HEAVY(nrefls_tile != 0, nrefls_tile);
+
1030 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
+
1031 const auto i_sub = j_sub + 1;
-
-
-
+
+
+
1035 const SizeType nrefls_tile = mat_taus_retiled.tileSize(
GlobalTileIndex(j_sub, 0)).rows();
-
1037 Panel<Coord::Col, T, D>& v = panels_v.nextResource();
-
1038 v.setRangeStart(ij_offset);
-
1039 if (isPanelIncomplete)
-
1040 v.setWidth(nrefls_tile);
+
1037 const bool isPanelIncomplete = (nrefls_tile != band_size);
+
+
+
1040 DLAF_ASSERT_HEAVY(nrefls_tile != 0, nrefls_tile);
-
-
1043 compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
-
-
-
-
-
-
1049 constexpr bool has_reflector_head =
true;
-
1050 setupReflectorPanelV<B, D, T>(has_reflector_head, panel_view, nrefls_tile, v, mat_a, !is_full_band);
-
-
-
-
-
1055 Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
-
-
1057 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
-
-
-
1060 const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
-
-
-
1063 if (!at_offset.isIn(mat_a.size()))
-
+
+
+
+
+
1046 Panel<Coord::Col, T, D>& v = panels_v.nextResource();
+
1047 v.setRangeStart(ij_offset);
+
1048 if (isPanelIncomplete)
+
1049 v.setWidth(nrefls_tile);
+
+
+
1052 compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
+
+
+
+
+
+
1058 constexpr bool has_reflector_head =
true;
+
1059 setupReflectorPanelV<B, D, T>(has_reflector_head, panel_view, nrefls_tile, v, mat_a, !is_full_band);
+
+
+
+
+
1064 Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
-
1066 const matrix::SubMatrixView trailing_matrix_view(dist_a, at_offset);
+
1066 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
-
-
1069 Panel<Coord::Col, T, D>& w = panels_w.nextResource();
-
1070 w.setRangeStart(at_offset);
-
1071 if (isPanelIncomplete)
-
1072 w.setWidth(nrefls_tile);
-
-
1074 trmmComputeW<B>(w, v, t.read(t_idx));
-
-
-
1077 Panel<Coord::Col, T, D>& x = panels_x.nextResource();
-
1078 x.setRangeStart(at_offset);
-
1079 if (isPanelIncomplete)
-
1080 x.setWidth(nrefls_tile);
-
-
-
-
-
-
1086 hemmComputeX<B>(x, trailing_matrix_view, mat_a, w);
-
-
-
-
-
-
-
-
1094 Matrix<T, D> w2 = std::move(t);
-
-
1096 gemmComputeW2<B>(w2, w, x);
-
1097 gemmUpdateX<B>(x, w2, v);
-
-
+
+
1069 const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
+
+
+
1072 if (!at_offset.isIn(mat_a.size()))
+
+
+
1075 const matrix::SubMatrixView trailing_matrix_view(dist_a, at_offset);
+
+
+
1078 Panel<Coord::Col, T, D>& w = panels_w.nextResource();
+
1079 w.setRangeStart(at_offset);
+
1080 if (isPanelIncomplete)
+
1081 w.setWidth(nrefls_tile);
+
+
1083 trmmComputeW<B>(w, v, t.read(t_idx));
+
+
+
1086 Panel<Coord::Col, T, D>& x = panels_x.nextResource();
+
1087 x.setRangeStart(at_offset);
+
1088 if (isPanelIncomplete)
+
1089 x.setWidth(nrefls_tile);
+
+
+
+
+
+
1095 hemmComputeX<B>(x, trailing_matrix_view, mat_a, w);
+
+
+
+
-
-
1102 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, v);
-
-
-
-
-
-
-
-
-
-
-
1113template <Backend B, Device D,
class T>
-
1114Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
-
1115 const SizeType band_size) {
-
1116 using namespace red2band::distributed;
+
+
+
1103 Matrix<T, D> w2 = std::move(t);
+
+
1105 gemmComputeW2<B>(w2, w, x);
+
1106 gemmUpdateX<B>(x, w2, v);
+
+
+
+
+
1111 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, v);
+
+
+
+
+
-
1118 using common::iterate_range2d;
-
1119 using factorization::internal::computeTFactor;
+
+
-
1121 namespace ex = pika::execution::experimental;
-
-
-
-
-
-
-
-
-
-
-
-
1133 DLAF_ASSERT(grid.num_pipelines() >= 2, grid.num_pipelines());
-
1134 auto mpi_row_chain = grid.row_communicator_pipeline();
-
1135 auto mpi_col_chain = grid.col_communicator_pipeline();
-
1136 auto mpi_col_chain_panel = grid.col_communicator_pipeline();
-
-
1138#ifdef DLAF_WITH_HDF5
-
1139 static std::atomic<size_t> num_reduction_to_band_calls = 0;
-
1140 std::stringstream fname;
-
1141 fname <<
"reduction_to_band-" << matrix::internal::TypeToString_v<T> <<
"-"
-
1142 << std::to_string(num_reduction_to_band_calls) <<
".h5";
-
1143 std::optional<matrix::internal::FileHDF5> file;
-
-
1145 if (getTuneParameters().debug_dump_reduction_to_band_data) {
-
1146 file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
-
1147 file->write(mat_a,
"/input");
-
-
-
-
1151 const auto& dist = mat_a.distribution();
-
1152 const comm::Index2D rank = dist.rankIndex();
+
+
1122template <Backend B, Device D,
class T>
+
1123Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
+
1124 const SizeType band_size) {
+
1125 using namespace red2band::distributed;
+
+
1127 using common::iterate_range2d;
+
1128 using factorization::internal::computeTFactor;
+
+
1130 namespace ex = pika::execution::experimental;
+
+
+
+
+
+
+
+
+
+
+
+
1142 DLAF_ASSERT(grid.num_pipelines() >= 2, grid.num_pipelines());
+
1143 auto mpi_row_chain = grid.row_communicator_pipeline();
+
1144 auto mpi_col_chain = grid.col_communicator_pipeline();
+
1145 auto mpi_col_chain_panel = grid.col_communicator_pipeline();
+
+
1147#ifdef DLAF_WITH_HDF5
+
1148 static std::atomic<size_t> num_reduction_to_band_calls = 0;
+
1149 std::stringstream fname;
+
1150 fname <<
"reduction_to_band-" << matrix::internal::TypeToString_v<T> <<
"-"
+
1151 << std::to_string(num_reduction_to_band_calls) <<
".h5";
+
1152 std::optional<matrix::internal::FileHDF5> file;
-
-
-
1156 const SizeType nrefls = std::max<SizeType>(0, dist.size().rows() - band_size - 1);
-
-
-
1159 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
-
1160 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
-
1161 TileElementSize(mat_a.blockSize().cols(), 1),
-
1162 comm::Size2D(mat_a.commGridSize().cols(), 1),
-
1163 comm::Index2D(mat_a.rankIndex().col(), 0),
-
1164 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
-
-
-
1167#ifdef DLAF_WITH_HDF5
-
1168 if (getTuneParameters().debug_dump_reduction_to_band_data) {
-
1169 file->write(mat_a,
"/band");
-
-
-
1172 num_reduction_to_band_calls++;
-
+
1154 if (getTuneParameters().debug_dump_reduction_to_band_data) {
+
1155 file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
+
1156 file->write(mat_a,
"/input");
+
+
+
+
1160 const auto& dist = mat_a.distribution();
+
1161 const comm::Index2D rank = dist.rankIndex();
+
+
+
+
1165 const SizeType nrefls = std::max<SizeType>(0, dist.size().rows() - band_size - 1);
+
+
+
1168 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
+
1169 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
+
1170 TileElementSize(mat_a.blockSize().cols(), 1),
+
1171 comm::Size2D(mat_a.commGridSize().cols(), 1),
+
1172 comm::Index2D(mat_a.rankIndex().col(), 0),
+
1173 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
-
-
-
-
1178 Matrix<T, Device::CPU> mat_taus_retiled =
-
1179 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+
+
1176#ifdef DLAF_WITH_HDF5
+
1177 if (getTuneParameters().debug_dump_reduction_to_band_data) {
+
1178 file->write(mat_a,
"/band");
+
-
1181 const SizeType ntiles = (nrefls - 1) / band_size + 1;
-
1182 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
1181 num_reduction_to_band_calls++;
+
-
1184 const bool is_full_band = (band_size == dist.blockSize().cols());
-
-
1186 constexpr std::size_t n_workspaces = 2;
-
1187 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
-
1188 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_vt(
-
1189 n_workspaces, dist);
-
-
1191 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
-
1192 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_wt(
-
1193 n_workspaces, dist);
+
+
+
+
1187 Matrix<T, Device::CPU> mat_taus_retiled =
+
1188 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+
+
1190 const SizeType ntiles = (nrefls - 1) / band_size + 1;
+
1191 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
+
1193 const bool is_full_band = (band_size == dist.blockSize().cols());
-
1195 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
-
1196 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
-
1197 n_workspaces, dist);
-
-
1199 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
-
-
1201 ex::unique_any_sender<> trigger_panel{ex::just()};
-
1202 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
-
1203 const SizeType i_sub = j_sub + 1;
-
-
1205 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
-
1206 const GlobalElementIndex at_offset(i_sub * band_size, (j_sub + 1) * band_size);
+
1195 constexpr std::size_t n_workspaces = 2;
+
1196 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
+
1197 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_vt(
+
1198 n_workspaces, dist);
+
+
1200 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
+
1201 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_wt(
+
1202 n_workspaces, dist);
+
+
1204 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
+
1205 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
+
1206 n_workspaces, dist);
-
1208 const comm::Index2D rank_v0{
-
1209 dist.template rankGlobalElement<Coord::Row>(ij_offset.row()),
-
1210 dist.template rankGlobalElement<Coord::Col>(ij_offset.col()),
-
-
-
1213 const bool is_panel_rank_col = rank_v0.col() == rank.col();
-
-
1215 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
+
1208 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
+
+
1210 ex::unique_any_sender<> trigger_panel{ex::just()};
+
1211 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
+
1212 const SizeType i_sub = j_sub + 1;
+
+
1214 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
+
1215 const GlobalElementIndex at_offset(i_sub * band_size, (j_sub + 1) * band_size);
-
1217 if (nrefls_tile == 0)
-
-
-
1220 auto& v = panels_v.nextResource();
-
1221 auto& vt = panels_vt.nextResource();
-
-
1223 v.setRangeStart(at_offset);
-
1224 vt.setRangeStart(at_offset);
+
1217 const comm::Index2D rank_v0{
+
1218 dist.template rankGlobalElement<Coord::Row>(ij_offset.row()),
+
1219 dist.template rankGlobalElement<Coord::Col>(ij_offset.col()),
+
+
+
1222 const bool is_panel_rank_col = rank_v0.col() == rank.col();
+
+
1224 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
-
1226 v.setWidth(nrefls_tile);
-
1227 vt.setHeight(nrefls_tile);
+
1226 if (nrefls_tile == 0)
+
-
1229 const LocalTileIndex t_idx(0, 0);
-
-
-
1232 matrix::Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
-
-
-
1235 const matrix::SubPanelView panel_view(dist, ij_offset, band_size);
-
-
1237 if (is_panel_rank_col) {
-
1238 compute_panel_helper.call(std::move(trigger_panel), rank_v0.row(), mpi_col_chain_panel.exclusive(),
-
1239 mat_a, mat_taus_retiled, j_sub, panel_view);
-
-
-
-
-
-
1245 red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
-
1246 nrefls_tile, v, mat_a, !is_full_band);
-
1247 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
-
-
-
-
-
-
-
1254 if (!at_offset.isIn(mat_a.size()))
-
-
-
1257 const matrix::SubMatrixView trailing_matrix_view(dist, at_offset);
-
-
1259 comm::broadcast(rank_v0.col(), v, vt, mpi_row_chain, mpi_col_chain);
-
-
-
1262 auto& w = panels_w.nextResource();
-
1263 auto& wt = panels_wt.nextResource();
-
-
1265 w.setRangeStart(at_offset);
-
1266 wt.setRangeStart(at_offset);
+
1229 auto& v = panels_v.nextResource();
+
1230 auto& vt = panels_vt.nextResource();
+
+
1232 v.setRangeStart(at_offset);
+
1233 vt.setRangeStart(at_offset);
+
+
1235 v.setWidth(nrefls_tile);
+
1236 vt.setHeight(nrefls_tile);
+
+
1238 const LocalTileIndex t_idx(0, 0);
+
+
+
1241 matrix::Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
+
+
+
1244 const matrix::SubPanelView panel_view(dist, ij_offset, band_size);
+
+
1246 if (is_panel_rank_col) {
+
1247 compute_panel_helper.call(std::move(trigger_panel), rank_v0.row(), mpi_col_chain_panel.exclusive(),
+
1248 mat_a, mat_taus_retiled, j_sub, panel_view);
+
+
+
+
+
+
1254 red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
+
1255 nrefls_tile, v, mat_a, !is_full_band);
+
1256 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
+
+
+
+
+
+
+
1263 if (!at_offset.isIn(mat_a.size()))
+
+
+
1266 const matrix::SubMatrixView trailing_matrix_view(dist, at_offset);
-
1268 w.setWidth(nrefls_tile);
-
1269 wt.setHeight(nrefls_tile);
-
-
1271 if (is_panel_rank_col)
-
1272 red2band::local::trmmComputeW<B, D>(w, v, t.read(t_idx));
+
1268 comm::broadcast(rank_v0.col(), v, vt, mpi_row_chain, mpi_col_chain);
+
+
+
1271 auto& w = panels_w.nextResource();
+
1272 auto& wt = panels_wt.nextResource();
-
1274 comm::broadcast(rank_v0.col(), w, wt, mpi_row_chain, mpi_col_chain);
-
-
-
1277 auto& x = panels_x.nextResource();
-
1278 auto& xt = panels_xt.nextResource();
+
1274 w.setRangeStart(at_offset);
+
1275 wt.setRangeStart(at_offset);
+
+
1277 w.setWidth(nrefls_tile);
+
1278 wt.setHeight(nrefls_tile);
-
1280 x.setRangeStart(at_offset);
-
1281 xt.setRangeStart(at_offset);
+
1280 if (is_panel_rank_col)
+
1281 red2band::local::trmmComputeW<B, D>(w, v, t.read(t_idx));
-
1283 x.setWidth(nrefls_tile);
-
1284 xt.setHeight(nrefls_tile);
-
-
-
-
-
-
-
-
-
-
1294 hemmComputeX<B, D>(rank_v0.col(), x, xt, trailing_matrix_view, mat_a, w, wt, mpi_row_chain,
-
-
-
-
-
-
-
-
-
-
1304 if (is_panel_rank_col) {
-
-
-
1307 matrix::Matrix<T, D> w2 = std::move(t);
-
-
1309 red2band::local::gemmComputeW2<B, D>(w2, w, x);
-
1310 if (mpi_col_chain.size() > 1) {
-
1311 ex::start_detached(comm::schedule_all_reduce_in_place(mpi_col_chain.exclusive(), MPI_SUM,
-
1312 w2.readwrite(LocalTileIndex(0, 0))));
-
-
-
1315 red2band::local::gemmUpdateX<B, D>(x, w2, v);
-
+
1283 comm::broadcast(rank_v0.col(), w, wt, mpi_row_chain, mpi_col_chain);
+
+
+
1286 auto& x = panels_x.nextResource();
+
1287 auto& xt = panels_xt.nextResource();
+
+
1289 x.setRangeStart(at_offset);
+
1290 xt.setRangeStart(at_offset);
+
+
1292 x.setWidth(nrefls_tile);
+
1293 xt.setHeight(nrefls_tile);
+
+
+
+
+
+
+
+
+
+
1303 hemmComputeX<B, D>(rank_v0.col(), x, xt, trailing_matrix_view, mat_a, w, wt, mpi_row_chain,
+
+
+
+
+
+
+
+
+
+
1313 if (is_panel_rank_col) {
+
+
+
1316 matrix::Matrix<T, D> w2 = std::move(t);
-
-
-
-
-
1322 xt.setRangeStart(at_offset);
-
1323 xt.setHeight(nrefls_tile);
-
-
1325 comm::broadcast(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
+
1318 red2band::local::gemmComputeW2<B, D>(w2, w, x);
+
1319 if (mpi_col_chain.size() > 1) {
+
1320 ex::start_detached(comm::schedule_all_reduce_in_place(mpi_col_chain.exclusive(), MPI_SUM,
+
1321 w2.readwrite(LocalTileIndex(0, 0))));
+
+
+
1324 red2band::local::gemmUpdateX<B, D>(x, w2, v);
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
1370 const SizeType j_tile_current = ij_offset.col() / dist.blockSize().cols();
-
1371 const SizeType j_tile_next = at_offset.col() / dist.blockSize().cols();
-
1372 const bool isNextColumnOnSameRank = (j_tile_current == j_tile_next);
-
1373 const comm::IndexT_MPI rank_next_col =
-
1374 isNextColumnOnSameRank ? rank_v0.col() : (rank_v0.col() + 1) % dist.commGridSize().cols();
-
-
1376 if (rank.col() == rank_next_col) {
-
1377 const LocalTileIndex at{
-
1378 dist.template nextLocalTileFromGlobalElement<Coord::Row>(at_offset.row()),
-
1379 dist.template nextLocalTileFromGlobalElement<Coord::Col>(at_offset.col()),
-
-
-
-
-
-
-
-
-
1388 const SizeType at_tile_col =
-
1389 dist.template globalTileFromGlobalElement<Coord::Col>(at_offset.col());
+
+
+
+
+
1331 xt.setRangeStart(at_offset);
+
1332 xt.setHeight(nrefls_tile);
+
+
1334 comm::broadcast(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
1379 const SizeType j_tile_current = ij_offset.col() / dist.blockSize().cols();
+
1380 const SizeType j_tile_next = at_offset.col() / dist.blockSize().cols();
+
1381 const bool isNextColumnOnSameRank = (j_tile_current == j_tile_next);
+
1382 const comm::IndexT_MPI rank_next_col =
+
1383 isNextColumnOnSameRank ? rank_v0.col() : (rank_v0.col() + 1) % dist.commGridSize().cols();
+
+
1385 if (rank.col() == rank_next_col) {
+
1386 const LocalTileIndex at{
+
1387 dist.template nextLocalTileFromGlobalElement<Coord::Row>(at_offset.row()),
+
1388 dist.template nextLocalTileFromGlobalElement<Coord::Col>(at_offset.col()),
+
-
1391 if (at_tile_col == dist.nrTiles().cols() - 1) {
-
1392 const comm::IndexT_MPI owner = rank_v0.row();
-
1393 if (rank.row() == owner) {
-
1394 xt.setTile(at, x.read(at));
-
-
1396 if (dist.commGridSize().rows() > 1)
-
1397 ex::start_detached(comm::schedule_bcast_send(mpi_col_chain.exclusive(), xt.read(at)));
-
-
-
1400 if (dist.commGridSize().rows() > 1)
-
1401 ex::start_detached(comm::schedule_bcast_recv(mpi_col_chain.exclusive(), owner,
-
-
-
-
-
1406 if constexpr (dlaf::comm::CommunicationDevice_v<D> == D) {
-
-
-
-
1410 if (rank.row() == rank_v0.row()) {
-
1411 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
+
+
+
+
+
+
1397 const SizeType at_tile_col =
+
1398 dist.template globalTileFromGlobalElement<Coord::Col>(at_offset.col());
+
+
1400 if (at_tile_col == dist.nrTiles().cols() - 1) {
+
1401 const comm::IndexT_MPI owner = rank_v0.row();
+
1402 if (rank.row() == owner) {
+
1403 xt.setTile(at, x.read(at));
+
+
1405 if (dist.commGridSize().rows() > 1)
+
1406 ex::start_detached(comm::schedule_bcast_send(mpi_col_chain.exclusive(), xt.read(at)));
+
+
+
1409 if (dist.commGridSize().rows() > 1)
+
1410 ex::start_detached(comm::schedule_bcast_recv(mpi_col_chain.exclusive(), owner,
+
-
-
-
-
-
-
1418 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
-
-
-
-
1422 if (rank.row() == rank_v0.row()) {
+
+
+
1415 if constexpr (dlaf::comm::CommunicationDevice_v<D> == D) {
+
+
+
+
1419 if (rank.row() == rank_v0.row()) {
+
1420 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
+
-
-
-
-
-
-
-
-
1431 trigger_panel = x.readwrite(at) | ex::drop_value() | ex::ensure_started();
-
-
-
-
-
-
-
1438 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
-
-
-
-
-
-
1444 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, vt, v, xt);
-
-
-
-
-
-
-
-
-
-
1454#ifdef DLAF_WITH_HDF5
-
1455 if (getTuneParameters().debug_dump_reduction_to_band_data) {
-
1456 file->write(mat_a,
"/band");
-
-
-
1459 num_reduction_to_band_calls++;
-
-
-
-
-
+
+
+
+
1427 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
+
+
+
1431 if (rank.row() == rank_v0.row()) {
+
+
+
+
+
+
+
+
+
1440 trigger_panel = x.readwrite(at) | ex::drop_value() | ex::ensure_started();
+
+
+
+
+
+
+
1447 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
+
+
+
+
+
1453 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, vt, v, xt);
+
+
+
+
+
+
+
+
+
+
1463#ifdef DLAF_WITH_HDF5
+
1464 if (getTuneParameters().debug_dump_reduction_to_band_data) {
+
1465 file->write(mat_a,
"/band");
+
+
+
1468 num_reduction_to_band_calls++;
+
+
+
+
+
void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile< const T, D > &a, const Tile< const T, D > &b, const T beta, const Tile< T, D > &c)
@@ -1585,7 +1594,7 @@
-
+
auto iteratorLocal() const noexcept
Return a Range2D that gives access to all local tiles part of the View.
Definition views.h:70
diff --git a/master/get__red2band__panel__nworkers_8h_source.html b/master/get__red2band__panel__nworkers_8h_source.html
index 9199dd87e9..bf06866e51 100644
--- a/master/get__red2band__panel__nworkers_8h_source.html
+++ b/master/get__red2band__panel__nworkers_8h_source.html
@@ -102,7 +102,7 @@
20namespace dlaf::eigensolver::internal {
-
22inline size_t getReductionToBandPanelNWorkers() noexcept {
+
22inline size_t get_red2band_panel_nworkers() noexcept {
24 const std::size_t available_workers = pika::resource::get_thread_pool(
"default").get_os_thread_count();
25 const std::size_t min_workers = 1;