diff --git a/master/eigensolver_2reduction__to__band_2impl_8h_source.html b/master/eigensolver_2reduction__to__band_2impl_8h_source.html index 6ae066a03f..8a2fa30187 100644 --- a/master/eigensolver_2reduction__to__band_2impl_8h_source.html +++ b/master/eigensolver_2reduction__to__band_2impl_8h_source.html @@ -394,1161 +394,1170 @@
311 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
312 }
314 const std::size_t nthreads = getReductionToBandPanelNWorkers();
315 auto s =
316 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
317 std::vector<common::internal::vector<T>>{}), // w (internally required)
318 mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
319 ex::when_all_vector(std::move(panel_tiles))) |
320 di::continues_on(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
321 ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr,
322 auto& w, auto& taus, auto& tiles) {
323 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
324 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
325 const std::size_t begin = index * batch_size;
326 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
327 const SizeType nrefls = taus.size().rows();
329 if (index == 0) {
330 w.resize(nthreads);
331 }
333 for (SizeType j = 0; j < nrefls; ++j) {
334 // STEP1: compute tau and reflector (single-thread)
335 if (index == 0) {
336 taus({j, 0}) = computeReflector(tiles, j);
337 }
339 barrier_ptr->arrive_and_wait(barrier_busy_wait);
341 // STEP2a: compute w (multi-threaded)
342 const SizeType pt_cols = cols - (j + 1);
343 if (pt_cols == 0)
344 break;
345 const bool has_head = (index == 0);
347 w[index] = common::internal::vector<T>(pt_cols, 0);
348 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
349 barrier_ptr->arrive_and_wait(barrier_busy_wait);
351 // STEP2b: reduce w results (single-threaded)
352 if (index == 0)
353 dlaf::eigensolver::internal::reduceColumnVectors(w);
314 const std::size_t nworkers = [nrtiles = panel_tiles.size()]() {
315 const std::size_t min_workers = 1;
316 const std::size_t available_workers = get_red2band_panel_nworkers();
317 const std::size_t ideal_workers = to_sizet(nrtiles);
318 return std::clamp(ideal_workers, min_workers, available_workers);
319 }();
320 ex::start_detached(
321 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nworkers),
322 std::vector<common::internal::vector<T>>{}), // w (internally required)
323 mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
324 ex::when_all_vector(std::move(panel_tiles))) |
325 di::continues_on(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
326 ex::bulk(nworkers, [nworkers, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr,
327 auto& w, auto& taus, auto& tiles) {
328 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
329 const std::size_t batch_size = util::ceilDiv(tiles.size(), nworkers);
330 const std::size_t begin = index * batch_size;
331 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
332 const SizeType nrefls = taus.size().rows();
334 if (index == 0) {
335 w.resize(nworkers);
336 }
338 for (SizeType j = 0; j < nrefls; ++j) {
339 // STEP1: compute tau and reflector (single-thread)
340 if (index == 0) {
341 taus({j, 0}) = computeReflector(tiles, j);
342 }
344 barrier_ptr->arrive_and_wait(barrier_busy_wait);
346 // STEP2a: compute w (multi-threaded)
347 const SizeType pt_cols = cols - (j + 1);
348 if (pt_cols == 0)
349 break;
350 const bool has_head = (index == 0);
352 w[index] = common::internal::vector<T>(pt_cols, 0);
353 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
354 barrier_ptr->arrive_and_wait(barrier_busy_wait);
356 // STEP3: update trailing panel (multi-threaded)
357 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
358 barrier_ptr->arrive_and_wait(barrier_busy_wait);
359 }
360 });
361 ex::start_detached(std::move(s));
364template <Backend B, Device D, class T>
365void setupReflectorPanelV(bool has_head, const matrix::SubPanelView& panel_view, const SizeType nrefls,
366 matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
367 bool force_copy = false) {
368 namespace ex = pika::execution::experimental;
370 using pika::execution::thread_priority;
371 using pika::execution::thread_stacksize;
373 // Note:
374 // Reflectors are stored in the lower triangular part of the A matrix leading to sharing memory
375 // between reflectors and results, which are in the upper triangular part. The problem exists only
376 // for the first tile (of the V, i.e. band excluded). Since refelectors will be used in next
377 // computations, they should be well-formed, i.e. a unit lower trapezoidal matrix. For this reason,
378 // a support tile is used, where just the reflectors values are copied, the diagonal is set to 1
379 // and the rest is zeroed out.
380 auto it_begin = panel_view.iteratorLocal().begin();
381 auto it_end = panel_view.iteratorLocal().end();
383 if (has_head) {
384 const LocalTileIndex i = *it_begin;
385 matrix::SubTileSpec spec = panel_view(i);
356 // STEP2b: reduce w results (single-threaded)
357 if (index == 0)
358 dlaf::eigensolver::internal::reduceColumnVectors(w);
359 barrier_ptr->arrive_and_wait(barrier_busy_wait);
361 // STEP3: update trailing panel (multi-threaded)
362 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
363 barrier_ptr->arrive_and_wait(barrier_busy_wait);
364 }
365 }));
368template <Backend B, Device D, class T>
369void setupReflectorPanelV(bool has_head, const matrix::SubPanelView& panel_view, const SizeType nrefls,
370 matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
371 bool force_copy = false) {
372 namespace ex = pika::execution::experimental;
374 using pika::execution::thread_priority;
375 using pika::execution::thread_stacksize;
377 // Note:
378 // Reflectors are stored in the lower triangular part of the A matrix leading to sharing memory
379 // between reflectors and results, which are in the upper triangular part. The problem exists only
380 // for the first tile (of the V, i.e. band excluded). Since refelectors will be used in next
381 // computations, they should be well-formed, i.e. a unit lower trapezoidal matrix. For this reason,
382 // a support tile is used, where just the reflectors values are copied, the diagonal is set to 1
383 // and the rest is zeroed out.
384 auto it_begin = panel_view.iteratorLocal().begin();
385 auto it_end = panel_view.iteratorLocal().end();
387 // Note:
388 // If the number of reflectors are limited by height (|reflector| > 1), the panel is narrower than
389 // the blocksize, leading to just using a part of A (first full nrefls columns)
390 spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
392 // Note:
393 // copy + laset is done in two independent tasks, but it could be theoretically merged to into a
394 // single task doing both.
395 const auto p = dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack);
396 ex::start_detached(dlaf::internal::whenAllLift(splitTile(mat_a.read(i), spec), v.readwrite(i)) |
397 matrix::copy(p));
398 ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
399 tile::laset(p));
401 ++it_begin;
402 }
404 // The rest of the V panel of reflectors can just point to the values in A, since they are
405 // well formed in-place.
406 for (auto it = it_begin; it < it_end; ++it) {
407 const LocalTileIndex idx = *it;
408 const matrix::SubTileSpec& spec = panel_view(idx);
410 // Note: This is a workaround for the deadlock problem with sub-tiles.
411 // Without this copy, during matrix update the same tile would get accessed at the same
412 // time both in readonly mode (for reflectors) and in readwrite mode (for updating the
413 // matrix). This would result in a deadlock, so instead of linking the panel to an external
414 // tile, memory provided internally by the panel is used as support. In this way, the two
415 // subtiles used in the operation belong to different tiles.
416 if (force_copy)
417 ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
418 matrix::copy(dlaf::internal::Policy<B>(thread_priority::high,
419 thread_stacksize::nostack)));
420 else
421 v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
422 }
425template <Backend B, Device D, class T>
426void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
427 matrix::ReadOnlyTileSender<T, D> tile_t) {
428 namespace ex = pika::execution::experimental;
430 using pika::execution::thread_priority;
431 using pika::execution::thread_stacksize;
432 using namespace blas;
387 if (has_head) {
388 const LocalTileIndex i = *it_begin;
389 matrix::SubTileSpec spec = panel_view(i);
391 // Note:
392 // If the number of reflectors are limited by height (|reflector| > 1), the panel is narrower than
393 // the blocksize, leading to just using a part of A (first full nrefls columns)
394 spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
396 // Note:
397 // copy + laset is done in two independent tasks, but it could be theoretically merged to into a
398 // single task doing both.
399 const auto p = dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack);
400 ex::start_detached(dlaf::internal::whenAllLift(splitTile(mat_a.read(i), spec), v.readwrite(i)) |
401 matrix::copy(p));
402 ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
403 tile::laset(p));
405 ++it_begin;
406 }
408 // The rest of the V panel of reflectors can just point to the values in A, since they are
409 // well formed in-place.
410 for (auto it = it_begin; it < it_end; ++it) {
411 const LocalTileIndex idx = *it;
412 const matrix::SubTileSpec& spec = panel_view(idx);
414 // Note: This is a workaround for the deadlock problem with sub-tiles.
415 // Without this copy, during matrix update the same tile would get accessed at the same
416 // time both in readonly mode (for reflectors) and in readwrite mode (for updating the
417 // matrix). This would result in a deadlock, so instead of linking the panel to an external
418 // tile, memory provided internally by the panel is used as support. In this way, the two
419 // subtiles used in the operation belong to different tiles.
420 if (force_copy)
421 ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
422 matrix::copy(dlaf::internal::Policy<B>(thread_priority::high,
423 thread_stacksize::nostack)));
424 else
425 v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
426 }
429template <Backend B, Device D, class T>
430void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
431 matrix::ReadOnlyTileSender<T, D> tile_t) {
432 namespace ex = pika::execution::experimental;
434 auto it = w.iteratorLocal();
436 for (const auto& index_i : it) {
437 ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
438 T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
439 tile::trmm3(dlaf::internal::Policy<B>(thread_priority::high,
440 thread_stacksize::nostack)));
441 }
443 if (it.empty()) {
444 ex::start_detached(std::move(tile_t));
434 using pika::execution::thread_priority;
435 using pika::execution::thread_stacksize;
436 using namespace blas;
438 auto it = w.iteratorLocal();
440 for (const auto& index_i : it) {
441 ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
442 T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
443 tile::trmm3(dlaf::internal::Policy<B>(thread_priority::high,
444 thread_stacksize::nostack)));
445 }
448template <Backend B, Device D, class T>
449void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
450 matrix::Panel<Coord::Col, const T, D>& v) {
451 namespace ex = pika::execution::experimental;
453 using pika::execution::thread_priority;
454 using pika::execution::thread_stacksize;
455 using namespace blas;
447 if (it.empty()) {
448 ex::start_detached(std::move(tile_t));
449 }
452template <Backend B, Device D, class T>
453void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
454 matrix::Panel<Coord::Col, const T, D>& v) {
455 namespace ex = pika::execution::experimental;
457 // GEMM X = X - 0.5 . V . W2
458 for (const auto& index_i : v.iteratorLocal())
459 ex::start_detached(
460 dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
461 w2.read(LocalTileIndex(0, 0)), T(1), x.readwrite(index_i)) |
462 tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
465template <Backend B, Device D, class T>
466void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x, const matrix::SubMatrixView& view,
467 matrix::Matrix<const T, D>& a, matrix::Panel<Coord::Col, const T, D>& w) {
468 namespace ex = pika::execution::experimental;
470 using pika::execution::thread_priority;
472 const auto dist = a.distribution();
457 using pika::execution::thread_priority;
458 using pika::execution::thread_stacksize;
459 using namespace blas;
461 // GEMM X = X - 0.5 . V . W2
462 for (const auto& index_i : v.iteratorLocal())
463 ex::start_detached(
464 dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
465 w2.read(LocalTileIndex(0, 0)), T(1), x.readwrite(index_i)) |
466 tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
469template <Backend B, Device D, class T>
470void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x, const matrix::SubMatrixView& view,
471 matrix::Matrix<const T, D>& a, matrix::Panel<Coord::Col, const T, D>& w) {
472 namespace ex = pika::execution::experimental;
474 // Note:
475 // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
476 // "initialized" during computation, so they should not contribute with any spurious value to the final
477 // result.
478 matrix::util::set0<B>(thread_priority::high, x);
480 const LocalTileIndex at_offset = view.begin();
482 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
483 const auto limit = i + 1;
484 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
485 const LocalTileIndex ij{i, j};
487 const bool is_diagonal_tile = (ij.row() == ij.col());
489 const auto& tile_a = splitTile(a.read(ij), view(ij));
474 using pika::execution::thread_priority;
476 const auto dist = a.distribution();
478 // Note:
479 // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
480 // "initialized" during computation, so they should not contribute with any spurious value to the final
481 // result.
482 matrix::util::set0<B>(thread_priority::high, x);
484 const LocalTileIndex at_offset = view.begin();
486 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
487 const auto limit = i + 1;
488 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
489 const LocalTileIndex ij{i, j};
491 if (is_diagonal_tile) {
492 hemmDiag<B>(thread_priority::high, tile_a, w.read(ij), x.readwrite(ij));
493 }
494 else {
495 // Note:
496 // Because A is hermitian and just the lower part contains the data, for each a(ij) not
497 // on the diagonal, two computations are done:
498 // - using a(ij) in its position;
499 // - using a(ij) in its "transposed" position (applying the ConjTrans to its data)
501 {
502 const LocalTileIndex index_x(Coord::Row, ij.row());
503 const LocalTileIndex index_w(Coord::Row, ij.col());
504 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, w.read(index_w),
505 x.readwrite(index_x));
506 }
508 {
509 const LocalTileIndex index_pretended = transposed(ij);
510 const LocalTileIndex index_x(Coord::Row, index_pretended.row());
511 const LocalTileIndex index_w(Coord::Row, index_pretended.col());
512 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, tile_a, w.read(index_w),
513 x.readwrite(index_x));
514 }
515 }
516 }
517 }
520template <Backend B, Device D, class T>
521void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
522 matrix::Panel<Coord::Col, const T, D>& x) {
523 using pika::execution::thread_priority;
524 using pika::execution::thread_stacksize;
526 namespace ex = pika::execution::experimental;
528 // Note:
529 // Not all ranks in the column always hold at least a tile in the panel Ai, but all ranks in
530 // the column are going to participate to the reduce. For them, it is important to set the
531 // partial result W2 to zero.
532 ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
533 tile::set0(dlaf::internal::Policy<B>(thread_priority::high,
534 thread_stacksize::nostack)));
536 using namespace blas;
537 // GEMM W2 = W* . X
538 for (const auto& index_tile : w.iteratorLocal())
539 ex::start_detached(
540 dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
541 x.read(index_tile), T(1), w2.readwrite(LocalTileIndex(0, 0))) |
542 tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
545template <Backend B, Device D, class T>
546void her2kUpdateTrailingMatrix(const matrix::SubMatrixView& view, matrix::Matrix<T, D>& a,
547 matrix::Panel<Coord::Col, const T, D>& x,
548 matrix::Panel<Coord::Col, const T, D>& v) {
549 static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
551 using pika::execution::thread_priority;
553 const auto dist = a.distribution();
491 const bool is_diagonal_tile = (ij.row() == ij.col());
493 const auto& tile_a = splitTile(a.read(ij), view(ij));
495 if (is_diagonal_tile) {
496 hemmDiag<B>(thread_priority::high, tile_a, w.read(ij), x.readwrite(ij));
497 }
498 else {
499 // Note:
500 // Because A is hermitian and just the lower part contains the data, for each a(ij) not
501 // on the diagonal, two computations are done:
502 // - using a(ij) in its position;
503 // - using a(ij) in its "transposed" position (applying the ConjTrans to its data)
505 {
506 const LocalTileIndex index_x(Coord::Row, ij.row());
507 const LocalTileIndex index_w(Coord::Row, ij.col());
508 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, w.read(index_w),
509 x.readwrite(index_x));
510 }
512 {
513 const LocalTileIndex index_pretended = transposed(ij);
514 const LocalTileIndex index_x(Coord::Row, index_pretended.row());
515 const LocalTileIndex index_w(Coord::Row, index_pretended.col());
516 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, tile_a, w.read(index_w),
517 x.readwrite(index_x));
518 }
519 }
520 }
521 }
524template <Backend B, Device D, class T>
525void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
526 matrix::Panel<Coord::Col, const T, D>& x) {
527 using pika::execution::thread_priority;
528 using pika::execution::thread_stacksize;
530 namespace ex = pika::execution::experimental;
532 // Note:
533 // Not all ranks in the column always hold at least a tile in the panel Ai, but all ranks in
534 // the column are going to participate to the reduce. For them, it is important to set the
535 // partial result W2 to zero.
536 ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
537 tile::set0(dlaf::internal::Policy<B>(thread_priority::high,
538 thread_stacksize::nostack)));
540 using namespace blas;
541 // GEMM W2 = W* . X
542 for (const auto& index_tile : w.iteratorLocal())
543 ex::start_detached(
544 dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
545 x.read(index_tile), T(1), w2.readwrite(LocalTileIndex(0, 0))) |
546 tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
549template <Backend B, Device D, class T>
550void her2kUpdateTrailingMatrix(const matrix::SubMatrixView& view, matrix::Matrix<T, D>& a,
551 matrix::Panel<Coord::Col, const T, D>& x,
552 matrix::Panel<Coord::Col, const T, D>& v) {
553 static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
555 const LocalTileIndex at_start = view.begin();
555 using pika::execution::thread_priority;
557 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
558 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
559 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
560 for (SizeType j = at_start.col(); j < limit; ++j) {
561 const LocalTileIndex ij_local{i, j};
562 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
564 const bool is_diagonal_tile = (ij.row() == ij.col());
566 auto getSubA = [&a, &view, ij_local]() {
567 return splitTile(a.readwrite(ij_local), view(ij_local));
568 };
557 const auto dist = a.distribution();
559 const LocalTileIndex at_start = view.begin();
561 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
562 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
563 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
564 for (SizeType j = at_start.col(); j < limit; ++j) {
565 const LocalTileIndex ij_local{i, j};
566 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
568 const bool is_diagonal_tile = (ij.row() == ij.col());
570 // The first column of the trailing matrix (except for the very first global tile) has to be
571 // updated first, in order to unlock the next iteration as soon as possible.
572 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
570 auto getSubA = [&a, &view, ij_local]() {
571 return splitTile(a.readwrite(ij_local), view(ij_local));
572 };
574 if (is_diagonal_tile) {
575 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
576 }
577 else {
578 // A -= X . V*
579 her2kOffDiag<B>(priority, x.read(ij_local), v.read(transposed(ij_local)), getSubA());
581 // A -= V . X*
582 her2kOffDiag<B>(priority, v.read(ij_local), x.read(transposed(ij_local)), getSubA());
583 }
584 }
585 }
590namespace distributed {
591template <Device D, class T>
592T computeReflector(const bool has_head, comm::Communicator& communicator,
593 const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
594 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
596 // Note:
597 // This is an optimization for grouping two separate low bandwidth communications, respectively
598 // bcast(x0) and reduce(norm), where the latency was degrading performances.
599 //
600 // In particular this allReduce allows to:
601 // - bcast x0, since for all ranks is 0 and just the root rank has the real value;
602 // - allReduce squares for the norm computation.
574 // The first column of the trailing matrix (except for the very first global tile) has to be
575 // updated first, in order to unlock the next iteration as soon as possible.
576 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
578 if (is_diagonal_tile) {
579 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
580 }
581 else {
582 // A -= X . V*
583 her2kOffDiag<B>(priority, x.read(ij_local), v.read(transposed(ij_local)), getSubA());
585 // A -= V . X*
586 her2kOffDiag<B>(priority, v.read(ij_local), x.read(transposed(ij_local)), getSubA());
587 }
588 }
589 }
594namespace distributed {
595template <Device D, class T>
596T computeReflector(const bool has_head, comm::Communicator& communicator,
597 const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
598 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
600 // Note:
601 // This is an optimization for grouping two separate low bandwidth communications, respectively
602 // bcast(x0) and reduce(norm), where the latency was degrading performances.
603 //
604 // Moreover, by all-reducing squares and broadcasting x0, all ranks have all the information to
605 // update locally the reflectors (section they have). This is more efficient than computing params
606 // (e.g. norm, y, tau) just on the root rank and then having to broadcast them (i.e. additional
607 // communication).
608 comm::sync::allReduceInPlace(communicator, MPI_SUM,
609 common::make_data(x0_and_squares.data(),
610 to_SizeType(x0_and_squares.size())));
612 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
614 return tau;
617template <class MatrixLikeA, class MatrixLikeTaus, class TriggerSender, class CommSender>
618void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
619 CommSender&& mpi_col_chain_panel, MatrixLikeA& mat_a,
620 MatrixLikeTaus& mat_taus, SizeType j_sub,
621 const matrix::SubPanelView& panel_view) {
622 static Device constexpr D = MatrixLikeA::device;
623 using T = typename MatrixLikeA::ElementType;
624 namespace ex = pika::execution::experimental;
625 namespace di = dlaf::internal;
627 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
628 panel_tiles.reserve(to_sizet(std::distance(panel_view.iteratorLocal().begin(),
629 panel_view.iteratorLocal().end())));
630 for (const auto& i : panel_view.iteratorLocal()) {
631 const matrix::SubTileSpec& spec = panel_view(i);
632 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
633 }
635 const std::size_t nthreads = getReductionToBandPanelNWorkers();
636 auto s =
637 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
638 std::vector<common::internal::vector<T>>{}), // w (internally required)
639 mat_taus.readwrite(GlobalTileIndex(j_sub, 0)),
640 ex::when_all_vector(std::move(panel_tiles)),
641 std::forward<CommSender>(mpi_col_chain_panel), std::forward<TriggerSender>(trigger)) |
642 di::continues_on(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
643 ex::bulk(nthreads, [nthreads, rank_v0,
644 cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr, auto& w,
645 auto& taus, auto& tiles, auto&& pcomm) {
646 const bool rankHasHead = rank_v0 == pcomm.get().rank();
648 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
649 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
650 const std::size_t begin = index * batch_size;
651 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
652 const SizeType nrefls = taus.size().rows();
654 if (index == 0) {
655 w.resize(nthreads);
656 }
604 // In particular this allReduce allows to:
605 // - bcast x0, since for all ranks is 0 and just the root rank has the real value;
606 // - allReduce squares for the norm computation.
607 //
608 // Moreover, by all-reducing squares and broadcasting x0, all ranks have all the information to
609 // update locally the reflectors (section they have). This is more efficient than computing params
610 // (e.g. norm, y, tau) just on the root rank and then having to broadcast them (i.e. additional
611 // communication).
612 comm::sync::allReduceInPlace(communicator, MPI_SUM,
613 common::make_data(x0_and_squares.data(),
614 to_SizeType(x0_and_squares.size())));
616 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
618 return tau;
621template <class MatrixLikeA, class MatrixLikeTaus, class TriggerSender, class CommSender>
622void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
623 CommSender&& mpi_col_chain_panel, MatrixLikeA& mat_a,
624 MatrixLikeTaus& mat_taus, SizeType j_sub,
625 const matrix::SubPanelView& panel_view) {
626 static Device constexpr D = MatrixLikeA::device;
627 using T = typename MatrixLikeA::ElementType;
628 namespace ex = pika::execution::experimental;
629 namespace di = dlaf::internal;
631 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
632 panel_tiles.reserve(to_sizet(std::distance(panel_view.iteratorLocal().begin(),
633 panel_view.iteratorLocal().end())));
634 for (const auto& i : panel_view.iteratorLocal()) {
635 const matrix::SubTileSpec& spec = panel_view(i);
636 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
637 }
639 const std::size_t nworkers = [nrtiles = panel_tiles.size()]() {
640 const std::size_t min_workers = 1;
641 const std::size_t available_workers = get_red2band_panel_nworkers();
642 const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(2));
643 return std::clamp(ideal_workers, min_workers, available_workers);
644 }();
646 ex::start_detached(
647 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nworkers),
648 std::vector<common::internal::vector<T>>{}), // w (internally required)
649 mat_taus.readwrite(GlobalTileIndex(j_sub, 0)),
650 ex::when_all_vector(std::move(panel_tiles)),
651 std::forward<CommSender>(mpi_col_chain_panel), std::forward<TriggerSender>(trigger)) |
652 di::continues_on(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
653 ex::bulk(nworkers, [nworkers, rank_v0,
654 cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr, auto& w,
655 auto& taus, auto& tiles, auto&& pcomm) {
656 const bool rankHasHead = rank_v0 == pcomm.get().rank();
658 for (SizeType j = 0; j < nrefls; ++j) {
659 // STEP1: compute tau and reflector (single-thread)
660 if (index == 0) {
661 const bool has_head = rankHasHead;
662 taus({j, 0}) = computeReflector(has_head, pcomm.get(), tiles, j);
663 }
664 barrier_ptr->arrive_and_wait(barrier_busy_wait);
666 // STEP2a: compute w (multi-threaded)
667 const SizeType pt_cols = cols - (j + 1);
668 if (pt_cols == 0)
669 break;
671 const bool has_head = rankHasHead && (index == 0);
673 w[index] = common::internal::vector<T>(pt_cols, 0);
674 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
675 barrier_ptr->arrive_and_wait(barrier_busy_wait);
677 // STEP2b: reduce w results (single-threaded)
678 if (index == 0) {
679 dlaf::eigensolver::internal::reduceColumnVectors(w);
680 comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM, common::make_data(w[0].data(), pt_cols));
681 }
682 barrier_ptr->arrive_and_wait(barrier_busy_wait);
684 // STEP3: update trailing panel (multi-threaded)
685 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
686 barrier_ptr->arrive_and_wait(barrier_busy_wait);
687 }
688 });
689 ex::start_detached(std::move(s));
692template <Backend B, Device D, class T>
693void hemmComputeX(comm::IndexT_MPI reducer_col, matrix::Panel<Coord::Col, T, D>& x,
694 matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& xt,
695 const matrix::SubMatrixView& view, matrix::Matrix<const T, D>& a,
696 matrix::Panel<Coord::Col, const T, D>& w,
697 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& wt,
698 comm::CommunicatorPipeline<comm::CommunicatorType::Row>& mpi_row_chain,
699 comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_chain) {
700 namespace ex = pika::execution::experimental;
702 using pika::execution::thread_priority;
704 const auto dist = a.distribution();
705 const auto rank = dist.rankIndex();
707 // Note:
708 // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
709 // "initialized" during computation, so they should not contribute with any spurious value to the final
710 // result.
711 matrix::util::set0<B>(thread_priority::high, x);
712 matrix::util::set0<B>(thread_priority::high, xt);
714 const LocalTileIndex at_offset = view.begin();
658 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
659 const std::size_t batch_size = util::ceilDiv(tiles.size(), nworkers);
660 const std::size_t begin = index * batch_size;
661 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
662 const SizeType nrefls = taus.size().rows();
664 if (index == 0) {
665 w.resize(nworkers);
666 }
668 for (SizeType j = 0; j < nrefls; ++j) {
669 // STEP1: compute tau and reflector (single-thread)
670 if (index == 0) {
671 const bool has_head = rankHasHead;
672 taus({j, 0}) = computeReflector(has_head, pcomm.get(), tiles, j);
673 }
674 barrier_ptr->arrive_and_wait(barrier_busy_wait);
676 // STEP2a: compute w (multi-threaded)
677 const SizeType pt_cols = cols - (j + 1);
678 if (pt_cols == 0)
679 break;
681 const bool has_head = rankHasHead && (index == 0);
683 w[index] = common::internal::vector<T>(pt_cols, 0);
684 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
685 barrier_ptr->arrive_and_wait(barrier_busy_wait);
687 // STEP2b: reduce w results (single-threaded)
688 if (index == 0) {
689 dlaf::eigensolver::internal::reduceColumnVectors(w);
690 comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM, common::make_data(w[0].data(), pt_cols));
691 }
692 barrier_ptr->arrive_and_wait(barrier_busy_wait);
694 // STEP3: update trailing panel (multi-threaded)
695 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
696 barrier_ptr->arrive_and_wait(barrier_busy_wait);
697 }
698 }));
701template <Backend B, Device D, class T>
702void hemmComputeX(comm::IndexT_MPI reducer_col, matrix::Panel<Coord::Col, T, D>& x,
703 matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& xt,
704 const matrix::SubMatrixView& view, matrix::Matrix<const T, D>& a,
705 matrix::Panel<Coord::Col, const T, D>& w,
706 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& wt,
707 comm::CommunicatorPipeline<comm::CommunicatorType::Row>& mpi_row_chain,
708 comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_chain) {
709 namespace ex = pika::execution::experimental;
711 using pika::execution::thread_priority;
713 const auto dist = a.distribution();
714 const auto rank = dist.rankIndex();
716 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
717 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
718 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
719 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
720 const LocalTileIndex ij_local{i, j};
721 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
716 // Note:
717 // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
718 // "initialized" during computation, so they should not contribute with any spurious value to the final
719 // result.
720 matrix::util::set0<B>(thread_priority::high, x);
721 matrix::util::set0<B>(thread_priority::high, xt);
723 const bool is_diagonal_tile = (ij.row() == ij.col());
723 const LocalTileIndex at_offset = view.begin();
725 auto tile_a = splitTile(a.read(ij), view(ij_local));
727 if (is_diagonal_tile) {
728 hemmDiag<B>(thread_priority::high, std::move(tile_a), w.read(ij_local), x.readwrite(ij_local));
729 }
730 else {
731 // Note:
732 // Since it is not a diagonal tile, otherwise it would have been managed in the previous
733 // branch, the second operand is not available in W but it is accessible through the
734 // support panel Wt.
735 // However, since we are still computing the "straight" part, the result can be stored
736 // in the "local" panel X.
737 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, wt.read(ij_local),
738 x.readwrite(ij_local));
725 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
726 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
727 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
728 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
729 const LocalTileIndex ij_local{i, j};
730 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
732 const bool is_diagonal_tile = (ij.row() == ij.col());
734 auto tile_a = splitTile(a.read(ij), view(ij_local));
736 if (is_diagonal_tile) {
737 hemmDiag<B>(thread_priority::high, std::move(tile_a), w.read(ij_local), x.readwrite(ij_local));
738 }
739 else {
740 // Note:
741 // Here we are considering the hermitian part of A, so coordinates have to be "mirrored".
742 // So, first step is identifying the mirrored cell coordinate, i.e. swap row/col, together
743 // with realizing if the new coord lays on an owned row or not.
744 // If yes, the result can be stored in the X, otherwise Xt support panel will be used.
745 // For what concerns the second operand, it can be found for sure in W. In fact, the
746 // multiplication requires matching col(A) == row(W), but since coordinates are mirrored,
747 // we are matching row(A) == row(W), so it is local by construction.
748 const auto owner = dist.template rankGlobalTile<Coord::Row>(ij.col());
750 const LocalTileIndex index_x{dist.template localTileFromGlobalTile<Coord::Row>(ij.col()), 0};
751 const LocalTileIndex index_xt{0, ij_local.col()};
753 auto tile_x = (dist.rankIndex().row() == owner) ? x.readwrite(index_x) : xt.readwrite(index_xt);
755 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, std::move(tile_a), w.read(ij_local),
756 std::move(tile_x));
757 }
758 }
759 }
761 // Note:
762 // At this point, partial results of X and Xt are available in the panels, and they have to be reduced,
763 // both row-wise and col-wise.
764 // The final X result will be available just on Ai panel column.
766 // Note:
767 // The first step in reducing partial results distributed over X and Xt, it is to reduce the row
768 // panel Xt col-wise, by collecting all Xt results on the rank which can "mirror" the result on its
769 // rows (i.e. diagonal). So, for each tile of the row panel, select who is the "diagonal" rank that can
770 // mirror and reduce on it.
771 if (mpi_col_chain.size() > 1) {
772 for (const auto& index_xt : xt.iteratorLocal()) {
773 const auto index_k = dist.template globalTileFromLocalTile<Coord::Col>(index_xt.col());
774 const auto rank_owner_row = dist.template rankGlobalTile<Coord::Row>(index_k);
776 if (rank_owner_row == rank.row()) {
777 // Note:
778 // Since it is the owner, it has to perform the "mirroring" of the results from columns to
779 // rows.
780 //
781 // Moreover, it reduces in place because the owner of the diagonal stores the partial result
782 // directly in x (without using xt)
783 const auto i = dist.template localTileFromGlobalTile<Coord::Row>(index_k);
784 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_col_chain.exclusive(), MPI_SUM,
785 x.readwrite({i, 0})));
786 }
787 else {
788 ex::start_detached(comm::schedule_reduce_send(mpi_col_chain.exclusive(), rank_owner_row, MPI_SUM,
789 xt.read(index_xt)));
790 }
791 }
792 }
794 // Note:
795 // At this point partial results are all collected in X (Xt has been embedded in previous step),
796 // so the last step needed is to reduce these last partial results in the final results.
797 // The result is needed just on the column with reflectors.
798 if (mpi_row_chain.size() > 1) {
799 for (const auto& index_x : x.iteratorLocal()) {
800 if (reducer_col == rank.col())
801 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_row_chain.exclusive(), MPI_SUM,
802 x.readwrite(index_x)));
803 else
804 ex::start_detached(comm::schedule_reduce_send(mpi_row_chain.exclusive(), reducer_col, MPI_SUM,
805 x.read(index_x)));
806 }
807 }
810template <Backend B, Device D, class T>
811void her2kUpdateTrailingMatrix(const matrix::SubMatrixView& view, Matrix<T, D>& a,
812 matrix::Panel<Coord::Col, const T, D>& x,
813 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& vt,
814 matrix::Panel<Coord::Col, const T, D>& v,
815 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& xt) {
816 static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
818 using pika::execution::thread_priority;
820 const auto dist = a.distribution();
822 const LocalTileIndex at_start = view.begin();
824 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
825 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
826 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
827 for (SizeType j = at_start.col(); j < limit; ++j) {
828 const LocalTileIndex ij_local{i, j};
829 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
741 // Since it is not a diagonal tile, otherwise it would have been managed in the previous
742 // branch, the second operand is not available in W but it is accessible through the
743 // support panel Wt.
744 // However, since we are still computing the "straight" part, the result can be stored
745 // in the "local" panel X.
746 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, wt.read(ij_local),
747 x.readwrite(ij_local));
749 // Note:
750 // Here we are considering the hermitian part of A, so coordinates have to be "mirrored".
751 // So, first step is identifying the mirrored cell coordinate, i.e. swap row/col, together
752 // with realizing if the new coord lays on an owned row or not.
753 // If yes, the result can be stored in the X, otherwise Xt support panel will be used.
754 // For what concerns the second operand, it can be found for sure in W. In fact, the
755 // multiplication requires matching col(A) == row(W), but since coordinates are mirrored,
756 // we are matching row(A) == row(W), so it is local by construction.
757 const auto owner = dist.template rankGlobalTile<Coord::Row>(ij.col());
759 const LocalTileIndex index_x{dist.template localTileFromGlobalTile<Coord::Row>(ij.col()), 0};
760 const LocalTileIndex index_xt{0, ij_local.col()};
762 auto tile_x = (dist.rankIndex().row() == owner) ? x.readwrite(index_x) : xt.readwrite(index_xt);
764 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, std::move(tile_a), w.read(ij_local),
765 std::move(tile_x));
766 }
767 }
768 }
770 // Note:
771 // At this point, partial results of X and Xt are available in the panels, and they have to be reduced,
772 // both row-wise and col-wise.
773 // The final X result will be available just on Ai panel column.
775 // Note:
776 // The first step in reducing partial results distributed over X and Xt, it is to reduce the row
777 // panel Xt col-wise, by collecting all Xt results on the rank which can "mirror" the result on its
778 // rows (i.e. diagonal). So, for each tile of the row panel, select who is the "diagonal" rank that can
779 // mirror and reduce on it.
780 if (mpi_col_chain.size() > 1) {
781 for (const auto& index_xt : xt.iteratorLocal()) {
782 const auto index_k = dist.template globalTileFromLocalTile<Coord::Col>(index_xt.col());
783 const auto rank_owner_row = dist.template rankGlobalTile<Coord::Row>(index_k);
785 if (rank_owner_row == rank.row()) {
786 // Note:
787 // Since it is the owner, it has to perform the "mirroring" of the results from columns to
788 // rows.
789 //
790 // Moreover, it reduces in place because the owner of the diagonal stores the partial result
791 // directly in x (without using xt)
792 const auto i = dist.template localTileFromGlobalTile<Coord::Row>(index_k);
793 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_col_chain.exclusive(), MPI_SUM,
794 x.readwrite({i, 0})));
795 }
796 else {
797 ex::start_detached(comm::schedule_reduce_send(mpi_col_chain.exclusive(), rank_owner_row, MPI_SUM,
798 xt.read(index_xt)));
799 }
800 }
801 }
803 // Note:
804 // At this point partial results are all collected in X (Xt has been embedded in previous step),
805 // so the last step needed is to reduce these last partial results in the final results.
806 // The result is needed just on the column with reflectors.
807 if (mpi_row_chain.size() > 1) {
808 for (const auto& index_x : x.iteratorLocal()) {
809 if (reducer_col == rank.col())
810 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_row_chain.exclusive(), MPI_SUM,
811 x.readwrite(index_x)));
812 else
813 ex::start_detached(comm::schedule_reduce_send(mpi_row_chain.exclusive(), reducer_col, MPI_SUM,
814 x.read(index_x)));
815 }
816 }
819template <Backend B, Device D, class T>
820void her2kUpdateTrailingMatrix(const matrix::SubMatrixView& view, Matrix<T, D>& a,
821 matrix::Panel<Coord::Col, const T, D>& x,
822 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& vt,
823 matrix::Panel<Coord::Col, const T, D>& v,
824 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& xt) {
825 static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
827 using pika::execution::thread_priority;
829 const auto dist = a.distribution();
831 const bool is_diagonal_tile = (ij.row() == ij.col());
831 const LocalTileIndex at_start = view.begin();
833 auto getSubA = [&a, &view, ij_local]() {
834 return splitTile(a.readwrite(ij_local), view(ij_local));
835 };
837 // The first column of the trailing matrix (except for the very first global tile) has to be
838 // updated first, in order to unlock the next iteration as soon as possible.
839 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
841 if (is_diagonal_tile) {
842 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
843 }
844 else {
845 // A -= X . V*
846 her2kOffDiag<B>(priority, x.read(ij_local), vt.read(ij_local), getSubA());
848 // A -= V . X*
849 her2kOffDiag<B>(priority, v.read(ij_local), xt.read(ij_local), getSubA());
850 }
851 }
852 }
856template <Backend B, Device D, class T>
857struct ComputePanelHelper;
859template <class T>
860struct ComputePanelHelper<Backend::MC, Device::CPU, T> {
861 ComputePanelHelper(const std::size_t, matrix::Distribution) {}
863 void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
864 const matrix::SubPanelView& panel_view) {
865 using red2band::local::computePanelReflectors;
866 computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
867 }
869 template <Device D, class CommSender, class TriggerSender>
870 void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
871 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
872 const matrix::SubPanelView& panel_view) {
873 using red2band::distributed::computePanelReflectors;
874 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
875 std::forward<CommSender>(mpi_col_chain_panel), mat_a, mat_taus, j_sub,
876 panel_view);
877 }
833 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
834 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
835 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
836 for (SizeType j = at_start.col(); j < limit; ++j) {
837 const LocalTileIndex ij_local{i, j};
838 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
840 const bool is_diagonal_tile = (ij.row() == ij.col());
842 auto getSubA = [&a, &view, ij_local]() {
843 return splitTile(a.readwrite(ij_local), view(ij_local));
844 };
846 // The first column of the trailing matrix (except for the very first global tile) has to be
847 // updated first, in order to unlock the next iteration as soon as possible.
848 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
850 if (is_diagonal_tile) {
851 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
852 }
853 else {
854 // A -= X . V*
855 her2kOffDiag<B>(priority, x.read(ij_local), vt.read(ij_local), getSubA());
857 // A -= V . X*
858 her2kOffDiag<B>(priority, v.read(ij_local), xt.read(ij_local), getSubA());
859 }
860 }
861 }
865template <Backend B, Device D, class T>
866struct ComputePanelHelper;
868template <class T>
869struct ComputePanelHelper<Backend::MC, Device::CPU, T> {
870 ComputePanelHelper(const std::size_t, matrix::Distribution) {}
872 void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
873 const matrix::SubPanelView& panel_view) {
874 using red2band::local::computePanelReflectors;
875 computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
876 }
878 template <Device D, class CommSender, class TriggerSender>
879 void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
880 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
881 const matrix::SubPanelView& panel_view) {
882 using red2band::distributed::computePanelReflectors;
883 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
884 std::forward<CommSender>(mpi_col_chain_panel), mat_a, mat_taus, j_sub,
885 panel_view);
886 }
880#ifdef DLAF_WITH_GPU
881template <class T>
882struct ComputePanelHelper<Backend::GPU, Device::GPU, T> {
883 ComputePanelHelper(const std::size_t n_workspaces, matrix::Distribution dist_a)
884 : panels_v(n_workspaces, dist_a) {}
886 void call(Matrix<T, Device::GPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
887 const matrix::SubPanelView& panel_view) {
888 using red2band::local::computePanelReflectors;
890 namespace ex = pika::execution::experimental;
892 // Note:
893 // - copy panel_view from GPU to CPU
894 // - computePanelReflectors on CPU (on a matrix like, with just a panel)
895 // - copy back matrix "panel" from CPU to GPU
897 auto& v = panels_v.nextResource();
889#ifdef DLAF_WITH_GPU
890template <class T>
891struct ComputePanelHelper<Backend::GPU, Device::GPU, T> {
892 ComputePanelHelper(const std::size_t n_workspaces, matrix::Distribution dist_a)
893 : panels_v(n_workspaces, dist_a) {}
895 void call(Matrix<T, Device::GPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
896 const matrix::SubPanelView& panel_view) {
897 using red2band::local::computePanelReflectors;
899 copyToCPU(panel_view, mat_a, v);
900 computePanelReflectors(v, mat_taus, j_sub, panel_view);
901 copyFromCPU(panel_view, v, mat_a);
902 }
904 template <Device D, class CommSender, class TriggerSender>
905 void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
906 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, SizeType j_sub,
907 const matrix::SubPanelView& panel_view) {
908 auto& v = panels_v.nextResource();
910 // copy to CPU
911 copyToCPU(panel_view, mat_a, v);
899 namespace ex = pika::execution::experimental;
901 // Note:
902 // - copy panel_view from GPU to CPU
903 // - computePanelReflectors on CPU (on a matrix like, with just a panel)
904 // - copy back matrix "panel" from CPU to GPU
906 auto& v = panels_v.nextResource();
908 copyToCPU(panel_view, mat_a, v);
909 computePanelReflectors(v, mat_taus, j_sub, panel_view);
910 copyFromCPU(panel_view, v, mat_a);
911 }
913 // compute on CPU
914 using dlaf::eigensolver::internal::red2band::distributed::computePanelReflectors;
915 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
916 std::forward<CommSender>(mpi_col_chain_panel), v, mat_taus, j_sub,
917 panel_view);
913 template <Device D, class CommSender, class TriggerSender>
914 void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
915 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, SizeType j_sub,
916 const matrix::SubPanelView& panel_view) {
917 auto& v = panels_v.nextResource();
919 // copy back to GPU
920 copyFromCPU(panel_view, v, mat_a);
921 }
- -
926 void copyToCPU(const matrix::SubPanelView panel_view, matrix::Matrix<T, Device::GPU>& mat_a,
- -
928 namespace ex = pika::execution::experimental;
- -
931 using dlaf::matrix::internal::CopyBackend_v;
932 using pika::execution::thread_priority;
933 using pika::execution::thread_stacksize;
919 // copy to CPU
920 copyToCPU(panel_view, mat_a, v);
922 // compute on CPU
923 using dlaf::eigensolver::internal::red2band::distributed::computePanelReflectors;
924 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
925 std::forward<CommSender>(mpi_col_chain_panel), v, mat_taus, j_sub,
926 panel_view);
928 // copy back to GPU
929 copyFromCPU(panel_view, v, mat_a);
930 }
935 for (const auto& i : panel_view.iteratorLocal()) {
936 auto spec = panel_view(i);
937 auto tmp_tile = v.readwrite(i);
938 ex::start_detached(
939 ex::when_all(splitTile(mat_a.read(i), spec), splitTile(std::move(tmp_tile), spec)) |
940 matrix::copy(Policy<CopyBackend_v<Device::GPU, Device::CPU>>(thread_priority::high,
941 thread_stacksize::nostack)));
942 }
943 }
945 void copyFromCPU(const matrix::SubPanelView panel_view, matrix::Panel<Coord::Col, T, Device::CPU>& v,
- -
947 namespace ex = pika::execution::experimental;
- -
950 using dlaf::matrix::internal::CopyBackend_v;
951 using pika::execution::thread_priority;
952 using pika::execution::thread_stacksize;
935 void copyToCPU(const matrix::SubPanelView panel_view, matrix::Matrix<T, Device::GPU>& mat_a,
+ +
937 namespace ex = pika::execution::experimental;
+ +
940 using dlaf::matrix::internal::CopyBackend_v;
941 using pika::execution::thread_priority;
942 using pika::execution::thread_stacksize;
944 for (const auto& i : panel_view.iteratorLocal()) {
945 auto spec = panel_view(i);
946 auto tmp_tile = v.readwrite(i);
947 ex::start_detached(
948 ex::when_all(splitTile(mat_a.read(i), spec), splitTile(std::move(tmp_tile), spec)) |
949 matrix::copy(Policy<CopyBackend_v<Device::GPU, Device::CPU>>(thread_priority::high,
950 thread_stacksize::nostack)));
951 }
952 }
954 for (const auto& i : panel_view.iteratorLocal()) {
955 auto spec = panel_view(i);
956 auto tile_a = mat_a.readwrite(i);
957 ex::start_detached(ex::when_all(splitTile(v.read(i), spec), splitTile(std::move(tile_a), spec)) |
958 matrix::copy(Policy<CopyBackend_v<Device::CPU, Device::GPU>>(
959 thread_priority::high, thread_stacksize::nostack)));
960 }
961 }
954 void copyFromCPU(const matrix::SubPanelView panel_view, matrix::Panel<Coord::Col, T, Device::CPU>& v,
+ +
956 namespace ex = pika::execution::experimental;
+ +
959 using dlaf::matrix::internal::CopyBackend_v;
960 using pika::execution::thread_priority;
961 using pika::execution::thread_stacksize;
963 for (const auto& i : panel_view.iteratorLocal()) {
964 auto spec = panel_view(i);
965 auto tile_a = mat_a.readwrite(i);
966 ex::start_detached(ex::when_all(splitTile(v.read(i), spec), splitTile(std::move(tile_a), spec)) |
967 matrix::copy(Policy<CopyBackend_v<Device::CPU, Device::GPU>>(
968 thread_priority::high, thread_stacksize::nostack)));
969 }
970 }
967// Local implementation of reduction to band
968template <Backend B, Device D, class T>
969Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const SizeType band_size) {
- - -
973 using namespace red2band::local;
975 using common::iterate_range2d;
976 using factorization::internal::computeTFactor;
978 using pika::execution::experimental::any_sender;
980 const auto dist_a = mat_a.distribution();
981 const matrix::Distribution dist({mat_a.size().rows(), band_size},
982 {dist_a.blockSize().rows(), band_size});
976// Local implementation of reduction to band
977template <Backend B, Device D, class T>
978Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const SizeType band_size) {
+ + +
982 using namespace red2band::local;
984 // Note:
985 // Reflector of size = 1 is not considered whatever T is (i.e. neither real nor complex)
986 const SizeType nrefls = std::max<SizeType>(0, dist_a.size().rows() - band_size - 1);
988 // Row-vector that is distributed over columns, but exists locally on all rows of the grid
989 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
990 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
991 TileElementSize(mat_a.blockSize().cols(), 1),
992 comm::Size2D(mat_a.commGridSize().cols(), 1),
993 comm::Index2D(mat_a.rankIndex().col(), 0),
994 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
996 if (nrefls == 0)
997 return mat_taus;
999 Matrix<T, Device::CPU> mat_taus_retiled =
1000 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
1002 const SizeType ntiles = (nrefls - 1) / band_size + 1;
1003 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
984 using common::iterate_range2d;
985 using factorization::internal::computeTFactor;
987 using pika::execution::experimental::any_sender;
989 const auto dist_a = mat_a.distribution();
990 const matrix::Distribution dist({mat_a.size().rows(), band_size},
991 {dist_a.blockSize().rows(), band_size});
993 // Note:
994 // Reflector of size = 1 is not considered whatever T is (i.e. neither real nor complex)
995 const SizeType nrefls = std::max<SizeType>(0, dist_a.size().rows() - band_size - 1);
997 // Row-vector that is distributed over columns, but exists locally on all rows of the grid
998 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
999 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
1000 TileElementSize(mat_a.blockSize().cols(), 1),
1001 comm::Size2D(mat_a.commGridSize().cols(), 1),
1002 comm::Index2D(mat_a.rankIndex().col(), 0),
1003 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
1005 const bool is_full_band = (band_size == dist_a.blockSize().cols());
1007 constexpr std::size_t n_workspaces = 2;
1008 common::RoundRobin<Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
1009 common::RoundRobin<Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
1010 common::RoundRobin<Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
1012 // Note:
1013 // Here dist_a is given with full panel size instead of dist with just the part actually needeed,
1014 // because the GPU Helper internally exploits Panel data-structure. Indeed, the full size panel is
1015 // needed in order to mimick Matrix with Panel, so it is possible to apply a SubPanelView to it.
1016 //
1017 // It is a bit hacky usage, because SubPanelView is not meant to be used with Panel, but just with
1018 // Matrix. This results in a variable waste of memory, depending no the ratio band_size/nb.
1019 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist_a);
1005 if (nrefls == 0)
1006 return mat_taus;
1008 Matrix<T, Device::CPU> mat_taus_retiled =
1009 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
1011 const SizeType ntiles = (nrefls - 1) / band_size + 1;
1012 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
1014 const bool is_full_band = (band_size == dist_a.blockSize().cols());
1016 constexpr std::size_t n_workspaces = 2;
1017 common::RoundRobin<Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
1018 common::RoundRobin<Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
1019 common::RoundRobin<Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
1021 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
1022 const auto i_sub = j_sub + 1;
1024 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
1026 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
1028 const bool isPanelIncomplete = (nrefls_tile != band_size);
1021 // Note:
1022 // Here dist_a is given with full panel size instead of dist with just the part actually needeed,
1023 // because the GPU Helper internally exploits Panel data-structure. Indeed, the full size panel is
1024 // needed in order to mimick Matrix with Panel, so it is possible to apply a SubPanelView to it.
1025 //
1026 // It is a bit hacky usage, because SubPanelView is not meant to be used with Panel, but just with
1027 // Matrix. This results in a variable waste of memory, depending no the ratio band_size/nb.
1028 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist_a);
1030 // Note: if this is running, it must have at least one valid reflector (i.e. with size > 1)
1031 DLAF_ASSERT_HEAVY(nrefls_tile != 0, nrefls_tile);
1030 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
1031 const auto i_sub = j_sub + 1;
1033 // Note: SubPanelView is (at most) band_size wide, but it may contain a smaller number of
1034 // reflectors (i.e. at the end when last reflector size is 1)
1035 const matrix::SubPanelView panel_view(dist_a, ij_offset, band_size);
1033 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
1035 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
1037 Panel<Coord::Col, T, D>& v = panels_v.nextResource();
1038 v.setRangeStart(ij_offset);
1039 if (isPanelIncomplete)
1040 v.setWidth(nrefls_tile);
1037 const bool isPanelIncomplete = (nrefls_tile != band_size);
1039 // Note: if this is running, it must have at least one valid reflector (i.e. with size > 1)
1040 DLAF_ASSERT_HEAVY(nrefls_tile != 0, nrefls_tile);
1042 // PANEL
1043 compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
1045 // Note:
1046 // - has_reflector_head tells if this rank owns the first tile of the panel (being local, always true)
1047 // - if !is_full_band it has to force copy as a workaround, otherwise in update matrix it would deadlock
1048 // due to tile shared between panel and trailing matrix
1049 constexpr bool has_reflector_head = true;
1050 setupReflectorPanelV<B, D, T>(has_reflector_head, panel_view, nrefls_tile, v, mat_a, !is_full_band);
1052 const LocalTileIndex t_idx(0, 0);
1053 // TODO used just by the column, maybe we can re-use a panel tile?
1054 // TODO probably the first one in any panel is ok?
1055 Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
1057 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
1060 const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
1062 // Note: if there is no trailing matrix, algorithm has finised
1063 if (!at_offset.isIn(mat_a.size()))
1064 break;
1042 // Note: SubPanelView is (at most) band_size wide, but it may contain a smaller number of
1043 // reflectors (i.e. at the end when last reflector size is 1)
1044 const matrix::SubPanelView panel_view(dist_a, ij_offset, band_size);
1046 Panel<Coord::Col, T, D>& v = panels_v.nextResource();
1047 v.setRangeStart(ij_offset);
1048 if (isPanelIncomplete)
1049 v.setWidth(nrefls_tile);
1051 // PANEL
1052 compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
1054 // Note:
1055 // - has_reflector_head tells if this rank owns the first tile of the panel (being local, always true)
1056 // - if !is_full_band it has to force copy as a workaround, otherwise in update matrix it would deadlock
1057 // due to tile shared between panel and trailing matrix
1058 constexpr bool has_reflector_head = true;
1059 setupReflectorPanelV<B, D, T>(has_reflector_head, panel_view, nrefls_tile, v, mat_a, !is_full_band);
1061 const LocalTileIndex t_idx(0, 0);
1062 // TODO used just by the column, maybe we can re-use a panel tile?
1063 // TODO probably the first one in any panel is ok?
1064 Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
1066 const matrix::SubMatrixView trailing_matrix_view(dist_a, at_offset);
1066 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
1068 // W = V . T
1069 Panel<Coord::Col, T, D>& w = panels_w.nextResource();
1070 w.setRangeStart(at_offset);
1071 if (isPanelIncomplete)
1072 w.setWidth(nrefls_tile);
1074 trmmComputeW<B>(w, v, t.read(t_idx));
1076 // X = At . W
1077 Panel<Coord::Col, T, D>& x = panels_x.nextResource();
1078 x.setRangeStart(at_offset);
1079 if (isPanelIncomplete)
1080 x.setWidth(nrefls_tile);
1082 // Note:
1083 // Since At is hermitian, just the lower part is referenced.
1084 // When the tile is not part of the main diagonal, the same tile has to be used for two computations
1085 // that will contribute to two different rows of X: the ones indexed with row and col.
1086 hemmComputeX<B>(x, trailing_matrix_view, mat_a, w);
1088 // In the next section the next two operations are performed
1089 // A) W2 = W* . X
1090 // B) X -= 1/2 . V . W2
1092 // Note:
1093 // T can be re-used because it is not needed anymore in this step and it has the same shape
1094 Matrix<T, D> w2 = std::move(t);
1096 gemmComputeW2<B>(w2, w, x);
1097 gemmUpdateX<B>(x, w2, v);
1069 const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
1071 // Note: if there is no trailing matrix, algorithm has finised
1072 if (!at_offset.isIn(mat_a.size()))
1073 break;
1075 const matrix::SubMatrixView trailing_matrix_view(dist_a, at_offset);
1077 // W = V . T
1078 Panel<Coord::Col, T, D>& w = panels_w.nextResource();
1079 w.setRangeStart(at_offset);
1080 if (isPanelIncomplete)
1081 w.setWidth(nrefls_tile);
1083 trmmComputeW<B>(w, v, t.read(t_idx));
1085 // X = At . W
1086 Panel<Coord::Col, T, D>& x = panels_x.nextResource();
1087 x.setRangeStart(at_offset);
1088 if (isPanelIncomplete)
1089 x.setWidth(nrefls_tile);
1091 // Note:
1092 // Since At is hermitian, just the lower part is referenced.
1093 // When the tile is not part of the main diagonal, the same tile has to be used for two computations
1094 // that will contribute to two different rows of X: the ones indexed with row and col.
1095 hemmComputeX<B>(x, trailing_matrix_view, mat_a, w);
1097 // In the next section the next two operations are performed
1098 // A) W2 = W* . X
1099 // B) X -= 1/2 . V . W2
1101 // At -= X . V* + V . X*
1102 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, v);
1104 x.reset();
1105 w.reset();
1106 v.reset();
1107 }
1109 return mat_taus;
1112// Distributed implementation of reduction to band
1113template <Backend B, Device D, class T>
1114Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
1115 const SizeType band_size) {
1116 using namespace red2band::distributed;
1101 // Note:
1102 // T can be re-used because it is not needed anymore in this step and it has the same shape
1103 Matrix<T, D> w2 = std::move(t);
1105 gemmComputeW2<B>(w2, w, x);
1106 gemmUpdateX<B>(x, w2, v);
1110 // At -= X . V* + V . X*
1111 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, v);
1113 x.reset();
1114 w.reset();
1115 v.reset();
1116 }
1118 using common::iterate_range2d;
1119 using factorization::internal::computeTFactor;
1118 return mat_taus;
1121 namespace ex = pika::execution::experimental;
1123 // Note:
1124 // This is a temporary workaround.
1125 // See issue https://github.com/eth-cscs/DLA-Future/issues/729
1126 pika::wait();
1128 // This algorithm requires the grid to have at least 2 independent column communicators in the round
1129 // robin array. If there is only one communicator mpi_col_chain and mpi_col_chain_panel will be
1130 // separate pipelines to the same communicator, but since communication is interleaved between the
1131 // pipelines this algorithm will deadlock (separate subpipelines means that all work on the previous
1132 // subpipeline has to complete before the next subpipeline can even start scheduling work).
1133 DLAF_ASSERT(grid.num_pipelines() >= 2, grid.num_pipelines());
1134 auto mpi_row_chain = grid.row_communicator_pipeline();
1135 auto mpi_col_chain = grid.col_communicator_pipeline();
1136 auto mpi_col_chain_panel = grid.col_communicator_pipeline();
1138#ifdef DLAF_WITH_HDF5
1139 static std::atomic<size_t> num_reduction_to_band_calls = 0;
1140 std::stringstream fname;
1141 fname << "reduction_to_band-" << matrix::internal::TypeToString_v<T> << "-"
1142 << std::to_string(num_reduction_to_band_calls) << ".h5";
1143 std::optional<matrix::internal::FileHDF5> file;
1145 if (getTuneParameters().debug_dump_reduction_to_band_data) {
1146 file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
1147 file->write(mat_a, "/input");
1148 }
1151 const auto& dist = mat_a.distribution();
1152 const comm::Index2D rank = dist.rankIndex();
1121// Distributed implementation of reduction to band
1122template <Backend B, Device D, class T>
1123Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
1124 const SizeType band_size) {
1125 using namespace red2band::distributed;
1127 using common::iterate_range2d;
1128 using factorization::internal::computeTFactor;
1130 namespace ex = pika::execution::experimental;
1132 // Note:
1133 // This is a temporary workaround.
1134 // See issue https://github.com/eth-cscs/DLA-Future/issues/729
1135 pika::wait();
1137 // This algorithm requires the grid to have at least 2 independent column communicators in the round
1138 // robin array. If there is only one communicator mpi_col_chain and mpi_col_chain_panel will be
1139 // separate pipelines to the same communicator, but since communication is interleaved between the
1140 // pipelines this algorithm will deadlock (separate subpipelines means that all work on the previous
1141 // subpipeline has to complete before the next subpipeline can even start scheduling work).
1142 DLAF_ASSERT(grid.num_pipelines() >= 2, grid.num_pipelines());
1143 auto mpi_row_chain = grid.row_communicator_pipeline();
1144 auto mpi_col_chain = grid.col_communicator_pipeline();
1145 auto mpi_col_chain_panel = grid.col_communicator_pipeline();
1147#ifdef DLAF_WITH_HDF5
1148 static std::atomic<size_t> num_reduction_to_band_calls = 0;
1149 std::stringstream fname;
1150 fname << "reduction_to_band-" << matrix::internal::TypeToString_v<T> << "-"
1151 << std::to_string(num_reduction_to_band_calls) << ".h5";
1152 std::optional<matrix::internal::FileHDF5> file;
1154 // Note:
1155 // Reflector of size = 1 is not considered whatever T is (i.e. neither real nor complex)
1156 const SizeType nrefls = std::max<SizeType>(0, dist.size().rows() - band_size - 1);
1158 // Row-vector that is distributed over columns, but exists locally on all rows of the grid
1159 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
1160 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
1161 TileElementSize(mat_a.blockSize().cols(), 1),
1162 comm::Size2D(mat_a.commGridSize().cols(), 1),
1163 comm::Index2D(mat_a.rankIndex().col(), 0),
1164 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
1166 if (nrefls == 0) {
1167#ifdef DLAF_WITH_HDF5
1168 if (getTuneParameters().debug_dump_reduction_to_band_data) {
1169 file->write(mat_a, "/band");
1170 }
1172 num_reduction_to_band_calls++;
1154 if (getTuneParameters().debug_dump_reduction_to_band_data) {
1155 file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
1156 file->write(mat_a, "/input");
1157 }
1160 const auto& dist = mat_a.distribution();
1161 const comm::Index2D rank = dist.rankIndex();
1163 // Note:
1164 // Reflector of size = 1 is not considered whatever T is (i.e. neither real nor complex)
1165 const SizeType nrefls = std::max<SizeType>(0, dist.size().rows() - band_size - 1);
1167 // Row-vector that is distributed over columns, but exists locally on all rows of the grid
1168 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
1169 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
1170 TileElementSize(mat_a.blockSize().cols(), 1),
1171 comm::Size2D(mat_a.commGridSize().cols(), 1),
1172 comm::Index2D(mat_a.rankIndex().col(), 0),
1173 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
1175 return mat_taus;
1176 }
1178 Matrix<T, Device::CPU> mat_taus_retiled =
1179 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
1175 if (nrefls == 0) {
1176#ifdef DLAF_WITH_HDF5
1177 if (getTuneParameters().debug_dump_reduction_to_band_data) {
1178 file->write(mat_a, "/band");
1179 }
1181 const SizeType ntiles = (nrefls - 1) / band_size + 1;
1182 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
1181 num_reduction_to_band_calls++;
1184 const bool is_full_band = (band_size == dist.blockSize().cols());
1186 constexpr std::size_t n_workspaces = 2;
1187 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
1188 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_vt(
1189 n_workspaces, dist);
1191 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
1192 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_wt(
1193 n_workspaces, dist);
1184 return mat_taus;
1185 }
1187 Matrix<T, Device::CPU> mat_taus_retiled =
1188 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
1190 const SizeType ntiles = (nrefls - 1) / band_size + 1;
1191 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
1193 const bool is_full_band = (band_size == dist.blockSize().cols());
1195 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
1196 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
1197 n_workspaces, dist);
1199 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
1201 ex::unique_any_sender<> trigger_panel{ex::just()};
1202 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
1203 const SizeType i_sub = j_sub + 1;
1205 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
1206 const GlobalElementIndex at_offset(i_sub * band_size, (j_sub + 1) * band_size);
1195 constexpr std::size_t n_workspaces = 2;
1196 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
1197 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_vt(
1198 n_workspaces, dist);
1200 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
1201 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_wt(
1202 n_workspaces, dist);
1204 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
1205 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
1206 n_workspaces, dist);
1208 const comm::Index2D rank_v0{
1209 dist.template rankGlobalElement<Coord::Row>(ij_offset.row()),
1210 dist.template rankGlobalElement<Coord::Col>(ij_offset.col()),
1211 };
1213 const bool is_panel_rank_col = rank_v0.col() == rank.col();
1215 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
1208 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
1210 ex::unique_any_sender<> trigger_panel{ex::just()};
1211 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
1212 const SizeType i_sub = j_sub + 1;
1214 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
1215 const GlobalElementIndex at_offset(i_sub * band_size, (j_sub + 1) * band_size);
1217 if (nrefls_tile == 0)
1218 break;
1220 auto& v = panels_v.nextResource();
1221 auto& vt = panels_vt.nextResource();
1223 v.setRangeStart(at_offset);
1224 vt.setRangeStart(at_offset);
1217 const comm::Index2D rank_v0{
1218 dist.template rankGlobalElement<Coord::Row>(ij_offset.row()),
1219 dist.template rankGlobalElement<Coord::Col>(ij_offset.col()),
1220 };
1222 const bool is_panel_rank_col = rank_v0.col() == rank.col();
1224 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
1226 v.setWidth(nrefls_tile);
1227 vt.setHeight(nrefls_tile);
1226 if (nrefls_tile == 0)
1227 break;
1229 const LocalTileIndex t_idx(0, 0);
1230 // TODO used just by the column, maybe we can re-use a panel tile?
1231 // TODO or we can keep just the sh_future and allocate just inside if (is_panel_rank_col)
1232 matrix::Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
1234 // PANEL
1235 const matrix::SubPanelView panel_view(dist, ij_offset, band_size);
1237 if (is_panel_rank_col) {
1238 compute_panel_helper.call(std::move(trigger_panel), rank_v0.row(), mpi_col_chain_panel.exclusive(),
1239 mat_a, mat_taus_retiled, j_sub, panel_view);
1241 // Note:
1242 // - has_reflector_head tells if this rank owns the first tile of the panel
1243 // - if !is_full_band it has to force copy as a workaround, otherwise in update matrix it would
1244 // deadlock due to tile shared between panel and trailing matrix
1245 red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
1246 nrefls_tile, v, mat_a, !is_full_band);
1247 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
1248 mpi_col_chain);
1249 }
1253 // Note: if there is no trailing matrix, algorithm has finised
1254 if (!at_offset.isIn(mat_a.size()))
1255 break;
1257 const matrix::SubMatrixView trailing_matrix_view(dist, at_offset);
1259 comm::broadcast(rank_v0.col(), v, vt, mpi_row_chain, mpi_col_chain);
1261 // W = V . T
1262 auto& w = panels_w.nextResource();
1263 auto& wt = panels_wt.nextResource();
1265 w.setRangeStart(at_offset);
1266 wt.setRangeStart(at_offset);
1229 auto& v = panels_v.nextResource();
1230 auto& vt = panels_vt.nextResource();
1232 v.setRangeStart(at_offset);
1233 vt.setRangeStart(at_offset);
1235 v.setWidth(nrefls_tile);
1236 vt.setHeight(nrefls_tile);
1238 const LocalTileIndex t_idx(0, 0);
1239 // TODO used just by the column, maybe we can re-use a panel tile?
1240 // TODO or we can keep just the sh_future and allocate just inside if (is_panel_rank_col)
1241 matrix::Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
1243 // PANEL
1244 const matrix::SubPanelView panel_view(dist, ij_offset, band_size);
1246 if (is_panel_rank_col) {
1247 compute_panel_helper.call(std::move(trigger_panel), rank_v0.row(), mpi_col_chain_panel.exclusive(),
1248 mat_a, mat_taus_retiled, j_sub, panel_view);
1250 // Note:
1251 // - has_reflector_head tells if this rank owns the first tile of the panel
1252 // - if !is_full_band it has to force copy as a workaround, otherwise in update matrix it would
1253 // deadlock due to tile shared between panel and trailing matrix
1254 red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
1255 nrefls_tile, v, mat_a, !is_full_band);
1256 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
1257 mpi_col_chain);
1258 }
1262 // Note: if there is no trailing matrix, algorithm has finised
1263 if (!at_offset.isIn(mat_a.size()))
1264 break;
1266 const matrix::SubMatrixView trailing_matrix_view(dist, at_offset);
1268 w.setWidth(nrefls_tile);
1269 wt.setHeight(nrefls_tile);
1271 if (is_panel_rank_col)
1272 red2band::local::trmmComputeW<B, D>(w, v, t.read(t_idx));
1268 comm::broadcast(rank_v0.col(), v, vt, mpi_row_chain, mpi_col_chain);
1270 // W = V . T
1271 auto& w = panels_w.nextResource();
1272 auto& wt = panels_wt.nextResource();
1274 comm::broadcast(rank_v0.col(), w, wt, mpi_row_chain, mpi_col_chain);
1276 // X = At . W
1277 auto& x = panels_x.nextResource();
1278 auto& xt = panels_xt.nextResource();
1274 w.setRangeStart(at_offset);
1275 wt.setRangeStart(at_offset);
1277 w.setWidth(nrefls_tile);
1278 wt.setHeight(nrefls_tile);
1280 x.setRangeStart(at_offset);
1281 xt.setRangeStart(at_offset);
1280 if (is_panel_rank_col)
1281 red2band::local::trmmComputeW<B, D>(w, v, t.read(t_idx));
1283 x.setWidth(nrefls_tile);
1284 xt.setHeight(nrefls_tile);
1286 // Note:
1287 // Since At is hermitian, just the lower part is referenced.
1288 // When the tile is not part of the main diagonal, the same tile has to be used for two computations
1289 // that will contribute to two different rows of X: the ones indexed with row and col.
1290 // This is achieved by storing the two results in two different workspaces: X and X_conj respectively.
1291 //
1292 // On exit, x will contain a valid result just on ranks belonging to the column panel.
1293 // For what concerns xt, it is just used as support and it contains junk data on all ranks.
1294 hemmComputeX<B, D>(rank_v0.col(), x, xt, trailing_matrix_view, mat_a, w, wt, mpi_row_chain,
1295 mpi_col_chain);
1297 // In the next section the next two operations are performed
1298 // A) W2 = W* . X
1299 // B) X -= 1/2 . V . W2
1301 // Note:
1302 // Now the intermediate result for X is available on the panel column ranks,
1303 // which have locally all the needed stuff for updating X and finalize the result
1304 if (is_panel_rank_col) {
1305 // Note:
1306 // T can be re-used because it is not needed anymore in this step and it has the same shape
1307 matrix::Matrix<T, D> w2 = std::move(t);
1309 red2band::local::gemmComputeW2<B, D>(w2, w, x);
1310 if (mpi_col_chain.size() > 1) {
1311 ex::start_detached(comm::schedule_all_reduce_in_place(mpi_col_chain.exclusive(), MPI_SUM,
1312 w2.readwrite(LocalTileIndex(0, 0))));
1313 }
1315 red2band::local::gemmUpdateX<B, D>(x, w2, v);
1316 }
1283 comm::broadcast(rank_v0.col(), w, wt, mpi_row_chain, mpi_col_chain);
1285 // X = At . W
1286 auto& x = panels_x.nextResource();
1287 auto& xt = panels_xt.nextResource();
1289 x.setRangeStart(at_offset);
1290 xt.setRangeStart(at_offset);
1292 x.setWidth(nrefls_tile);
1293 xt.setHeight(nrefls_tile);
1295 // Note:
1296 // Since At is hermitian, just the lower part is referenced.
1297 // When the tile is not part of the main diagonal, the same tile has to be used for two computations
1298 // that will contribute to two different rows of X: the ones indexed with row and col.
1299 // This is achieved by storing the two results in two different workspaces: X and X_conj respectively.
1300 //
1301 // On exit, x will contain a valid result just on ranks belonging to the column panel.
1302 // For what concerns xt, it is just used as support and it contains junk data on all ranks.
1303 hemmComputeX<B, D>(rank_v0.col(), x, xt, trailing_matrix_view, mat_a, w, wt, mpi_row_chain,
1304 mpi_col_chain);
1306 // In the next section the next two operations are performed
1307 // A) W2 = W* . X
1308 // B) X -= 1/2 . V . W2
1310 // Note:
1311 // Now the intermediate result for X is available on the panel column ranks,
1312 // which have locally all the needed stuff for updating X and finalize the result
1313 if (is_panel_rank_col) {
1314 // Note:
1315 // T can be re-used because it is not needed anymore in this step and it has the same shape
1316 matrix::Matrix<T, D> w2 = std::move(t);
1318 // Note:
1319 // xt has been used previously as workspace for hemmComputeX, so it has to be reset, because now it
1320 // will be used for accessing the broadcasted version of x
1321 xt.reset();
1322 xt.setRangeStart(at_offset);
1323 xt.setHeight(nrefls_tile);
1325 comm::broadcast(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
1318 red2band::local::gemmComputeW2<B, D>(w2, w, x);
1319 if (mpi_col_chain.size() > 1) {
1320 ex::start_detached(comm::schedule_all_reduce_in_place(mpi_col_chain.exclusive(), MPI_SUM,
1321 w2.readwrite(LocalTileIndex(0, 0))));
1322 }
1324 red2band::local::gemmUpdateX<B, D>(x, w2, v);
1325 }
1329 // Note:
1330 // This trigger mechanism allows to control when the next iteration of compute panel will start.
1331 //
1332 // * What?
1333 // Compute panel uses MPI blocking communication that might block the only computing thread
1334 // available (since blocking communication are scheduled on normal queues and not on the MPI
1335 // dedicated one).
1336 //
1337 // * How?
1338 // If pika runtime has only 2 threads, one is dedicated to MPI and there is just one for
1339 // computation, that might get blocked by blocking MPI communication, without the chance to do
1340 // anything else. (TODO this might happen even with more reductions happening in parallel)
1341 //
1342 // * Why?
1343 // Panel computation at step i is done on the first column of the trailing matrix computed
1344 // at step i-1.
1345 // The rank owning the top-left tile of the trailing matrix, can update it as soon as it
1346 // receives X[0], which due to the pivot position is also the Xt[0]. Once it can go to the next
1347 // iteration, it ends up stucked in an MPI blocking communication, waiting for the others joining
1348 // before being able to advance.
1349 //
1350 // But at the same time, other ranks in the same column (needed for the next panel update), cannot
1351 // complete the trailing matrix update. Indeed, they are waiting for the pivot rank to communicate
1352 // column-wise Xt[0] (during x -> xt panel transpose broadcast), but he is not going to schedule
1353 // anything because the only normal thread which can do that is stuck in an MPI blocking
1354 // communication that is not going to advance... and so it's a DEADLOCK!
1355 //
1356 // * Solution:
1357 // The idea is to make the next panel depending not only on tiles stored locally, but also to
1358 // ensure that others have received Xt[0], which is needed to advance the computation and let
1359 // others arrive at the next iteration where the pivot will wait for them to complete the MPI
1360 // blocking communication.
1361 //
1362 // * Why is it different between MC and GPU?
1363 // As said above, the problem is related to the communication. But the communication is not said
1364 // to be an atomic operation happening in a single task. It might have to create a copy to
1365 // a buffer more suitable for the communication (e.g. GPU -> CPU if GPU-aware MPI is not
1366 // available).
1367 //
1368 // And in order to not be blocked, it must be ensured that the actual communication task has
1369 // been scheduled.
1370 const SizeType j_tile_current = ij_offset.col() / dist.blockSize().cols();
1371 const SizeType j_tile_next = at_offset.col() / dist.blockSize().cols();
1372 const bool isNextColumnOnSameRank = (j_tile_current == j_tile_next);
1373 const comm::IndexT_MPI rank_next_col =
1374 isNextColumnOnSameRank ? rank_v0.col() : (rank_v0.col() + 1) % dist.commGridSize().cols();
1376 if (rank.col() == rank_next_col) {
1377 const LocalTileIndex at{
1378 dist.template nextLocalTileFromGlobalElement<Coord::Row>(at_offset.row()),
1379 dist.template nextLocalTileFromGlobalElement<Coord::Col>(at_offset.col()),
1380 };
1382 // Note:
1383 // This additional communication of the last tile is a workaround for supporting following trigger
1384 // when b < mb.
1385 // Indeed, if b < mb the last column have (at least) a panel to compute, but differently from
1386 // other columns, broadcast transposed doesn't communicate the last tile, which is an assumption
1387 // needed to make the following trigger work correctly.
1388 const SizeType at_tile_col =
1389 dist.template globalTileFromGlobalElement<Coord::Col>(at_offset.col());
1327 // Note:
1328 // xt has been used previously as workspace for hemmComputeX, so it has to be reset, because now it
1329 // will be used for accessing the broadcasted version of x
1330 xt.reset();
1331 xt.setRangeStart(at_offset);
1332 xt.setHeight(nrefls_tile);
1334 comm::broadcast(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
1338 // Note:
1339 // This trigger mechanism allows to control when the next iteration of compute panel will start.
1340 //
1341 // * What?
1342 // Compute panel uses MPI blocking communication that might block the only computing thread
1343 // available (since blocking communication are scheduled on normal queues and not on the MPI
1344 // dedicated one).
1345 //
1346 // * How?
1347 // If pika runtime has only 2 threads, one is dedicated to MPI and there is just one for
1348 // computation, that might get blocked by blocking MPI communication, without the chance to do
1349 // anything else. (TODO this might happen even with more reductions happening in parallel)
1350 //
1351 // * Why?
1352 // Panel computation at step i is done on the first column of the trailing matrix computed
1353 // at step i-1.
1354 // The rank owning the top-left tile of the trailing matrix, can update it as soon as it
1355 // receives X[0], which due to the pivot position is also the Xt[0]. Once it can go to the next
1356 // iteration, it ends up stucked in an MPI blocking communication, waiting for the others joining
1357 // before being able to advance.
1358 //
1359 // But at the same time, other ranks in the same column (needed for the next panel update), cannot
1360 // complete the trailing matrix update. Indeed, they are waiting for the pivot rank to communicate
1361 // column-wise Xt[0] (during x -> xt panel transpose broadcast), but he is not going to schedule
1362 // anything because the only normal thread which can do that is stuck in an MPI blocking
1363 // communication that is not going to advance... and so it's a DEADLOCK!
1364 //
1365 // * Solution:
1366 // The idea is to make the next panel depending not only on tiles stored locally, but also to
1367 // ensure that others have received Xt[0], which is needed to advance the computation and let
1368 // others arrive at the next iteration where the pivot will wait for them to complete the MPI
1369 // blocking communication.
1370 //
1371 // * Why is it different between MC and GPU?
1372 // As said above, the problem is related to the communication. But the communication is not said
1373 // to be an atomic operation happening in a single task. It might have to create a copy to
1374 // a buffer more suitable for the communication (e.g. GPU -> CPU if GPU-aware MPI is not
1375 // available).
1376 //
1377 // And in order to not be blocked, it must be ensured that the actual communication task has
1378 // been scheduled.
1379 const SizeType j_tile_current = ij_offset.col() / dist.blockSize().cols();
1380 const SizeType j_tile_next = at_offset.col() / dist.blockSize().cols();
1381 const bool isNextColumnOnSameRank = (j_tile_current == j_tile_next);
1382 const comm::IndexT_MPI rank_next_col =
1383 isNextColumnOnSameRank ? rank_v0.col() : (rank_v0.col() + 1) % dist.commGridSize().cols();
1385 if (rank.col() == rank_next_col) {
1386 const LocalTileIndex at{
1387 dist.template nextLocalTileFromGlobalElement<Coord::Row>(at_offset.row()),
1388 dist.template nextLocalTileFromGlobalElement<Coord::Col>(at_offset.col()),
1389 };
1391 if (at_tile_col == dist.nrTiles().cols() - 1) {
1392 const comm::IndexT_MPI owner = rank_v0.row();
1393 if (rank.row() == owner) {
1394 xt.setTile(at, x.read(at));
1396 if (dist.commGridSize().rows() > 1)
1397 ex::start_detached(comm::schedule_bcast_send(mpi_col_chain.exclusive(), xt.read(at)));
1398 }
1399 else {
1400 if (dist.commGridSize().rows() > 1)
1401 ex::start_detached(comm::schedule_bcast_recv(mpi_col_chain.exclusive(), owner,
1402 xt.readwrite(at)));
1403 }
1404 }
1406 if constexpr (dlaf::comm::CommunicationDevice_v<D> == D) {
1407 // Note:
1408 // if there is no need for additional buffers, we can just wait that xt[0] is ready for
1409 // reading.
1410 if (rank.row() == rank_v0.row()) {
1411 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
1391 // Note:
1392 // This additional communication of the last tile is a workaround for supporting following trigger
1393 // when b < mb.
1394 // Indeed, if b < mb the last column have (at least) a panel to compute, but differently from
1395 // other columns, broadcast transposed doesn't communicate the last tile, which is an assumption
1396 // needed to make the following trigger work correctly.
1397 const SizeType at_tile_col =
1398 dist.template globalTileFromGlobalElement<Coord::Col>(at_offset.col());
1400 if (at_tile_col == dist.nrTiles().cols() - 1) {
1401 const comm::IndexT_MPI owner = rank_v0.row();
1402 if (rank.row() == owner) {
1403 xt.setTile(at, x.read(at));
1405 if (dist.commGridSize().rows() > 1)
1406 ex::start_detached(comm::schedule_bcast_send(mpi_col_chain.exclusive(), xt.read(at)));
1407 }
1408 else {
1409 if (dist.commGridSize().rows() > 1)
1410 ex::start_detached(comm::schedule_bcast_recv(mpi_col_chain.exclusive(), owner,
1411 xt.readwrite(at)));
1412 }
1413 else {
1414 // Note:
1415 // Conservatively ensure that xt[0] needed for updating the first column has been
1416 // received. Just wait for xt because communication of x happens over rows, while the
1417 // pivot rank can just block rank in the same column.
1418 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
1419 }
1420 }
1421 else {
1422 if (rank.row() == rank_v0.row()) {
1413 }
1415 if constexpr (dlaf::comm::CommunicationDevice_v<D> == D) {
1416 // Note:
1417 // if there is no need for additional buffers, we can just wait that xt[0] is ready for
1418 // reading.
1419 if (rank.row() == rank_v0.row()) {
1420 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
1421 }
1422 else {
1423 // Note:
1424 // on the pivot rank, i.e. the one that would quickly go to the next panel and block, from
1425 // implementation we know that xt[0] is set as an external tile pointing to x[0].
1426 // We cannot wait on xt readwrite (because it is an external tile in a panel, that constraints
1427 // it to be just readable), but we can wait on its source x[0]. This has a subtle implication,
1428 // since we will wait not just for the communication to be complete (which is already more
1429 // than what needed), but we will also wait till xt[0] will be released, so after all local
1430 // communication and computation on the first column of the trailing matrix will be completed.
1431 trigger_panel = x.readwrite(at) | ex::drop_value() | ex::ensure_started();
1432 }
1433 else {
1434 // Note:
1435 // Conservatively ensure that xt[0] needed for updating the first column has been
1436 // received. Just wait for xt because communication of x happens over rows, while the
1437 // pivot rank can just block rank in the same column.
1438 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
1439 }
1440 }
1441 }
1443 // At -= X . V* + V . X*
1444 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, vt, v, xt);
1446 xt.reset();
1447 x.reset();
1448 wt.reset();
1449 w.reset();
1450 vt.reset();
1451 v.reset();
1452 }
1454#ifdef DLAF_WITH_HDF5
1455 if (getTuneParameters().debug_dump_reduction_to_band_data) {
1456 file->write(mat_a, "/band");
1457 }
1459 num_reduction_to_band_calls++;
1462 return mat_taus;
1424 // Conservatively ensure that xt[0] needed for updating the first column has been
1425 // received. Just wait for xt because communication of x happens over rows, while the
1426 // pivot rank can just block rank in the same column.
1427 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
1428 }
1429 }
1430 else {
1431 if (rank.row() == rank_v0.row()) {
1432 // Note:
1433 // on the pivot rank, i.e. the one that would quickly go to the next panel and block, from
1434 // implementation we know that xt[0] is set as an external tile pointing to x[0].
1435 // We cannot wait on xt readwrite (because it is an external tile in a panel, that constraints
1436 // it to be just readable), but we can wait on its source x[0]. This has a subtle implication,
1437 // since we will wait not just for the communication to be complete (which is already more
1438 // than what needed), but we will also wait till xt[0] will be released, so after all local
1439 // communication and computation on the first column of the trailing matrix will be completed.
1440 trigger_panel = x.readwrite(at) | ex::drop_value() | ex::ensure_started();
1441 }
1442 else {
1443 // Note:
1444 // Conservatively ensure that xt[0] needed for updating the first column has been
1445 // received. Just wait for xt because communication of x happens over rows, while the
1446 // pivot rank can just block rank in the same column.
1447 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
1448 }
1449 }
1450 }
1452 // At -= X . V* + V . X*
1453 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, vt, v, xt);
1455 xt.reset();
1456 x.reset();
1457 wt.reset();
1458 w.reset();
1459 vt.reset();
1460 v.reset();
1461 }
1463#ifdef DLAF_WITH_HDF5
1464 if (getTuneParameters().debug_dump_reduction_to_band_data) {
1465 file->write(mat_a, "/band");
1466 }
1468 num_reduction_to_band_calls++;
1471 return mat_taus;
void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile< const T, D > &a, const Tile< const T, D > &b, const T beta, const Tile< T, D > &c)
@@ -1585,7 +1594,7 @@ - +
Definition panel.h:589
Definition views.h:132
auto iteratorLocal() const noexcept
Return a Range2D that gives access to all local tiles part of the View.
Definition views.h:70
diff --git a/master/get__red2band__panel__nworkers_8h_source.html b/master/get__red2band__panel__nworkers_8h_source.html index 9199dd87e9..bf06866e51 100644 --- a/master/get__red2band__panel__nworkers_8h_source.html +++ b/master/get__red2band__panel__nworkers_8h_source.html @@ -102,7 +102,7 @@
20namespace dlaf::eigensolver::internal {
22inline size_t getReductionToBandPanelNWorkers() noexcept {
22inline size_t get_red2band_panel_nworkers() noexcept {
23 // Note: precautionarily we leave at least 1 thread "free" to do other stuff (if possible)
24 const std::size_t available_workers = pika::resource::get_thread_pool("default").get_os_thread_count();
25 const std::size_t min_workers = 1;