diff --git a/master/eigensolver_2reduction__to__band_2impl_8h_source.html b/master/eigensolver_2reduction__to__band_2impl_8h_source.html index 6ae066a03f..8a2fa30187 100644 --- a/master/eigensolver_2reduction__to__band_2impl_8h_source.html +++ b/master/eigensolver_2reduction__to__band_2impl_8h_source.html @@ -394,1161 +394,1170 @@
311 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
312 }
313
-
314 const std::size_t nthreads = getReductionToBandPanelNWorkers();
-
315 auto s =
-
316 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
-
317 std::vector<common::internal::vector<T>>{}), // w (internally required)
-
318 mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
-
319 ex::when_all_vector(std::move(panel_tiles))) |
-
320 di::continues_on(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
-
321 ex::bulk(nthreads, [nthreads, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr,
-
322 auto& w, auto& taus, auto& tiles) {
-
323 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
-
324 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
-
325 const std::size_t begin = index * batch_size;
-
326 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
-
327 const SizeType nrefls = taus.size().rows();
-
328
-
329 if (index == 0) {
-
330 w.resize(nthreads);
-
331 }
-
332
-
333 for (SizeType j = 0; j < nrefls; ++j) {
-
334 // STEP1: compute tau and reflector (single-thread)
-
335 if (index == 0) {
-
336 taus({j, 0}) = computeReflector(tiles, j);
-
337 }
-
338
-
339 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
340
-
341 // STEP2a: compute w (multi-threaded)
-
342 const SizeType pt_cols = cols - (j + 1);
-
343 if (pt_cols == 0)
-
344 break;
-
345 const bool has_head = (index == 0);
-
346
-
347 w[index] = common::internal::vector<T>(pt_cols, 0);
-
348 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
-
349 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
350
-
351 // STEP2b: reduce w results (single-threaded)
-
352 if (index == 0)
-
353 dlaf::eigensolver::internal::reduceColumnVectors(w);
+
314 const std::size_t nworkers = [nrtiles = panel_tiles.size()]() {
+
315 const std::size_t min_workers = 1;
+
316 const std::size_t available_workers = get_red2band_panel_nworkers();
+
317 const std::size_t ideal_workers = to_sizet(nrtiles);
+
318 return std::clamp(ideal_workers, min_workers, available_workers);
+
319 }();
+
320 ex::start_detached(
+
321 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nworkers),
+
322 std::vector<common::internal::vector<T>>{}), // w (internally required)
+
323 mat_taus.readwrite(LocalTileIndex(j_sub, 0)),
+
324 ex::when_all_vector(std::move(panel_tiles))) |
+
325 di::continues_on(di::getBackendScheduler<Backend::MC>(thread_priority::high)) |
+
326 ex::bulk(nworkers, [nworkers, cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr,
+
327 auto& w, auto& taus, auto& tiles) {
+
328 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
+
329 const std::size_t batch_size = util::ceilDiv(tiles.size(), nworkers);
+
330 const std::size_t begin = index * batch_size;
+
331 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
+
332 const SizeType nrefls = taus.size().rows();
+
333
+
334 if (index == 0) {
+
335 w.resize(nworkers);
+
336 }
+
337
+
338 for (SizeType j = 0; j < nrefls; ++j) {
+
339 // STEP1: compute tau and reflector (single-thread)
+
340 if (index == 0) {
+
341 taus({j, 0}) = computeReflector(tiles, j);
+
342 }
+
343
+
344 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
345
+
346 // STEP2a: compute w (multi-threaded)
+
347 const SizeType pt_cols = cols - (j + 1);
+
348 if (pt_cols == 0)
+
349 break;
+
350 const bool has_head = (index == 0);
+
351
+
352 w[index] = common::internal::vector<T>(pt_cols, 0);
+
353 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
354 barrier_ptr->arrive_and_wait(barrier_busy_wait);
355
-
356 // STEP3: update trailing panel (multi-threaded)
-
357 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
-
358 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
359 }
-
360 });
-
361 ex::start_detached(std::move(s));
-
362}
-
363
-
364template <Backend B, Device D, class T>
-
365void setupReflectorPanelV(bool has_head, const matrix::SubPanelView& panel_view, const SizeType nrefls,
-
366 matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
-
367 bool force_copy = false) {
-
368 namespace ex = pika::execution::experimental;
-
369
-
370 using pika::execution::thread_priority;
-
371 using pika::execution::thread_stacksize;
-
372
-
373 // Note:
-
374 // Reflectors are stored in the lower triangular part of the A matrix leading to sharing memory
-
375 // between reflectors and results, which are in the upper triangular part. The problem exists only
-
376 // for the first tile (of the V, i.e. band excluded). Since refelectors will be used in next
-
377 // computations, they should be well-formed, i.e. a unit lower trapezoidal matrix. For this reason,
-
378 // a support tile is used, where just the reflectors values are copied, the diagonal is set to 1
-
379 // and the rest is zeroed out.
-
380 auto it_begin = panel_view.iteratorLocal().begin();
-
381 auto it_end = panel_view.iteratorLocal().end();
-
382
-
383 if (has_head) {
-
384 const LocalTileIndex i = *it_begin;
-
385 matrix::SubTileSpec spec = panel_view(i);
+
356 // STEP2b: reduce w results (single-threaded)
+
357 if (index == 0)
+
358 dlaf::eigensolver::internal::reduceColumnVectors(w);
+
359 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
360
+
361 // STEP3: update trailing panel (multi-threaded)
+
362 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
+
363 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
364 }
+
365 }));
+
366}
+
367
+
368template <Backend B, Device D, class T>
+
369void setupReflectorPanelV(bool has_head, const matrix::SubPanelView& panel_view, const SizeType nrefls,
+
370 matrix::Panel<Coord::Col, T, D>& v, matrix::Matrix<const T, D>& mat_a,
+
371 bool force_copy = false) {
+
372 namespace ex = pika::execution::experimental;
+
373
+
374 using pika::execution::thread_priority;
+
375 using pika::execution::thread_stacksize;
+
376
+
377 // Note:
+
378 // Reflectors are stored in the lower triangular part of the A matrix leading to sharing memory
+
379 // between reflectors and results, which are in the upper triangular part. The problem exists only
+
380 // for the first tile (of the V, i.e. band excluded). Since refelectors will be used in next
+
381 // computations, they should be well-formed, i.e. a unit lower trapezoidal matrix. For this reason,
+
382 // a support tile is used, where just the reflectors values are copied, the diagonal is set to 1
+
383 // and the rest is zeroed out.
+
384 auto it_begin = panel_view.iteratorLocal().begin();
+
385 auto it_end = panel_view.iteratorLocal().end();
386
-
387 // Note:
-
388 // If the number of reflectors are limited by height (|reflector| > 1), the panel is narrower than
-
389 // the blocksize, leading to just using a part of A (first full nrefls columns)
-
390 spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
-
391
-
392 // Note:
-
393 // copy + laset is done in two independent tasks, but it could be theoretically merged to into a
-
394 // single task doing both.
-
395 const auto p = dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack);
-
396 ex::start_detached(dlaf::internal::whenAllLift(splitTile(mat_a.read(i), spec), v.readwrite(i)) |
-
397 matrix::copy(p));
-
398 ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
-
399 tile::laset(p));
-
400
-
401 ++it_begin;
-
402 }
-
403
-
404 // The rest of the V panel of reflectors can just point to the values in A, since they are
-
405 // well formed in-place.
-
406 for (auto it = it_begin; it < it_end; ++it) {
-
407 const LocalTileIndex idx = *it;
-
408 const matrix::SubTileSpec& spec = panel_view(idx);
-
409
-
410 // Note: This is a workaround for the deadlock problem with sub-tiles.
-
411 // Without this copy, during matrix update the same tile would get accessed at the same
-
412 // time both in readonly mode (for reflectors) and in readwrite mode (for updating the
-
413 // matrix). This would result in a deadlock, so instead of linking the panel to an external
-
414 // tile, memory provided internally by the panel is used as support. In this way, the two
-
415 // subtiles used in the operation belong to different tiles.
-
416 if (force_copy)
-
417 ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
-
418 matrix::copy(dlaf::internal::Policy<B>(thread_priority::high,
-
419 thread_stacksize::nostack)));
-
420 else
-
421 v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
-
422 }
-
423}
-
424
-
425template <Backend B, Device D, class T>
-
426void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
-
427 matrix::ReadOnlyTileSender<T, D> tile_t) {
-
428 namespace ex = pika::execution::experimental;
-
429
-
430 using pika::execution::thread_priority;
-
431 using pika::execution::thread_stacksize;
-
432 using namespace blas;
+
387 if (has_head) {
+
388 const LocalTileIndex i = *it_begin;
+
389 matrix::SubTileSpec spec = panel_view(i);
+
390
+
391 // Note:
+
392 // If the number of reflectors are limited by height (|reflector| > 1), the panel is narrower than
+
393 // the blocksize, leading to just using a part of A (first full nrefls columns)
+
394 spec.size = {spec.size.rows(), std::min(nrefls, spec.size.cols())};
+
395
+
396 // Note:
+
397 // copy + laset is done in two independent tasks, but it could be theoretically merged to into a
+
398 // single task doing both.
+
399 const auto p = dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack);
+
400 ex::start_detached(dlaf::internal::whenAllLift(splitTile(mat_a.read(i), spec), v.readwrite(i)) |
+
401 matrix::copy(p));
+
402 ex::start_detached(dlaf::internal::whenAllLift(blas::Uplo::Upper, T(0), T(1), v.readwrite(i)) |
+
403 tile::laset(p));
+
404
+
405 ++it_begin;
+
406 }
+
407
+
408 // The rest of the V panel of reflectors can just point to the values in A, since they are
+
409 // well formed in-place.
+
410 for (auto it = it_begin; it < it_end; ++it) {
+
411 const LocalTileIndex idx = *it;
+
412 const matrix::SubTileSpec& spec = panel_view(idx);
+
413
+
414 // Note: This is a workaround for the deadlock problem with sub-tiles.
+
415 // Without this copy, during matrix update the same tile would get accessed at the same
+
416 // time both in readonly mode (for reflectors) and in readwrite mode (for updating the
+
417 // matrix). This would result in a deadlock, so instead of linking the panel to an external
+
418 // tile, memory provided internally by the panel is used as support. In this way, the two
+
419 // subtiles used in the operation belong to different tiles.
+
420 if (force_copy)
+
421 ex::start_detached(ex::when_all(matrix::splitTile(mat_a.read(idx), spec), v.readwrite(idx)) |
+
422 matrix::copy(dlaf::internal::Policy<B>(thread_priority::high,
+
423 thread_stacksize::nostack)));
+
424 else
+
425 v.setTile(idx, matrix::splitTile(mat_a.read(idx), spec));
+
426 }
+
427}
+
428
+
429template <Backend B, Device D, class T>
+
430void trmmComputeW(matrix::Panel<Coord::Col, T, D>& w, matrix::Panel<Coord::Col, T, D>& v,
+
431 matrix::ReadOnlyTileSender<T, D> tile_t) {
+
432 namespace ex = pika::execution::experimental;
433
-
434 auto it = w.iteratorLocal();
-
435
-
436 for (const auto& index_i : it) {
-
437 ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
-
438 T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
-
439 tile::trmm3(dlaf::internal::Policy<B>(thread_priority::high,
-
440 thread_stacksize::nostack)));
-
441 }
-
442
-
443 if (it.empty()) {
-
444 ex::start_detached(std::move(tile_t));
+
434 using pika::execution::thread_priority;
+
435 using pika::execution::thread_stacksize;
+
436 using namespace blas;
+
437
+
438 auto it = w.iteratorLocal();
+
439
+
440 for (const auto& index_i : it) {
+
441 ex::start_detached(dlaf::internal::whenAllLift(Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit,
+
442 T(1), tile_t, v.read(index_i), w.readwrite(index_i)) |
+
443 tile::trmm3(dlaf::internal::Policy<B>(thread_priority::high,
+
444 thread_stacksize::nostack)));
445 }
-
446}
-
447
-
448template <Backend B, Device D, class T>
-
449void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
-
450 matrix::Panel<Coord::Col, const T, D>& v) {
-
451 namespace ex = pika::execution::experimental;
-
452
-
453 using pika::execution::thread_priority;
-
454 using pika::execution::thread_stacksize;
-
455 using namespace blas;
+
446
+
447 if (it.empty()) {
+
448 ex::start_detached(std::move(tile_t));
+
449 }
+
450}
+
451
+
452template <Backend B, Device D, class T>
+
453void gemmUpdateX(matrix::Panel<Coord::Col, T, D>& x, matrix::Matrix<const T, D>& w2,
+
454 matrix::Panel<Coord::Col, const T, D>& v) {
+
455 namespace ex = pika::execution::experimental;
456
-
457 // GEMM X = X - 0.5 . V . W2
-
458 for (const auto& index_i : v.iteratorLocal())
-
459 ex::start_detached(
-
460 dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
-
461 w2.read(LocalTileIndex(0, 0)), T(1), x.readwrite(index_i)) |
-
462 tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
-
463}
-
464
-
465template <Backend B, Device D, class T>
-
466void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x, const matrix::SubMatrixView& view,
-
467 matrix::Matrix<const T, D>& a, matrix::Panel<Coord::Col, const T, D>& w) {
-
468 namespace ex = pika::execution::experimental;
-
469
-
470 using pika::execution::thread_priority;
-
471
-
472 const auto dist = a.distribution();
+
457 using pika::execution::thread_priority;
+
458 using pika::execution::thread_stacksize;
+
459 using namespace blas;
+
460
+
461 // GEMM X = X - 0.5 . V . W2
+
462 for (const auto& index_i : v.iteratorLocal())
+
463 ex::start_detached(
+
464 dlaf::internal::whenAllLift(Op::NoTrans, Op::NoTrans, T(-0.5), v.read(index_i),
+
465 w2.read(LocalTileIndex(0, 0)), T(1), x.readwrite(index_i)) |
+
466 tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
+
467}
+
468
+
469template <Backend B, Device D, class T>
+
470void hemmComputeX(matrix::Panel<Coord::Col, T, D>& x, const matrix::SubMatrixView& view,
+
471 matrix::Matrix<const T, D>& a, matrix::Panel<Coord::Col, const T, D>& w) {
+
472 namespace ex = pika::execution::experimental;
473
-
474 // Note:
-
475 // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
-
476 // "initialized" during computation, so they should not contribute with any spurious value to the final
-
477 // result.
-
478 matrix::util::set0<B>(thread_priority::high, x);
-
479
-
480 const LocalTileIndex at_offset = view.begin();
-
481
-
482 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
-
483 const auto limit = i + 1;
-
484 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
-
485 const LocalTileIndex ij{i, j};
-
486
-
487 const bool is_diagonal_tile = (ij.row() == ij.col());
-
488
-
489 const auto& tile_a = splitTile(a.read(ij), view(ij));
+
474 using pika::execution::thread_priority;
+
475
+
476 const auto dist = a.distribution();
+
477
+
478 // Note:
+
479 // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
+
480 // "initialized" during computation, so they should not contribute with any spurious value to the final
+
481 // result.
+
482 matrix::util::set0<B>(thread_priority::high, x);
+
483
+
484 const LocalTileIndex at_offset = view.begin();
+
485
+
486 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
+
487 const auto limit = i + 1;
+
488 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
+
489 const LocalTileIndex ij{i, j};
490
-
491 if (is_diagonal_tile) {
-
492 hemmDiag<B>(thread_priority::high, tile_a, w.read(ij), x.readwrite(ij));
-
493 }
-
494 else {
-
495 // Note:
-
496 // Because A is hermitian and just the lower part contains the data, for each a(ij) not
-
497 // on the diagonal, two computations are done:
-
498 // - using a(ij) in its position;
-
499 // - using a(ij) in its "transposed" position (applying the ConjTrans to its data)
-
500
-
501 {
-
502 const LocalTileIndex index_x(Coord::Row, ij.row());
-
503 const LocalTileIndex index_w(Coord::Row, ij.col());
-
504 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, w.read(index_w),
-
505 x.readwrite(index_x));
-
506 }
-
507
-
508 {
-
509 const LocalTileIndex index_pretended = transposed(ij);
-
510 const LocalTileIndex index_x(Coord::Row, index_pretended.row());
-
511 const LocalTileIndex index_w(Coord::Row, index_pretended.col());
-
512 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, tile_a, w.read(index_w),
-
513 x.readwrite(index_x));
-
514 }
-
515 }
-
516 }
-
517 }
-
518}
-
519
-
520template <Backend B, Device D, class T>
-
521void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
-
522 matrix::Panel<Coord::Col, const T, D>& x) {
-
523 using pika::execution::thread_priority;
-
524 using pika::execution::thread_stacksize;
-
525
-
526 namespace ex = pika::execution::experimental;
-
527
-
528 // Note:
-
529 // Not all ranks in the column always hold at least a tile in the panel Ai, but all ranks in
-
530 // the column are going to participate to the reduce. For them, it is important to set the
-
531 // partial result W2 to zero.
-
532 ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
-
533 tile::set0(dlaf::internal::Policy<B>(thread_priority::high,
-
534 thread_stacksize::nostack)));
-
535
-
536 using namespace blas;
-
537 // GEMM W2 = W* . X
-
538 for (const auto& index_tile : w.iteratorLocal())
-
539 ex::start_detached(
-
540 dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
-
541 x.read(index_tile), T(1), w2.readwrite(LocalTileIndex(0, 0))) |
-
542 tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
-
543}
-
544
-
545template <Backend B, Device D, class T>
-
546void her2kUpdateTrailingMatrix(const matrix::SubMatrixView& view, matrix::Matrix<T, D>& a,
-
547 matrix::Panel<Coord::Col, const T, D>& x,
-
548 matrix::Panel<Coord::Col, const T, D>& v) {
-
549 static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
-
550
-
551 using pika::execution::thread_priority;
-
552
-
553 const auto dist = a.distribution();
+
491 const bool is_diagonal_tile = (ij.row() == ij.col());
+
492
+
493 const auto& tile_a = splitTile(a.read(ij), view(ij));
+
494
+
495 if (is_diagonal_tile) {
+
496 hemmDiag<B>(thread_priority::high, tile_a, w.read(ij), x.readwrite(ij));
+
497 }
+
498 else {
+
499 // Note:
+
500 // Because A is hermitian and just the lower part contains the data, for each a(ij) not
+
501 // on the diagonal, two computations are done:
+
502 // - using a(ij) in its position;
+
503 // - using a(ij) in its "transposed" position (applying the ConjTrans to its data)
+
504
+
505 {
+
506 const LocalTileIndex index_x(Coord::Row, ij.row());
+
507 const LocalTileIndex index_w(Coord::Row, ij.col());
+
508 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, w.read(index_w),
+
509 x.readwrite(index_x));
+
510 }
+
511
+
512 {
+
513 const LocalTileIndex index_pretended = transposed(ij);
+
514 const LocalTileIndex index_x(Coord::Row, index_pretended.row());
+
515 const LocalTileIndex index_w(Coord::Row, index_pretended.col());
+
516 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, tile_a, w.read(index_w),
+
517 x.readwrite(index_x));
+
518 }
+
519 }
+
520 }
+
521 }
+
522}
+
523
+
524template <Backend B, Device D, class T>
+
525void gemmComputeW2(matrix::Matrix<T, D>& w2, matrix::Panel<Coord::Col, const T, D>& w,
+
526 matrix::Panel<Coord::Col, const T, D>& x) {
+
527 using pika::execution::thread_priority;
+
528 using pika::execution::thread_stacksize;
+
529
+
530 namespace ex = pika::execution::experimental;
+
531
+
532 // Note:
+
533 // Not all ranks in the column always hold at least a tile in the panel Ai, but all ranks in
+
534 // the column are going to participate to the reduce. For them, it is important to set the
+
535 // partial result W2 to zero.
+
536 ex::start_detached(w2.readwrite(LocalTileIndex(0, 0)) |
+
537 tile::set0(dlaf::internal::Policy<B>(thread_priority::high,
+
538 thread_stacksize::nostack)));
+
539
+
540 using namespace blas;
+
541 // GEMM W2 = W* . X
+
542 for (const auto& index_tile : w.iteratorLocal())
+
543 ex::start_detached(
+
544 dlaf::internal::whenAllLift(Op::ConjTrans, Op::NoTrans, T(1), w.read(index_tile),
+
545 x.read(index_tile), T(1), w2.readwrite(LocalTileIndex(0, 0))) |
+
546 tile::gemm(dlaf::internal::Policy<B>(thread_priority::high, thread_stacksize::nostack)));
+
547}
+
548
+
549template <Backend B, Device D, class T>
+
550void her2kUpdateTrailingMatrix(const matrix::SubMatrixView& view, matrix::Matrix<T, D>& a,
+
551 matrix::Panel<Coord::Col, const T, D>& x,
+
552 matrix::Panel<Coord::Col, const T, D>& v) {
+
553 static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
554
-
555 const LocalTileIndex at_start = view.begin();
+
555 using pika::execution::thread_priority;
556
-
557 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
-
558 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
-
559 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
-
560 for (SizeType j = at_start.col(); j < limit; ++j) {
-
561 const LocalTileIndex ij_local{i, j};
-
562 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
-
563
-
564 const bool is_diagonal_tile = (ij.row() == ij.col());
-
565
-
566 auto getSubA = [&a, &view, ij_local]() {
-
567 return splitTile(a.readwrite(ij_local), view(ij_local));
-
568 };
+
557 const auto dist = a.distribution();
+
558
+
559 const LocalTileIndex at_start = view.begin();
+
560
+
561 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
+
562 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
+
563 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
+
564 for (SizeType j = at_start.col(); j < limit; ++j) {
+
565 const LocalTileIndex ij_local{i, j};
+
566 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
567
+
568 const bool is_diagonal_tile = (ij.row() == ij.col());
569
-
570 // The first column of the trailing matrix (except for the very first global tile) has to be
-
571 // updated first, in order to unlock the next iteration as soon as possible.
-
572 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
+
570 auto getSubA = [&a, &view, ij_local]() {
+
571 return splitTile(a.readwrite(ij_local), view(ij_local));
+
572 };
573
-
574 if (is_diagonal_tile) {
-
575 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
-
576 }
-
577 else {
-
578 // A -= X . V*
-
579 her2kOffDiag<B>(priority, x.read(ij_local), v.read(transposed(ij_local)), getSubA());
-
580
-
581 // A -= V . X*
-
582 her2kOffDiag<B>(priority, v.read(ij_local), x.read(transposed(ij_local)), getSubA());
-
583 }
-
584 }
-
585 }
-
586}
-
587
-
588}
-
589
-
590namespace distributed {
-
591template <Device D, class T>
-
592T computeReflector(const bool has_head, comm::Communicator& communicator,
-
593 const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
-
594 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
-
595
-
596 // Note:
-
597 // This is an optimization for grouping two separate low bandwidth communications, respectively
-
598 // bcast(x0) and reduce(norm), where the latency was degrading performances.
-
599 //
-
600 // In particular this allReduce allows to:
-
601 // - bcast x0, since for all ranks is 0 and just the root rank has the real value;
-
602 // - allReduce squares for the norm computation.
+
574 // The first column of the trailing matrix (except for the very first global tile) has to be
+
575 // updated first, in order to unlock the next iteration as soon as possible.
+
576 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
+
577
+
578 if (is_diagonal_tile) {
+
579 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
+
580 }
+
581 else {
+
582 // A -= X . V*
+
583 her2kOffDiag<B>(priority, x.read(ij_local), v.read(transposed(ij_local)), getSubA());
+
584
+
585 // A -= V . X*
+
586 her2kOffDiag<B>(priority, v.read(ij_local), x.read(transposed(ij_local)), getSubA());
+
587 }
+
588 }
+
589 }
+
590}
+
591
+
592}
+
593
+
594namespace distributed {
+
595template <Device D, class T>
+
596T computeReflector(const bool has_head, comm::Communicator& communicator,
+
597 const std::vector<matrix::Tile<T, D>>& panel, SizeType j) {
+
598 std::array<T, 2> x0_and_squares = computeX0AndSquares(has_head, panel, j);
+
599
+
600 // Note:
+
601 // This is an optimization for grouping two separate low bandwidth communications, respectively
+
602 // bcast(x0) and reduce(norm), where the latency was degrading performances.
603 //
-
604 // Moreover, by all-reducing squares and broadcasting x0, all ranks have all the information to
-
605 // update locally the reflectors (section they have). This is more efficient than computing params
-
606 // (e.g. norm, y, tau) just on the root rank and then having to broadcast them (i.e. additional
-
607 // communication).
-
608 comm::sync::allReduceInPlace(communicator, MPI_SUM,
-
609 common::make_data(x0_and_squares.data(),
-
610 to_SizeType(x0_and_squares.size())));
-
611
-
612 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
-
613
-
614 return tau;
-
615}
-
616
-
617template <class MatrixLikeA, class MatrixLikeTaus, class TriggerSender, class CommSender>
-
618void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
-
619 CommSender&& mpi_col_chain_panel, MatrixLikeA& mat_a,
-
620 MatrixLikeTaus& mat_taus, SizeType j_sub,
-
621 const matrix::SubPanelView& panel_view) {
-
622 static Device constexpr D = MatrixLikeA::device;
-
623 using T = typename MatrixLikeA::ElementType;
-
624 namespace ex = pika::execution::experimental;
-
625 namespace di = dlaf::internal;
-
626
-
627 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
-
628 panel_tiles.reserve(to_sizet(std::distance(panel_view.iteratorLocal().begin(),
-
629 panel_view.iteratorLocal().end())));
-
630 for (const auto& i : panel_view.iteratorLocal()) {
-
631 const matrix::SubTileSpec& spec = panel_view(i);
-
632 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
-
633 }
-
634
-
635 const std::size_t nthreads = getReductionToBandPanelNWorkers();
-
636 auto s =
-
637 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nthreads),
-
638 std::vector<common::internal::vector<T>>{}), // w (internally required)
-
639 mat_taus.readwrite(GlobalTileIndex(j_sub, 0)),
-
640 ex::when_all_vector(std::move(panel_tiles)),
-
641 std::forward<CommSender>(mpi_col_chain_panel), std::forward<TriggerSender>(trigger)) |
-
642 di::continues_on(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
-
643 ex::bulk(nthreads, [nthreads, rank_v0,
-
644 cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr, auto& w,
-
645 auto& taus, auto& tiles, auto&& pcomm) {
-
646 const bool rankHasHead = rank_v0 == pcomm.get().rank();
-
647
-
648 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
-
649 const std::size_t batch_size = util::ceilDiv(tiles.size(), nthreads);
-
650 const std::size_t begin = index * batch_size;
-
651 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
-
652 const SizeType nrefls = taus.size().rows();
-
653
-
654 if (index == 0) {
-
655 w.resize(nthreads);
-
656 }
+
604 // In particular this allReduce allows to:
+
605 // - bcast x0, since for all ranks is 0 and just the root rank has the real value;
+
606 // - allReduce squares for the norm computation.
+
607 //
+
608 // Moreover, by all-reducing squares and broadcasting x0, all ranks have all the information to
+
609 // update locally the reflectors (section they have). This is more efficient than computing params
+
610 // (e.g. norm, y, tau) just on the root rank and then having to broadcast them (i.e. additional
+
611 // communication).
+
612 comm::sync::allReduceInPlace(communicator, MPI_SUM,
+
613 common::make_data(x0_and_squares.data(),
+
614 to_SizeType(x0_and_squares.size())));
+
615
+
616 auto tau = computeReflectorAndTau(has_head, panel, j, std::move(x0_and_squares));
+
617
+
618 return tau;
+
619}
+
620
+
621template <class MatrixLikeA, class MatrixLikeTaus, class TriggerSender, class CommSender>
+
622void computePanelReflectors(TriggerSender&& trigger, comm::IndexT_MPI rank_v0,
+
623 CommSender&& mpi_col_chain_panel, MatrixLikeA& mat_a,
+
624 MatrixLikeTaus& mat_taus, SizeType j_sub,
+
625 const matrix::SubPanelView& panel_view) {
+
626 static Device constexpr D = MatrixLikeA::device;
+
627 using T = typename MatrixLikeA::ElementType;
+
628 namespace ex = pika::execution::experimental;
+
629 namespace di = dlaf::internal;
+
630
+
631 std::vector<matrix::ReadWriteTileSender<T, D>> panel_tiles;
+
632 panel_tiles.reserve(to_sizet(std::distance(panel_view.iteratorLocal().begin(),
+
633 panel_view.iteratorLocal().end())));
+
634 for (const auto& i : panel_view.iteratorLocal()) {
+
635 const matrix::SubTileSpec& spec = panel_view(i);
+
636 panel_tiles.emplace_back(matrix::splitTile(mat_a.readwrite(i), spec));
+
637 }
+
638
+
639 const std::size_t nworkers = [nrtiles = panel_tiles.size()]() {
+
640 const std::size_t min_workers = 1;
+
641 const std::size_t available_workers = get_red2band_panel_nworkers();
+
642 const std::size_t ideal_workers = util::ceilDiv(to_sizet(nrtiles), to_sizet(2));
+
643 return std::clamp(ideal_workers, min_workers, available_workers);
+
644 }();
+
645
+
646 ex::start_detached(
+
647 ex::when_all(ex::just(std::make_unique<pika::barrier<>>(nworkers),
+
648 std::vector<common::internal::vector<T>>{}), // w (internally required)
+
649 mat_taus.readwrite(GlobalTileIndex(j_sub, 0)),
+
650 ex::when_all_vector(std::move(panel_tiles)),
+
651 std::forward<CommSender>(mpi_col_chain_panel), std::forward<TriggerSender>(trigger)) |
+
652 di::continues_on(di::getBackendScheduler<Backend::MC>(pika::execution::thread_priority::high)) |
+
653 ex::bulk(nworkers, [nworkers, rank_v0,
+
654 cols = panel_view.cols()](const std::size_t index, auto& barrier_ptr, auto& w,
+
655 auto& taus, auto& tiles, auto&& pcomm) {
+
656 const bool rankHasHead = rank_v0 == pcomm.get().rank();
657
-
658 for (SizeType j = 0; j < nrefls; ++j) {
-
659 // STEP1: compute tau and reflector (single-thread)
-
660 if (index == 0) {
-
661 const bool has_head = rankHasHead;
-
662 taus({j, 0}) = computeReflector(has_head, pcomm.get(), tiles, j);
-
663 }
-
664 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
665
-
666 // STEP2a: compute w (multi-threaded)
-
667 const SizeType pt_cols = cols - (j + 1);
-
668 if (pt_cols == 0)
-
669 break;
-
670
-
671 const bool has_head = rankHasHead && (index == 0);
-
672
-
673 w[index] = common::internal::vector<T>(pt_cols, 0);
-
674 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
-
675 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
676
-
677 // STEP2b: reduce w results (single-threaded)
-
678 if (index == 0) {
-
679 dlaf::eigensolver::internal::reduceColumnVectors(w);
-
680 comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM, common::make_data(w[0].data(), pt_cols));
-
681 }
-
682 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
683
-
684 // STEP3: update trailing panel (multi-threaded)
-
685 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
-
686 barrier_ptr->arrive_and_wait(barrier_busy_wait);
-
687 }
-
688 });
-
689 ex::start_detached(std::move(s));
-
690}
-
691
-
692template <Backend B, Device D, class T>
-
693void hemmComputeX(comm::IndexT_MPI reducer_col, matrix::Panel<Coord::Col, T, D>& x,
-
694 matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& xt,
-
695 const matrix::SubMatrixView& view, matrix::Matrix<const T, D>& a,
-
696 matrix::Panel<Coord::Col, const T, D>& w,
-
697 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& wt,
-
698 comm::CommunicatorPipeline<comm::CommunicatorType::Row>& mpi_row_chain,
-
699 comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_chain) {
-
700 namespace ex = pika::execution::experimental;
-
701
-
702 using pika::execution::thread_priority;
-
703
-
704 const auto dist = a.distribution();
-
705 const auto rank = dist.rankIndex();
-
706
-
707 // Note:
-
708 // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
-
709 // "initialized" during computation, so they should not contribute with any spurious value to the final
-
710 // result.
-
711 matrix::util::set0<B>(thread_priority::high, x);
-
712 matrix::util::set0<B>(thread_priority::high, xt);
-
713
-
714 const LocalTileIndex at_offset = view.begin();
+
658 const auto barrier_busy_wait = getReductionToBandBarrierBusyWait();
+
659 const std::size_t batch_size = util::ceilDiv(tiles.size(), nworkers);
+
660 const std::size_t begin = index * batch_size;
+
661 const std::size_t end = std::min(index * batch_size + batch_size, tiles.size());
+
662 const SizeType nrefls = taus.size().rows();
+
663
+
664 if (index == 0) {
+
665 w.resize(nworkers);
+
666 }
+
667
+
668 for (SizeType j = 0; j < nrefls; ++j) {
+
669 // STEP1: compute tau and reflector (single-thread)
+
670 if (index == 0) {
+
671 const bool has_head = rankHasHead;
+
672 taus({j, 0}) = computeReflector(has_head, pcomm.get(), tiles, j);
+
673 }
+
674 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
675
+
676 // STEP2a: compute w (multi-threaded)
+
677 const SizeType pt_cols = cols - (j + 1);
+
678 if (pt_cols == 0)
+
679 break;
+
680
+
681 const bool has_head = rankHasHead && (index == 0);
+
682
+
683 w[index] = common::internal::vector<T>(pt_cols, 0);
+
684 computeWTrailingPanel(has_head, tiles, w[index], j, pt_cols, begin, end);
+
685 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
686
+
687 // STEP2b: reduce w results (single-threaded)
+
688 if (index == 0) {
+
689 dlaf::eigensolver::internal::reduceColumnVectors(w);
+
690 comm::sync::allReduceInPlace(pcomm.get(), MPI_SUM, common::make_data(w[0].data(), pt_cols));
+
691 }
+
692 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
693
+
694 // STEP3: update trailing panel (multi-threaded)
+
695 updateTrailingPanel(has_head, tiles, j, w[0], taus({j, 0}), begin, end);
+
696 barrier_ptr->arrive_and_wait(barrier_busy_wait);
+
697 }
+
698 }));
+
699}
+
700
+
701template <Backend B, Device D, class T>
+
702void hemmComputeX(comm::IndexT_MPI reducer_col, matrix::Panel<Coord::Col, T, D>& x,
+
703 matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>& xt,
+
704 const matrix::SubMatrixView& view, matrix::Matrix<const T, D>& a,
+
705 matrix::Panel<Coord::Col, const T, D>& w,
+
706 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& wt,
+
707 comm::CommunicatorPipeline<comm::CommunicatorType::Row>& mpi_row_chain,
+
708 comm::CommunicatorPipeline<comm::CommunicatorType::Col>& mpi_col_chain) {
+
709 namespace ex = pika::execution::experimental;
+
710
+
711 using pika::execution::thread_priority;
+
712
+
713 const auto dist = a.distribution();
+
714 const auto rank = dist.rankIndex();
715
-
716 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
-
717 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
-
718 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
-
719 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
-
720 const LocalTileIndex ij_local{i, j};
-
721 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
716 // Note:
+
717 // They have to be set to zero, because all tiles are going to be reduced, and some tiles may not get
+
718 // "initialized" during computation, so they should not contribute with any spurious value to the final
+
719 // result.
+
720 matrix::util::set0<B>(thread_priority::high, x);
+
721 matrix::util::set0<B>(thread_priority::high, xt);
722
-
723 const bool is_diagonal_tile = (ij.row() == ij.col());
+
723 const LocalTileIndex at_offset = view.begin();
724
-
725 auto tile_a = splitTile(a.read(ij), view(ij_local));
-
726
-
727 if (is_diagonal_tile) {
-
728 hemmDiag<B>(thread_priority::high, std::move(tile_a), w.read(ij_local), x.readwrite(ij_local));
-
729 }
-
730 else {
-
731 // Note:
-
732 // Since it is not a diagonal tile, otherwise it would have been managed in the previous
-
733 // branch, the second operand is not available in W but it is accessible through the
-
734 // support panel Wt.
-
735 // However, since we are still computing the "straight" part, the result can be stored
-
736 // in the "local" panel X.
-
737 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, wt.read(ij_local),
-
738 x.readwrite(ij_local));
-
739
+
725 for (SizeType i = at_offset.row(); i < dist.localNrTiles().rows(); ++i) {
+
726 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
+
727 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
+
728 for (SizeType j = limit - 1; j >= at_offset.col(); --j) {
+
729 const LocalTileIndex ij_local{i, j};
+
730 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
731
+
732 const bool is_diagonal_tile = (ij.row() == ij.col());
+
733
+
734 auto tile_a = splitTile(a.read(ij), view(ij_local));
+
735
+
736 if (is_diagonal_tile) {
+
737 hemmDiag<B>(thread_priority::high, std::move(tile_a), w.read(ij_local), x.readwrite(ij_local));
+
738 }
+
739 else {
740 // Note:
-
741 // Here we are considering the hermitian part of A, so coordinates have to be "mirrored".
-
742 // So, first step is identifying the mirrored cell coordinate, i.e. swap row/col, together
-
743 // with realizing if the new coord lays on an owned row or not.
-
744 // If yes, the result can be stored in the X, otherwise Xt support panel will be used.
-
745 // For what concerns the second operand, it can be found for sure in W. In fact, the
-
746 // multiplication requires matching col(A) == row(W), but since coordinates are mirrored,
-
747 // we are matching row(A) == row(W), so it is local by construction.
-
748 const auto owner = dist.template rankGlobalTile<Coord::Row>(ij.col());
-
749
-
750 const LocalTileIndex index_x{dist.template localTileFromGlobalTile<Coord::Row>(ij.col()), 0};
-
751 const LocalTileIndex index_xt{0, ij_local.col()};
-
752
-
753 auto tile_x = (dist.rankIndex().row() == owner) ? x.readwrite(index_x) : xt.readwrite(index_xt);
-
754
-
755 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, std::move(tile_a), w.read(ij_local),
-
756 std::move(tile_x));
-
757 }
-
758 }
-
759 }
-
760
-
761 // Note:
-
762 // At this point, partial results of X and Xt are available in the panels, and they have to be reduced,
-
763 // both row-wise and col-wise.
-
764 // The final X result will be available just on Ai panel column.
-
765
-
766 // Note:
-
767 // The first step in reducing partial results distributed over X and Xt, it is to reduce the row
-
768 // panel Xt col-wise, by collecting all Xt results on the rank which can "mirror" the result on its
-
769 // rows (i.e. diagonal). So, for each tile of the row panel, select who is the "diagonal" rank that can
-
770 // mirror and reduce on it.
-
771 if (mpi_col_chain.size() > 1) {
-
772 for (const auto& index_xt : xt.iteratorLocal()) {
-
773 const auto index_k = dist.template globalTileFromLocalTile<Coord::Col>(index_xt.col());
-
774 const auto rank_owner_row = dist.template rankGlobalTile<Coord::Row>(index_k);
-
775
-
776 if (rank_owner_row == rank.row()) {
-
777 // Note:
-
778 // Since it is the owner, it has to perform the "mirroring" of the results from columns to
-
779 // rows.
-
780 //
-
781 // Moreover, it reduces in place because the owner of the diagonal stores the partial result
-
782 // directly in x (without using xt)
-
783 const auto i = dist.template localTileFromGlobalTile<Coord::Row>(index_k);
-
784 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_col_chain.exclusive(), MPI_SUM,
-
785 x.readwrite({i, 0})));
-
786 }
-
787 else {
-
788 ex::start_detached(comm::schedule_reduce_send(mpi_col_chain.exclusive(), rank_owner_row, MPI_SUM,
-
789 xt.read(index_xt)));
-
790 }
-
791 }
-
792 }
-
793
-
794 // Note:
-
795 // At this point partial results are all collected in X (Xt has been embedded in previous step),
-
796 // so the last step needed is to reduce these last partial results in the final results.
-
797 // The result is needed just on the column with reflectors.
-
798 if (mpi_row_chain.size() > 1) {
-
799 for (const auto& index_x : x.iteratorLocal()) {
-
800 if (reducer_col == rank.col())
-
801 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_row_chain.exclusive(), MPI_SUM,
-
802 x.readwrite(index_x)));
-
803 else
-
804 ex::start_detached(comm::schedule_reduce_send(mpi_row_chain.exclusive(), reducer_col, MPI_SUM,
-
805 x.read(index_x)));
-
806 }
-
807 }
-
808}
-
809
-
810template <Backend B, Device D, class T>
-
811void her2kUpdateTrailingMatrix(const matrix::SubMatrixView& view, Matrix<T, D>& a,
-
812 matrix::Panel<Coord::Col, const T, D>& x,
-
813 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& vt,
-
814 matrix::Panel<Coord::Col, const T, D>& v,
-
815 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& xt) {
-
816 static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
-
817
-
818 using pika::execution::thread_priority;
-
819
-
820 const auto dist = a.distribution();
-
821
-
822 const LocalTileIndex at_start = view.begin();
-
823
-
824 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
-
825 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
-
826 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
-
827 for (SizeType j = at_start.col(); j < limit; ++j) {
-
828 const LocalTileIndex ij_local{i, j};
-
829 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
741 // Since it is not a diagonal tile, otherwise it would have been managed in the previous
+
742 // branch, the second operand is not available in W but it is accessible through the
+
743 // support panel Wt.
+
744 // However, since we are still computing the "straight" part, the result can be stored
+
745 // in the "local" panel X.
+
746 hemmOffDiag<B>(thread_priority::high, blas::Op::NoTrans, tile_a, wt.read(ij_local),
+
747 x.readwrite(ij_local));
+
748
+
749 // Note:
+
750 // Here we are considering the hermitian part of A, so coordinates have to be "mirrored".
+
751 // So, first step is identifying the mirrored cell coordinate, i.e. swap row/col, together
+
752 // with realizing if the new coord lays on an owned row or not.
+
753 // If yes, the result can be stored in the X, otherwise Xt support panel will be used.
+
754 // For what concerns the second operand, it can be found for sure in W. In fact, the
+
755 // multiplication requires matching col(A) == row(W), but since coordinates are mirrored,
+
756 // we are matching row(A) == row(W), so it is local by construction.
+
757 const auto owner = dist.template rankGlobalTile<Coord::Row>(ij.col());
+
758
+
759 const LocalTileIndex index_x{dist.template localTileFromGlobalTile<Coord::Row>(ij.col()), 0};
+
760 const LocalTileIndex index_xt{0, ij_local.col()};
+
761
+
762 auto tile_x = (dist.rankIndex().row() == owner) ? x.readwrite(index_x) : xt.readwrite(index_xt);
+
763
+
764 hemmOffDiag<B>(thread_priority::high, blas::Op::ConjTrans, std::move(tile_a), w.read(ij_local),
+
765 std::move(tile_x));
+
766 }
+
767 }
+
768 }
+
769
+
770 // Note:
+
771 // At this point, partial results of X and Xt are available in the panels, and they have to be reduced,
+
772 // both row-wise and col-wise.
+
773 // The final X result will be available just on Ai panel column.
+
774
+
775 // Note:
+
776 // The first step in reducing partial results distributed over X and Xt, it is to reduce the row
+
777 // panel Xt col-wise, by collecting all Xt results on the rank which can "mirror" the result on its
+
778 // rows (i.e. diagonal). So, for each tile of the row panel, select who is the "diagonal" rank that can
+
779 // mirror and reduce on it.
+
780 if (mpi_col_chain.size() > 1) {
+
781 for (const auto& index_xt : xt.iteratorLocal()) {
+
782 const auto index_k = dist.template globalTileFromLocalTile<Coord::Col>(index_xt.col());
+
783 const auto rank_owner_row = dist.template rankGlobalTile<Coord::Row>(index_k);
+
784
+
785 if (rank_owner_row == rank.row()) {
+
786 // Note:
+
787 // Since it is the owner, it has to perform the "mirroring" of the results from columns to
+
788 // rows.
+
789 //
+
790 // Moreover, it reduces in place because the owner of the diagonal stores the partial result
+
791 // directly in x (without using xt)
+
792 const auto i = dist.template localTileFromGlobalTile<Coord::Row>(index_k);
+
793 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_col_chain.exclusive(), MPI_SUM,
+
794 x.readwrite({i, 0})));
+
795 }
+
796 else {
+
797 ex::start_detached(comm::schedule_reduce_send(mpi_col_chain.exclusive(), rank_owner_row, MPI_SUM,
+
798 xt.read(index_xt)));
+
799 }
+
800 }
+
801 }
+
802
+
803 // Note:
+
804 // At this point partial results are all collected in X (Xt has been embedded in previous step),
+
805 // so the last step needed is to reduce these last partial results in the final results.
+
806 // The result is needed just on the column with reflectors.
+
807 if (mpi_row_chain.size() > 1) {
+
808 for (const auto& index_x : x.iteratorLocal()) {
+
809 if (reducer_col == rank.col())
+
810 ex::start_detached(comm::schedule_reduce_recv_in_place(mpi_row_chain.exclusive(), MPI_SUM,
+
811 x.readwrite(index_x)));
+
812 else
+
813 ex::start_detached(comm::schedule_reduce_send(mpi_row_chain.exclusive(), reducer_col, MPI_SUM,
+
814 x.read(index_x)));
+
815 }
+
816 }
+
817}
+
818
+
819template <Backend B, Device D, class T>
+
820void her2kUpdateTrailingMatrix(const matrix::SubMatrixView& view, Matrix<T, D>& a,
+
821 matrix::Panel<Coord::Col, const T, D>& x,
+
822 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& vt,
+
823 matrix::Panel<Coord::Col, const T, D>& v,
+
824 matrix::Panel<Coord::Row, const T, D, matrix::StoreTransposed::Yes>& xt) {
+
825 static_assert(std::is_signed_v<BaseType<T>>, "alpha in computations requires to be -1");
+
826
+
827 using pika::execution::thread_priority;
+
828
+
829 const auto dist = a.distribution();
830
-
831 const bool is_diagonal_tile = (ij.row() == ij.col());
+
831 const LocalTileIndex at_start = view.begin();
832
-
833 auto getSubA = [&a, &view, ij_local]() {
-
834 return splitTile(a.readwrite(ij_local), view(ij_local));
-
835 };
-
836
-
837 // The first column of the trailing matrix (except for the very first global tile) has to be
-
838 // updated first, in order to unlock the next iteration as soon as possible.
-
839 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
-
840
-
841 if (is_diagonal_tile) {
-
842 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
-
843 }
-
844 else {
-
845 // A -= X . V*
-
846 her2kOffDiag<B>(priority, x.read(ij_local), vt.read(ij_local), getSubA());
-
847
-
848 // A -= V . X*
-
849 her2kOffDiag<B>(priority, v.read(ij_local), xt.read(ij_local), getSubA());
-
850 }
-
851 }
-
852 }
-
853}
-
854}
-
855
-
856template <Backend B, Device D, class T>
-
857struct ComputePanelHelper;
-
858
-
859template <class T>
-
-
860struct ComputePanelHelper<Backend::MC, Device::CPU, T> {
-
861 ComputePanelHelper(const std::size_t, matrix::Distribution) {}
-
862
-
863 void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
-
864 const matrix::SubPanelView& panel_view) {
-
865 using red2band::local::computePanelReflectors;
-
866 computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
-
867 }
-
868
-
869 template <Device D, class CommSender, class TriggerSender>
-
870 void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
-
871 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
-
872 const matrix::SubPanelView& panel_view) {
-
873 using red2band::distributed::computePanelReflectors;
-
874 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
-
875 std::forward<CommSender>(mpi_col_chain_panel), mat_a, mat_taus, j_sub,
-
876 panel_view);
-
877 }
-
878};
+
833 for (SizeType i = at_start.row(); i < dist.localNrTiles().rows(); ++i) {
+
834 const auto limit = dist.template nextLocalTileFromGlobalTile<Coord::Col>(
+
835 dist.template globalTileFromLocalTile<Coord::Row>(i) + 1);
+
836 for (SizeType j = at_start.col(); j < limit; ++j) {
+
837 const LocalTileIndex ij_local{i, j};
+
838 const GlobalTileIndex ij = dist.globalTileIndex(ij_local);
+
839
+
840 const bool is_diagonal_tile = (ij.row() == ij.col());
+
841
+
842 auto getSubA = [&a, &view, ij_local]() {
+
843 return splitTile(a.readwrite(ij_local), view(ij_local));
+
844 };
+
845
+
846 // The first column of the trailing matrix (except for the very first global tile) has to be
+
847 // updated first, in order to unlock the next iteration as soon as possible.
+
848 const auto priority = (j == at_start.col()) ? thread_priority::high : thread_priority::normal;
+
849
+
850 if (is_diagonal_tile) {
+
851 her2kDiag<B>(priority, v.read(ij_local), x.read(ij_local), getSubA());
+
852 }
+
853 else {
+
854 // A -= X . V*
+
855 her2kOffDiag<B>(priority, x.read(ij_local), vt.read(ij_local), getSubA());
+
856
+
857 // A -= V . X*
+
858 her2kOffDiag<B>(priority, v.read(ij_local), xt.read(ij_local), getSubA());
+
859 }
+
860 }
+
861 }
+
862}
+
863}
+
864
+
865template <Backend B, Device D, class T>
+
866struct ComputePanelHelper;
+
867
+
868template <class T>
+
+
869struct ComputePanelHelper<Backend::MC, Device::CPU, T> {
+
870 ComputePanelHelper(const std::size_t, matrix::Distribution) {}
+
871
+
872 void call(Matrix<T, Device::CPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
+
873 const matrix::SubPanelView& panel_view) {
+
874 using red2band::local::computePanelReflectors;
+
875 computePanelReflectors(mat_a, mat_taus, j_sub, panel_view);
+
876 }
+
877
+
878 template <Device D, class CommSender, class TriggerSender>
+
879 void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
+
880 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
+
881 const matrix::SubPanelView& panel_view) {
+
882 using red2band::distributed::computePanelReflectors;
+
883 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
+
884 std::forward<CommSender>(mpi_col_chain_panel), mat_a, mat_taus, j_sub,
+
885 panel_view);
+
886 }
+
887};
-
879
-
880#ifdef DLAF_WITH_GPU
-
881template <class T>
-
-
882struct ComputePanelHelper<Backend::GPU, Device::GPU, T> {
-
883 ComputePanelHelper(const std::size_t n_workspaces, matrix::Distribution dist_a)
-
884 : panels_v(n_workspaces, dist_a) {}
-
885
-
886 void call(Matrix<T, Device::GPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
-
887 const matrix::SubPanelView& panel_view) {
-
888 using red2band::local::computePanelReflectors;
-
889
-
890 namespace ex = pika::execution::experimental;
-
891
-
892 // Note:
-
893 // - copy panel_view from GPU to CPU
-
894 // - computePanelReflectors on CPU (on a matrix like, with just a panel)
-
895 // - copy back matrix "panel" from CPU to GPU
-
896
-
897 auto& v = panels_v.nextResource();
+
888
+
889#ifdef DLAF_WITH_GPU
+
890template <class T>
+
+
891struct ComputePanelHelper<Backend::GPU, Device::GPU, T> {
+
892 ComputePanelHelper(const std::size_t n_workspaces, matrix::Distribution dist_a)
+
893 : panels_v(n_workspaces, dist_a) {}
+
894
+
895 void call(Matrix<T, Device::GPU>& mat_a, Matrix<T, Device::CPU>& mat_taus, const SizeType j_sub,
+
896 const matrix::SubPanelView& panel_view) {
+
897 using red2band::local::computePanelReflectors;
898
-
899 copyToCPU(panel_view, mat_a, v);
-
900 computePanelReflectors(v, mat_taus, j_sub, panel_view);
-
901 copyFromCPU(panel_view, v, mat_a);
-
902 }
-
903
-
904 template <Device D, class CommSender, class TriggerSender>
-
905 void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
-
906 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, SizeType j_sub,
-
907 const matrix::SubPanelView& panel_view) {
-
908 auto& v = panels_v.nextResource();
-
909
-
910 // copy to CPU
-
911 copyToCPU(panel_view, mat_a, v);
+
899 namespace ex = pika::execution::experimental;
+
900
+
901 // Note:
+
902 // - copy panel_view from GPU to CPU
+
903 // - computePanelReflectors on CPU (on a matrix like, with just a panel)
+
904 // - copy back matrix "panel" from CPU to GPU
+
905
+
906 auto& v = panels_v.nextResource();
+
907
+
908 copyToCPU(panel_view, mat_a, v);
+
909 computePanelReflectors(v, mat_taus, j_sub, panel_view);
+
910 copyFromCPU(panel_view, v, mat_a);
+
911 }
912
-
913 // compute on CPU
-
914 using dlaf::eigensolver::internal::red2band::distributed::computePanelReflectors;
-
915 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
-
916 std::forward<CommSender>(mpi_col_chain_panel), v, mat_taus, j_sub,
-
917 panel_view);
+
913 template <Device D, class CommSender, class TriggerSender>
+
914 void call(TriggerSender&& trigger, comm::IndexT_MPI rank_v0, CommSender&& mpi_col_chain_panel,
+
915 Matrix<T, D>& mat_a, Matrix<T, Device::CPU>& mat_taus, SizeType j_sub,
+
916 const matrix::SubPanelView& panel_view) {
+
917 auto& v = panels_v.nextResource();
918
-
919 // copy back to GPU
-
920 copyFromCPU(panel_view, v, mat_a);
-
921 }
-
922
-
923protected:
- -
925
-
926 void copyToCPU(const matrix::SubPanelView panel_view, matrix::Matrix<T, Device::GPU>& mat_a,
- -
928 namespace ex = pika::execution::experimental;
-
929
- -
931 using dlaf::matrix::internal::CopyBackend_v;
-
932 using pika::execution::thread_priority;
-
933 using pika::execution::thread_stacksize;
+
919 // copy to CPU
+
920 copyToCPU(panel_view, mat_a, v);
+
921
+
922 // compute on CPU
+
923 using dlaf::eigensolver::internal::red2band::distributed::computePanelReflectors;
+
924 computePanelReflectors(std::forward<TriggerSender>(trigger), rank_v0,
+
925 std::forward<CommSender>(mpi_col_chain_panel), v, mat_taus, j_sub,
+
926 panel_view);
+
927
+
928 // copy back to GPU
+
929 copyFromCPU(panel_view, v, mat_a);
+
930 }
+
931
+
932protected:
+
934
-
935 for (const auto& i : panel_view.iteratorLocal()) {
-
936 auto spec = panel_view(i);
-
937 auto tmp_tile = v.readwrite(i);
-
938 ex::start_detached(
-
939 ex::when_all(splitTile(mat_a.read(i), spec), splitTile(std::move(tmp_tile), spec)) |
-
940 matrix::copy(Policy<CopyBackend_v<Device::GPU, Device::CPU>>(thread_priority::high,
-
941 thread_stacksize::nostack)));
-
942 }
-
943 }
-
944
-
945 void copyFromCPU(const matrix::SubPanelView panel_view, matrix::Panel<Coord::Col, T, Device::CPU>& v,
- -
947 namespace ex = pika::execution::experimental;
-
948
- -
950 using dlaf::matrix::internal::CopyBackend_v;
-
951 using pika::execution::thread_priority;
-
952 using pika::execution::thread_stacksize;
+
935 void copyToCPU(const matrix::SubPanelView panel_view, matrix::Matrix<T, Device::GPU>& mat_a,
+ +
937 namespace ex = pika::execution::experimental;
+
938
+ +
940 using dlaf::matrix::internal::CopyBackend_v;
+
941 using pika::execution::thread_priority;
+
942 using pika::execution::thread_stacksize;
+
943
+
944 for (const auto& i : panel_view.iteratorLocal()) {
+
945 auto spec = panel_view(i);
+
946 auto tmp_tile = v.readwrite(i);
+
947 ex::start_detached(
+
948 ex::when_all(splitTile(mat_a.read(i), spec), splitTile(std::move(tmp_tile), spec)) |
+
949 matrix::copy(Policy<CopyBackend_v<Device::GPU, Device::CPU>>(thread_priority::high,
+
950 thread_stacksize::nostack)));
+
951 }
+
952 }
953
-
954 for (const auto& i : panel_view.iteratorLocal()) {
-
955 auto spec = panel_view(i);
-
956 auto tile_a = mat_a.readwrite(i);
-
957 ex::start_detached(ex::when_all(splitTile(v.read(i), spec), splitTile(std::move(tile_a), spec)) |
-
958 matrix::copy(Policy<CopyBackend_v<Device::CPU, Device::GPU>>(
-
959 thread_priority::high, thread_stacksize::nostack)));
-
960 }
-
961 }
-
962};
+
954 void copyFromCPU(const matrix::SubPanelView panel_view, matrix::Panel<Coord::Col, T, Device::CPU>& v,
+ +
956 namespace ex = pika::execution::experimental;
+
957
+ +
959 using dlaf::matrix::internal::CopyBackend_v;
+
960 using pika::execution::thread_priority;
+
961 using pika::execution::thread_stacksize;
+
962
+
963 for (const auto& i : panel_view.iteratorLocal()) {
+
964 auto spec = panel_view(i);
+
965 auto tile_a = mat_a.readwrite(i);
+
966 ex::start_detached(ex::when_all(splitTile(v.read(i), spec), splitTile(std::move(tile_a), spec)) |
+
967 matrix::copy(Policy<CopyBackend_v<Device::CPU, Device::GPU>>(
+
968 thread_priority::high, thread_stacksize::nostack)));
+
969 }
+
970 }
+
971};
-
963#endif
-
964
-
965}
-
966
-
967// Local implementation of reduction to band
-
968template <Backend B, Device D, class T>
-
969Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const SizeType band_size) {
- - -
972
-
973 using namespace red2band::local;
-
974
-
975 using common::iterate_range2d;
-
976 using factorization::internal::computeTFactor;
-
977
-
978 using pika::execution::experimental::any_sender;
-
979
-
980 const auto dist_a = mat_a.distribution();
-
981 const matrix::Distribution dist({mat_a.size().rows(), band_size},
-
982 {dist_a.blockSize().rows(), band_size});
+
972#endif
+
973
+
974}
+
975
+
976// Local implementation of reduction to band
+
977template <Backend B, Device D, class T>
+
978Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(Matrix<T, D>& mat_a, const SizeType band_size) {
+ + +
981
+
982 using namespace red2band::local;
983
-
984 // Note:
-
985 // Reflector of size = 1 is not considered whatever T is (i.e. neither real nor complex)
-
986 const SizeType nrefls = std::max<SizeType>(0, dist_a.size().rows() - band_size - 1);
-
987
-
988 // Row-vector that is distributed over columns, but exists locally on all rows of the grid
-
989 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
-
990 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
-
991 TileElementSize(mat_a.blockSize().cols(), 1),
-
992 comm::Size2D(mat_a.commGridSize().cols(), 1),
-
993 comm::Index2D(mat_a.rankIndex().col(), 0),
-
994 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
-
995
-
996 if (nrefls == 0)
-
997 return mat_taus;
-
998
-
999 Matrix<T, Device::CPU> mat_taus_retiled =
-
1000 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
-
1001
-
1002 const SizeType ntiles = (nrefls - 1) / band_size + 1;
-
1003 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
984 using common::iterate_range2d;
+
985 using factorization::internal::computeTFactor;
+
986
+
987 using pika::execution::experimental::any_sender;
+
988
+
989 const auto dist_a = mat_a.distribution();
+
990 const matrix::Distribution dist({mat_a.size().rows(), band_size},
+
991 {dist_a.blockSize().rows(), band_size});
+
992
+
993 // Note:
+
994 // Reflector of size = 1 is not considered whatever T is (i.e. neither real nor complex)
+
995 const SizeType nrefls = std::max<SizeType>(0, dist_a.size().rows() - band_size - 1);
+
996
+
997 // Row-vector that is distributed over columns, but exists locally on all rows of the grid
+
998 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
+
999 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
+
1000 TileElementSize(mat_a.blockSize().cols(), 1),
+
1001 comm::Size2D(mat_a.commGridSize().cols(), 1),
+
1002 comm::Index2D(mat_a.rankIndex().col(), 0),
+
1003 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
1004
-
1005 const bool is_full_band = (band_size == dist_a.blockSize().cols());
-
1006
-
1007 constexpr std::size_t n_workspaces = 2;
-
1008 common::RoundRobin<Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
-
1009 common::RoundRobin<Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
-
1010 common::RoundRobin<Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
-
1011
-
1012 // Note:
-
1013 // Here dist_a is given with full panel size instead of dist with just the part actually needeed,
-
1014 // because the GPU Helper internally exploits Panel data-structure. Indeed, the full size panel is
-
1015 // needed in order to mimick Matrix with Panel, so it is possible to apply a SubPanelView to it.
-
1016 //
-
1017 // It is a bit hacky usage, because SubPanelView is not meant to be used with Panel, but just with
-
1018 // Matrix. This results in a variable waste of memory, depending no the ratio band_size/nb.
-
1019 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist_a);
+
1005 if (nrefls == 0)
+
1006 return mat_taus;
+
1007
+
1008 Matrix<T, Device::CPU> mat_taus_retiled =
+
1009 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+
1010
+
1011 const SizeType ntiles = (nrefls - 1) / band_size + 1;
+
1012 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
1013
+
1014 const bool is_full_band = (band_size == dist_a.blockSize().cols());
+
1015
+
1016 constexpr std::size_t n_workspaces = 2;
+
1017 common::RoundRobin<Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
+
1018 common::RoundRobin<Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
+
1019 common::RoundRobin<Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
1020
-
1021 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
-
1022 const auto i_sub = j_sub + 1;
-
1023
-
1024 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
-
1025
-
1026 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
-
1027
-
1028 const bool isPanelIncomplete = (nrefls_tile != band_size);
+
1021 // Note:
+
1022 // Here dist_a is given with full panel size instead of dist with just the part actually needeed,
+
1023 // because the GPU Helper internally exploits Panel data-structure. Indeed, the full size panel is
+
1024 // needed in order to mimick Matrix with Panel, so it is possible to apply a SubPanelView to it.
+
1025 //
+
1026 // It is a bit hacky usage, because SubPanelView is not meant to be used with Panel, but just with
+
1027 // Matrix. This results in a variable waste of memory, depending no the ratio band_size/nb.
+
1028 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist_a);
1029
-
1030 // Note: if this is running, it must have at least one valid reflector (i.e. with size > 1)
-
1031 DLAF_ASSERT_HEAVY(nrefls_tile != 0, nrefls_tile);
+
1030 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
+
1031 const auto i_sub = j_sub + 1;
1032
-
1033 // Note: SubPanelView is (at most) band_size wide, but it may contain a smaller number of
-
1034 // reflectors (i.e. at the end when last reflector size is 1)
-
1035 const matrix::SubPanelView panel_view(dist_a, ij_offset, band_size);
+
1033 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
+
1034
+
1035 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
1036
-
1037 Panel<Coord::Col, T, D>& v = panels_v.nextResource();
-
1038 v.setRangeStart(ij_offset);
-
1039 if (isPanelIncomplete)
-
1040 v.setWidth(nrefls_tile);
+
1037 const bool isPanelIncomplete = (nrefls_tile != band_size);
+
1038
+
1039 // Note: if this is running, it must have at least one valid reflector (i.e. with size > 1)
+
1040 DLAF_ASSERT_HEAVY(nrefls_tile != 0, nrefls_tile);
1041
-
1042 // PANEL
-
1043 compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
-
1044
-
1045 // Note:
-
1046 // - has_reflector_head tells if this rank owns the first tile of the panel (being local, always true)
-
1047 // - if !is_full_band it has to force copy as a workaround, otherwise in update matrix it would deadlock
-
1048 // due to tile shared between panel and trailing matrix
-
1049 constexpr bool has_reflector_head = true;
-
1050 setupReflectorPanelV<B, D, T>(has_reflector_head, panel_view, nrefls_tile, v, mat_a, !is_full_band);
-
1051
-
1052 const LocalTileIndex t_idx(0, 0);
-
1053 // TODO used just by the column, maybe we can re-use a panel tile?
-
1054 // TODO probably the first one in any panel is ok?
-
1055 Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
-
1056
-
1057 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
-
1058
-
1059 // PREPARATION FOR TRAILING MATRIX UPDATE
-
1060 const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
-
1061
-
1062 // Note: if there is no trailing matrix, algorithm has finised
-
1063 if (!at_offset.isIn(mat_a.size()))
-
1064 break;
+
1042 // Note: SubPanelView is (at most) band_size wide, but it may contain a smaller number of
+
1043 // reflectors (i.e. at the end when last reflector size is 1)
+
1044 const matrix::SubPanelView panel_view(dist_a, ij_offset, band_size);
+
1045
+
1046 Panel<Coord::Col, T, D>& v = panels_v.nextResource();
+
1047 v.setRangeStart(ij_offset);
+
1048 if (isPanelIncomplete)
+
1049 v.setWidth(nrefls_tile);
+
1050
+
1051 // PANEL
+
1052 compute_panel_helper.call(mat_a, mat_taus_retiled, j_sub, panel_view);
+
1053
+
1054 // Note:
+
1055 // - has_reflector_head tells if this rank owns the first tile of the panel (being local, always true)
+
1056 // - if !is_full_band it has to force copy as a workaround, otherwise in update matrix it would deadlock
+
1057 // due to tile shared between panel and trailing matrix
+
1058 constexpr bool has_reflector_head = true;
+
1059 setupReflectorPanelV<B, D, T>(has_reflector_head, panel_view, nrefls_tile, v, mat_a, !is_full_band);
+
1060
+
1061 const LocalTileIndex t_idx(0, 0);
+
1062 // TODO used just by the column, maybe we can re-use a panel tile?
+
1063 // TODO probably the first one in any panel is ok?
+
1064 Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
1065
-
1066 const matrix::SubMatrixView trailing_matrix_view(dist_a, at_offset);
+
1066 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx));
1067
-
1068 // W = V . T
-
1069 Panel<Coord::Col, T, D>& w = panels_w.nextResource();
-
1070 w.setRangeStart(at_offset);
-
1071 if (isPanelIncomplete)
-
1072 w.setWidth(nrefls_tile);
-
1073
-
1074 trmmComputeW<B>(w, v, t.read(t_idx));
-
1075
-
1076 // X = At . W
-
1077 Panel<Coord::Col, T, D>& x = panels_x.nextResource();
-
1078 x.setRangeStart(at_offset);
-
1079 if (isPanelIncomplete)
-
1080 x.setWidth(nrefls_tile);
-
1081
-
1082 // Note:
-
1083 // Since At is hermitian, just the lower part is referenced.
-
1084 // When the tile is not part of the main diagonal, the same tile has to be used for two computations
-
1085 // that will contribute to two different rows of X: the ones indexed with row and col.
-
1086 hemmComputeX<B>(x, trailing_matrix_view, mat_a, w);
-
1087
-
1088 // In the next section the next two operations are performed
-
1089 // A) W2 = W* . X
-
1090 // B) X -= 1/2 . V . W2
-
1091
-
1092 // Note:
-
1093 // T can be re-used because it is not needed anymore in this step and it has the same shape
-
1094 Matrix<T, D> w2 = std::move(t);
-
1095
-
1096 gemmComputeW2<B>(w2, w, x);
-
1097 gemmUpdateX<B>(x, w2, v);
-
1098
-
1099 // TRAILING MATRIX UPDATE
+
1068 // PREPARATION FOR TRAILING MATRIX UPDATE
+
1069 const GlobalElementIndex at_offset(ij_offset + GlobalElementSize(0, band_size));
+
1070
+
1071 // Note: if there is no trailing matrix, algorithm has finised
+
1072 if (!at_offset.isIn(mat_a.size()))
+
1073 break;
+
1074
+
1075 const matrix::SubMatrixView trailing_matrix_view(dist_a, at_offset);
+
1076
+
1077 // W = V . T
+
1078 Panel<Coord::Col, T, D>& w = panels_w.nextResource();
+
1079 w.setRangeStart(at_offset);
+
1080 if (isPanelIncomplete)
+
1081 w.setWidth(nrefls_tile);
+
1082
+
1083 trmmComputeW<B>(w, v, t.read(t_idx));
+
1084
+
1085 // X = At . W
+
1086 Panel<Coord::Col, T, D>& x = panels_x.nextResource();
+
1087 x.setRangeStart(at_offset);
+
1088 if (isPanelIncomplete)
+
1089 x.setWidth(nrefls_tile);
+
1090
+
1091 // Note:
+
1092 // Since At is hermitian, just the lower part is referenced.
+
1093 // When the tile is not part of the main diagonal, the same tile has to be used for two computations
+
1094 // that will contribute to two different rows of X: the ones indexed with row and col.
+
1095 hemmComputeX<B>(x, trailing_matrix_view, mat_a, w);
+
1096
+
1097 // In the next section the next two operations are performed
+
1098 // A) W2 = W* . X
+
1099 // B) X -= 1/2 . V . W2
1100
-
1101 // At -= X . V* + V . X*
-
1102 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, v);
-
1103
-
1104 x.reset();
-
1105 w.reset();
-
1106 v.reset();
-
1107 }
-
1108
-
1109 return mat_taus;
-
1110}
-
1111
-
1112// Distributed implementation of reduction to band
-
1113template <Backend B, Device D, class T>
-
1114Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
-
1115 const SizeType band_size) {
-
1116 using namespace red2band::distributed;
+
1101 // Note:
+
1102 // T can be re-used because it is not needed anymore in this step and it has the same shape
+
1103 Matrix<T, D> w2 = std::move(t);
+
1104
+
1105 gemmComputeW2<B>(w2, w, x);
+
1106 gemmUpdateX<B>(x, w2, v);
+
1107
+
1108 // TRAILING MATRIX UPDATE
+
1109
+
1110 // At -= X . V* + V . X*
+
1111 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, v);
+
1112
+
1113 x.reset();
+
1114 w.reset();
+
1115 v.reset();
+
1116 }
1117
-
1118 using common::iterate_range2d;
-
1119 using factorization::internal::computeTFactor;
+
1118 return mat_taus;
+
1119}
1120
-
1121 namespace ex = pika::execution::experimental;
-
1122
-
1123 // Note:
-
1124 // This is a temporary workaround.
-
1125 // See issue https://github.com/eth-cscs/DLA-Future/issues/729
-
1126 pika::wait();
-
1127
-
1128 // This algorithm requires the grid to have at least 2 independent column communicators in the round
-
1129 // robin array. If there is only one communicator mpi_col_chain and mpi_col_chain_panel will be
-
1130 // separate pipelines to the same communicator, but since communication is interleaved between the
-
1131 // pipelines this algorithm will deadlock (separate subpipelines means that all work on the previous
-
1132 // subpipeline has to complete before the next subpipeline can even start scheduling work).
-
1133 DLAF_ASSERT(grid.num_pipelines() >= 2, grid.num_pipelines());
-
1134 auto mpi_row_chain = grid.row_communicator_pipeline();
-
1135 auto mpi_col_chain = grid.col_communicator_pipeline();
-
1136 auto mpi_col_chain_panel = grid.col_communicator_pipeline();
-
1137
-
1138#ifdef DLAF_WITH_HDF5
-
1139 static std::atomic<size_t> num_reduction_to_band_calls = 0;
-
1140 std::stringstream fname;
-
1141 fname << "reduction_to_band-" << matrix::internal::TypeToString_v<T> << "-"
-
1142 << std::to_string(num_reduction_to_band_calls) << ".h5";
-
1143 std::optional<matrix::internal::FileHDF5> file;
-
1144
-
1145 if (getTuneParameters().debug_dump_reduction_to_band_data) {
-
1146 file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
-
1147 file->write(mat_a, "/input");
-
1148 }
-
1149#endif
-
1150
-
1151 const auto& dist = mat_a.distribution();
-
1152 const comm::Index2D rank = dist.rankIndex();
+
1121// Distributed implementation of reduction to band
+
1122template <Backend B, Device D, class T>
+
1123Matrix<T, Device::CPU> ReductionToBand<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, D>& mat_a,
+
1124 const SizeType band_size) {
+
1125 using namespace red2band::distributed;
+
1126
+
1127 using common::iterate_range2d;
+
1128 using factorization::internal::computeTFactor;
+
1129
+
1130 namespace ex = pika::execution::experimental;
+
1131
+
1132 // Note:
+
1133 // This is a temporary workaround.
+
1134 // See issue https://github.com/eth-cscs/DLA-Future/issues/729
+
1135 pika::wait();
+
1136
+
1137 // This algorithm requires the grid to have at least 2 independent column communicators in the round
+
1138 // robin array. If there is only one communicator mpi_col_chain and mpi_col_chain_panel will be
+
1139 // separate pipelines to the same communicator, but since communication is interleaved between the
+
1140 // pipelines this algorithm will deadlock (separate subpipelines means that all work on the previous
+
1141 // subpipeline has to complete before the next subpipeline can even start scheduling work).
+
1142 DLAF_ASSERT(grid.num_pipelines() >= 2, grid.num_pipelines());
+
1143 auto mpi_row_chain = grid.row_communicator_pipeline();
+
1144 auto mpi_col_chain = grid.col_communicator_pipeline();
+
1145 auto mpi_col_chain_panel = grid.col_communicator_pipeline();
+
1146
+
1147#ifdef DLAF_WITH_HDF5
+
1148 static std::atomic<size_t> num_reduction_to_band_calls = 0;
+
1149 std::stringstream fname;
+
1150 fname << "reduction_to_band-" << matrix::internal::TypeToString_v<T> << "-"
+
1151 << std::to_string(num_reduction_to_band_calls) << ".h5";
+
1152 std::optional<matrix::internal::FileHDF5> file;
1153
-
1154 // Note:
-
1155 // Reflector of size = 1 is not considered whatever T is (i.e. neither real nor complex)
-
1156 const SizeType nrefls = std::max<SizeType>(0, dist.size().rows() - band_size - 1);
-
1157
-
1158 // Row-vector that is distributed over columns, but exists locally on all rows of the grid
-
1159 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
-
1160 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
-
1161 TileElementSize(mat_a.blockSize().cols(), 1),
-
1162 comm::Size2D(mat_a.commGridSize().cols(), 1),
-
1163 comm::Index2D(mat_a.rankIndex().col(), 0),
-
1164 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
-
1165
-
1166 if (nrefls == 0) {
-
1167#ifdef DLAF_WITH_HDF5
-
1168 if (getTuneParameters().debug_dump_reduction_to_band_data) {
-
1169 file->write(mat_a, "/band");
-
1170 }
-
1171
-
1172 num_reduction_to_band_calls++;
-
1173#endif
+
1154 if (getTuneParameters().debug_dump_reduction_to_band_data) {
+
1155 file = matrix::internal::FileHDF5(grid.fullCommunicator(), fname.str());
+
1156 file->write(mat_a, "/input");
+
1157 }
+
1158#endif
+
1159
+
1160 const auto& dist = mat_a.distribution();
+
1161 const comm::Index2D rank = dist.rankIndex();
+
1162
+
1163 // Note:
+
1164 // Reflector of size = 1 is not considered whatever T is (i.e. neither real nor complex)
+
1165 const SizeType nrefls = std::max<SizeType>(0, dist.size().rows() - band_size - 1);
+
1166
+
1167 // Row-vector that is distributed over columns, but exists locally on all rows of the grid
+
1168 DLAF_ASSERT(mat_a.blockSize().cols() % band_size == 0, mat_a.blockSize().cols(), band_size);
+
1169 Matrix<T, Device::CPU> mat_taus(matrix::Distribution(GlobalElementSize(nrefls, 1),
+
1170 TileElementSize(mat_a.blockSize().cols(), 1),
+
1171 comm::Size2D(mat_a.commGridSize().cols(), 1),
+
1172 comm::Index2D(mat_a.rankIndex().col(), 0),
+
1173 comm::Index2D(mat_a.sourceRankIndex().col(), 0)));
1174
-
1175 return mat_taus;
-
1176 }
-
1177
-
1178 Matrix<T, Device::CPU> mat_taus_retiled =
-
1179 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+
1175 if (nrefls == 0) {
+
1176#ifdef DLAF_WITH_HDF5
+
1177 if (getTuneParameters().debug_dump_reduction_to_band_data) {
+
1178 file->write(mat_a, "/band");
+
1179 }
1180
-
1181 const SizeType ntiles = (nrefls - 1) / band_size + 1;
-
1182 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
1181 num_reduction_to_band_calls++;
+
1182#endif
1183
-
1184 const bool is_full_band = (band_size == dist.blockSize().cols());
-
1185
-
1186 constexpr std::size_t n_workspaces = 2;
-
1187 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
-
1188 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_vt(
-
1189 n_workspaces, dist);
-
1190
-
1191 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
-
1192 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_wt(
-
1193 n_workspaces, dist);
+
1184 return mat_taus;
+
1185 }
+
1186
+
1187 Matrix<T, Device::CPU> mat_taus_retiled =
+
1188 mat_taus.retiledSubPipeline(LocalTileSize(mat_a.blockSize().cols() / band_size, 1));
+
1189
+
1190 const SizeType ntiles = (nrefls - 1) / band_size + 1;
+
1191 DLAF_ASSERT(ntiles == mat_taus_retiled.nrTiles().rows(), ntiles, mat_taus_retiled.nrTiles().rows());
+
1192
+
1193 const bool is_full_band = (band_size == dist.blockSize().cols());
1194
-
1195 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
-
1196 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
-
1197 n_workspaces, dist);
-
1198
-
1199 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
-
1200
-
1201 ex::unique_any_sender<> trigger_panel{ex::just()};
-
1202 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
-
1203 const SizeType i_sub = j_sub + 1;
-
1204
-
1205 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
-
1206 const GlobalElementIndex at_offset(i_sub * band_size, (j_sub + 1) * band_size);
+
1195 constexpr std::size_t n_workspaces = 2;
+
1196 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_v(n_workspaces, dist);
+
1197 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_vt(
+
1198 n_workspaces, dist);
+
1199
+
1200 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_w(n_workspaces, dist);
+
1201 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_wt(
+
1202 n_workspaces, dist);
+
1203
+
1204 common::RoundRobin<matrix::Panel<Coord::Col, T, D>> panels_x(n_workspaces, dist);
+
1205 common::RoundRobin<matrix::Panel<Coord::Row, T, D, matrix::StoreTransposed::Yes>> panels_xt(
+
1206 n_workspaces, dist);
1207
-
1208 const comm::Index2D rank_v0{
-
1209 dist.template rankGlobalElement<Coord::Row>(ij_offset.row()),
-
1210 dist.template rankGlobalElement<Coord::Col>(ij_offset.col()),
-
1211 };
-
1212
-
1213 const bool is_panel_rank_col = rank_v0.col() == rank.col();
-
1214
-
1215 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
+
1208 red2band::ComputePanelHelper<B, D, T> compute_panel_helper(n_workspaces, dist);
+
1209
+
1210 ex::unique_any_sender<> trigger_panel{ex::just()};
+
1211 for (SizeType j_sub = 0; j_sub < ntiles; ++j_sub) {
+
1212 const SizeType i_sub = j_sub + 1;
+
1213
+
1214 const GlobalElementIndex ij_offset(i_sub * band_size, j_sub * band_size);
+
1215 const GlobalElementIndex at_offset(i_sub * band_size, (j_sub + 1) * band_size);
1216
-
1217 if (nrefls_tile == 0)
-
1218 break;
-
1219
-
1220 auto& v = panels_v.nextResource();
-
1221 auto& vt = panels_vt.nextResource();
-
1222
-
1223 v.setRangeStart(at_offset);
-
1224 vt.setRangeStart(at_offset);
+
1217 const comm::Index2D rank_v0{
+
1218 dist.template rankGlobalElement<Coord::Row>(ij_offset.row()),
+
1219 dist.template rankGlobalElement<Coord::Col>(ij_offset.col()),
+
1220 };
+
1221
+
1222 const bool is_panel_rank_col = rank_v0.col() == rank.col();
+
1223
+
1224 const SizeType nrefls_tile = mat_taus_retiled.tileSize(GlobalTileIndex(j_sub, 0)).rows();
1225
-
1226 v.setWidth(nrefls_tile);
-
1227 vt.setHeight(nrefls_tile);
+
1226 if (nrefls_tile == 0)
+
1227 break;
1228
-
1229 const LocalTileIndex t_idx(0, 0);
-
1230 // TODO used just by the column, maybe we can re-use a panel tile?
-
1231 // TODO or we can keep just the sh_future and allocate just inside if (is_panel_rank_col)
-
1232 matrix::Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
-
1233
-
1234 // PANEL
-
1235 const matrix::SubPanelView panel_view(dist, ij_offset, band_size);
-
1236
-
1237 if (is_panel_rank_col) {
-
1238 compute_panel_helper.call(std::move(trigger_panel), rank_v0.row(), mpi_col_chain_panel.exclusive(),
-
1239 mat_a, mat_taus_retiled, j_sub, panel_view);
-
1240
-
1241 // Note:
-
1242 // - has_reflector_head tells if this rank owns the first tile of the panel
-
1243 // - if !is_full_band it has to force copy as a workaround, otherwise in update matrix it would
-
1244 // deadlock due to tile shared between panel and trailing matrix
-
1245 red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
-
1246 nrefls_tile, v, mat_a, !is_full_band);
-
1247 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
-
1248 mpi_col_chain);
-
1249 }
-
1250
-
1251 // PREPARATION FOR TRAILING MATRIX UPDATE
-
1252
-
1253 // Note: if there is no trailing matrix, algorithm has finised
-
1254 if (!at_offset.isIn(mat_a.size()))
-
1255 break;
-
1256
-
1257 const matrix::SubMatrixView trailing_matrix_view(dist, at_offset);
-
1258
-
1259 comm::broadcast(rank_v0.col(), v, vt, mpi_row_chain, mpi_col_chain);
-
1260
-
1261 // W = V . T
-
1262 auto& w = panels_w.nextResource();
-
1263 auto& wt = panels_wt.nextResource();
-
1264
-
1265 w.setRangeStart(at_offset);
-
1266 wt.setRangeStart(at_offset);
+
1229 auto& v = panels_v.nextResource();
+
1230 auto& vt = panels_vt.nextResource();
+
1231
+
1232 v.setRangeStart(at_offset);
+
1233 vt.setRangeStart(at_offset);
+
1234
+
1235 v.setWidth(nrefls_tile);
+
1236 vt.setHeight(nrefls_tile);
+
1237
+
1238 const LocalTileIndex t_idx(0, 0);
+
1239 // TODO used just by the column, maybe we can re-use a panel tile?
+
1240 // TODO or we can keep just the sh_future and allocate just inside if (is_panel_rank_col)
+
1241 matrix::Matrix<T, D> t({nrefls_tile, nrefls_tile}, dist.blockSize());
+
1242
+
1243 // PANEL
+
1244 const matrix::SubPanelView panel_view(dist, ij_offset, band_size);
+
1245
+
1246 if (is_panel_rank_col) {
+
1247 compute_panel_helper.call(std::move(trigger_panel), rank_v0.row(), mpi_col_chain_panel.exclusive(),
+
1248 mat_a, mat_taus_retiled, j_sub, panel_view);
+
1249
+
1250 // Note:
+
1251 // - has_reflector_head tells if this rank owns the first tile of the panel
+
1252 // - if !is_full_band it has to force copy as a workaround, otherwise in update matrix it would
+
1253 // deadlock due to tile shared between panel and trailing matrix
+
1254 red2band::local::setupReflectorPanelV<B, D, T>(rank.row() == rank_v0.row(), panel_view,
+
1255 nrefls_tile, v, mat_a, !is_full_band);
+
1256 computeTFactor<B>(v, mat_taus_retiled.read(GlobalTileIndex(j_sub, 0)), t.readwrite(t_idx),
+
1257 mpi_col_chain);
+
1258 }
+
1259
+
1260 // PREPARATION FOR TRAILING MATRIX UPDATE
+
1261
+
1262 // Note: if there is no trailing matrix, algorithm has finised
+
1263 if (!at_offset.isIn(mat_a.size()))
+
1264 break;
+
1265
+
1266 const matrix::SubMatrixView trailing_matrix_view(dist, at_offset);
1267
-
1268 w.setWidth(nrefls_tile);
-
1269 wt.setHeight(nrefls_tile);
-
1270
-
1271 if (is_panel_rank_col)
-
1272 red2band::local::trmmComputeW<B, D>(w, v, t.read(t_idx));
+
1268 comm::broadcast(rank_v0.col(), v, vt, mpi_row_chain, mpi_col_chain);
+
1269
+
1270 // W = V . T
+
1271 auto& w = panels_w.nextResource();
+
1272 auto& wt = panels_wt.nextResource();
1273
-
1274 comm::broadcast(rank_v0.col(), w, wt, mpi_row_chain, mpi_col_chain);
-
1275
-
1276 // X = At . W
-
1277 auto& x = panels_x.nextResource();
-
1278 auto& xt = panels_xt.nextResource();
+
1274 w.setRangeStart(at_offset);
+
1275 wt.setRangeStart(at_offset);
+
1276
+
1277 w.setWidth(nrefls_tile);
+
1278 wt.setHeight(nrefls_tile);
1279
-
1280 x.setRangeStart(at_offset);
-
1281 xt.setRangeStart(at_offset);
+
1280 if (is_panel_rank_col)
+
1281 red2band::local::trmmComputeW<B, D>(w, v, t.read(t_idx));
1282
-
1283 x.setWidth(nrefls_tile);
-
1284 xt.setHeight(nrefls_tile);
-
1285
-
1286 // Note:
-
1287 // Since At is hermitian, just the lower part is referenced.
-
1288 // When the tile is not part of the main diagonal, the same tile has to be used for two computations
-
1289 // that will contribute to two different rows of X: the ones indexed with row and col.
-
1290 // This is achieved by storing the two results in two different workspaces: X and X_conj respectively.
-
1291 //
-
1292 // On exit, x will contain a valid result just on ranks belonging to the column panel.
-
1293 // For what concerns xt, it is just used as support and it contains junk data on all ranks.
-
1294 hemmComputeX<B, D>(rank_v0.col(), x, xt, trailing_matrix_view, mat_a, w, wt, mpi_row_chain,
-
1295 mpi_col_chain);
-
1296
-
1297 // In the next section the next two operations are performed
-
1298 // A) W2 = W* . X
-
1299 // B) X -= 1/2 . V . W2
-
1300
-
1301 // Note:
-
1302 // Now the intermediate result for X is available on the panel column ranks,
-
1303 // which have locally all the needed stuff for updating X and finalize the result
-
1304 if (is_panel_rank_col) {
-
1305 // Note:
-
1306 // T can be re-used because it is not needed anymore in this step and it has the same shape
-
1307 matrix::Matrix<T, D> w2 = std::move(t);
-
1308
-
1309 red2band::local::gemmComputeW2<B, D>(w2, w, x);
-
1310 if (mpi_col_chain.size() > 1) {
-
1311 ex::start_detached(comm::schedule_all_reduce_in_place(mpi_col_chain.exclusive(), MPI_SUM,
-
1312 w2.readwrite(LocalTileIndex(0, 0))));
-
1313 }
-
1314
-
1315 red2band::local::gemmUpdateX<B, D>(x, w2, v);
-
1316 }
+
1283 comm::broadcast(rank_v0.col(), w, wt, mpi_row_chain, mpi_col_chain);
+
1284
+
1285 // X = At . W
+
1286 auto& x = panels_x.nextResource();
+
1287 auto& xt = panels_xt.nextResource();
+
1288
+
1289 x.setRangeStart(at_offset);
+
1290 xt.setRangeStart(at_offset);
+
1291
+
1292 x.setWidth(nrefls_tile);
+
1293 xt.setHeight(nrefls_tile);
+
1294
+
1295 // Note:
+
1296 // Since At is hermitian, just the lower part is referenced.
+
1297 // When the tile is not part of the main diagonal, the same tile has to be used for two computations
+
1298 // that will contribute to two different rows of X: the ones indexed with row and col.
+
1299 // This is achieved by storing the two results in two different workspaces: X and X_conj respectively.
+
1300 //
+
1301 // On exit, x will contain a valid result just on ranks belonging to the column panel.
+
1302 // For what concerns xt, it is just used as support and it contains junk data on all ranks.
+
1303 hemmComputeX<B, D>(rank_v0.col(), x, xt, trailing_matrix_view, mat_a, w, wt, mpi_row_chain,
+
1304 mpi_col_chain);
+
1305
+
1306 // In the next section the next two operations are performed
+
1307 // A) W2 = W* . X
+
1308 // B) X -= 1/2 . V . W2
+
1309
+
1310 // Note:
+
1311 // Now the intermediate result for X is available on the panel column ranks,
+
1312 // which have locally all the needed stuff for updating X and finalize the result
+
1313 if (is_panel_rank_col) {
+
1314 // Note:
+
1315 // T can be re-used because it is not needed anymore in this step and it has the same shape
+
1316 matrix::Matrix<T, D> w2 = std::move(t);
1317
-
1318 // Note:
-
1319 // xt has been used previously as workspace for hemmComputeX, so it has to be reset, because now it
-
1320 // will be used for accessing the broadcasted version of x
-
1321 xt.reset();
-
1322 xt.setRangeStart(at_offset);
-
1323 xt.setHeight(nrefls_tile);
-
1324
-
1325 comm::broadcast(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
+
1318 red2band::local::gemmComputeW2<B, D>(w2, w, x);
+
1319 if (mpi_col_chain.size() > 1) {
+
1320 ex::start_detached(comm::schedule_all_reduce_in_place(mpi_col_chain.exclusive(), MPI_SUM,
+
1321 w2.readwrite(LocalTileIndex(0, 0))));
+
1322 }
+
1323
+
1324 red2band::local::gemmUpdateX<B, D>(x, w2, v);
+
1325 }
1326
-
1327 // TRAILING MATRIX UPDATE
-
1328
-
1329 // Note:
-
1330 // This trigger mechanism allows to control when the next iteration of compute panel will start.
-
1331 //
-
1332 // * What?
-
1333 // Compute panel uses MPI blocking communication that might block the only computing thread
-
1334 // available (since blocking communication are scheduled on normal queues and not on the MPI
-
1335 // dedicated one).
-
1336 //
-
1337 // * How?
-
1338 // If pika runtime has only 2 threads, one is dedicated to MPI and there is just one for
-
1339 // computation, that might get blocked by blocking MPI communication, without the chance to do
-
1340 // anything else. (TODO this might happen even with more reductions happening in parallel)
-
1341 //
-
1342 // * Why?
-
1343 // Panel computation at step i is done on the first column of the trailing matrix computed
-
1344 // at step i-1.
-
1345 // The rank owning the top-left tile of the trailing matrix, can update it as soon as it
-
1346 // receives X[0], which due to the pivot position is also the Xt[0]. Once it can go to the next
-
1347 // iteration, it ends up stucked in an MPI blocking communication, waiting for the others joining
-
1348 // before being able to advance.
-
1349 //
-
1350 // But at the same time, other ranks in the same column (needed for the next panel update), cannot
-
1351 // complete the trailing matrix update. Indeed, they are waiting for the pivot rank to communicate
-
1352 // column-wise Xt[0] (during x -> xt panel transpose broadcast), but he is not going to schedule
-
1353 // anything because the only normal thread which can do that is stuck in an MPI blocking
-
1354 // communication that is not going to advance... and so it's a DEADLOCK!
-
1355 //
-
1356 // * Solution:
-
1357 // The idea is to make the next panel depending not only on tiles stored locally, but also to
-
1358 // ensure that others have received Xt[0], which is needed to advance the computation and let
-
1359 // others arrive at the next iteration where the pivot will wait for them to complete the MPI
-
1360 // blocking communication.
-
1361 //
-
1362 // * Why is it different between MC and GPU?
-
1363 // As said above, the problem is related to the communication. But the communication is not said
-
1364 // to be an atomic operation happening in a single task. It might have to create a copy to
-
1365 // a buffer more suitable for the communication (e.g. GPU -> CPU if GPU-aware MPI is not
-
1366 // available).
-
1367 //
-
1368 // And in order to not be blocked, it must be ensured that the actual communication task has
-
1369 // been scheduled.
-
1370 const SizeType j_tile_current = ij_offset.col() / dist.blockSize().cols();
-
1371 const SizeType j_tile_next = at_offset.col() / dist.blockSize().cols();
-
1372 const bool isNextColumnOnSameRank = (j_tile_current == j_tile_next);
-
1373 const comm::IndexT_MPI rank_next_col =
-
1374 isNextColumnOnSameRank ? rank_v0.col() : (rank_v0.col() + 1) % dist.commGridSize().cols();
-
1375
-
1376 if (rank.col() == rank_next_col) {
-
1377 const LocalTileIndex at{
-
1378 dist.template nextLocalTileFromGlobalElement<Coord::Row>(at_offset.row()),
-
1379 dist.template nextLocalTileFromGlobalElement<Coord::Col>(at_offset.col()),
-
1380 };
-
1381
-
1382 // Note:
-
1383 // This additional communication of the last tile is a workaround for supporting following trigger
-
1384 // when b < mb.
-
1385 // Indeed, if b < mb the last column have (at least) a panel to compute, but differently from
-
1386 // other columns, broadcast transposed doesn't communicate the last tile, which is an assumption
-
1387 // needed to make the following trigger work correctly.
-
1388 const SizeType at_tile_col =
-
1389 dist.template globalTileFromGlobalElement<Coord::Col>(at_offset.col());
+
1327 // Note:
+
1328 // xt has been used previously as workspace for hemmComputeX, so it has to be reset, because now it
+
1329 // will be used for accessing the broadcasted version of x
+
1330 xt.reset();
+
1331 xt.setRangeStart(at_offset);
+
1332 xt.setHeight(nrefls_tile);
+
1333
+
1334 comm::broadcast(rank_v0.col(), x, xt, mpi_row_chain, mpi_col_chain);
+
1335
+
1336 // TRAILING MATRIX UPDATE
+
1337
+
1338 // Note:
+
1339 // This trigger mechanism allows to control when the next iteration of compute panel will start.
+
1340 //
+
1341 // * What?
+
1342 // Compute panel uses MPI blocking communication that might block the only computing thread
+
1343 // available (since blocking communication are scheduled on normal queues and not on the MPI
+
1344 // dedicated one).
+
1345 //
+
1346 // * How?
+
1347 // If pika runtime has only 2 threads, one is dedicated to MPI and there is just one for
+
1348 // computation, that might get blocked by blocking MPI communication, without the chance to do
+
1349 // anything else. (TODO this might happen even with more reductions happening in parallel)
+
1350 //
+
1351 // * Why?
+
1352 // Panel computation at step i is done on the first column of the trailing matrix computed
+
1353 // at step i-1.
+
1354 // The rank owning the top-left tile of the trailing matrix, can update it as soon as it
+
1355 // receives X[0], which due to the pivot position is also the Xt[0]. Once it can go to the next
+
1356 // iteration, it ends up stucked in an MPI blocking communication, waiting for the others joining
+
1357 // before being able to advance.
+
1358 //
+
1359 // But at the same time, other ranks in the same column (needed for the next panel update), cannot
+
1360 // complete the trailing matrix update. Indeed, they are waiting for the pivot rank to communicate
+
1361 // column-wise Xt[0] (during x -> xt panel transpose broadcast), but he is not going to schedule
+
1362 // anything because the only normal thread which can do that is stuck in an MPI blocking
+
1363 // communication that is not going to advance... and so it's a DEADLOCK!
+
1364 //
+
1365 // * Solution:
+
1366 // The idea is to make the next panel depending not only on tiles stored locally, but also to
+
1367 // ensure that others have received Xt[0], which is needed to advance the computation and let
+
1368 // others arrive at the next iteration where the pivot will wait for them to complete the MPI
+
1369 // blocking communication.
+
1370 //
+
1371 // * Why is it different between MC and GPU?
+
1372 // As said above, the problem is related to the communication. But the communication is not said
+
1373 // to be an atomic operation happening in a single task. It might have to create a copy to
+
1374 // a buffer more suitable for the communication (e.g. GPU -> CPU if GPU-aware MPI is not
+
1375 // available).
+
1376 //
+
1377 // And in order to not be blocked, it must be ensured that the actual communication task has
+
1378 // been scheduled.
+
1379 const SizeType j_tile_current = ij_offset.col() / dist.blockSize().cols();
+
1380 const SizeType j_tile_next = at_offset.col() / dist.blockSize().cols();
+
1381 const bool isNextColumnOnSameRank = (j_tile_current == j_tile_next);
+
1382 const comm::IndexT_MPI rank_next_col =
+
1383 isNextColumnOnSameRank ? rank_v0.col() : (rank_v0.col() + 1) % dist.commGridSize().cols();
+
1384
+
1385 if (rank.col() == rank_next_col) {
+
1386 const LocalTileIndex at{
+
1387 dist.template nextLocalTileFromGlobalElement<Coord::Row>(at_offset.row()),
+
1388 dist.template nextLocalTileFromGlobalElement<Coord::Col>(at_offset.col()),
+
1389 };
1390
-
1391 if (at_tile_col == dist.nrTiles().cols() - 1) {
-
1392 const comm::IndexT_MPI owner = rank_v0.row();
-
1393 if (rank.row() == owner) {
-
1394 xt.setTile(at, x.read(at));
-
1395
-
1396 if (dist.commGridSize().rows() > 1)
-
1397 ex::start_detached(comm::schedule_bcast_send(mpi_col_chain.exclusive(), xt.read(at)));
-
1398 }
-
1399 else {
-
1400 if (dist.commGridSize().rows() > 1)
-
1401 ex::start_detached(comm::schedule_bcast_recv(mpi_col_chain.exclusive(), owner,
-
1402 xt.readwrite(at)));
-
1403 }
-
1404 }
-
1405
-
1406 if constexpr (dlaf::comm::CommunicationDevice_v<D> == D) {
-
1407 // Note:
-
1408 // if there is no need for additional buffers, we can just wait that xt[0] is ready for
-
1409 // reading.
-
1410 if (rank.row() == rank_v0.row()) {
-
1411 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
1391 // Note:
+
1392 // This additional communication of the last tile is a workaround for supporting following trigger
+
1393 // when b < mb.
+
1394 // Indeed, if b < mb the last column have (at least) a panel to compute, but differently from
+
1395 // other columns, broadcast transposed doesn't communicate the last tile, which is an assumption
+
1396 // needed to make the following trigger work correctly.
+
1397 const SizeType at_tile_col =
+
1398 dist.template globalTileFromGlobalElement<Coord::Col>(at_offset.col());
+
1399
+
1400 if (at_tile_col == dist.nrTiles().cols() - 1) {
+
1401 const comm::IndexT_MPI owner = rank_v0.row();
+
1402 if (rank.row() == owner) {
+
1403 xt.setTile(at, x.read(at));
+
1404
+
1405 if (dist.commGridSize().rows() > 1)
+
1406 ex::start_detached(comm::schedule_bcast_send(mpi_col_chain.exclusive(), xt.read(at)));
+
1407 }
+
1408 else {
+
1409 if (dist.commGridSize().rows() > 1)
+
1410 ex::start_detached(comm::schedule_bcast_recv(mpi_col_chain.exclusive(), owner,
+
1411 xt.readwrite(at)));
1412 }
-
1413 else {
-
1414 // Note:
-
1415 // Conservatively ensure that xt[0] needed for updating the first column has been
-
1416 // received. Just wait for xt because communication of x happens over rows, while the
-
1417 // pivot rank can just block rank in the same column.
-
1418 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
-
1419 }
-
1420 }
-
1421 else {
-
1422 if (rank.row() == rank_v0.row()) {
+
1413 }
+
1414
+
1415 if constexpr (dlaf::comm::CommunicationDevice_v<D> == D) {
+
1416 // Note:
+
1417 // if there is no need for additional buffers, we can just wait that xt[0] is ready for
+
1418 // reading.
+
1419 if (rank.row() == rank_v0.row()) {
+
1420 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
1421 }
+
1422 else {
1423 // Note:
-
1424 // on the pivot rank, i.e. the one that would quickly go to the next panel and block, from
-
1425 // implementation we know that xt[0] is set as an external tile pointing to x[0].
-
1426 // We cannot wait on xt readwrite (because it is an external tile in a panel, that constraints
-
1427 // it to be just readable), but we can wait on its source x[0]. This has a subtle implication,
-
1428 // since we will wait not just for the communication to be complete (which is already more
-
1429 // than what needed), but we will also wait till xt[0] will be released, so after all local
-
1430 // communication and computation on the first column of the trailing matrix will be completed.
-
1431 trigger_panel = x.readwrite(at) | ex::drop_value() | ex::ensure_started();
-
1432 }
-
1433 else {
-
1434 // Note:
-
1435 // Conservatively ensure that xt[0] needed for updating the first column has been
-
1436 // received. Just wait for xt because communication of x happens over rows, while the
-
1437 // pivot rank can just block rank in the same column.
-
1438 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
-
1439 }
-
1440 }
-
1441 }
-
1442
-
1443 // At -= X . V* + V . X*
-
1444 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, vt, v, xt);
-
1445
-
1446 xt.reset();
-
1447 x.reset();
-
1448 wt.reset();
-
1449 w.reset();
-
1450 vt.reset();
-
1451 v.reset();
-
1452 }
-
1453
-
1454#ifdef DLAF_WITH_HDF5
-
1455 if (getTuneParameters().debug_dump_reduction_to_band_data) {
-
1456 file->write(mat_a, "/band");
-
1457 }
-
1458
-
1459 num_reduction_to_band_calls++;
-
1460#endif
-
1461
-
1462 return mat_taus;
-
1463}
-
1464}
+
1424 // Conservatively ensure that xt[0] needed for updating the first column has been
+
1425 // received. Just wait for xt because communication of x happens over rows, while the
+
1426 // pivot rank can just block rank in the same column.
+
1427 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
1428 }
+
1429 }
+
1430 else {
+
1431 if (rank.row() == rank_v0.row()) {
+
1432 // Note:
+
1433 // on the pivot rank, i.e. the one that would quickly go to the next panel and block, from
+
1434 // implementation we know that xt[0] is set as an external tile pointing to x[0].
+
1435 // We cannot wait on xt readwrite (because it is an external tile in a panel, that constraints
+
1436 // it to be just readable), but we can wait on its source x[0]. This has a subtle implication,
+
1437 // since we will wait not just for the communication to be complete (which is already more
+
1438 // than what needed), but we will also wait till xt[0] will be released, so after all local
+
1439 // communication and computation on the first column of the trailing matrix will be completed.
+
1440 trigger_panel = x.readwrite(at) | ex::drop_value() | ex::ensure_started();
+
1441 }
+
1442 else {
+
1443 // Note:
+
1444 // Conservatively ensure that xt[0] needed for updating the first column has been
+
1445 // received. Just wait for xt because communication of x happens over rows, while the
+
1446 // pivot rank can just block rank in the same column.
+
1447 trigger_panel = xt.read(at) | ex::drop_value() | ex::ensure_started();
+
1448 }
+
1449 }
+
1450 }
+
1451
+
1452 // At -= X . V* + V . X*
+
1453 her2kUpdateTrailingMatrix<B>(trailing_matrix_view, mat_a, x, vt, v, xt);
+
1454
+
1455 xt.reset();
+
1456 x.reset();
+
1457 wt.reset();
+
1458 w.reset();
+
1459 vt.reset();
+
1460 v.reset();
+
1461 }
+
1462
+
1463#ifdef DLAF_WITH_HDF5
+
1464 if (getTuneParameters().debug_dump_reduction_to_band_data) {
+
1465 file->write(mat_a, "/band");
+
1466 }
+
1467
+
1468 num_reduction_to_band_calls++;
+
1469#endif
+
1470
+
1471 return mat_taus;
+
1472}
+
1473}
void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile< const T, D > &a, const Tile< const T, D > &b, const T beta, const Tile< T, D > &c)
@@ -1585,7 +1594,7 @@ - +
Definition panel.h:589
Definition views.h:132
auto iteratorLocal() const noexcept
Return a Range2D that gives access to all local tiles part of the View.
Definition views.h:70
diff --git a/master/get__red2band__panel__nworkers_8h_source.html b/master/get__red2band__panel__nworkers_8h_source.html index 9199dd87e9..bf06866e51 100644 --- a/master/get__red2band__panel__nworkers_8h_source.html +++ b/master/get__red2band__panel__nworkers_8h_source.html @@ -102,7 +102,7 @@
19
20namespace dlaf::eigensolver::internal {
21
-
22inline size_t getReductionToBandPanelNWorkers() noexcept {
+
22inline size_t get_red2band_panel_nworkers() noexcept {
23 // Note: precautionarily we leave at least 1 thread "free" to do other stuff (if possible)
24 const std::size_t available_workers = pika::resource::get_thread_pool("default").get_os_thread_count();
25 const std::size_t min_workers = 1;