From b31dcad62cd4adce57cc0f3927860e9581cc5f86 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 9 Jun 2023 20:15:02 +0200 Subject: [PATCH 01/48] Refactor HiCFile ctor Move MatrixType and MatrixUnit params from MatrixSelector to HiCFile ctor. --- src/hic/hic_file_impl.hpp | 82 +++++++-------- src/hic/include/hictk/hic.hpp | 31 +++--- test/units/hic/hic_file_test.cpp | 58 +++++------ test/units/hic/matrix_zoom_data_test.cpp | 123 +++++++++++------------ 4 files changed, 140 insertions(+), 154 deletions(-) diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index 32b2aa6f..7204ae64 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -16,8 +16,8 @@ namespace hictk { -inline HiCFile::HiCFile(std::string url_) - : _fs(std::make_shared(std::move(url_))) {} +inline HiCFile::HiCFile(std::string url_, MatrixType type_, MatrixUnit unit_) + : _fs(std::make_shared(std::move(url_))), _type(type_), _unit(unit_) {} inline const std::string& HiCFile::url() const noexcept { return _fs->url(); } @@ -55,45 +55,45 @@ inline std::shared_ptr HiCFile::get_footer( return node.first->second; } -inline internal::MatrixSelector HiCFile::get_matrix_selector( - const Chromosome& chrom, MatrixType matrix_type, NormalizationMethod norm, MatrixUnit unit, - std::uint32_t resolution, std::size_t block_cache_capacity) { - return get_matrix_selector(chrom, chrom, matrix_type, norm, unit, resolution, - block_cache_capacity); +inline internal::MatrixSelector HiCFile::get_matrix_selector(const Chromosome& chrom, + NormalizationMethod norm, + std::uint32_t resolution, + std::size_t block_cache_capacity) { + return get_matrix_selector(chrom, chrom, norm, resolution, block_cache_capacity); } -inline internal::MatrixSelector HiCFile::get_matrix_selector( - const std::string& chromName, MatrixType matrix_type, NormalizationMethod norm, MatrixUnit unit, - std::uint32_t resolution, std::size_t block_cache_capacity) { - return get_matrix_selector(chromName, chromName, matrix_type, norm, unit, resolution, - block_cache_capacity); +inline internal::MatrixSelector HiCFile::get_matrix_selector(const std::string& chromName, + NormalizationMethod norm, + std::uint32_t resolution, + std::size_t block_cache_capacity) { + return get_matrix_selector(chromName, chromName, norm, resolution, block_cache_capacity); } -inline internal::MatrixSelector HiCFile::get_matrix_selector( - std::uint32_t chrom_id, MatrixType matrix_type, NormalizationMethod norm, MatrixUnit unit, - std::uint32_t resolution, std::size_t block_cache_capacity) { - return get_matrix_selector(chrom_id, chrom_id, matrix_type, norm, unit, resolution, - block_cache_capacity); +inline internal::MatrixSelector HiCFile::get_matrix_selector(std::uint32_t chrom_id, + NormalizationMethod norm, + std::uint32_t resolution, + std::size_t block_cache_capacity) { + return get_matrix_selector(chrom_id, chrom_id, norm, resolution, block_cache_capacity); } -inline internal::MatrixSelector HiCFile::get_matrix_selector( - const Chromosome& chrom1, const Chromosome& chrom2, MatrixType matrix_type, - NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution, - std::size_t block_cache_capacity) { - return get_matrix_selector(chrom1.id(), chrom2.id(), matrix_type, norm, unit, resolution, - block_cache_capacity); +inline internal::MatrixSelector HiCFile::get_matrix_selector(const Chromosome& chrom1, + const Chromosome& chrom2, + NormalizationMethod norm, + std::uint32_t resolution, + std::size_t block_cache_capacity) { + return get_matrix_selector(chrom1.id(), chrom2.id(), norm, resolution, block_cache_capacity); } -inline internal::MatrixSelector HiCFile::get_matrix_selector( - const std::string& chrom1_name, const std::string& chrom2_name, MatrixType matrix_type, - NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution, - std::size_t block_cache_capacity) { +inline internal::MatrixSelector HiCFile::get_matrix_selector(const std::string& chrom1_name, + const std::string& chrom2_name, + NormalizationMethod norm, + std::uint32_t resolution, + std::size_t block_cache_capacity) { const auto it1 = chromosomes().find(chrom1_name); if (it1 == chromosomes().end()) { throw std::runtime_error( fmt::format(FMT_STRING("unable to find chromosome named {}"), chrom1_name)); } if (chrom1_name == chrom2_name) { - return get_matrix_selector(*it1, *it1, matrix_type, norm, unit, resolution, - block_cache_capacity); + return get_matrix_selector(*it1, *it1, norm, resolution, block_cache_capacity); } const auto it2 = chromosomes().find(chrom2_name); @@ -102,13 +102,14 @@ inline internal::MatrixSelector HiCFile::get_matrix_selector( fmt::format(FMT_STRING("unable to find chromosome named {}"), chrom2_name)); } - return get_matrix_selector(*it1, *it2, matrix_type, norm, unit, resolution, block_cache_capacity); + return get_matrix_selector(*it1, *it2, norm, resolution, block_cache_capacity); } -inline internal::MatrixSelector HiCFile::get_matrix_selector( - std::uint32_t chrom1_id, std::uint32_t chrom2_id, MatrixType matrix_type, - NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution, - std::size_t block_cache_capacity) { +inline internal::MatrixSelector HiCFile::get_matrix_selector(std::uint32_t chrom1_id, + std::uint32_t chrom2_id, + NormalizationMethod norm, + std::uint32_t resolution, + std::size_t block_cache_capacity) { if (chrom1_id >= std::int64_t(chromosomes().size())) { throw std::runtime_error( fmt::format(FMT_STRING("unable to find chromosome corresponding to ID {}"), chrom1_id)); @@ -123,10 +124,9 @@ inline internal::MatrixSelector HiCFile::get_matrix_selector( "Query overlaps the lower-triangle of the matrix. This is currently not supported."); } - if (matrix_type == MatrixType::expected && norm != NormalizationMethod::NONE) { - throw std::logic_error( - fmt::format(FMT_STRING("matrix type {} is incompatible with normalization method {}"), - matrix_type, norm)); + if (_type == MatrixType::expected && norm != NormalizationMethod::NONE) { + throw std::logic_error(fmt::format( + FMT_STRING("matrix type {} is incompatible with normalization method {}"), _type, norm)); } const auto it = std::find(resolutions().begin(), resolutions().end(), resolution); @@ -139,7 +139,7 @@ inline internal::MatrixSelector HiCFile::get_matrix_selector( try { return internal::MatrixSelector( - _fs, get_footer(chrom1_id, chrom2_id, matrix_type, norm, unit, resolution), + _fs, get_footer(chrom1_id, chrom2_id, _type, norm, _unit, resolution), block_cache_capacity); } catch (const std::exception& e) { // Check whether query is valid but there are no interactions for the given chromosome pair @@ -150,9 +150,9 @@ inline internal::MatrixSelector HiCFile::get_matrix_selector( } internal::HiCFooterMetadata metadata{url(), - matrix_type, + _type, norm, - unit, + _unit, resolution, _fs->header().chromosomes.at(chrom1_id), _fs->header().chromosomes.at(chrom2_id), diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index 3b80a78b..2129a07c 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -28,10 +28,13 @@ class HiCFile { // clang-format on std::shared_ptr _fs{}; FooterCacheT _footers{}; + MatrixType _type{MatrixType::observed}; + MatrixUnit _unit{MatrixUnit::BP}; public: static constexpr std::size_t DEFAULT_BLOCK_CACHE_CAPACITY = 500UL << 20U; // ~500MB - explicit HiCFile(std::string url_); + explicit HiCFile(std::string url_, MatrixType type_ = MatrixType::observed, + MatrixUnit unit_ = MatrixUnit::BP); [[nodiscard]] const std::string &url() const noexcept; [[nodiscard]] const std::string &name() const noexcept; @@ -41,28 +44,24 @@ class HiCFile { [[nodiscard]] const std::vector &resolutions() const noexcept; [[nodiscard]] internal::MatrixSelector get_matrix_selector( - const Chromosome &chrom, MatrixType matrix_type, NormalizationMethod norm, MatrixUnit unit, - std::uint32_t resolution, std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); + const Chromosome &chrom, NormalizationMethod norm, std::uint32_t resolution, + std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); [[nodiscard]] internal::MatrixSelector get_matrix_selector( - const std::string &chromName, MatrixType matrix_type, NormalizationMethod norm, - MatrixUnit unit, std::uint32_t resolution, + const std::string &chromName, NormalizationMethod norm, std::uint32_t resolution, std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); [[nodiscard]] internal::MatrixSelector get_matrix_selector( - std::uint32_t chrom_id, MatrixType matrix_type, NormalizationMethod norm, MatrixUnit unit, - std::uint32_t resolution, std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); + std::uint32_t chrom_id, NormalizationMethod norm, std::uint32_t resolution, + std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); [[nodiscard]] internal::MatrixSelector get_matrix_selector( - const Chromosome &chrom1, const Chromosome &chrom2, MatrixType matrix_type, - NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution, - std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); + const Chromosome &chrom1, const Chromosome &chrom2, NormalizationMethod norm, + std::uint32_t resolution, std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); [[nodiscard]] internal::MatrixSelector get_matrix_selector( - const std::string &chrom1_name, const std::string &chrom2_name, MatrixType matrix_type, - NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution, - std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); + const std::string &chrom1_name, const std::string &chrom2_name, NormalizationMethod norm, + std::uint32_t resolution, std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); [[nodiscard]] internal::MatrixSelector get_matrix_selector( - std::uint32_t chrom1_id, std::uint32_t chrom2_id, MatrixType matrix_type, - NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution, - std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); + std::uint32_t chrom1_id, std::uint32_t chrom2_id, NormalizationMethod norm, + std::uint32_t resolution, std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); [[nodiscard]] std::size_t num_cached_footers() const noexcept; void purge_footer_cache(); diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index 3a8484ac..f0e7ebf6 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -47,22 +47,19 @@ TEST_CASE("HiCFile accessors", "[hic][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("HiCFile footer cache", "[hic][short]") { - HiCFile f(pathV8); + HiCFile f(pathV8, MatrixType::observed, MatrixUnit::BP); REQUIRE(f.resolutions().size() == 10); CHECK(f.num_cached_footers() == 0); for (const auto res : f.resolutions()) { - std::ignore = f.get_matrix_selector("chr2L", MatrixType::observed, NormalizationMethod::NONE, - MatrixUnit::BP, res); + std::ignore = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, res); } CHECK(f.num_cached_footers() == f.resolutions().size()); - const auto sel1 = f.get_matrix_selector("chr2L", MatrixType::observed, NormalizationMethod::NONE, - MatrixUnit::BP, 2500000); - const auto sel2 = f.get_matrix_selector("chr2L", MatrixType::observed, NormalizationMethod::NONE, - MatrixUnit::BP, 2500000); + const auto sel1 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, 2500000); + const auto sel2 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, 2500000); // this check relies on the fact that chrom1Norm are stored in the footer, and that footers are // looked up in the cache when creating matrix selectors @@ -71,8 +68,7 @@ TEST_CASE("HiCFile footer cache", "[hic][short]") { f.purge_footer_cache(); CHECK(f.num_cached_footers() == 0); - const auto sel3 = f.get_matrix_selector("chr2L", MatrixType::observed, NormalizationMethod::NONE, - MatrixUnit::BP, 2500000); + const auto sel3 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, 2500000); CHECK(f.num_cached_footers() == 1); CHECK(&sel1.chrom1Norm() != &sel3.chrom1Norm()); @@ -80,42 +76,40 @@ TEST_CASE("HiCFile footer cache", "[hic][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("HiCFile get_matrix_selector", "[hic][short]") { - HiCFile f(pathV8); - - REQUIRE(f.chromosomes().size() == 9); - - constexpr auto mt = MatrixType::observed; constexpr auto norm = NormalizationMethod::NONE; - constexpr auto unit = MatrixUnit::BP; constexpr std::int32_t res = 2500000; + HiCFile f(pathV8, MatrixType::observed, MatrixUnit::BP); + + REQUIRE(f.chromosomes().size() == 9); + const auto chrom1 = f.chromosomes().at("chr2L"); const auto chrom2 = f.chromosomes().at("chr2R"); SECTION("intra-chromosomal") { - auto sel = f.get_matrix_selector(chrom1, mt, norm, unit, res); + auto sel = f.get_matrix_selector(chrom1, norm, res); CHECK(sel.chrom1() == chrom1); CHECK(sel.isIntra()); - sel = f.get_matrix_selector(chrom1.id(), mt, norm, unit, res); + sel = f.get_matrix_selector(chrom1.id(), norm, res); CHECK(sel.chrom1() == chrom1); CHECK(sel.isIntra()); - sel = f.get_matrix_selector(chrom1, mt, norm, unit, res); + sel = f.get_matrix_selector(chrom1, norm, res); CHECK(sel.chrom1() == chrom1); CHECK(sel.isIntra()); } SECTION("inter-chromosomal") { - auto sel = f.get_matrix_selector(chrom1, chrom2, mt, norm, unit, res); + auto sel = f.get_matrix_selector(chrom1, chrom2, norm, res); CHECK(sel.chrom1() == chrom1); CHECK(sel.chrom2() == chrom2); - sel = f.get_matrix_selector(chrom1.id(), chrom2.id(), mt, norm, unit, res); + sel = f.get_matrix_selector(chrom1.id(), chrom2.id(), norm, res); CHECK(sel.chrom1() == chrom1); CHECK(sel.chrom2() == chrom2); - sel = f.get_matrix_selector(chrom1, chrom2, mt, norm, unit, res); + sel = f.get_matrix_selector(chrom1, chrom2, norm, res); CHECK(sel.chrom1() == chrom1); CHECK(sel.chrom2() == chrom2); @@ -124,27 +118,27 @@ TEST_CASE("HiCFile get_matrix_selector", "[hic][short]") { } SECTION("valid, but empty matrix") { - auto sel = f.get_matrix_selector("chrM", mt, norm, unit, res); + auto sel = f.get_matrix_selector("chrM", norm, res); std::vector> buff{}; sel.fetch(buff); CHECK(buff.empty()); } SECTION("invalid chromosome") { - CHECK_THROWS(f.get_matrix_selector("not-a-chromosome", mt, norm, unit, res)); - CHECK_THROWS( - f.get_matrix_selector(std::string{chrom1.name()}, "not-a-chromosome", mt, norm, unit, res)); - CHECK_THROWS(f.get_matrix_selector(999, mt, norm, unit, res)); - CHECK_THROWS(f.get_matrix_selector(chrom1.id(), 999, mt, norm, unit, res)); + CHECK_THROWS(f.get_matrix_selector("not-a-chromosome", norm, res)); + CHECK_THROWS(f.get_matrix_selector(std::string{chrom1.name()}, "not-a-chromosome", norm, res)); + CHECK_THROWS(f.get_matrix_selector(999, norm, res)); + CHECK_THROWS(f.get_matrix_selector(chrom1.id(), 999, norm, res)); } SECTION("malformed") { - CHECK_THROWS(f.get_matrix_selector(chrom2, chrom1, mt, norm, unit, res)); // NOLINT - CHECK_THROWS(f.get_matrix_selector(chrom1, mt, norm, unit, 123)); - CHECK_THROWS( - f.get_matrix_selector(chrom1, MatrixType::expected, NormalizationMethod::VC, unit, res)); + CHECK_THROWS(f.get_matrix_selector(chrom2, chrom1, norm, res)); // NOLINT + CHECK_THROWS(f.get_matrix_selector(chrom1, norm, 123)); + CHECK_THROWS(HiCFile(pathV8, MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector(chrom1, NormalizationMethod::VC, res)); // Matrix does not have contacts for fragments - CHECK_THROWS(f.get_matrix_selector(chrom1, mt, norm, MatrixUnit::FRAG, res)); + CHECK_THROWS(HiCFile(pathV8, MatrixType::observed, MatrixUnit::FRAG) + .get_matrix_selector(chrom1, norm, res)); } } diff --git a/test/units/hic/matrix_zoom_data_test.cpp b/test/units/hic/matrix_zoom_data_test.cpp index eeed7385..2a2a3364 100644 --- a/test/units/hic/matrix_zoom_data_test.cpp +++ b/test/units/hic/matrix_zoom_data_test.cpp @@ -72,8 +72,8 @@ static void compareContactRecord(const Pixel& r1, const SerializedPixel& // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("MatrixSelector accessors", "[hic][short]") { - const auto sel = HiCFile(pathV8).get_matrix_selector( - "chr2L", MatrixType::observed, NormalizationMethod::NONE, MatrixUnit::BP, 2500000); + const auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, 2500000); CHECK(sel.chrom1().name() == "chr2L"); CHECK(sel.chrom2().name() == "chr2L"); @@ -90,10 +90,9 @@ TEST_CASE("MatrixSelector accessors", "[hic][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("MatrixSelector LRU cache", "[hic][short]") { std::vector> buffer; - HiCFile f(pathV8); + HiCFile f(pathV8, MatrixType::observed, MatrixUnit::BP); - auto sel = f.get_matrix_selector("chr2L", MatrixType::observed, NormalizationMethod::NONE, - MatrixUnit::BP, 10000); + auto sel = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); CHECK(sel.blockCacheHitRate() == 0.0); CHECK(sel.blockCacheSize() == 0); @@ -132,8 +131,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { std::make_pair(std::size_t(1229799), SerializedPixel{15770000, 15770000, 1234.0F}); SECTION("v8") { - auto sel = HiCFile(pathV8).get_matrix_selector( - "chr2L", MatrixType::observed, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -149,8 +148,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { compareContactRecord(buffer[expected_value.first], expected_value.second); } SECTION("v9") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", MatrixType::observed, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -179,8 +178,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { std::make_pair(std::size_t(3541), SerializedPixel{770000, 1300000, 13.0F}); SECTION("v8") { - auto sel = HiCFile(pathV8).get_matrix_selector( - "chr2L", "chr4", MatrixType::observed, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -197,8 +196,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("v9") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", "chr4", MatrixType::observed, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -215,9 +214,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("cover type 2 interactions") { - auto sel = - HiCFile(pathV8).get_matrix_selector("chr2L", "chr2R", MatrixType::observed, - NormalizationMethod::NONE, MatrixUnit::BP, 2500000); + auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr2R", NormalizationMethod::NONE, 2500000); sel.fetch(buffer, true); REQUIRE(buffer.size() == 110); CHECK(sumCounts(buffer) == 1483112); @@ -228,16 +226,16 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { SECTION("sub-queries") { const std::uint32_t resolution = 10000; SECTION("single pixel") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", MatrixType::observed, NormalizationMethod::NONE, MatrixUnit::BP, resolution); + auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, resolution); sel.fetch(100000, 100001, 100000, 100001, buffer); REQUIRE(buffer.size() == 1); compareContactRecord(buffer.front(), SerializedPixel{100000, 100000, 13895.0F}); } SECTION("upper-triangle") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", MatrixType::observed, NormalizationMethod::NONE, MatrixUnit::BP, resolution); + auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, resolution); sel.fetch(123456, 200000, 0, 200000, buffer, true); REQUIRE(buffer.size() == 132); CHECK(sumCounts(buffer) == 124561); @@ -247,8 +245,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("lower-triangle") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", MatrixType::observed, NormalizationMethod::NONE, MatrixUnit::BP, resolution); + auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, resolution); sel.fetch(0, 200000, 123456, 200000, buffer, true); REQUIRE(buffer.size() == 132); CHECK(sumCounts(buffer) == 124561); @@ -258,9 +256,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("inter-chromosomal") { - auto sel = HiCFile(pathV9).get_matrix_selector("chr2L", "chr4", MatrixType::observed, - NormalizationMethod::NONE, MatrixUnit::BP, - resolution); + auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, resolution); sel.fetch(123456, 200000, 0, 200000, buffer); REQUIRE(buffer.size() == 57); CHECK(sumCounts(buffer) == 74); @@ -270,33 +267,29 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("invalid") { - HiCFile hic(pathV9); - SECTION("invalid chromosome") { - CHECK_THROWS(hic.get_matrix_selector("chr123", MatrixType::observed, - NormalizationMethod::NONE, MatrixUnit::BP, 10000)); - CHECK_THROWS(hic.get_matrix_selector(999, MatrixType::observed, NormalizationMethod::NONE, - MatrixUnit::BP, 10000)); + HiCFile hic(pathV9, MatrixType::observed, MatrixUnit::BP); + CHECK_THROWS(hic.get_matrix_selector("chr123", NormalizationMethod::NONE, 10000)); + CHECK_THROWS(hic.get_matrix_selector(999, NormalizationMethod::NONE, 10000)); } SECTION("invalid resolution") { - CHECK_THROWS(hic.get_matrix_selector("chr2L", MatrixType::observed, - NormalizationMethod::NONE, MatrixUnit::BP, + HiCFile hic(pathV9, MatrixType::observed, MatrixUnit::BP); + CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE, (std::numeric_limits::max)())); } SECTION("invalid unit") { - CHECK_THROWS(hic.get_matrix_selector("chr2L", MatrixType::observed, - NormalizationMethod::NONE, MatrixUnit::FRAG, 10000)); + HiCFile hic(pathV9, MatrixType::observed, MatrixUnit::FRAG); + CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000)); } SECTION("expected + norm") { - CHECK_THROWS(hic.get_matrix_selector("chr2L", MatrixType::expected, NormalizationMethod::VC, - MatrixUnit::BP, 10000)); + HiCFile hic(pathV9, MatrixType::expected, MatrixUnit::BP); + CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::VC, 10000)); } SECTION("invalid range") { - CHECK_THROWS(hic.get_matrix_selector("chr2L", MatrixType::observed, - NormalizationMethod::NONE, MatrixUnit::BP, 10000) + HiCFile hic(pathV9, MatrixType::observed, MatrixUnit::BP); + CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000) .fetch(1000, 0, buffer)); - CHECK_THROWS(hic.get_matrix_selector("chr2L", MatrixType::observed, - NormalizationMethod::NONE, MatrixUnit::BP, 10000) + CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000) .fetch(0, 1'000'000'000, buffer)); } } @@ -310,15 +303,15 @@ TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 20391277.41514; SECTION("v8") { - auto sel = HiCFile(pathV8).get_matrix_selector( - "chr2L", MatrixType::observed, NormalizationMethod::VC, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::VC, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", MatrixType::observed, NormalizationMethod::VC, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::VC, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -328,16 +321,16 @@ TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 56743; constexpr double expected_sum = 96690.056244753; SECTION("v8") { - auto sel = HiCFile(pathV8).get_matrix_selector( - "chr2L", "chr4", MatrixType::observed, NormalizationMethod::VC, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::VC, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", "chr4", MatrixType::observed, NormalizationMethod::VC, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::VC, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -352,15 +345,15 @@ TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 18314748.068024; SECTION("v8") { - auto sel = HiCFile(pathV8).get_matrix_selector( - "chr2L", MatrixType::expected, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV8, MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", MatrixType::expected, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV9, MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -370,16 +363,16 @@ TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 56743; constexpr double expected_sum = 12610.80619812; SECTION("v8") { - auto sel = HiCFile(pathV8).get_matrix_selector( - "chr2L", "chr4", MatrixType::expected, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV8, MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", "chr4", MatrixType::expected, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV9, MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -394,15 +387,15 @@ TEST_CASE("MatrixSelector fetch (oe NONE BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 2785506.2274201; SECTION("v8") { - auto sel = HiCFile(pathV8).get_matrix_selector( - "chr2L", MatrixType::oe, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV8, MatrixType::oe, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", MatrixType::oe, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV9, MatrixType::oe, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -412,16 +405,16 @@ TEST_CASE("MatrixSelector fetch (oe NONE BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 56743; constexpr double expected_sum = 317520.00459671; SECTION("v8") { - auto sel = HiCFile(pathV8).get_matrix_selector( - "chr2L", "chr4", MatrixType::oe, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV8, MatrixType::oe, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9).get_matrix_selector( - "chr2L", "chr4", MatrixType::oe, NormalizationMethod::NONE, MatrixUnit::BP, 10000); + auto sel = HiCFile(pathV9, MatrixType::oe, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); From 77dbae6eeb0d4de89f2f56dcf067cfc1e7f7ab97 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 9 Jun 2023 21:06:04 +0200 Subject: [PATCH 02/48] Move resolution and cache_capacity params from matrix selector to hic ctor --- src/hic/hic_file_impl.hpp | 78 ++++++++------- src/hic/hic_file_stream_impl.hpp | 4 +- src/hic/include/hictk/hic.hpp | 48 ++++----- test/units/hic/hic_file_test.cpp | 75 +++++++------- test/units/hic/matrix_zoom_data_test.cpp | 121 +++++++++++------------ 5 files changed, 164 insertions(+), 162 deletions(-) diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index 7204ae64..d09b4cff 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -16,8 +16,28 @@ namespace hictk { -inline HiCFile::HiCFile(std::string url_, MatrixType type_, MatrixUnit unit_) - : _fs(std::make_shared(std::move(url_))), _type(type_), _unit(unit_) {} +inline HiCFile::HiCFile(std::string url_, std::uint32_t resolution_, MatrixType type_, + MatrixUnit unit_, std::uint64_t block_cache_capacity) + : _fs(std::make_shared(std::move(url_))), + _type(type_), + _unit(unit_), + _block_cache(block_cache_capacity), + _bins(chromosomes(), resolution_) { + assert(block_cache_capacity != 0); + if (!has_resolution(resolution())) { + throw std::runtime_error(fmt::format( + FMT_STRING("file {} does not have interactions for resolution {}"), url(), resolution())); + } +} + +inline HiCFile HiCFile::open_resolution(std::uint32_t resolution) const { + return HiCFile(url(), resolution, _type, _unit); +} + +inline bool HiCFile::has_resolution(std::uint32_t resolution) const { + const auto match = std::find(avail_resolutions().begin(), avail_resolutions().end(), resolution); + return match != avail_resolutions().end(); +} inline const std::string& HiCFile::url() const noexcept { return _fs->url(); } @@ -29,10 +49,12 @@ inline const Reference& HiCFile::chromosomes() const noexcept { return _fs->head inline const std::string& HiCFile::assembly() const noexcept { return _fs->header().genomeID; } -inline const std::vector& HiCFile::resolutions() const noexcept { +inline const std::vector& HiCFile::avail_resolutions() const noexcept { return _fs->header().resolutions; } +constexpr std::uint32_t HiCFile::resolution() const noexcept { return _bins.bin_size(); } + inline std::shared_ptr HiCFile::get_footer( std::uint32_t chrom1_id, std::uint32_t chrom2_id, MatrixType matrix_type, NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution) { @@ -56,44 +78,34 @@ inline std::shared_ptr HiCFile::get_footer( } inline internal::MatrixSelector HiCFile::get_matrix_selector(const Chromosome& chrom, - NormalizationMethod norm, - std::uint32_t resolution, - std::size_t block_cache_capacity) { - return get_matrix_selector(chrom, chrom, norm, resolution, block_cache_capacity); + NormalizationMethod norm) { + return get_matrix_selector(chrom, chrom, norm); } inline internal::MatrixSelector HiCFile::get_matrix_selector(const std::string& chromName, - NormalizationMethod norm, - std::uint32_t resolution, - std::size_t block_cache_capacity) { - return get_matrix_selector(chromName, chromName, norm, resolution, block_cache_capacity); + NormalizationMethod norm) { + return get_matrix_selector(chromName, chromName, norm); } inline internal::MatrixSelector HiCFile::get_matrix_selector(std::uint32_t chrom_id, - NormalizationMethod norm, - std::uint32_t resolution, - std::size_t block_cache_capacity) { - return get_matrix_selector(chrom_id, chrom_id, norm, resolution, block_cache_capacity); + NormalizationMethod norm) { + return get_matrix_selector(chrom_id, chrom_id, norm); } inline internal::MatrixSelector HiCFile::get_matrix_selector(const Chromosome& chrom1, const Chromosome& chrom2, - NormalizationMethod norm, - std::uint32_t resolution, - std::size_t block_cache_capacity) { - return get_matrix_selector(chrom1.id(), chrom2.id(), norm, resolution, block_cache_capacity); + NormalizationMethod norm) { + return get_matrix_selector(chrom1.id(), chrom2.id(), norm); } inline internal::MatrixSelector HiCFile::get_matrix_selector(const std::string& chrom1_name, const std::string& chrom2_name, - NormalizationMethod norm, - std::uint32_t resolution, - std::size_t block_cache_capacity) { + NormalizationMethod norm) { const auto it1 = chromosomes().find(chrom1_name); if (it1 == chromosomes().end()) { throw std::runtime_error( fmt::format(FMT_STRING("unable to find chromosome named {}"), chrom1_name)); } if (chrom1_name == chrom2_name) { - return get_matrix_selector(*it1, *it1, norm, resolution, block_cache_capacity); + return get_matrix_selector(*it1, *it1, norm); } const auto it2 = chromosomes().find(chrom2_name); @@ -102,14 +114,12 @@ inline internal::MatrixSelector HiCFile::get_matrix_selector(const std::string& fmt::format(FMT_STRING("unable to find chromosome named {}"), chrom2_name)); } - return get_matrix_selector(*it1, *it2, norm, resolution, block_cache_capacity); + return get_matrix_selector(*it1, *it2, norm); } inline internal::MatrixSelector HiCFile::get_matrix_selector(std::uint32_t chrom1_id, std::uint32_t chrom2_id, - NormalizationMethod norm, - std::uint32_t resolution, - std::size_t block_cache_capacity) { + NormalizationMethod norm) { if (chrom1_id >= std::int64_t(chromosomes().size())) { throw std::runtime_error( fmt::format(FMT_STRING("unable to find chromosome corresponding to ID {}"), chrom1_id)); @@ -129,18 +139,10 @@ inline internal::MatrixSelector HiCFile::get_matrix_selector(std::uint32_t chrom FMT_STRING("matrix type {} is incompatible with normalization method {}"), _type, norm)); } - const auto it = std::find(resolutions().begin(), resolutions().end(), resolution); - if (it == resolutions().end()) { - throw std::runtime_error(fmt::format( - FMT_STRING( - "matrix does not have interactions for resolution {}. Available resolutions: {}"), - resolution, fmt::join(_fs->header().resolutions, ", "))); - } - try { return internal::MatrixSelector( - _fs, get_footer(chrom1_id, chrom2_id, _type, norm, _unit, resolution), - block_cache_capacity); + _fs, get_footer(chrom1_id, chrom2_id, _type, norm, _unit, resolution()), + 1'000'000); // TODO: REMOVE CACHE CAPACITY! } catch (const std::exception& e) { // Check whether query is valid but there are no interactions for the given chromosome pair const auto missing_footer = @@ -153,7 +155,7 @@ inline internal::MatrixSelector HiCFile::get_matrix_selector(std::uint32_t chrom _type, norm, _unit, - resolution, + resolution(), _fs->header().chromosomes.at(chrom1_id), _fs->header().chromosomes.at(chrom2_id), -1}; diff --git a/src/hic/hic_file_stream_impl.hpp b/src/hic/hic_file_stream_impl.hpp index 3cf9f7bb..2e3bc537 100644 --- a/src/hic/hic_file_stream_impl.hpp +++ b/src/hic/hic_file_stream_impl.hpp @@ -225,8 +225,8 @@ inline void HiCFileStream::readBlockMap(std::int64_t fileOffset, } throw std::runtime_error( - fmt::format(FMT_STRING("Unable to find block map for unit {} and resolution {}"), wantedUnit, - wantedResolution)); + fmt::format(FMT_STRING("Unable to find block map for {}:{} with unit {} and resolution {}"), + chrom1.name(), chrom2.name(), wantedUnit, wantedResolution)); } inline bool HiCFileStream::checkMagicString() { return checkMagicString(*_fs); } diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index 2129a07c..5a31ad9c 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -30,38 +30,42 @@ class HiCFile { FooterCacheT _footers{}; MatrixType _type{MatrixType::observed}; MatrixUnit _unit{MatrixUnit::BP}; + internal::BlockLRUCache _block_cache{}; + BinTable _bins{}; public: - static constexpr std::size_t DEFAULT_BLOCK_CACHE_CAPACITY = 500UL << 20U; // ~500MB - explicit HiCFile(std::string url_, MatrixType type_ = MatrixType::observed, - MatrixUnit unit_ = MatrixUnit::BP); + explicit HiCFile(std::string url_, std::uint32_t resolution_, + MatrixType type_ = MatrixType::observed, MatrixUnit unit_ = MatrixUnit::BP, + // TODO consider expressing cache size in terms of number of pixels + std::uint64_t block_cache_capacity = 500ULL << 20U); + + [[nodiscard]] HiCFile open_resolution(std::uint32_t resolution) const; + [[nodiscard]] bool has_resolution(std::uint32_t resolution) const; [[nodiscard]] const std::string &url() const noexcept; [[nodiscard]] const std::string &name() const noexcept; [[nodiscard]] std::int32_t version() const noexcept; [[nodiscard]] const Reference &chromosomes() const noexcept; [[nodiscard]] const std::string &assembly() const noexcept; - [[nodiscard]] const std::vector &resolutions() const noexcept; + [[nodiscard]] const std::vector &avail_resolutions() const noexcept; + [[nodiscard]] constexpr std::uint32_t resolution() const noexcept; - [[nodiscard]] internal::MatrixSelector get_matrix_selector( - const Chromosome &chrom, NormalizationMethod norm, std::uint32_t resolution, - std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); - [[nodiscard]] internal::MatrixSelector get_matrix_selector( - const std::string &chromName, NormalizationMethod norm, std::uint32_t resolution, - std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); - [[nodiscard]] internal::MatrixSelector get_matrix_selector( - std::uint32_t chrom_id, NormalizationMethod norm, std::uint32_t resolution, - std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); + [[nodiscard]] internal::MatrixSelector get_matrix_selector(const Chromosome &chrom, + NormalizationMethod norm); + [[nodiscard]] internal::MatrixSelector get_matrix_selector(const std::string &chromName, + NormalizationMethod norm); + [[nodiscard]] internal::MatrixSelector get_matrix_selector(std::uint32_t chrom_id, + NormalizationMethod norm); - [[nodiscard]] internal::MatrixSelector get_matrix_selector( - const Chromosome &chrom1, const Chromosome &chrom2, NormalizationMethod norm, - std::uint32_t resolution, std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); - [[nodiscard]] internal::MatrixSelector get_matrix_selector( - const std::string &chrom1_name, const std::string &chrom2_name, NormalizationMethod norm, - std::uint32_t resolution, std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); - [[nodiscard]] internal::MatrixSelector get_matrix_selector( - std::uint32_t chrom1_id, std::uint32_t chrom2_id, NormalizationMethod norm, - std::uint32_t resolution, std::size_t block_cache_capacity = DEFAULT_BLOCK_CACHE_CAPACITY); + [[nodiscard]] internal::MatrixSelector get_matrix_selector(const Chromosome &chrom1, + const Chromosome &chrom2, + NormalizationMethod norm); + [[nodiscard]] internal::MatrixSelector get_matrix_selector(const std::string &chrom1_name, + const std::string &chrom2_name, + NormalizationMethod norm); + [[nodiscard]] internal::MatrixSelector get_matrix_selector(std::uint32_t chrom1_id, + std::uint32_t chrom2_id, + NormalizationMethod norm); [[nodiscard]] std::size_t num_cached_footers() const noexcept; void purge_footer_cache(); diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index f0e7ebf6..f7931d67 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -26,7 +26,7 @@ TEST_CASE("utils: is_hic_file", "[hic][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("HiCFile accessors", "[hic][short]") { - const HiCFile f(pathV8); + const HiCFile f(pathV8, 1'000); CHECK(f.url() == pathV8); CHECK(f.name() == pathV8); @@ -34,32 +34,37 @@ TEST_CASE("HiCFile accessors", "[hic][short]") { CHECK(f.chromosomes().size() == 9); CHECK(f.assembly() == "dm6"); - CHECK(f.resolutions().size() == 10); - CHECK(f.resolutions().front() == 2500000); - CHECK(f.resolutions().back() == 1000); + CHECK(f.avail_resolutions().size() == 10); + CHECK(f.avail_resolutions().front() == 2'500'000); + CHECK(f.avail_resolutions().back() == 1000); + + CHECK(f.open_resolution(2'500'000).resolution() == 2'500'000); SECTION("invalid") { - CHECK_THROWS(HiCFile("non-existing-file")); - CHECK_THROWS(HiCFile("https://localhost:non-existing-url")); - CHECK_THROWS(HiCFile("test/CMakeLists.txt")); + CHECK_THROWS(HiCFile(pathV8, (std::numeric_limits::max)(), MatrixType::observed, + MatrixUnit::BP)); + CHECK_THROWS(HiCFile("non-existing-file", 1)); + CHECK_THROWS(HiCFile("https://localhost:non-existing-url", 1)); + CHECK_THROWS(HiCFile("test/CMakeLists.txt", 1)); } } // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("HiCFile footer cache", "[hic][short]") { - HiCFile f(pathV8, MatrixType::observed, MatrixUnit::BP); - - REQUIRE(f.resolutions().size() == 10); + HiCFile f(pathV8, 2'500'000); CHECK(f.num_cached_footers() == 0); - for (const auto res : f.resolutions()) { - std::ignore = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, res); + for (const auto& chrom : f.chromosomes()) { + if (chrom.is_all()) { + continue; + } + std::ignore = f.get_matrix_selector(chrom, NormalizationMethod::NONE); } - CHECK(f.num_cached_footers() == f.resolutions().size()); + CHECK(f.num_cached_footers() == 7); - const auto sel1 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, 2500000); - const auto sel2 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, 2500000); + const auto sel1 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE); + const auto sel2 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE); // this check relies on the fact that chrom1Norm are stored in the footer, and that footers are // looked up in the cache when creating matrix selectors @@ -68,7 +73,7 @@ TEST_CASE("HiCFile footer cache", "[hic][short]") { f.purge_footer_cache(); CHECK(f.num_cached_footers() == 0); - const auto sel3 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, 2500000); + const auto sel3 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE); CHECK(f.num_cached_footers() == 1); CHECK(&sel1.chrom1Norm() != &sel3.chrom1Norm()); @@ -77,39 +82,36 @@ TEST_CASE("HiCFile footer cache", "[hic][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("HiCFile get_matrix_selector", "[hic][short]") { constexpr auto norm = NormalizationMethod::NONE; - constexpr std::int32_t res = 2500000; - - HiCFile f(pathV8, MatrixType::observed, MatrixUnit::BP); + HiCFile f(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP); REQUIRE(f.chromosomes().size() == 9); const auto chrom1 = f.chromosomes().at("chr2L"); const auto chrom2 = f.chromosomes().at("chr2R"); - SECTION("intra-chromosomal") { - auto sel = f.get_matrix_selector(chrom1, norm, res); + auto sel = f.get_matrix_selector(chrom1, norm); CHECK(sel.chrom1() == chrom1); CHECK(sel.isIntra()); - sel = f.get_matrix_selector(chrom1.id(), norm, res); + sel = f.get_matrix_selector(chrom1.id(), norm); CHECK(sel.chrom1() == chrom1); CHECK(sel.isIntra()); - sel = f.get_matrix_selector(chrom1, norm, res); + sel = f.get_matrix_selector(chrom1, norm); CHECK(sel.chrom1() == chrom1); CHECK(sel.isIntra()); } SECTION("inter-chromosomal") { - auto sel = f.get_matrix_selector(chrom1, chrom2, norm, res); + auto sel = f.get_matrix_selector(chrom1, chrom2, norm); CHECK(sel.chrom1() == chrom1); CHECK(sel.chrom2() == chrom2); - sel = f.get_matrix_selector(chrom1.id(), chrom2.id(), norm, res); + sel = f.get_matrix_selector(chrom1.id(), chrom2.id(), norm); CHECK(sel.chrom1() == chrom1); CHECK(sel.chrom2() == chrom2); - sel = f.get_matrix_selector(chrom1, chrom2, norm, res); + sel = f.get_matrix_selector(chrom1, chrom2, norm); CHECK(sel.chrom1() == chrom1); CHECK(sel.chrom2() == chrom2); @@ -118,27 +120,26 @@ TEST_CASE("HiCFile get_matrix_selector", "[hic][short]") { } SECTION("valid, but empty matrix") { - auto sel = f.get_matrix_selector("chrM", norm, res); + auto sel = f.get_matrix_selector("chrM", norm); std::vector> buff{}; sel.fetch(buff); CHECK(buff.empty()); } SECTION("invalid chromosome") { - CHECK_THROWS(f.get_matrix_selector("not-a-chromosome", norm, res)); - CHECK_THROWS(f.get_matrix_selector(std::string{chrom1.name()}, "not-a-chromosome", norm, res)); - CHECK_THROWS(f.get_matrix_selector(999, norm, res)); - CHECK_THROWS(f.get_matrix_selector(chrom1.id(), 999, norm, res)); + CHECK_THROWS(f.get_matrix_selector("not-a-chromosome", norm)); + CHECK_THROWS(f.get_matrix_selector(std::string{chrom1.name()}, "not-a-chromosome", norm)); + CHECK_THROWS(f.get_matrix_selector(999, norm)); + CHECK_THROWS(f.get_matrix_selector(chrom1.id(), 999, norm)); } SECTION("malformed") { - CHECK_THROWS(f.get_matrix_selector(chrom2, chrom1, norm, res)); // NOLINT - CHECK_THROWS(f.get_matrix_selector(chrom1, norm, 123)); - CHECK_THROWS(HiCFile(pathV8, MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector(chrom1, NormalizationMethod::VC, res)); + CHECK_THROWS(f.get_matrix_selector(chrom2, chrom1, norm)); // NOLINT + CHECK_THROWS(HiCFile(pathV8, f.resolution(), MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector(chrom1, NormalizationMethod::VC)); // Matrix does not have contacts for fragments - CHECK_THROWS(HiCFile(pathV8, MatrixType::observed, MatrixUnit::FRAG) - .get_matrix_selector(chrom1, norm, res)); + CHECK_THROWS(HiCFile(pathV8, f.resolution(), MatrixType::observed, MatrixUnit::FRAG) + .get_matrix_selector(chrom1, norm)); } } diff --git a/test/units/hic/matrix_zoom_data_test.cpp b/test/units/hic/matrix_zoom_data_test.cpp index 2a2a3364..d8edfc79 100644 --- a/test/units/hic/matrix_zoom_data_test.cpp +++ b/test/units/hic/matrix_zoom_data_test.cpp @@ -72,8 +72,8 @@ static void compareContactRecord(const Pixel& r1, const SerializedPixel& // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("MatrixSelector accessors", "[hic][short]") { - const auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, 2500000); + const auto sel = HiCFile(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); CHECK(sel.chrom1().name() == "chr2L"); CHECK(sel.chrom2().name() == "chr2L"); @@ -90,9 +90,9 @@ TEST_CASE("MatrixSelector accessors", "[hic][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("MatrixSelector LRU cache", "[hic][short]") { std::vector> buffer; - HiCFile f(pathV8, MatrixType::observed, MatrixUnit::BP); + HiCFile f(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP); - auto sel = f.get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); + auto sel = f.get_matrix_selector("chr2L", NormalizationMethod::NONE); CHECK(sel.blockCacheHitRate() == 0.0); CHECK(sel.blockCacheSize() == 0); @@ -131,8 +131,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { std::make_pair(std::size_t(1229799), SerializedPixel{15770000, 15770000, 1234.0F}); SECTION("v8") { - auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -148,8 +148,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { compareContactRecord(buffer[expected_value.first], expected_value.second); } SECTION("v9") { - auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -178,8 +178,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { std::make_pair(std::size_t(3541), SerializedPixel{770000, 1300000, 13.0F}); SECTION("v8") { - auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -196,8 +196,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("v9") { - auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -214,8 +214,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("cover type 2 interactions") { - auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr2R", NormalizationMethod::NONE, 2500000); + auto sel = HiCFile(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr2R", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == 110); CHECK(sumCounts(buffer) == 1483112); @@ -224,18 +224,18 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("sub-queries") { - const std::uint32_t resolution = 10000; + const std::uint32_t resolution = 10'000; SECTION("single pixel") { - auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, resolution); + auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); sel.fetch(100000, 100001, 100000, 100001, buffer); REQUIRE(buffer.size() == 1); compareContactRecord(buffer.front(), SerializedPixel{100000, 100000, 13895.0F}); } SECTION("upper-triangle") { - auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, resolution); + auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); sel.fetch(123456, 200000, 0, 200000, buffer, true); REQUIRE(buffer.size() == 132); CHECK(sumCounts(buffer) == 124561); @@ -245,8 +245,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("lower-triangle") { - auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, resolution); + auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); sel.fetch(0, 200000, 123456, 200000, buffer, true); REQUIRE(buffer.size() == 132); CHECK(sumCounts(buffer) == 124561); @@ -256,8 +256,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } SECTION("inter-chromosomal") { - auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, resolution); + auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); sel.fetch(123456, 200000, 0, 200000, buffer); REQUIRE(buffer.size() == 57); CHECK(sumCounts(buffer) == 74); @@ -268,28 +268,23 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { SECTION("invalid") { SECTION("invalid chromosome") { - HiCFile hic(pathV9, MatrixType::observed, MatrixUnit::BP); - CHECK_THROWS(hic.get_matrix_selector("chr123", NormalizationMethod::NONE, 10000)); - CHECK_THROWS(hic.get_matrix_selector(999, NormalizationMethod::NONE, 10000)); - } - SECTION("invalid resolution") { - HiCFile hic(pathV9, MatrixType::observed, MatrixUnit::BP); - CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE, - (std::numeric_limits::max)())); + HiCFile hic(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP); + CHECK_THROWS(hic.get_matrix_selector("chr123", NormalizationMethod::NONE)); + CHECK_THROWS(hic.get_matrix_selector(999, NormalizationMethod::NONE)); } SECTION("invalid unit") { - HiCFile hic(pathV9, MatrixType::observed, MatrixUnit::FRAG); - CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000)); + HiCFile hic(pathV9, 10'000, MatrixType::observed, MatrixUnit::FRAG); + CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE)); } SECTION("expected + norm") { - HiCFile hic(pathV9, MatrixType::expected, MatrixUnit::BP); - CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::VC, 10000)); + HiCFile hic(pathV9, 10'000, MatrixType::expected, MatrixUnit::BP); + CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::VC)); } SECTION("invalid range") { - HiCFile hic(pathV9, MatrixType::observed, MatrixUnit::BP); - CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000) - .fetch(1000, 0, buffer)); - CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000) + HiCFile hic(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP); + CHECK_THROWS( + hic.get_matrix_selector("chr2L", NormalizationMethod::NONE).fetch(1000, 0, buffer)); + CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE) .fetch(0, 1'000'000'000, buffer)); } } @@ -303,15 +298,15 @@ TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 20391277.41514; SECTION("v8") { - auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::VC, 10000); + auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::VC); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::VC, 10000); + auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::VC); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -321,16 +316,16 @@ TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 56743; constexpr double expected_sum = 96690.056244753; SECTION("v8") { - auto sel = HiCFile(pathV8, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::VC, 10000); + auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::VC); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::VC, 10000); + auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::VC); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -345,15 +340,15 @@ TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 18314748.068024; SECTION("v8") { - auto sel = HiCFile(pathV8, MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV8, 10'000, MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9, MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV9, 10'000, MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -363,16 +358,16 @@ TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 56743; constexpr double expected_sum = 12610.80619812; SECTION("v8") { - auto sel = HiCFile(pathV8, MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV8, 10'000, MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9, MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV9, 10'000, MatrixType::expected, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -387,15 +382,15 @@ TEST_CASE("MatrixSelector fetch (oe NONE BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 2785506.2274201; SECTION("v8") { - auto sel = HiCFile(pathV8, MatrixType::oe, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV8, 10'000, MatrixType::oe, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9, MatrixType::oe, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV9, 10'000, MatrixType::oe, MatrixUnit::BP) + .get_matrix_selector("chr2L", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); @@ -405,16 +400,16 @@ TEST_CASE("MatrixSelector fetch (oe NONE BP 10000)", "[hic][short]") { constexpr std::size_t expected_size = 56743; constexpr double expected_sum = 317520.00459671; SECTION("v8") { - auto sel = HiCFile(pathV8, MatrixType::oe, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV8, 10'000, MatrixType::oe, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); } SECTION("v9") { - auto sel = HiCFile(pathV9, MatrixType::oe, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE, 10000); + auto sel = HiCFile(pathV9, 10'000, MatrixType::oe, MatrixUnit::BP) + .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); sel.fetch(buffer, true); REQUIRE(buffer.size() == expected_size); CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); From aaaae442da93b59530ee404dac5fa7224a32aeab Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sat, 10 Jun 2023 22:35:57 +0200 Subject: [PATCH 03/48] Wrap cooler-related code inside ::cooler --- src/cooler/attribute_impl.hpp | 18 +++++++------- src/cooler/balancing_impl.hpp | 4 ++-- src/cooler/dataset_accessors_impl.hpp | 4 ++-- src/cooler/dataset_impl.hpp | 4 ++-- src/cooler/dataset_iterator_impl.hpp | 4 ++-- src/cooler/dataset_read_impl.hpp | 23 +++++++++--------- src/cooler/dataset_write_impl.hpp | 8 +++---- src/cooler/file_accessors_impl.hpp | 6 ++--- src/cooler/file_impl.hpp | 12 +++++----- src/cooler/file_read_impl.hpp | 11 +++++---- src/cooler/file_standard_attr_impl.hpp | 8 +++---- src/cooler/file_validation_impl.hpp | 4 ++-- src/cooler/file_write_impl.hpp | 5 ++-- src/cooler/include/hictk/cooler.hpp | 15 ++++++------ src/cooler/include/hictk/cooler/attribute.hpp | 4 ++-- src/cooler/include/hictk/cooler/balancing.hpp | 4 ++-- src/cooler/include/hictk/cooler/dataset.hpp | 24 ++++++++++--------- src/cooler/include/hictk/cooler/group.hpp | 4 ++-- src/cooler/include/hictk/cooler/index.hpp | 5 +++- .../include/hictk/cooler/pixel_selector.hpp | 9 ++++--- src/cooler/include/hictk/cooler/uri.hpp | 4 ++-- src/cooler/include/hictk/cooler/utils.hpp | 4 ++-- .../include/hictk/cooler/validation.hpp | 20 ++++++++-------- src/cooler/index_impl.hpp | 4 ++-- src/cooler/pixel_selector_impl.hpp | 7 +++--- src/cooler/uri_impl.hpp | 4 ++-- src/cooler/utils_equal_impl.hpp | 4 ++-- src/cooler/utils_merge_impl.hpp | 4 ++-- src/cooler/validation_impl.hpp | 24 +++++++++---------- test/units/cooler/attribute_test.cpp | 7 ++++-- test/units/cooler/balancing_test.cpp | 5 ++-- test/units/cooler/dataset_test.cpp | 11 +++++---- test/units/cooler/file_test.cpp | 8 +++++-- test/units/cooler/index_test.cpp | 5 ++-- test/units/cooler/pixel_selector_test.cpp | 6 +++-- test/units/cooler/utils_equal_test.cpp | 12 ++++++---- test/units/cooler/utils_merge_test.cpp | 12 ++++++---- 37 files changed, 168 insertions(+), 149 deletions(-) diff --git a/src/cooler/attribute_impl.hpp b/src/cooler/attribute_impl.hpp index 3bbcf062..a6e03b84 100644 --- a/src/cooler/attribute_impl.hpp +++ b/src/cooler/attribute_impl.hpp @@ -26,7 +26,7 @@ #include "hictk/suppress_warnings.hpp" #include "hictk/type_pretty_printer.hpp" -namespace hictk { +namespace hictk::cooler { template inline bool Attribute::exists(ParentObj& h5obj, std::string_view key) { @@ -129,6 +129,7 @@ DISABLE_WARNING_UNREACHABLE_CODE template // NOLINTNEXTLINE(readability-function-cognitive-complexity) inline Tout Attribute::numeric_converter(T1& buff) { + using namespace hictk::internal; static_assert(!std::is_same_v); if constexpr (std::is_same_v) { @@ -143,12 +144,12 @@ inline Tout Attribute::numeric_converter(T1& buff) { if constexpr (std::is_same_v && std::is_arithmetic_v) { // Try to convert a string attribute to the appropriate numeric type try { - return internal::parse_numeric_or_throw(buff); + return parse_numeric_or_throw(buff); } catch (const std::exception& e) { throw std::runtime_error( fmt::format(FMT_STRING("Expected type {}, found std::string. An attempt to convert " "std::string to {} was made, but failed. Reason {}"), - internal::type_name(), internal::type_name(), e.what())); + type_name(), type_name(), e.what())); } } @@ -163,8 +164,7 @@ inline Tout Attribute::numeric_converter(T1& buff) { throw std::runtime_error( fmt::format(FMT_STRING("Expected type {}, found {}. Unable to represent value {} as {} " "without information loss"), - internal::type_name(), internal::type_name(), buff, - internal::type_name())); + type_name(), type_name(), buff, type_name())); } if constexpr (std::is_integral_v && std::is_integral_v) { @@ -189,16 +189,14 @@ inline Tout Attribute::numeric_converter(T1& buff) { throw std::runtime_error(fmt::format( FMT_STRING( "Expected type {}, found {}. Unable to represent value {} as {} without overflowing"), - internal::type_name(), internal::type_name(), buff, - internal::type_name())); + type_name(), type_name(), buff, type_name())); } // No conversion was possible throw std::runtime_error(fmt::format( FMT_STRING( "Expected type {}, found {}. Unable to safely convert value {} of type {} to type {}"), - internal::type_name(), internal::type_name(), buff, internal::type_name(), - internal::type_name())); + type_name(), type_name(), buff, type_name(), type_name())); } DISABLE_WARNING_POP -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/balancing_impl.hpp b/src/cooler/balancing_impl.hpp index a7b6dbbb..722b3fb7 100644 --- a/src/cooler/balancing_impl.hpp +++ b/src/cooler/balancing_impl.hpp @@ -12,7 +12,7 @@ #include #include -namespace hictk { +namespace hictk::cooler { inline Weights::Weights(std::vector weights, Type type) noexcept : _weights(std::move(weights)), _type(type) { @@ -252,4 +252,4 @@ inline auto Balancer::iterator::operator++(int) -> iterator { return it; } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/dataset_accessors_impl.hpp b/src/cooler/dataset_accessors_impl.hpp index 4c920a12..b8cfe720 100644 --- a/src/cooler/dataset_accessors_impl.hpp +++ b/src/cooler/dataset_accessors_impl.hpp @@ -10,7 +10,7 @@ #include #include -namespace hictk { +namespace hictk::cooler { inline HighFive::DataSet Dataset::operator()() { return this->_dataset; } @@ -95,4 +95,4 @@ inline auto Dataset::cend() const -> iterator { return this->end(); } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/dataset_impl.hpp b/src/cooler/dataset_impl.hpp index 4ac43265..698a123c 100644 --- a/src/cooler/dataset_impl.hpp +++ b/src/cooler/dataset_impl.hpp @@ -19,7 +19,7 @@ #include "hictk/common.hpp" -namespace hictk { +namespace hictk::cooler { namespace internal { @@ -180,4 +180,4 @@ inline HighFive::Selection Dataset::select(std::size_t i1, std::size_t i2) const return this->_dataset.select(std::vector{i1}, std::vector{i2}); } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/dataset_iterator_impl.hpp b/src/cooler/dataset_iterator_impl.hpp index e8bdca08..4589eb89 100644 --- a/src/cooler/dataset_iterator_impl.hpp +++ b/src/cooler/dataset_iterator_impl.hpp @@ -19,7 +19,7 @@ #include "hictk/common.hpp" #include "hictk/type_pretty_printer.hpp" -namespace hictk { +namespace hictk::cooler { template inline Dataset::iterator::iterator(const Dataset &dset, std::size_t h5_offset, @@ -286,4 +286,4 @@ constexpr auto Dataset::iterator::make_end_iterator(const Dataset return it; } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/dataset_read_impl.hpp b/src/cooler/dataset_read_impl.hpp index fc4edad7..b725e994 100644 --- a/src/cooler/dataset_read_impl.hpp +++ b/src/cooler/dataset_read_impl.hpp @@ -15,7 +15,7 @@ #include "hictk/common.hpp" #include "hictk/cooler/attribute.hpp" -namespace hictk { +namespace hictk::cooler { template inline std::size_t Dataset::read(std::vector &buff, std::size_t num, std::size_t offset) const { @@ -55,8 +55,7 @@ inline std::size_t Dataset::read(std::vector &buff, std::size_t num DISABLE_WARNING_PUSH DISABLE_WARNING_UNREACHABLE_CODE template -inline std::size_t Dataset::read(internal::VariantBuffer &vbuff, std::size_t num, - std::size_t offset) const { +inline std::size_t Dataset::read(VariantBuffer &vbuff, std::size_t num, std::size_t offset) const { if constexpr (i == 0) { if (offset + num > this->size()) { this->throw_out_of_range_excp(offset, num); @@ -108,8 +107,8 @@ inline std::size_t Dataset::read_all(BuffT &buff, std::size_t offset) const { return this->read(buff, num, offset); } -inline internal::VariantBuffer Dataset::read_all(std::size_t offset) const { - return this->read_all(offset); +inline hictk::internal::VariantBuffer Dataset::read_all(std::size_t offset) const { + return this->read_all(offset); } template @@ -150,14 +149,14 @@ inline std::size_t Dataset::read(std::string &buff, std::size_t offset) const { DISABLE_WARNING_PUSH DISABLE_WARNING_UNREACHABLE_CODE template -inline std::size_t Dataset::read(internal::GenericVariant &vbuff, std::size_t offset) const { +inline std::size_t Dataset::read(GenericVariant &vbuff, std::size_t offset) const { if constexpr (i == 0) { if (offset >= this->size()) { this->throw_out_of_range_excp(offset); } } - using VBuffT = internal::GenericVariant; + using VBuffT = GenericVariant; if constexpr (i < std::variant_size_v) { using T = std::variant_alternative_t; @@ -195,8 +194,8 @@ inline BuffT Dataset::read(std::size_t offset) const { return buff; } -inline internal::GenericVariant Dataset::read(std::size_t offset) const { - return this->read(offset); +inline hictk::internal::GenericVariant Dataset::read(std::size_t offset) const { + return this->read(offset); } template @@ -210,8 +209,8 @@ inline BuffT Dataset::read_last() const { return buff; } -inline internal::GenericVariant Dataset::read_last() const { - return this->read_last(); +inline hictk::internal::GenericVariant Dataset::read_last() const { + return this->read_last(); } template @@ -233,4 +232,4 @@ inline auto Dataset::read_attribute(std::string_view key, bool missing_ok) const return Attribute::read(this->_dataset, key, missing_ok); } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/dataset_write_impl.hpp b/src/cooler/dataset_write_impl.hpp index dfaa2b4d..a5689f76 100644 --- a/src/cooler/dataset_write_impl.hpp +++ b/src/cooler/dataset_write_impl.hpp @@ -22,7 +22,7 @@ #include "hictk/common.hpp" -namespace hictk { +namespace hictk::cooler { inline std::size_t Dataset::write(const std::vector &buff, std::size_t offset, bool allow_dataset_resize) { @@ -68,7 +68,7 @@ inline std::size_t Dataset::write(const std::vector &buff, std::size_t offset return offset + buff.size(); } -inline std::size_t Dataset::write(const internal::VariantBuffer &vbuff, std::size_t offset, +inline std::size_t Dataset::write(const VariantBuffer &vbuff, std::size_t offset, bool allow_dataset_resize) { std::size_t new_offset{}; std::visit( @@ -156,7 +156,7 @@ inline std::size_t Dataset::write(std::string buff, std::size_t offset, bool all return offset + 1; } -inline std::size_t Dataset::write(const internal::GenericVariant &vbuff, std::size_t offset, +inline std::size_t Dataset::write(const GenericVariant &vbuff, std::size_t offset, bool allow_dataset_resize) { std::size_t new_offset{}; std::visit( @@ -209,4 +209,4 @@ inline HighFive::DataSet Dataset::create_fixed_str_dataset( return group.getDataSet(dataset_name); } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/file_accessors_impl.hpp b/src/cooler/file_accessors_impl.hpp index 156f98d1..9e0ed47e 100644 --- a/src/cooler/file_accessors_impl.hpp +++ b/src/cooler/file_accessors_impl.hpp @@ -14,7 +14,7 @@ #include #include -namespace hictk { +namespace hictk::cooler { inline std::string File::uri() const { if (this->hdf5_path() == "/") { @@ -88,7 +88,7 @@ inline auto File::dataset(std::string_view dataset_name) const -> const Dataset } } -inline const internal::NumericVariant &File::pixel_variant() const noexcept { +inline const hictk::internal::NumericVariant &File::pixel_variant() const noexcept { return this->_pixel_variant; } @@ -169,4 +169,4 @@ inline auto File::index() const noexcept -> const Index & { return *this->_index; } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/file_impl.hpp b/src/cooler/file_impl.hpp index a3359556..a4908097 100644 --- a/src/cooler/file_impl.hpp +++ b/src/cooler/file_impl.hpp @@ -30,7 +30,7 @@ #include "hictk/type_pretty_printer.hpp" #include "hictk/variant_buff.hpp" -namespace hictk { +namespace hictk::cooler { template inline void init_mcool(std::string_view file_path, InputIt first_resolution, @@ -212,7 +212,7 @@ inline void File::open(std::string_view uri, bool validate) { template inline void File::create(std::string_view uri, const hictk::Reference &chroms, std::uint32_t bin_size, bool overwrite_if_exists, - hictk::StandardAttributes attributes) { + StandardAttributes attributes) { *this = File::create_new_cooler(uri, chroms, bin_size, overwrite_if_exists, attributes); } @@ -300,11 +300,11 @@ template DISABLE_WARNING_POP } // namespace internal -inline internal::NumericVariant File::detect_pixel_type(const RootGroup &root_grp, - std::string_view path) { +inline hictk::internal::NumericVariant File::detect_pixel_type(const RootGroup &root_grp, + std::string_view path) { [[maybe_unused]] HighFive::SilenceHDF5 silencer{}; // NOLINT auto dset = root_grp().getDataSet(std::string{path}); - return internal::read_pixel_variant(dset); + return internal::read_pixel_variant(dset); } template @@ -321,4 +321,4 @@ inline void File::update_pixel_sum(N partial_sum) { } } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/file_read_impl.hpp b/src/cooler/file_read_impl.hpp index 42344049..f7f4c56e 100644 --- a/src/cooler/file_read_impl.hpp +++ b/src/cooler/file_read_impl.hpp @@ -31,7 +31,7 @@ #include "hictk/suppress_warnings.hpp" #include "hictk/type_pretty_printer.hpp" -namespace hictk { +namespace hictk::cooler { inline bool File::check_sentinel_attr(const HighFive::Group &grp) { const auto generated_by_v = Attribute::read(grp, "generated-by", true); @@ -253,8 +253,9 @@ inline auto File::open_datasets(const RootGroup &root_grp, std::size_t cache_siz auto open_dataset = [&](const auto dataset_uri) { return std::make_pair( std::string{dataset_uri}, - Dataset{root_grp, dataset_uri, - internal::starts_with(dataset_uri, "pixels") ? pixels_aprop : default_aprop}); + Dataset{ + root_grp, dataset_uri, + hictk::internal::starts_with(dataset_uri, "pixels") ? pixels_aprop : default_aprop}); }; std::transform(MANDATORY_DATASET_NAMES.begin(), MANDATORY_DATASET_NAMES.end(), @@ -318,7 +319,7 @@ inline auto File::read_standard_attributes(const RootGroup &root_grp, bool initi throw std::runtime_error( fmt::format(FMT_STRING("Attribute \"{}{}\" as an unexpected type. Expected a " "numeric type, found {}"), - root_grp().getPath(), key, internal::type_name())); + root_grp().getPath(), key, hictk::internal::type_name())); }, sumv); return true; @@ -452,4 +453,4 @@ inline Bin File::get_last_bin_written() const { return this->bins().at(bin1_id); } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/file_standard_attr_impl.hpp b/src/cooler/file_standard_attr_impl.hpp index 61353971..f9e59988 100644 --- a/src/cooler/file_standard_attr_impl.hpp +++ b/src/cooler/file_standard_attr_impl.hpp @@ -8,7 +8,7 @@ #include #include -namespace hictk { +namespace hictk::cooler { template inline StandardAttributes StandardAttributes::init(std::uint32_t bin_size_) { @@ -43,7 +43,7 @@ inline StandardAttributes StandardAttributes::init_empty() noexcept { return attrs; } -inline bool StandardAttributes::operator==(const hictk::StandardAttributes& other) const noexcept { +inline bool StandardAttributes::operator==(const StandardAttributes& other) const noexcept { // clang-format off return bin_size == other.bin_size && bin_type == other.bin_type && @@ -63,7 +63,7 @@ inline bool StandardAttributes::operator==(const hictk::StandardAttributes& othe // clang-format on } -inline bool StandardAttributes::operator!=(const hictk::StandardAttributes& other) const noexcept { +inline bool StandardAttributes::operator!=(const StandardAttributes& other) const noexcept { return !(*this == other); } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/file_validation_impl.hpp b/src/cooler/file_validation_impl.hpp index 090e8f9e..535c750c 100644 --- a/src/cooler/file_validation_impl.hpp +++ b/src/cooler/file_validation_impl.hpp @@ -21,7 +21,7 @@ #include "hictk/cooler/validation.hpp" #include "hictk/fmt.hpp" -namespace hictk { +namespace hictk::cooler { inline void File::validate_bins() const { try { @@ -163,4 +163,4 @@ inline void File::validate_pixel_type() const noexcept { } } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/file_write_impl.hpp b/src/cooler/file_write_impl.hpp index cef5fcc8..c2226194 100644 --- a/src/cooler/file_write_impl.hpp +++ b/src/cooler/file_write_impl.hpp @@ -28,8 +28,7 @@ #include "hictk/cooler/group.hpp" #include "hictk/cooler/uri.hpp" -namespace hictk { - +namespace hictk::cooler { template inline void File::append_pixels(PixelIt first_pixel, PixelIt last_pixel, bool validate) { using PixelT = typename std::iterator_traits::value_type; @@ -350,4 +349,4 @@ inline void File::write_sentinel_attr(HighFive::Group grp) { inline void File::write_sentinel_attr() { File::write_sentinel_attr(this->_root_group()); } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/include/hictk/cooler.hpp b/src/cooler/include/hictk/cooler.hpp index 37ae6c4e..37c91837 100644 --- a/src/cooler/include/hictk/cooler.hpp +++ b/src/cooler/include/hictk/cooler.hpp @@ -34,7 +34,7 @@ DISABLE_WARNING_POP #include "hictk/numeric_variant.hpp" #include "hictk/pixel.hpp" -namespace hictk { +namespace hictk::cooler { using DefaultPixelT = std::int32_t; @@ -88,6 +88,7 @@ void init_mcool(std::string_view file_path, bool force_overwrite = false); // bool force_overwrite = false); class File { + using NumericVariant = hictk::internal::NumericVariant; unsigned int _mode{HighFive::File::ReadOnly}; std::unique_ptr _fp{}; RootGroup _root_group{}; @@ -95,7 +96,7 @@ class File { DatasetMap _datasets{}; mutable WeightMap _weights{}; StandardAttributes _attrs{StandardAttributes::init(0)}; - internal::NumericVariant _pixel_variant{}; + NumericVariant _pixel_variant{}; std::shared_ptr _bins{}; std::shared_ptr _index{}; bool _finalize{false}; @@ -111,7 +112,7 @@ class File { double w0 = DEFAULT_HDF5_CACHE_W0); public: - using QUERY_TYPE = GenomicInterval::Type; + using QUERY_TYPE = hictk::GenomicInterval::Type; File() = default; File(const File &other) = delete; @@ -178,7 +179,7 @@ class File { [[nodiscard]] auto group(std::string_view group_name) const -> const Group &; [[nodiscard]] auto dataset(std::string_view dataset_name) const -> const Dataset &; - [[nodiscard]] const internal::NumericVariant &pixel_variant() const noexcept; + [[nodiscard]] const NumericVariant &pixel_variant() const noexcept; template [[nodiscard]] bool has_pixel_of_type() const noexcept; @@ -275,8 +276,8 @@ class File { template void validate_pixels_before_append(PixelIt first_pixel, PixelIt last_pixel) const; - [[nodiscard]] static internal::NumericVariant detect_pixel_type( - const RootGroup &root_grp, std::string_view path = "pixels/count"); + [[nodiscard]] static NumericVariant detect_pixel_type(const RootGroup &root_grp, + std::string_view path = "pixels/count"); void write_attributes(bool skip_sentinel_attr = true); void write_chromosomes(); @@ -317,7 +318,7 @@ class File { PixelCoordinates coord2) const; }; -} // namespace hictk +} // namespace hictk::cooler #include "../../file_accessors_impl.hpp" #include "../../file_impl.hpp" diff --git a/src/cooler/include/hictk/cooler/attribute.hpp b/src/cooler/include/hictk/cooler/attribute.hpp index 0bf5d28b..a856da4a 100644 --- a/src/cooler/include/hictk/cooler/attribute.hpp +++ b/src/cooler/include/hictk/cooler/attribute.hpp @@ -13,7 +13,7 @@ #include "hictk/common.hpp" #include "hictk/type_traits.hpp" -namespace hictk { +namespace hictk::cooler { struct Attribute { // Variants are listed in order from the most common to the least common for perf. reasons @@ -54,6 +54,6 @@ struct Attribute { template > [[nodiscard]] static Tout numeric_converter(T1& buff); }; -} // namespace hictk +} // namespace hictk::cooler #include "../../../attribute_impl.hpp" diff --git a/src/cooler/include/hictk/cooler/balancing.hpp b/src/cooler/include/hictk/cooler/balancing.hpp index ebe3f70e..a227f5c3 100644 --- a/src/cooler/include/hictk/cooler/balancing.hpp +++ b/src/cooler/include/hictk/cooler/balancing.hpp @@ -13,7 +13,7 @@ #include "hictk/cooler/dataset.hpp" #include "hictk/cooler/pixel_selector.hpp" -namespace hictk { +namespace hictk::cooler { class Weights { public: @@ -104,6 +104,6 @@ class Balancer { using WeightMap = phmap::flat_hash_map>; -} // namespace hictk +} // namespace hictk::cooler #include "../../../balancing_impl.hpp" diff --git a/src/cooler/include/hictk/cooler/dataset.hpp b/src/cooler/include/hictk/cooler/dataset.hpp index c7432ec0..afddacd8 100644 --- a/src/cooler/include/hictk/cooler/dataset.hpp +++ b/src/cooler/include/hictk/cooler/dataset.hpp @@ -25,14 +25,14 @@ DISABLE_WARNING_POP #include "hictk/generic_variant.hpp" #include "hictk/variant_buff.hpp" -namespace hictk { +namespace hictk::cooler { struct RootGroup; namespace internal { template struct is_atomic_buffer - : public std::disjunction>, + : public std::disjunction>, std::is_same>, std::is_arithmetic>> {}; @@ -43,9 +43,11 @@ inline constexpr bool is_atomic_buffer_v = is_atomic_buffer::value; DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS class Dataset { + using VariantBuffer = hictk::internal::VariantBuffer; + using GenericVariant = hictk::internal::GenericVariant; RootGroup _root_group{}; HighFive::DataSet _dataset{}; - mutable internal::VariantBuffer _buff{}; + mutable VariantBuffer _buff{}; public: template @@ -102,7 +104,7 @@ class Dataset { std::size_t read(std::vector &buff, std::size_t num, std::size_t offset = 0) const; std::size_t read(std::vector &buff, std::size_t num, std::size_t offset = 0) const; template - std::size_t read(internal::VariantBuffer &vbuff, std::size_t num, std::size_t offset = 0) const; + std::size_t read(VariantBuffer &vbuff, std::size_t num, std::size_t offset = 0) const; template , typename = std::enable_if_t>> @@ -117,23 +119,23 @@ class Dataset { typename = std::enable_if_t>> BuffT read_all(std::size_t offset = 0) const; - internal::VariantBuffer read_all(std::size_t offset = 0) const; + VariantBuffer read_all(std::size_t offset = 0) const; // Read single values template >> std::size_t read(N &buff, std::size_t offset) const; std::size_t read(std::string &buff, std::size_t offset) const; template - std::size_t read(internal::GenericVariant &vbuff, std::size_t offset) const; + std::size_t read(GenericVariant &vbuff, std::size_t offset) const; template , typename = std::enable_if_t>> BuffT read(std::size_t offset) const; - internal::GenericVariant read(std::size_t offset) const; + GenericVariant read(std::size_t offset) const; template [[nodiscard]] BuffT read_last() const; - [[nodiscard]] internal::GenericVariant read_last() const; + [[nodiscard]] GenericVariant read_last() const; // Write N values template >> @@ -141,7 +143,7 @@ class Dataset { bool allow_dataset_resize = false); std::size_t write(const std::vector &buff, std::size_t offset = 0, bool allow_dataset_resize = false); - std::size_t write(const internal::VariantBuffer &vbuff, std::size_t offset = 0, + std::size_t write(const VariantBuffer &vbuff, std::size_t offset = 0, bool allow_dataset_resize = false); template >> std::size_t write(N buff, std::size_t offset = 0, bool allow_dataset_resize = false); std::size_t write(std::string buff, std::size_t offset = 0, bool allow_dataset_resize = false); - std::size_t write(const internal::GenericVariant &vbuff, std::size_t offset = 0, + std::size_t write(const GenericVariant &vbuff, std::size_t offset = 0, bool allow_dataset_resize = false); template @@ -282,7 +284,7 @@ DISABLE_WARNING_POP using DatasetMap = phmap::flat_hash_map; -} // namespace hictk +} // namespace hictk::cooler #include "../../../dataset_accessors_impl.hpp" #include "../../../dataset_impl.hpp" diff --git a/src/cooler/include/hictk/cooler/group.hpp b/src/cooler/include/hictk/cooler/group.hpp index 5220b117..6fb5bdff 100644 --- a/src/cooler/include/hictk/cooler/group.hpp +++ b/src/cooler/include/hictk/cooler/group.hpp @@ -19,7 +19,7 @@ DISABLE_WARNING_POP #include "hictk/suppress_warnings.hpp" -namespace hictk { +namespace hictk::cooler { DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS @@ -51,4 +51,4 @@ DISABLE_WARNING_POP using GroupMap = phmap::flat_hash_map; -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/include/hictk/cooler/index.hpp b/src/cooler/include/hictk/cooler/index.hpp index 8b7ff270..be762c9f 100644 --- a/src/cooler/include/hictk/cooler/index.hpp +++ b/src/cooler/include/hictk/cooler/index.hpp @@ -13,12 +13,13 @@ #include namespace hictk { - class GenomicInterval; class BinTable; class Chromosome; class Reference; +namespace cooler { + class Index { using ChromID = std::uint32_t; using OffsetVect = std::vector; @@ -145,6 +146,8 @@ class Index { }; }; +} // namespace cooler + } // namespace hictk #include "../../../index_impl.hpp" diff --git a/src/cooler/include/hictk/cooler/pixel_selector.hpp b/src/cooler/include/hictk/cooler/pixel_selector.hpp index a6193792..5b3f94a9 100644 --- a/src/cooler/include/hictk/cooler/pixel_selector.hpp +++ b/src/cooler/include/hictk/cooler/pixel_selector.hpp @@ -8,14 +8,13 @@ #include #include +#include "hictk/bin_table.hpp" #include "hictk/common.hpp" #include "hictk/cooler/dataset.hpp" +#include "hictk/cooler/index.hpp" #include "hictk/pixel.hpp" -namespace hictk { - -class BinTable; -class Index; +namespace hictk::cooler { template class PixelSelector { @@ -129,6 +128,6 @@ class PixelSelector { }; }; -} // namespace hictk +} // namespace hictk::cooler #include "../../../pixel_selector_impl.hpp" diff --git a/src/cooler/include/hictk/cooler/uri.hpp b/src/cooler/include/hictk/cooler/uri.hpp index 77c3a512..971dddce 100644 --- a/src/cooler/include/hictk/cooler/uri.hpp +++ b/src/cooler/include/hictk/cooler/uri.hpp @@ -8,7 +8,7 @@ #include #include -namespace hictk { +namespace hictk::cooler { struct CoolerURI { std::string file_path; @@ -23,6 +23,6 @@ struct CoolerURI { }; [[nodiscard]] CoolerURI parse_cooler_uri(std::string_view uri); -} // namespace hictk +} // namespace hictk::cooler #include "../../../uri_impl.hpp" diff --git a/src/cooler/include/hictk/cooler/utils.hpp b/src/cooler/include/hictk/cooler/utils.hpp index 9441bc54..b964fe49 100644 --- a/src/cooler/include/hictk/cooler/utils.hpp +++ b/src/cooler/include/hictk/cooler/utils.hpp @@ -12,7 +12,7 @@ #include "hictk/cooler.hpp" -namespace hictk::utils { +namespace hictk::cooler::utils { enum class MergeStrategy { IN_MEMORY, PQUEUE }; @@ -62,7 +62,7 @@ class PixelMerger { [[nodiscard]] Pixel next(); }; } // namespace internal -} // namespace hictk::utils +} // namespace hictk::cooler::utils #include "../../../utils_equal_impl.hpp" #include "../../../utils_merge_impl.hpp" diff --git a/src/cooler/include/hictk/cooler/validation.hpp b/src/cooler/include/hictk/cooler/validation.hpp index ddff02b9..8c41679d 100644 --- a/src/cooler/include/hictk/cooler/validation.hpp +++ b/src/cooler/include/hictk/cooler/validation.hpp @@ -17,7 +17,7 @@ DISABLE_WARNING_POP #include #include -namespace hictk::utils { +namespace hictk::cooler::utils { namespace internal { struct ValidationStatusBase { @@ -70,31 +70,31 @@ struct ValidationStatusScool : public internal::ValidationStatusBase { [[nodiscard]] ValidationStatusScool is_scool_file(std::string_view uri, bool validate_cells = true); [[nodiscard]] ValidationStatusScool is_scool_file(const HighFive::File& fp, bool validate_cells = true); -} // namespace hictk::utils +} // namespace hictk::cooler::utils namespace fmt { template <> -struct formatter { +struct formatter { constexpr auto parse(format_parse_context& ctx) const -> format_parse_context::iterator; - inline auto format(const hictk::utils::ValidationStatusCooler& s, format_context& ctx) const - -> format_context::iterator; + inline auto format(const hictk::cooler::utils::ValidationStatusCooler& s, + format_context& ctx) const -> format_context::iterator; }; template <> -struct formatter { +struct formatter { constexpr auto parse(format_parse_context& ctx) const -> format_parse_context::iterator; - inline auto format(const hictk::utils::ValidationStatusMultiresCooler& s, + inline auto format(const hictk::cooler::utils::ValidationStatusMultiresCooler& s, format_context& ctx) const -> format_context::iterator; }; template <> -struct formatter { +struct formatter { constexpr auto parse(format_parse_context& ctx) const -> format_parse_context::iterator; - inline auto format(const hictk::utils::ValidationStatusScool& s, format_context& ctx) const - -> format_context::iterator; + inline auto format(const hictk::cooler::utils::ValidationStatusScool& s, + format_context& ctx) const -> format_context::iterator; }; } // namespace fmt diff --git a/src/cooler/index_impl.hpp b/src/cooler/index_impl.hpp index 7b42ab8d..2075d42d 100644 --- a/src/cooler/index_impl.hpp +++ b/src/cooler/index_impl.hpp @@ -18,7 +18,7 @@ #include "hictk/chromosome.hpp" #include "hictk/fmt.hpp" -namespace hictk { +namespace hictk::cooler { inline Index::Index(std::shared_ptr bins, std::uint64_t nnz) : _bins(std::move(bins)), @@ -332,4 +332,4 @@ inline auto Index::iterator::get_offsets() const noexcept -> const OffsetVect & return this->_idx->_idx[static_cast(this->_chrom_id)]; } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/pixel_selector_impl.hpp b/src/cooler/pixel_selector_impl.hpp index b27dc9bf..a08cb014 100644 --- a/src/cooler/pixel_selector_impl.hpp +++ b/src/cooler/pixel_selector_impl.hpp @@ -6,13 +6,12 @@ #include #include +#include #include -#include "hictk/bin_table.hpp" -#include "hictk/cooler/index.hpp" #include "hictk/numeric_utils.hpp" -namespace hictk { +namespace hictk::cooler { template inline PixelSelector::PixelSelector(std::shared_ptr index, @@ -428,4 +427,4 @@ constexpr bool PixelSelector::iterator::is_at_end() const noexcep return !this->overlaps_coord1() && !this->overlaps_coord2(); } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/uri_impl.hpp b/src/cooler/uri_impl.hpp index de1f1ce2..da607d9e 100644 --- a/src/cooler/uri_impl.hpp +++ b/src/cooler/uri_impl.hpp @@ -10,7 +10,7 @@ #include #include -namespace hictk { +namespace hictk::cooler { inline CoolerURI::CoolerURI(std::string_view p1, std::string_view p2) : CoolerURI(std::string{p1}, std::string{p2}) {} @@ -61,4 +61,4 @@ inline CoolerURI parse_cooler_uri(std::string_view uri) { return CoolerURI{std::string{tok1}, "/" + std::string{tok2}}; } -} // namespace hictk +} // namespace hictk::cooler diff --git a/src/cooler/utils_equal_impl.hpp b/src/cooler/utils_equal_impl.hpp index f03a2daa..533116bf 100644 --- a/src/cooler/utils_equal_impl.hpp +++ b/src/cooler/utils_equal_impl.hpp @@ -9,7 +9,7 @@ #include "hictk/cooler.hpp" -namespace hictk::utils { +namespace hictk::cooler::utils { inline bool equal(std::string_view uri1, std::string_view uri2, bool ignore_attributes) { if (uri1 == uri2) { @@ -74,4 +74,4 @@ inline bool equal(const File& clr1, const File& clr2, bool ignore_attributes) { return true; } -} // namespace hictk::utils +} // namespace hictk::cooler::utils diff --git a/src/cooler/utils_merge_impl.hpp b/src/cooler/utils_merge_impl.hpp index 0443d585..b14b2331 100644 --- a/src/cooler/utils_merge_impl.hpp +++ b/src/cooler/utils_merge_impl.hpp @@ -11,7 +11,7 @@ #include "hictk/cooler.hpp" -namespace hictk::utils { +namespace hictk::cooler::utils { namespace internal { template @@ -188,4 +188,4 @@ inline void merge(Str first_file, Str last_file, std::string_view dest_uri, fmt::format(FMT_STRING("failed to merge {} cooler files: {}"), clrs.size(), e.what())); } } -} // namespace hictk::utils +} // namespace hictk::cooler::utils diff --git a/src/cooler/validation_impl.hpp b/src/cooler/validation_impl.hpp index 85099ffc..43e2be5b 100644 --- a/src/cooler/validation_impl.hpp +++ b/src/cooler/validation_impl.hpp @@ -19,7 +19,7 @@ #include "hictk/cooler/uri.hpp" #include "hictk/numeric_utils.hpp" -namespace hictk::utils { +namespace hictk::cooler::utils { constexpr ValidationStatusCooler::operator bool() const noexcept { return this->is_cooler; } @@ -83,7 +83,7 @@ inline ValidationStatusCooler is_cooler(const HighFive::Group &root_group) { if (Attribute::exists(root_group, "format-version")) { const auto version = Attribute::read(root_group, "format-version"); - status.file_was_properly_closed = version != hictk::internal::SENTINEL_ATTR_VALUE; + status.file_was_properly_closed = version != cooler::internal::SENTINEL_ATTR_VALUE; status.missing_or_invalid_format_attr |= version == 0 || version > 3; } @@ -315,9 +315,9 @@ inline std::vector list_resolutions(std::string_view uri, bool so } } -} // namespace hictk::utils +} // namespace hictk::cooler::utils -constexpr auto fmt::formatter::parse( +constexpr auto fmt::formatter::parse( format_parse_context &ctx) const -> format_parse_context::iterator { if (ctx.begin() != ctx.end() && *ctx.begin() != '}') { throw fmt::format_error("invalid format"); @@ -325,8 +325,8 @@ constexpr auto fmt::formatter::parse( return ctx.end(); } -auto fmt::formatter::format( - const hictk::utils::ValidationStatusCooler &s, format_context &ctx) const +auto fmt::formatter::format( + const hictk::cooler::utils::ValidationStatusCooler &s, format_context &ctx) const -> decltype(ctx.out()) { // clang-format off return fmt::format_to( @@ -348,7 +348,7 @@ auto fmt::formatter::format( // clang-format on } -constexpr auto fmt::formatter::parse( +constexpr auto fmt::formatter::parse( format_parse_context &ctx) const -> format_parse_context::iterator { if (ctx.begin() != ctx.end() && *ctx.begin() != '}') { throw fmt::format_error("invalid format"); @@ -356,8 +356,8 @@ constexpr auto fmt::formatter::par return ctx.end(); } -auto fmt::formatter::format( - const hictk::utils::ValidationStatusMultiresCooler &s, format_context &ctx) const +auto fmt::formatter::format( + const hictk::cooler::utils::ValidationStatusMultiresCooler &s, format_context &ctx) const -> decltype(ctx.out()) { // clang-format off return fmt::format_to( @@ -382,7 +382,7 @@ auto fmt::formatter::format( // clang-format on } -constexpr auto fmt::formatter::parse( +constexpr auto fmt::formatter::parse( format_parse_context &ctx) const -> format_parse_context::iterator { if (ctx.begin() != ctx.end() && *ctx.begin() != '}') { throw fmt::format_error("invalid format"); @@ -390,8 +390,8 @@ constexpr auto fmt::formatter::parse( return ctx.end(); } -auto fmt::formatter::format( - const hictk::utils::ValidationStatusScool &s, format_context &ctx) const +auto fmt::formatter::format( + const hictk::cooler::utils::ValidationStatusScool &s, format_context &ctx) const -> decltype(ctx.out()) { // clang-format off return fmt::format_to( diff --git a/test/units/cooler/attribute_test.cpp b/test/units/cooler/attribute_test.cpp index 1a96c7ca..2fab5b26 100644 --- a/test/units/cooler/attribute_test.cpp +++ b/test/units/cooler/attribute_test.cpp @@ -19,7 +19,10 @@ inline const SelfDeletingFolder testdir{true}; // NOLIN static inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -namespace hictk::test::attribute { +namespace hictk::cooler::test::attribute { + +const auto& testdir = hictk::test::testdir; +const auto& datadir = hictk::test::datadir; template static void compare_attribute(H5Obj& obj, std::string_view key, const T& expected) { @@ -486,4 +489,4 @@ TEST_CASE("Attribute: read - test numeric conversions", "[cooler][short]") { } } -} // namespace hictk::test::attribute +} // namespace hictk::cooler::test::attribute diff --git a/test/units/cooler/balancing_test.cpp b/test/units/cooler/balancing_test.cpp index 88688c1b..adbd14f3 100644 --- a/test/units/cooler/balancing_test.cpp +++ b/test/units/cooler/balancing_test.cpp @@ -16,7 +16,8 @@ namespace hictk::test { inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -namespace hictk::test::hictk { +namespace hictk::cooler::test::balancing { +const auto& datadir = hictk::test::datadir; template static void balancer_test_helper(const Balancer& sel, @@ -112,4 +113,4 @@ TEST_CASE("Cooler: Balancer", "[cooler][short]") { } } } -} // namespace hictk::test::hictk +} // namespace hictk::cooler::test::balancing diff --git a/test/units/cooler/dataset_test.cpp b/test/units/cooler/dataset_test.cpp index 8c2ece35..c19d9b9c 100644 --- a/test/units/cooler/dataset_test.cpp +++ b/test/units/cooler/dataset_test.cpp @@ -18,7 +18,10 @@ inline const SelfDeletingFolder testdir{true}; // NOLINT(cert- inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -namespace hictk::test::dataset { +namespace hictk::cooler::test::dataset { +const auto& testdir = hictk::test::testdir; +const auto& datadir = hictk::test::datadir; + // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("Dataset: read", "[dataset][short]") { const auto path = datadir / "cooler_test_file.cool"; @@ -70,7 +73,7 @@ TEST_CASE("Dataset: read", "[dataset][short]") { } SECTION("variant buff") { - internal::VariantBuffer vbuff{std::size_t(0), 0.0}; + hictk::internal::VariantBuffer vbuff{std::size_t(0), 0.0}; std::ignore = Dataset{grp, "bins/start"}.read(vbuff, expected.size()); const auto& buff = vbuff.get(); REQUIRE(buff.size() == expected.size()); @@ -166,7 +169,7 @@ TEST_CASE("Dataset: write", "[dataset][short]") { } SECTION("variant buff") { - const internal::VariantBuffer vexpected{expected}; + const hictk::internal::VariantBuffer vexpected{expected}; Dataset{grp, "num", T{}}.write(vexpected, 0, true); const auto vbuff = Dataset{grp, "num"}.read_all(); @@ -382,4 +385,4 @@ TEST_CASE("Dataset: attributes", "[dataset][short]") { } } // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) -} // namespace hictk::test::dataset +} // namespace hictk::cooler::test::dataset diff --git a/test/units/cooler/file_test.cpp b/test/units/cooler/file_test.cpp index 177bd162..d07a25f6 100644 --- a/test/units/cooler/file_test.cpp +++ b/test/units/cooler/file_test.cpp @@ -18,7 +18,11 @@ inline const SelfDeletingFolder testdir{true}; // NOLINT(cert- inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -namespace hictk::test::hictk { +namespace hictk::cooler::test::cooler_file { + +const auto& testdir = hictk::test::testdir; +const auto& datadir = hictk::test::datadir; + // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("Cooler: version", "[cooler][short]") { // clang-format off @@ -593,4 +597,4 @@ TEST_CASE("Cooler: write weights", "[cooler][short]") { } } -} // namespace hictk::test::hictk +} // namespace hictk::cooler::test::cooler_file diff --git a/test/units/cooler/index_test.cpp b/test/units/cooler/index_test.cpp index 1cc4f336..45091748 100644 --- a/test/units/cooler/index_test.cpp +++ b/test/units/cooler/index_test.cpp @@ -15,7 +15,8 @@ namespace hictk::test { inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } -namespace hictk::test::index { +namespace hictk::cooler::test::index { +const auto& datadir = hictk::test::datadir; // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("Index: ctor", "[index][short]") { @@ -194,4 +195,4 @@ TEST_CASE("Index: compute chromosome offsets", "[index][short]") { CHECK(chrom_offsets[2] == chr1_offsets.size() + chr2_offsets.size()); } -} // namespace hictk::test::index +} // namespace hictk::cooler::test::index diff --git a/test/units/cooler/pixel_selector_test.cpp b/test/units/cooler/pixel_selector_test.cpp index eae2f3bf..203f3995 100644 --- a/test/units/cooler/pixel_selector_test.cpp +++ b/test/units/cooler/pixel_selector_test.cpp @@ -18,7 +18,9 @@ inline const SelfDeletingFolder testdir{true}; // NOLINT(cert- inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -namespace hictk::test::pixel_selector { +namespace hictk::cooler::test::pixel_selector { +const auto& testdir = hictk::test::testdir; +const auto& datadir = hictk::test::datadir; template static std::ptrdiff_t generate_test_data(const std::filesystem::path& path, const Reference& chroms, @@ -267,4 +269,4 @@ TEST_CASE("Pixel selector: 2D queries", "[pixel_selector][short]") { } } -} // namespace hictk::test::pixel_selector +} // namespace hictk::cooler::test::pixel_selector diff --git a/test/units/cooler/utils_equal_test.cpp b/test/units/cooler/utils_equal_test.cpp index 9b018962..348ee680 100644 --- a/test/units/cooler/utils_equal_test.cpp +++ b/test/units/cooler/utils_equal_test.cpp @@ -14,7 +14,9 @@ inline const SelfDeletingFolder testdir{true}; // NOLINT(cert- inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -namespace hictk::test::index { +namespace hictk::cooler::test::utils { +inline const auto& testdir = hictk::test::testdir; +inline const auto& datadir = hictk::test::datadir; // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("utils: equal", "[equal][utils][short]") { @@ -27,10 +29,10 @@ TEST_CASE("utils: equal", "[equal][utils][short]") { std::filesystem::copy(path1, path3); SECTION("equal") { - CHECK(utils::equal(path1.string(), path1.string())); - CHECK(utils::equal(path1.string(), path3.string())); + CHECK(cooler::utils::equal(path1.string(), path1.string())); + CHECK(cooler::utils::equal(path1.string(), path3.string())); } - SECTION("not equal") { CHECK_FALSE(utils::equal(path1.string(), path2.string())); } + SECTION("not equal") { CHECK_FALSE(cooler::utils::equal(path1.string(), path2.string())); } } -} // namespace hictk::test::index +} // namespace hictk::cooler::test::utils diff --git a/test/units/cooler/utils_merge_test.cpp b/test/units/cooler/utils_merge_test.cpp index 5e0ed340..5d241d39 100644 --- a/test/units/cooler/utils_merge_test.cpp +++ b/test/units/cooler/utils_merge_test.cpp @@ -14,7 +14,9 @@ inline const SelfDeletingFolder testdir{true}; // NOLINT(cert- inline const std::filesystem::path datadir{"test/data/cooler"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -namespace hictk::test::index { +namespace hictk::cooler::test::utils { +inline const auto& testdir = hictk::test::testdir; +inline const auto& datadir = hictk::test::datadir; // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("utils: merge", "[merge][utils][short]") { @@ -24,7 +26,7 @@ TEST_CASE("utils: merge", "[merge][utils][short]") { const std::array sources{src.string(), src.string()}; SECTION("merge") { - utils::merge(sources.begin(), sources.end(), dest.string(), true); + cooler::utils::merge(sources.begin(), sources.end(), dest.string(), true); const auto clr1 = File::open_read_only_read_once(src.string()); const auto clr2 = File::open_read_only_read_once(dest.string()); @@ -52,7 +54,7 @@ TEST_CASE("utils: merge", "[merge][utils][short]") { fmt::format(FMT_STRING("{}::/resolutions/100000"), mclr.string()), fmt::format(FMT_STRING("{}::/resolutions/200000"), mclr.string())}; - CHECK_THROWS_WITH(utils::merge(sources1.begin(), sources1.end(), dest1.string(), true), + CHECK_THROWS_WITH(cooler::utils::merge(sources1.begin(), sources1.end(), dest1.string(), true), Catch::Matchers::ContainsSubstring("have different resolutions")); } @@ -63,8 +65,8 @@ TEST_CASE("utils: merge", "[merge][utils][short]") { const std::array sources2{clr1.string(), clr2.string()}; - CHECK_THROWS_WITH(utils::merge(sources2.begin(), sources2.end(), dest2.string(), true), + CHECK_THROWS_WITH(cooler::utils::merge(sources2.begin(), sources2.end(), dest2.string(), true), Catch::Matchers::ContainsSubstring("use different reference genomes")); } } -} // namespace hictk::test::index +} // namespace hictk::cooler::test::utils From fd14b4a1c5a05089e3e2c032568ad03135e60bf8 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 12 Jun 2023 11:44:36 +0200 Subject: [PATCH 04/48] Work in progress Split MatrixSelector into several smaller classes. --- src/bin_table/bin_table_impl.hpp | 18 +- src/bin_table/include/hictk/bin_table.hpp | 7 +- src/common/include/hictk/common.hpp | 3 + src/hic/block_reader_impl.hpp | 339 ++++++++++++ src/hic/cache_impl.hpp | 101 +++- src/hic/filestream_impl.hpp | 4 +- src/hic/hic_file_impl.hpp | 124 +++-- src/hic/hic_file_stream_impl.hpp | 55 +- src/hic/hic_file_utils_impl.hpp | 4 +- src/hic/hic_footer_impl.hpp | 8 +- src/hic/hic_header_impl.hpp | 8 +- src/hic/hic_matrix_selector_impl.hpp | 520 ------------------ src/hic/include/hictk/hic.hpp | 56 +- src/hic/include/hictk/hic/block_reader.hpp | 130 +++++ src/hic/include/hictk/hic/cache.hpp | 53 +- src/hic/include/hictk/hic/common.hpp | 64 +-- src/hic/include/hictk/hic/filestream.hpp | 4 +- src/hic/include/hictk/hic/hic_file_stream.hpp | 28 +- src/hic/include/hictk/hic/hic_footer.hpp | 4 +- src/hic/include/hictk/hic/hic_header.hpp | 4 +- .../include/hictk/hic/hic_matrix_selector.hpp | 108 ---- src/hic/include/hictk/hic/index.hpp | 94 ++++ src/hic/include/hictk/hic/pixel_selector.hpp | 139 +++++ .../hictk/hic/suppress_compiler_warnings.hpp | 38 -- src/hic/index_impl.hpp | 215 ++++++++ src/hic/pixel_selector_impl.hpp | 401 ++++++++++++++ src/numeric/include/hictk/numeric_utils.hpp | 3 + src/numeric/numeric_utils_impl.hpp | 6 + src/pixel/include/hictk/pixel.hpp | 2 + src/pixel/pixel_impl.hpp | 4 + test/units/bin_table/bin_table_test.cpp | 17 +- test/units/hic/CMakeLists.txt | 2 +- test/units/hic/filestream_test.cpp | 4 +- test/units/hic/hic_file_stream_test.cpp | 8 +- test/units/hic/hic_file_test.cpp | 70 +-- test/units/hic/matrix_zoom_data_test.cpp | 418 -------------- test/units/hic/pixel_selector_test.cpp | 109 ++++ 37 files changed, 1814 insertions(+), 1358 deletions(-) create mode 100644 src/hic/block_reader_impl.hpp delete mode 100644 src/hic/hic_matrix_selector_impl.hpp create mode 100644 src/hic/include/hictk/hic/block_reader.hpp delete mode 100644 src/hic/include/hictk/hic/hic_matrix_selector.hpp create mode 100644 src/hic/include/hictk/hic/index.hpp create mode 100644 src/hic/include/hictk/hic/pixel_selector.hpp delete mode 100644 src/hic/include/hictk/hic/suppress_compiler_warnings.hpp create mode 100644 src/hic/index_impl.hpp create mode 100644 src/hic/pixel_selector_impl.hpp delete mode 100644 test/units/hic/matrix_zoom_data_test.cpp create mode 100644 test/units/hic/pixel_selector_test.cpp diff --git a/src/bin_table/bin_table_impl.hpp b/src/bin_table/bin_table_impl.hpp index 1d3458e8..7475b9a9 100644 --- a/src/bin_table/bin_table_impl.hpp +++ b/src/bin_table/bin_table_impl.hpp @@ -22,16 +22,17 @@ namespace hictk { // NOLINT inline Bin::Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end_) noexcept - : Bin(Bin::null_id, chrom_, start_, end_) {} + : Bin(Bin::null_id, Bin::rel_null_id, chrom_, start_, end_) {} -inline Bin::Bin(std::uint64_t id_, const Chromosome &chrom_, std::uint32_t start_, - std::uint32_t end_) noexcept - : _id(id_), _interval(chrom_, start_, end_) {} +inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, + std::uint32_t start_, std::uint32_t end_) noexcept + : _id(id_), _rel_id(rel_id_), _interval(chrom_, start_, end_) {} -inline Bin::Bin(GenomicInterval interval) noexcept : Bin(Bin::null_id, std::move(interval)) {} +inline Bin::Bin(GenomicInterval interval) noexcept + : Bin(Bin::null_id, Bin::rel_null_id, std::move(interval)) {} -inline Bin::Bin(std::uint64_t id_, GenomicInterval interval) noexcept - : _id(id_), _interval(std::move(interval)) {} +inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept + : _id(id_), _rel_id(rel_id_), _interval(std::move(interval)) {} inline Bin::operator bool() const noexcept { return !!this->chrom(); } @@ -72,6 +73,7 @@ inline bool Bin::operator>=(const Bin &other) const noexcept { } constexpr std::uint64_t Bin::id() const noexcept { return this->_id; } +constexpr std::uint32_t Bin::rel_id() const noexcept { return this->_rel_id; } inline const GenomicInterval &Bin::interval() const noexcept { return this->_interval; } inline const Chromosome &Bin::chrom() const noexcept { return this->interval().chrom(); } constexpr std::uint32_t Bin::start() const noexcept { return this->_interval.start(); } @@ -214,7 +216,7 @@ inline Bin BinTable::at_hint(std::uint64_t bin_id, const Chromosome &chrom) cons assert(start < chrom.size()); const auto end = (std::min)(start + this->bin_size(), chrom.size()); - return {bin_id, chrom, start, end}; + return {bin_id, static_cast(relative_bin_id), chrom, start, end}; } inline std::pair BinTable::at(const GenomicInterval &gi) const { diff --git a/src/bin_table/include/hictk/bin_table.hpp b/src/bin_table/include/hictk/bin_table.hpp index 1b98a39d..410b5a55 100644 --- a/src/bin_table/include/hictk/bin_table.hpp +++ b/src/bin_table/include/hictk/bin_table.hpp @@ -19,18 +19,20 @@ namespace hictk { class Bin { public: static constexpr std::uint64_t null_id{(std::numeric_limits::max)()}; + static constexpr std::uint32_t rel_null_id{(std::numeric_limits::max)()}; private: std::uint64_t _id{null_id}; + std::uint32_t _rel_id{rel_null_id}; GenomicInterval _interval{}; public: constexpr Bin() = default; Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept; - Bin(std::uint64_t id_, const Chromosome &chrom_, std::uint32_t start_, + Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end_) noexcept; explicit Bin(GenomicInterval interval) noexcept; - Bin(std::uint64_t id, GenomicInterval interval) noexcept; + Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept; [[nodiscard]] explicit operator bool() const noexcept; @@ -44,6 +46,7 @@ class Bin { [[nodiscard]] bool operator>=(const Bin &other) const noexcept; [[nodiscard]] constexpr std::uint64_t id() const noexcept; + [[nodiscard]] constexpr std::uint32_t rel_id() const noexcept; [[nodiscard]] const GenomicInterval &interval() const noexcept; [[nodiscard]] const Chromosome &chrom() const noexcept; [[nodiscard]] constexpr std::uint32_t start() const noexcept; diff --git a/src/common/include/hictk/common.hpp b/src/common/include/hictk/common.hpp index 8226434e..86e92cfe 100644 --- a/src/common/include/hictk/common.hpp +++ b/src/common/include/hictk/common.hpp @@ -15,6 +15,7 @@ namespace hictk { inline const std::string_view HICTK_VERSION_STRING{hictk::config::version::str()}; +namespace cooler { // Magic values inline constexpr std::string_view COOL_MAGIC{"HDF5::Cooler"}; inline constexpr std::string_view MCOOL_MAGIC{"HDF5::MCOOL"}; @@ -58,6 +59,8 @@ inline constexpr std::string_view SENTINEL_ATTR_NAME{"format-version"}; inline constexpr std::uint8_t SENTINEL_ATTR_VALUE{255}; } // namespace internal +} // namespace cooler + [[nodiscard]] constexpr bool ndebug_defined() noexcept { #ifdef NDEBUG return true; diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp new file mode 100644 index 00000000..bb948924 --- /dev/null +++ b/src/hic/block_reader_impl.hpp @@ -0,0 +1,339 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +namespace hictk::hic::internal { + +inline BlockGrid::BlockGrid(const std::vector &index, std::size_t block_bin_count) { + _grid.resize(index.size()); + std::transform(index.begin(), index.end(), _grid.begin(), [&](const BlockIndex &idx) { + const auto row = idx.id / block_bin_count; + const auto col = idx.id % block_bin_count; + + return Node{std::make_shared(idx), nullptr, nullptr, row, col}; + }); + + assert(std::is_sorted(_grid.begin(), _grid.end(), [](const auto &n1, const auto &n2) { + return n1.block->id < n2.block->id; + })); + + auto head = _grid.begin(); + auto tail = _grid.end(); + while (head != tail) { + const Node *next_right{}; + const Node *next_down{}; + + const auto head_row = head->row; + + std::for_each(head + 1, tail, [&](const Node &n) { + if (!next_right || (n.row == head_row && n.col < next_right->col)) { + next_right = &n; + } + + if (!next_down || (n.row == head_row + 1 && n.col < next_down->col)) { + next_down = &n; + } + }); + + head->next_right = next_right; + head->next_down = next_down; + ++head; + } +} + +inline auto BlockGrid::begin() const noexcept -> iterator { return iterator{_grid.front()}; } +inline auto BlockGrid::end() const noexcept -> iterator { return {}; } +inline std::size_t BlockGrid::size() const noexcept { return _grid.size(); } + +inline BlockGrid::iterator::iterator(const BlockGrid::Node &head) + : _node(&head), _current_row(head.block->first_row) {} +inline bool BlockGrid::iterator::operator==(const BlockGrid::iterator &other) const noexcept { + return _node == other._node; +} + +inline bool BlockGrid::iterator::operator!=(const BlockGrid::iterator &other) const noexcept { + return !(*this == other); +} + +inline auto BlockGrid::iterator::operator++() noexcept -> iterator & { + assert(!!_node); + const auto move_right = _current_row != _node->block->last_row; + if (move_right) { + _node = _node->next_right; + } else { + _node = _node->next_down; + _current_row = !!_node ? _node->block->first_row : (std::numeric_limits::max)(); + } + + return *this; +} + +inline auto BlockGrid::iterator::operator++(int) noexcept -> iterator { + auto it = *this; + std::ignore = ++*this; + return it; +} + +inline auto BlockGrid::iterator::operator*() noexcept -> const_reference { + assert(_node); + return *_node; +} + +inline auto BlockGrid::iterator::operator->() noexcept -> const_pointer { + assert(_node); + return _node; +} + +template ::value>::type *> +inline T BinaryBuffer::read() { + static_assert(sizeof(char) == 1, ""); + assert(_i < _buffer.size()); + T x{}; + + std::memcpy(static_cast(&x), _buffer.data() + _i, sizeof(T)); + _i += sizeof(T); + return x; +} + +inline std::size_t BinaryBuffer::operator()() const noexcept { return _i; } + +inline std::string &BinaryBuffer::reset() noexcept { + _buffer.clear(); + _i = 0; + return _buffer; +} + +inline HiCBlockReader::HiCBlockReader(std::shared_ptr hfs, const HiCFooter &footer, + std::shared_ptr bins_, + std::shared_ptr block_cache_, + const PixelCoordinates &coords1, + const PixelCoordinates &coords2) + : _hfs(std::move(hfs)), + _index(read_index(*_hfs, footer)), + _blk_cache(std::move(block_cache_)), + _bins(std::move(bins_)) { + find_overlapping_blocks(coords1, coords2); +} + +inline HiCBlockReader::operator bool() const noexcept { return !!_hfs; } + +inline const Chromosome &HiCBlockReader::chrom1() const noexcept { return _index.chrom1(); } +inline const Chromosome &HiCBlockReader::chrom2() const noexcept { return _index.chrom2(); } + +inline const BinTable &HiCBlockReader::bins() const noexcept { return *_bins; } + +inline const BlockGrid &HiCBlockReader::grid() const { return _block_grid; } + +inline double HiCBlockReader::sum() const noexcept { return _index.matrix_sum(); } + +inline double HiCBlockReader::avg() const noexcept { + const auto num_bins1 = bins().subset(chrom1()).size(); + const auto num_bins2 = bins().subset(chrom2()).size(); + + return sum() / double(num_bins1 * num_bins2); +} + +inline void HiCBlockReader::find_overlapping_blocks(const hictk::PixelCoordinates &coords1, + const hictk::PixelCoordinates &coords2) { + std::vector _blocks_idx; + _index.map_2d_query_to_blocks(coords1, coords2, _blocks_idx); + _block_grid = BlockGrid(_blocks_idx, _index.block_bin_count()); +} + +inline Index HiCBlockReader::read_index(HiCFileStream &hfs, const HiCFooter &footer) { + if (footer.fileOffset() == -1) { + // Footer does not exist. However, query may be valid + return {}; + } + + return hfs.readBlockMap(footer.fileOffset(), footer.chrom1(), footer.chrom2(), footer.unit(), + footer.resolution()); +} + +inline std::shared_ptr HiCBlockReader::read(const BlockIndex &idx) { + if (!idx) { + return {nullptr}; + } + + assert(_blk_cache); + assert(_bins); + if (auto it = _blk_cache->find(idx.id); it != _blk_cache->end()) { + return it->second; + } + + _hfs->readAndInflate(idx, _bbuffer.reset()); + + const auto nRecords = static_cast(_bbuffer.read()); + _tmp_buffer.resize(nRecords); + + // if (_fs->version() == 6) { + // readBlockOfInteractionsV6(_bbuffer, buffer); + // auto it = + // _blockCache.emplace(idx.position), + // InteractionBlock(buffer)); + // return it.first->second; + // } + + const auto bin1Offset = _bbuffer.read(); + const auto bin2Offset = _bbuffer.read(); + + const auto i16Counts = _bbuffer.read() == 0; + + auto readUseShortBinFlag = [&]() { + if (_hfs->version() > 8) { + return _bbuffer.read() == 0; + } + return true; + }; + + const auto i16Bin1 = readUseShortBinFlag(); + const auto i16Bin2 = readUseShortBinFlag(); + + const auto type = static_cast(_bbuffer.read()); + if (type != 1 && type != 2) { + throw std::runtime_error( + fmt::format(FMT_STRING("uknown interaction type \"{}\". Supported types: 1, 2"), type)); + } + + switch (type) { + case 1: + read_dispatcher_type1_block(i16Bin1, i16Bin2, i16Counts, bin1Offset, bin2Offset, _bbuffer, + _tmp_buffer); + break; + case 2: + if (i16Counts) { + read_type2_block(bin1Offset, bin2Offset, _bbuffer, _tmp_buffer); + break; + } + read_type2_block(bin1Offset, bin2Offset, _bbuffer, _tmp_buffer); + break; + default: + HICTK_UNREACHABLE_CODE; + } + + auto it = _blk_cache->emplace(idx.id, InteractionBlock{idx.id, _tmp_buffer}); + return it.first->second; +} + +inline void HiCBlockReader::read_dispatcher_type1_block( + bool i16Bin1, bool i16Bin2, bool i16Counts, std::int32_t bin1Offset, std::int32_t bin2Offset, + BinaryBuffer &src, std::vector &dest) noexcept { + using BS = std::int16_t; // Short type for bins + using CS = std::int16_t; // Short type for count + + using BL = std::int32_t; // Long type for bins + using CL = float; // Long type for count + + if (i16Bin1 && i16Bin2 && i16Counts) { + read_type1_block(bin1Offset, bin2Offset, src, dest); + return; + } + if (!i16Bin1 && i16Bin2 && i16Counts) { + read_type1_block(bin1Offset, bin2Offset, src, dest); + return; + } + if (i16Bin1 && !i16Bin2 && i16Counts) { + read_type1_block(bin1Offset, bin2Offset, src, dest); + return; + } + if (i16Bin1 && i16Bin2 && !i16Counts) { + read_type1_block(bin1Offset, bin2Offset, src, dest); + return; + } + if (!i16Bin1 && !i16Bin2 && i16Counts) { + read_type1_block(bin1Offset, bin2Offset, src, dest); + return; + } + if (!i16Bin1 && i16Bin2 && !i16Counts) { + read_type1_block(bin1Offset, bin2Offset, src, dest); + return; + } + if (i16Bin1 && !i16Bin2 && !i16Counts) { + read_type1_block(bin1Offset, bin2Offset, src, dest); + return; + } + assert(!i16Bin1 && !i16Bin2 && !i16Counts); + read_type1_block(bin1Offset, bin2Offset, src, dest); +} + +template +inline void HiCBlockReader::read_type1_block(std::int32_t bin1Offset, std::int32_t bin2Offset, + BinaryBuffer &src, + std::vector &dest) noexcept { + using i16 = std::int16_t; + using i32 = std::int32_t; + using f32 = float; + static_assert(std::is_same::value || std::is_same::value, ""); + static_assert(std::is_same::value || std::is_same::value, ""); + static_assert(std::is_same::value || std::is_same::value, ""); + + constexpr auto expectedOffsetV7 = (3 * sizeof(i32)) + (2 * sizeof(char)); + constexpr auto expectedOffsetV8plus = expectedOffsetV7 + (2 * sizeof(char)); + std::ignore = expectedOffsetV7; + std::ignore = expectedOffsetV8plus; + assert(src() == expectedOffsetV7 || src() == expectedOffsetV8plus); + + const auto expectedNumRecords = dest.size(); + dest.clear(); + const auto numRows = static_cast(src.read()); + for (i32 i = 0; i < numRows; ++i) { + const auto bin2 = bin2Offset + static_cast(src.read()); + + const auto numCols = static_cast(src.read()); + for (i32 j = 0; j < numCols; ++j) { + const auto bin1 = bin1Offset + static_cast(src.read()); + + const auto counts = static_cast(src.read()); + dest.push_back(SerializedPixel{bin1, bin2, counts}); + } + } + + std::ignore = expectedNumRecords; + assert(expectedNumRecords == dest.size()); +} + +template +inline void HiCBlockReader::read_type2_block(std::int32_t bin1Offset, std::int32_t bin2Offset, + BinaryBuffer &src, + std::vector &dest) noexcept { + using i16 = std::int16_t; + using i32 = std::int32_t; + using f32 = float; + static_assert(std::is_same::value || std::is_same::value, ""); + + const auto nPts = src.read(); + const auto w = static_cast(src.read()); + + constexpr auto i16Sentinel = (std::numeric_limits::lowest)(); + constexpr auto i16Counts = std::is_same::value; + + auto isValid = [&](CountType n) { + return (i16Counts && static_cast(n) != i16Sentinel) || + (!i16Counts && !std::isnan(static_cast(n))); + }; + + dest.reserve(static_cast(nPts)); + dest.clear(); + for (i32 i = 0; i < nPts; ++i) { + const auto count = src.read(); + if (!isValid(count)) { + continue; + } + const auto row = i / w; + const auto col = i - row * w; + const auto bin1 = bin1Offset + col; + const auto bin2 = bin2Offset + row; + + dest.emplace_back(SerializedPixel{bin1, bin2, static_cast(count)}); + } +} + +} // namespace hictk::hic::internal diff --git a/src/hic/cache_impl.hpp b/src/hic/cache_impl.hpp index 006049cd..d5f9117e 100644 --- a/src/hic/cache_impl.hpp +++ b/src/hic/cache_impl.hpp @@ -9,24 +9,79 @@ #include #include -namespace hictk::internal { +namespace hictk::hic::internal { + +constexpr bool operator<(const InteractionBlock &a, const InteractionBlock &b) noexcept { + return a < b._id; +} +constexpr bool operator==(const InteractionBlock &a, const InteractionBlock &b) noexcept { + return a == b._id; +} +constexpr bool operator!=(const InteractionBlock &a, const InteractionBlock &b) noexcept { + return !(a == b); +} + +constexpr bool operator<(const InteractionBlock &a, std::size_t b_id) noexcept { + return a._id < b_id; +} +constexpr bool operator==(const InteractionBlock &a, std::size_t b_id) noexcept { + return a._id == b_id; +} +constexpr bool operator!=(const InteractionBlock &a, std::size_t b_id) noexcept { + return !(a == b_id); +} + +constexpr bool operator<(std::size_t a_id, const InteractionBlock &b) noexcept { + return a_id < b._id; +} +constexpr bool operator==(std::size_t a_id, const InteractionBlock &b) noexcept { + return a_id == b._id; +} +constexpr bool operator!=(std::size_t a_id, const InteractionBlock &b) noexcept { + return !(a_id == b); +} + +constexpr bool InteractionBlockCmp::operator()(const InteractionBlock &a, + const InteractionBlock &b) const noexcept { + return a < b; +} +constexpr bool InteractionBlockCmp::operator()(const InteractionBlock &a, + std::size_t b_id) const noexcept { + return a < b_id; +} +constexpr bool InteractionBlockCmp::operator()(std::size_t a_id, + const InteractionBlock &b) const noexcept { + return a_id < b; +} inline auto InteractionBlock::Overlap::begin() const noexcept { return first; } inline auto InteractionBlock::Overlap::end() const noexcept { return last; } inline auto InteractionBlock::Overlap::cbegin() const noexcept { return begin(); } inline auto InteractionBlock::Overlap::cend() const noexcept { return end(); } -inline InteractionBlock::InteractionBlock(const std::vector &pixels) { +inline std::size_t InteractionBlock::id() const noexcept { return _id; } +inline const Chromosome &InteractionBlock::chrom1() const noexcept { + assert(_chrom1); + return *_chrom1; +} +inline const Chromosome &InteractionBlock::chrom2() const noexcept { + assert(_chrom2); + return *_chrom2; +} + +inline InteractionBlock::InteractionBlock(std::size_t id_, + const std::vector &pixels) + : _id(id_) { if (pixels.empty()) { return; } for (const SerializedPixel &p : pixels) { - _first_col = (std::min)(_first_col, p.bin2_id); - _last_col = (std::max)(_last_col, p.bin2_id); - auto [node, inserted] = this->_interactions.try_emplace(p.bin1_id, Row{{p.bin2_id, p.count}}); + const auto b1 = static_cast(p.bin1_id); + const auto b2 = static_cast(p.bin2_id); + auto [node, inserted] = this->_interactions.try_emplace(b1, Row{{b2, p.count}}); if (!inserted) { - node->second.emplace_back(ThinPixel{p.bin2_id, p.count}); + node->second.emplace_back(ThinPixel{b2, p.count}); } } if constexpr (ndebug_not_defined()) { @@ -56,10 +111,21 @@ inline auto InteractionBlock::end() const noexcept -> const_iterator { return _i inline auto InteractionBlock::cend() const noexcept -> const_iterator { return end(); } -inline auto InteractionBlock::find_overlap(std::int64_t first_row, - std::int64_t last_row) const noexcept -> Overlap { +inline auto InteractionBlock::at(std::uint64_t row) const noexcept -> const_iterator { + return _interactions.lower_bound(row); +} + +inline auto InteractionBlock::find_overlap(std::uint64_t first_row, + std::uint64_t last_row) const noexcept -> Overlap { assert(first_row <= last_row); - return {_interactions.lower_bound(first_row), _interactions.upper_bound(last_row)}; + return {at(first_row), _interactions.upper_bound(last_row)}; +} + +inline bool InteractionBlock::has_overlap(std::uint64_t first_row, + std::uint64_t last_row) const noexcept { + auto overlap = find_overlap(first_row, last_row); + + return overlap.begin() != this->_interactions.end(); } inline std::size_t InteractionBlock::size() const noexcept { return _interactions.size(); } @@ -68,21 +134,6 @@ inline std::size_t InteractionBlock::size_in_bytes() const noexcept { return sizeof(Pixel) * size(); } -inline std::int64_t InteractionBlock::first_row() const noexcept { - if (_interactions.empty()) { - return 0; - } - return _interactions.begin()->first; -} -inline std::int64_t InteractionBlock::last_row() const noexcept { - if (_interactions.empty()) { - return 0; - } - return (--_interactions.end())->first; -} -inline std::int64_t InteractionBlock::first_col() const noexcept { return _first_col; } -inline std::int64_t InteractionBlock::last_col() const noexcept { return _last_col; } - inline BlockLRUCache::BlockLRUCache(std::size_t max_size_in_bytes) : _max_size_bytes(max_size_in_bytes) { if (_max_size_bytes == 0) { @@ -165,4 +216,4 @@ constexpr std::size_t BlockLRUCache::misses() const noexcept { return _misses; } inline std::size_t BlockLRUCache::size() const noexcept { return _cache.size(); } -} // namespace hictk::internal +} // namespace hictk::hic::internal diff --git a/src/hic/filestream_impl.hpp b/src/hic/filestream_impl.hpp index c5ae2b54..56457e39 100644 --- a/src/hic/filestream_impl.hpp +++ b/src/hic/filestream_impl.hpp @@ -15,7 +15,7 @@ #include "hictk/common.hpp" -namespace hictk::internal::filestream { +namespace hictk::hic::internal::filestream { inline FileStream::FileStream(std::string path) : path_(std::move(path)), @@ -151,4 +151,4 @@ inline std::ifstream FileStream::open_file(const std::string &path, std::ifstrea return ifs; } -} // namespace hictk::internal::filestream +} // namespace hictk::hic::internal::filestream diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index d09b4cff..1155987d 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -14,15 +14,15 @@ #include "hictk/hic/common.hpp" -namespace hictk { +namespace hictk::hic { inline HiCFile::HiCFile(std::string url_, std::uint32_t resolution_, MatrixType type_, MatrixUnit unit_, std::uint64_t block_cache_capacity) : _fs(std::make_shared(std::move(url_))), _type(type_), _unit(unit_), - _block_cache(block_cache_capacity), - _bins(chromosomes(), resolution_) { + _block_cache(std::make_shared(block_cache_capacity)), + _bins(std::make_shared(chromosomes(), resolution_)) { assert(block_cache_capacity != 0); if (!has_resolution(resolution())) { throw std::runtime_error(fmt::format( @@ -53,11 +53,11 @@ inline const std::vector& HiCFile::avail_resolutions() const noex return _fs->header().resolutions; } -constexpr std::uint32_t HiCFile::resolution() const noexcept { return _bins.bin_size(); } +inline std::uint32_t HiCFile::resolution() const noexcept { return _bins->bin_size(); } inline std::shared_ptr HiCFile::get_footer( std::uint32_t chrom1_id, std::uint32_t chrom2_id, MatrixType matrix_type, - NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution) { + NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution) const { const internal::HiCFooterMetadata metadata{url(), matrix_type, norm, @@ -76,7 +76,7 @@ inline std::shared_ptr HiCFile::get_footer( assert(node.second); return node.first->second; } - +/* inline internal::MatrixSelector HiCFile::get_matrix_selector(const Chromosome& chrom, NormalizationMethod norm) { return get_matrix_selector(chrom, chrom, norm); @@ -96,40 +96,49 @@ inline internal::MatrixSelector HiCFile::get_matrix_selector(const Chromosome& c return get_matrix_selector(chrom1.id(), chrom2.id(), norm); } -inline internal::MatrixSelector HiCFile::get_matrix_selector(const std::string& chrom1_name, - const std::string& chrom2_name, - NormalizationMethod norm) { - const auto it1 = chromosomes().find(chrom1_name); - if (it1 == chromosomes().end()) { - throw std::runtime_error( - fmt::format(FMT_STRING("unable to find chromosome named {}"), chrom1_name)); - } - if (chrom1_name == chrom2_name) { - return get_matrix_selector(*it1, *it1, norm); - } + */ - const auto it2 = chromosomes().find(chrom2_name); - if (it2 == chromosomes().end()) { - throw std::runtime_error( - fmt::format(FMT_STRING("unable to find chromosome named {}"), chrom2_name)); - } +inline PixelSelector HiCFile::fetch(std::string_view query, NormalizationMethod norm, + QUERY_TYPE query_type) const { + const auto gi = query_type == QUERY_TYPE::BED + ? GenomicInterval::parse_bed(this->chromosomes(), query) + : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{query}); - return get_matrix_selector(*it1, *it2, norm); + return this->fetch(gi.chrom(), gi.start(), gi.end(), gi.chrom(), gi.start(), gi.end(), norm); } -inline internal::MatrixSelector HiCFile::get_matrix_selector(std::uint32_t chrom1_id, - std::uint32_t chrom2_id, - NormalizationMethod norm) { - if (chrom1_id >= std::int64_t(chromosomes().size())) { - throw std::runtime_error( - fmt::format(FMT_STRING("unable to find chromosome corresponding to ID {}"), chrom1_id)); - } - if (chrom2_id >= std::int64_t(chromosomes().size())) { - throw std::runtime_error( - fmt::format(FMT_STRING("unable to find chromosome corresponding to ID {}"), chrom2_id)); - } +inline PixelSelector HiCFile::fetch(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end, NormalizationMethod norm) const { + return this->fetch(chrom_name, start, end, chrom_name, start, end, norm); +} + +inline PixelSelector HiCFile::fetch(std::string_view range1, std::string_view range2, + NormalizationMethod norm, QUERY_TYPE query_type) const { + const auto gi1 = query_type == QUERY_TYPE::BED + ? GenomicInterval::parse_bed(this->chromosomes(), range1) + : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{range1}); + + const auto gi2 = query_type == QUERY_TYPE::BED + ? GenomicInterval::parse_bed(this->chromosomes(), range2) + : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{range2}); - if (chrom1_id > chrom2_id) { + return this->fetch(gi1.chrom(), gi1.start(), gi1.end(), gi2.chrom(), gi2.start(), gi2.end(), + norm); +} + +inline PixelSelector HiCFile::fetch(std::string_view chrom1_name, std::uint32_t start1, + std::uint32_t end1, std::string_view chrom2_name, + std::uint32_t start2, std::uint32_t end2, + NormalizationMethod norm) const { + return this->fetch(chromosomes().at(chrom1_name), start1, end1, chromosomes().at(chrom2_name), + start2, end2, norm); +} + +inline PixelSelector HiCFile::fetch(const Chromosome& chrom1, std::uint32_t start1, + std::uint32_t end1, const Chromosome& chrom2, + std::uint32_t start2, std::uint32_t end2, + NormalizationMethod norm) const { + if (chrom1 > chrom2) { throw std::runtime_error( "Query overlaps the lower-triangle of the matrix. This is currently not supported."); } @@ -139,34 +148,33 @@ inline internal::MatrixSelector HiCFile::get_matrix_selector(std::uint32_t chrom FMT_STRING("matrix type {} is incompatible with normalization method {}"), _type, norm)); } - try { - return internal::MatrixSelector( - _fs, get_footer(chrom1_id, chrom2_id, _type, norm, _unit, resolution()), - 1'000'000); // TODO: REMOVE CACHE CAPACITY! - } catch (const std::exception& e) { - // Check whether query is valid but there are no interactions for the given chromosome pair - const auto missing_footer = - std::string_view{e.what()}.find("unable to read file offset") == std::string_view::npos; - if (missing_footer) { - throw; + const PixelCoordinates coord1 = {_bins->at(chrom1, start1), _bins->at(chrom1, end1)}; + const PixelCoordinates coord2 = {_bins->at(chrom2, start2), _bins->at(chrom2, end2)}; + + auto footer = [&]() { + try { + return get_footer(chrom1.id(), chrom2.id(), _type, norm, _unit, resolution()); + } catch (const std::exception& e) { + // Check whether query is valid but there are no interactions for the given chromosome + // pair + const auto missing_footer = + std::string_view{e.what()}.find("unable to read file offset") == std::string_view::npos; + if (missing_footer) { + throw; + } + + internal::HiCFooterMetadata metadata{url(), _type, norm, _unit, + resolution(), chrom1, chrom2, -1}; + + return std::make_shared(std::move(metadata)); } + }(); - internal::HiCFooterMetadata metadata{url(), - _type, - norm, - _unit, - resolution(), - _fs->header().chromosomes.at(chrom1_id), - _fs->header().chromosomes.at(chrom2_id), - -1}; - - return internal::MatrixSelector( - _fs, std::make_shared(std::move(metadata)), 1); - } + return PixelSelector{_fs, footer, _block_cache, _bins, coord1, coord2}; } inline std::size_t HiCFile::num_cached_footers() const noexcept { return _footers.size(); } inline void HiCFile::purge_footer_cache() { _footers.clear(); } -} // namespace hictk +} // namespace hictk::hic diff --git a/src/hic/hic_file_stream_impl.hpp b/src/hic/hic_file_stream_impl.hpp index 2e3bc537..e296cc82 100644 --- a/src/hic/hic_file_stream_impl.hpp +++ b/src/hic/hic_file_stream_impl.hpp @@ -18,8 +18,9 @@ #include "hictk/hic/common.hpp" #include "hictk/hic/filestream.hpp" +#include "hictk/hic/index.hpp" -namespace hictk::internal { +namespace hictk::hic::internal { inline HiCFileStream::HiCFileStream(std::string url) : _fs(std::make_shared(HiCFileStream::openStream(std::move(url)))), @@ -177,14 +178,10 @@ inline auto HiCFileStream::init_decompressor() -> Decompressor { return zs; } -inline void HiCFileStream::readBlockMap(std::int64_t fileOffset, - [[maybe_unused]] const Chromosome &chrom1, - [[maybe_unused]] const Chromosome &chrom2, - MatrixUnit wantedUnit, std::int64_t wantedResolution, - BlockMap &buffer) { +inline Index HiCFileStream::readBlockMap(std::int64_t fileOffset, const Chromosome &chrom1, + const Chromosome &chrom2, MatrixUnit wantedUnit, + std::int64_t wantedResolution) { _fs->seekg(fileOffset); - auto &blockMap = buffer.blocks; - blockMap.clear(); [[maybe_unused]] const auto c1i = _fs->read(); [[maybe_unused]] const auto c2i = _fs->read(); @@ -205,23 +202,31 @@ inline void HiCFileStream::readBlockMap(std::int64_t fileOffset, const auto blockBinCount = _fs->read(); const auto blockColumnCount = _fs->read(); - const auto nBlocks = static_cast(_fs->read()); + const auto nBlocks = static_cast(_fs->read()); + phmap::btree_set buffer; if (wantedUnit == foundUnit && wantedResolution == foundResolution) { - for (std::int64_t j = 0; j < nBlocks; ++j) { - const auto key = _fs->read(); - indexEntry index{_fs->read(), _fs->read()}; - assert(index.position + index.size < static_cast(_fs->size())); - blockMap.emplace(key, std::move(index)); + for (std::size_t j = 0; j < nBlocks; ++j) { + const auto block_id = static_cast(_fs->read()); + const auto position = static_cast(_fs->read()); + const auto size = static_cast(_fs->read()); + assert(position + size < _fs->size()); + if (size > 0) { + buffer.emplace(BlockIndex{block_id, position, size, 0, 0, 0, 0}); + } } - buffer.blockBinCount = blockBinCount; - buffer.blockColumnCount = blockColumnCount; - buffer.sumCount = static_cast(sumCount); - return; + + return {chrom1, + chrom2, + std::move(buffer), + version(), + static_cast(blockBinCount), + static_cast(blockColumnCount), + static_cast(sumCount)}; } constexpr std::int64_t blockSize = sizeof(int32_t) + sizeof(int64_t) + sizeof(int32_t); - _fs->seekg(nBlocks * blockSize, std::ios::cur); + _fs->seekg(static_cast(nBlocks) * blockSize, std::ios::cur); } throw std::runtime_error( @@ -306,18 +311,18 @@ inline HiCHeader HiCFileStream::readHeader(filestream::FileStream &fs) { return header; } -inline void HiCFileStream::readAndInflate(indexEntry idx, std::string &plainTextBuffer) { +inline void HiCFileStream::readAndInflate(const BlockIndex &idx, std::string &plainTextBuffer) { try { // _strbuff is used to store compressed data // plainTextBuffer is used to store decompressed data assert(_decompressor); - assert(idx.size > 0); - const auto buffSize = static_cast(idx.size); + assert(idx.compressed_size_bytes > 0); + const auto buffSize = static_cast(idx.compressed_size_bytes); plainTextBuffer.reserve(buffSize * 3); plainTextBuffer.resize(plainTextBuffer.capacity()); - _fs->seekg(idx.position); + _fs->seekg(static_cast(idx.file_offset)); _fs->read(_strbuff, buffSize); std::size_t bytes_decompressed{}; @@ -341,7 +346,7 @@ inline void HiCFileStream::readAndInflate(indexEntry idx, std::string &plainText } } catch (const std::exception &e) { throw std::runtime_error(fmt::format(FMT_STRING("failed to decompress block at pos {}: {}"), - idx.position, e.what())); + idx.file_offset, e.what())); } } @@ -523,4 +528,4 @@ inline HiCFooter HiCFileStream::readFooter(const std::uint32_t chrom1_id, return footer; } -} // namespace hictk::internal +} // namespace hictk::hic::internal diff --git a/src/hic/hic_file_utils_impl.hpp b/src/hic/hic_file_utils_impl.hpp index fbad5573..48bfa238 100644 --- a/src/hic/hic_file_utils_impl.hpp +++ b/src/hic/hic_file_utils_impl.hpp @@ -7,8 +7,8 @@ #include #include -namespace hictk::utils { +namespace hictk::hic::utils { inline bool is_hic_file(const std::filesystem::path& path) { return internal::HiCFileStream::checkMagicString(path.string()); } -} // namespace hictk::utils +} // namespace hictk::hic::utils diff --git a/src/hic/hic_footer_impl.hpp b/src/hic/hic_footer_impl.hpp index a86e641f..14e7bffb 100644 --- a/src/hic/hic_footer_impl.hpp +++ b/src/hic/hic_footer_impl.hpp @@ -10,7 +10,7 @@ #include "hictk/hic/common.hpp" -namespace hictk::internal { +namespace hictk::hic::internal { constexpr HiCFooterMetadata::operator bool() const noexcept { return fileOffset >= 0; } @@ -71,11 +71,11 @@ constexpr std::vector &HiCFooter::c2Norm() noexcept { return _c2Norm; } -} // namespace hictk::internal +} // namespace hictk::hic::internal template <> -struct std::hash { - inline std::size_t operator()(hictk::internal::HiCFooterMetadata const &m) const noexcept { +struct std::hash { + inline std::size_t operator()(hictk::hic::internal::HiCFooterMetadata const &m) const noexcept { return hictk::internal::hash_combine(0, m.url, m.matrix_type, m.normalization, m.unit, m.resolution, m.chrom1, m.chrom2); } diff --git a/src/hic/hic_header_impl.hpp b/src/hic/hic_header_impl.hpp index 28a8afb0..88b26d3b 100644 --- a/src/hic/hic_header_impl.hpp +++ b/src/hic/hic_header_impl.hpp @@ -9,7 +9,7 @@ #include #include -namespace hictk::internal { +namespace hictk::hic::internal { constexpr HiCHeader::operator bool() const noexcept { return masterIndexOffset >= 0; } @@ -21,11 +21,11 @@ inline bool HiCHeader::operator!=(const HiCHeader &other) const noexcept { return !(*this == other); } -} // namespace hictk::internal +} // namespace hictk::hic::internal template <> -struct std::hash { - inline std::size_t operator()(hictk::internal::HiCHeader const &h) const noexcept { +struct std::hash { + inline std::size_t operator()(hictk::hic::internal::HiCHeader const &h) const noexcept { return hictk::internal::hash_combine(0, h.url, h.masterIndexOffset); } }; diff --git a/src/hic/hic_matrix_selector_impl.hpp b/src/hic/hic_matrix_selector_impl.hpp deleted file mode 100644 index f71b77a8..00000000 --- a/src/hic/hic_matrix_selector_impl.hpp +++ /dev/null @@ -1,520 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hictk/hic/common.hpp" - -namespace hictk::internal { - -template ::value>::type *> -inline T MatrixSelector::BinaryBuffer::read() { - static_assert(sizeof(char) == 1, ""); - assert(i < buffer.size()); - T x{}; - - std::memcpy(static_cast(&x), buffer.data() + i, sizeof(T)); - i += sizeof(T); - return x; -} - -inline MatrixSelector::MatrixSelector(std::shared_ptr fs, - std::shared_ptr footer, - std::size_t block_cache_capacity) - : _fs(std::move(fs)), - _footer(std::move(footer)), - _bins(_fs->header().chromosomes, _footer->resolution()), - _blockMap(readBlockMap(*_fs, *_footer)), - _blockCache(block_cache_capacity) { - assert(_footer); -} - -inline const Chromosome &MatrixSelector::chrom1() const noexcept { return _footer->chrom1(); } - -inline const Chromosome &MatrixSelector::chrom2() const noexcept { return _footer->chrom2(); } - -inline std::uint32_t MatrixSelector::resolution() const noexcept { return _footer->resolution(); } - -inline MatrixType MatrixSelector::matrix_type() const noexcept { return _footer->matrix_type(); } - -inline NormalizationMethod MatrixSelector::normalizationMethod() const noexcept { - return _footer->normalization(); -} - -inline MatrixUnit MatrixSelector::matrixUnit() const noexcept { return _footer->unit(); } - -inline std::int64_t MatrixSelector::numBins1() const noexcept { - return (chrom1().size() + resolution() - 1) / resolution(); -} - -inline std::int64_t MatrixSelector::numBins2() const noexcept { - return (chrom2().size() + resolution() - 1) / resolution(); -} - -inline bool MatrixSelector::isIntra() const noexcept { return chrom1() == chrom2(); } - -inline bool MatrixSelector::isInter() const noexcept { return !isIntra(); } - -inline const std::vector &MatrixSelector::chrom1Norm() const noexcept { - return _footer->c1Norm(); -} - -inline const std::vector &MatrixSelector::chrom2Norm() const noexcept { - return _footer->c2Norm(); -} - -inline double MatrixSelector::avgCount() const { - if (isInter()) { - return _blockMap.sumCount / static_cast(numBins1() * numBins2()); - } - throw std::domain_error( - "MatrixSelector::avgCount is not implemented for intra-chromosomal matrices"); -} - -inline void MatrixSelector::fetch(std::vector> &buffer, bool sorted) { - return fetch(0, chrom1().size(), 0, chrom2().size(), buffer, sorted); -} - -inline void MatrixSelector::fetch(std::int64_t start, std::int64_t end, - std::vector> &buffer, bool sorted) { - return fetch(start, end, start, end, buffer, sorted); -} - -inline void MatrixSelector::fetch(std::int64_t start1, std::int64_t end1, std::int64_t start2, - std::int64_t end2, std::vector> &buffer, - bool sorted) { - buffer.clear(); - if (start1 > end1) { - throw std::invalid_argument(fmt::format(FMT_STRING("start1 > end1: {} > {}"), start1, end1)); - } - if (start2 > end2) { - throw std::invalid_argument(fmt::format(FMT_STRING("start2 > end2: {} > {}"), start2, end2)); - } - - if (start1 < 0 || end1 > chrom1().size()) { - throw std::runtime_error(fmt::format( - FMT_STRING("query extends past chromosome {}: interval {}-{} lies outside of 0-{}"), - chrom1().name(), start1, end1, chrom1().size())); - } - - if (start2 < 0 || end2 > chrom2().size()) { - throw std::runtime_error(fmt::format( - FMT_STRING("query extends past chromosome {}: interval {}-{} lies outside of 0-{}"), - chrom2().name(), start2, end2, chrom2().size())); - } - - // Query is valid but returns no pixels - if (this->_footer->fileOffset() == -1) { - assert(this->_blockMap.blocks.empty()); - return; - } - - const auto is_intra = isIntra(); - if (is_intra && start1 > start2) { - std::swap(start1, start2); - std::swap(end1, end2); - } - - const auto bin1 = start1 / resolution(); - const auto bin2 = (end1 + resolution() - 1) / resolution(); - const auto bin3 = start2 / resolution(); - const auto bin4 = (end2 + resolution() - 1) / resolution(); - - if (_fs->version() > 8 && isIntra()) { - readBlockNumbersV9Intra(bin1, bin2, bin3, bin4, _blockNumberBuff); - } else { - readBlockNumbers(bin1, bin2, bin3, bin4, _blockNumberBuff); - } - - std::size_t empty_blocks = 0; - for (auto blockNumber : _blockNumberBuff) { - const auto block = readBlockOfInteractions(_blockMap.blocks[blockNumber], _contactRecordBuff); - if (!block) { - empty_blocks++; - continue; - } - - // Obs we use open-closed interval instead of open-open like is done in straw - for (const auto &[b1, row] : block->find_overlap(bin1, bin2)) { - if (b1 >= bin2) { - // We're past the last row overlapping the query - break; - } - for (const auto &tp : row) { - const auto &b2 = tp.bin2_id; - if (b1 < bin1 || b2 < bin3) { - // We're upstream of the first column overlapping the query (if any) - continue; - } - - if (b2 >= bin4) { - // We're past the last column overlapping the query for the current row - break; - } - - auto record = processInteraction(SerializedPixel{b1, b2, tp.count}); - if (std::isfinite(record.count)) { - buffer.emplace_back( - PixelCoordinates{ - _bins.at(_footer->chrom1(), static_cast(record.bin1_id)), - _bins.at(_footer->chrom2(), static_cast(record.bin2_id))}, - record.count); - } - } - }; - } - if (sorted && _blockNumberBuff.size() - empty_blocks > 1) { - // Only interactions from the same block are guaranteed to already be sorted - std::sort(buffer.begin(), buffer.end()); - } -} - -inline SerializedPixel MatrixSelector::processInteraction(SerializedPixel record) { - const auto &c1Norm = _footer->c1Norm(); - const auto &c2Norm = _footer->c2Norm(); - const auto &expected = _footer->expectedValues(); - - assert(isInter() || record.bin1_id <= record.bin2_id); - - const auto skipNormalization = - normalizationMethod() == NormalizationMethod::NONE || matrix_type() == MatrixType::expected; - - if (!skipNormalization) { - const auto bin1 = static_cast(record.bin1_id); - const auto bin2 = static_cast(record.bin2_id); - assert(bin1 < c1Norm.size()); - assert(bin2 < c2Norm.size()); - record.count /= static_cast(c1Norm[bin1] * c2Norm[bin2]); - } - - record.bin1_id *= resolution(); - record.bin2_id *= resolution(); - - if (matrix_type() == MatrixType::observed) { - return record; - } - - const auto expectedCount = [&]() { - if (isInter()) { - return float(avgCount()); - } - - const auto i = static_cast((record.bin2_id - record.bin1_id) / resolution()); - assert(i < expected.size()); - return float(expected[i]); - }(); - - if (matrix_type() == MatrixType::expected) { - record.count = expectedCount; - return record; - } - - assert(matrix_type() == MatrixType::oe); - record.count /= expectedCount; - - return record; -} - -/* -inline void MatrixSelector::readBlockOfInteractionsV6(BinaryBuffer &src, - std::vector &dest) { - assert(src.i == sizeof(std::int32_t)); - - constexpr auto recordSize = sizeof(std::int32_t) + sizeof(std::int32_t) + sizeof(float); - - const auto srcSize = sizeof(std::int32_t) + (src.buffer.size() * sizeof(char)); - const auto destSize = recordSize * dest.size(); - - if (srcSize != destSize) { - throw std::runtime_error(fmt::format( - FMT_STRING("binary buffer appears to be corrupted: expected {}B, found {}B"), destSize, - srcSize)); - } - - std::generate(dest.begin(), dest.end(), [&]() { - // clang-format off - return SerializedPixel{src.read(), - src.read(), - src.read()}; - // clang-format on - }); - return; -} -*/ - -inline std::shared_ptr MatrixSelector::readBlockOfInteractions( - indexEntry idx, std::vector &buffer) { - buffer.clear(); - if (idx.size <= 0) { - return {nullptr}; - } - - if (auto it = _blockCache.find(static_cast(idx.position)); it != _blockCache.end()) { - return it->second; - } - - _fs->readAndInflate(idx, _buffer.buffer); - _buffer.i = 0; - - const auto nRecords = static_cast(_buffer.read()); - buffer.resize(nRecords); - - // if (_fs->version() == 6) { - // readBlockOfInteractionsV6(_buffer, buffer); - // auto it = - // _blockCache.emplace(static_cast(idx.position), - // InteractionBlock(buffer)); - // return it.first->second; - // } - - const auto bin1Offset = _buffer.read(); - const auto bin2Offset = _buffer.read(); - - const auto i16Counts = _buffer.read() == 0; - - auto readUseShortBinFlag = [&]() { - if (_fs->version() > 8) { - return _buffer.read() == 0; - } - return true; - }; - - const auto i16Bin1 = readUseShortBinFlag(); - const auto i16Bin2 = readUseShortBinFlag(); - - const auto type = static_cast(_buffer.read()); - if (type != 1 && type != 2) { - throw std::runtime_error( - fmt::format(FMT_STRING("uknown interaction type \"{}\". Supported types: 1, 2"), type)); - } - - switch (type) { - case 1: - readBlockOfInteractionsType1Dispatcher(i16Bin1, i16Bin2, i16Counts, bin1Offset, bin2Offset, - _buffer, buffer); - break; - case 2: - if (i16Counts) { - readBlockOfInteractionsType2(bin1Offset, bin2Offset, _buffer, buffer); - break; - } - readBlockOfInteractionsType2(bin1Offset, bin2Offset, _buffer, buffer); - break; - default: - assert(false); - std::abort(); - } - - auto it = _blockCache.emplace(static_cast(idx.position), InteractionBlock(buffer)); - return it.first->second; -} - -inline void MatrixSelector::readBlockOfInteractionsType1Dispatcher( - bool i16Bin1, bool i16Bin2, bool i16Counts, std::int32_t bin1Offset, std::int32_t bin2Offset, - BinaryBuffer &src, std::vector &dest) noexcept { - using BS = std::int16_t; // Short type for bins - using CS = std::int16_t; // Short type for count - - using BL = std::int32_t; // Long type for bins - using CL = float; // Long type for count - - if (i16Bin1 && i16Bin2 && i16Counts) { - readBlockOfInteractionsType1(bin1Offset, bin2Offset, src, dest); - return; - } - if (!i16Bin1 && i16Bin2 && i16Counts) { - readBlockOfInteractionsType1(bin1Offset, bin2Offset, src, dest); - return; - } - if (i16Bin1 && !i16Bin2 && i16Counts) { - readBlockOfInteractionsType1(bin1Offset, bin2Offset, src, dest); - return; - } - if (i16Bin1 && i16Bin2 && !i16Counts) { - readBlockOfInteractionsType1(bin1Offset, bin2Offset, src, dest); - return; - } - if (!i16Bin1 && !i16Bin2 && i16Counts) { - readBlockOfInteractionsType1(bin1Offset, bin2Offset, src, dest); - return; - } - if (!i16Bin1 && i16Bin2 && !i16Counts) { - readBlockOfInteractionsType1(bin1Offset, bin2Offset, src, dest); - return; - } - if (i16Bin1 && !i16Bin2 && !i16Counts) { - readBlockOfInteractionsType1(bin1Offset, bin2Offset, src, dest); - return; - } - assert(!i16Bin1 && !i16Bin2 && !i16Counts); - readBlockOfInteractionsType1(bin1Offset, bin2Offset, src, dest); -} - -template -inline void MatrixSelector::readBlockOfInteractionsType1( - std::int32_t bin1Offset, std::int32_t bin2Offset, BinaryBuffer &src, - std::vector &dest) noexcept { - using i16 = std::int16_t; - using i32 = std::int32_t; - using f32 = float; - static_assert(std::is_same::value || std::is_same::value, ""); - static_assert(std::is_same::value || std::is_same::value, ""); - static_assert(std::is_same::value || std::is_same::value, ""); - - constexpr auto expectedOffsetV7 = (3 * sizeof(i32)) + (2 * sizeof(char)); - constexpr auto expectedOffsetV8plus = expectedOffsetV7 + (2 * sizeof(char)); - std::ignore = expectedOffsetV7; - std::ignore = expectedOffsetV8plus; - assert(src.i == expectedOffsetV7 || src.i == expectedOffsetV8plus); - - const auto expectedNumRecords = dest.size(); - dest.clear(); - const auto numRows = static_cast(src.read()); - for (i32 i = 0; i < numRows; ++i) { - const auto bin2 = bin2Offset + static_cast(src.read()); - - const auto numCols = static_cast(src.read()); - for (i32 j = 0; j < numCols; ++j) { - const auto bin1 = bin1Offset + static_cast(src.read()); - - const auto counts = static_cast(src.read()); - dest.push_back(SerializedPixel{bin1, bin2, counts}); - } - } - - std::ignore = expectedNumRecords; - assert(expectedNumRecords == dest.size()); -} - -template -inline void MatrixSelector::readBlockOfInteractionsType2( - std::int32_t bin1Offset, std::int32_t bin2Offset, BinaryBuffer &src, - std::vector &dest) noexcept { - using i16 = std::int16_t; - using i32 = std::int32_t; - using f32 = float; - static_assert(std::is_same::value || std::is_same::value, ""); - - const auto nPts = src.read(); - const auto w = static_cast(src.read()); - - constexpr auto i16Sentinel = (std::numeric_limits::lowest)(); - constexpr auto i16Counts = std::is_same::value; - - auto isValid = [&](CountType n) { - return (i16Counts && static_cast(n) != i16Sentinel) || - (!i16Counts && !std::isnan(static_cast(n))); - }; - - dest.reserve(static_cast(nPts)); - dest.clear(); - for (i32 i = 0; i < nPts; ++i) { - const auto count = src.read(); - if (!isValid(count)) { - continue; - } - const auto row = i / w; - const auto col = i - row * w; - const auto bin1 = bin1Offset + col; - const auto bin2 = bin2Offset + row; - - dest.emplace_back(SerializedPixel{bin1, bin2, static_cast(count)}); - } -} - -inline BlockMap MatrixSelector::readBlockMap(HiCFileStream &fs, const HiCFooter &footer) { - if (footer.fileOffset() == -1) { - // Footer does not exist. However, query validity is assessed elswehere - return {}; - } - - BlockMap buffer{}; - fs.readBlockMap(footer.fileOffset(), footer.chrom1(), footer.chrom2(), footer.unit(), - footer.resolution(), buffer); - return buffer; -} - -inline void MatrixSelector::readBlockNumbers(std::int64_t bin1, std::int64_t bin2, - std::int64_t bin3, std::int64_t bin4, - std::set &buffer) const { - const auto blockBinCount = _blockMap.blockBinCount; - const auto blockColumnCount = _blockMap.blockColumnCount; - - const auto col1 = bin1 / blockBinCount; - const auto col2 = (bin2 + 1) / blockBinCount; - const auto row1 = bin3 / blockBinCount; - const auto row2 = (bin4 + 1) / blockBinCount; - - // check region part that overlaps with lower left triangle but only if intrachromosomal - const auto checkLowerLeftTri = isIntra(); - buffer.clear(); - // first check the upper triangular matrix_type - for (auto row = row1; row <= row2; ++row) { - for (auto col = col1; col <= col2; ++col) { - buffer.insert(static_cast(row * blockColumnCount + col)); - if (checkLowerLeftTri) { - buffer.insert(static_cast(col * blockColumnCount + row)); - } - } - } -} - -inline void MatrixSelector::readBlockNumbersV9Intra(std::int64_t bin1, std::int64_t bin2, - std::int64_t bin3, std::int64_t bin4, - std::set &buffer) const { - const auto blockBinCount = _blockMap.blockBinCount; - const auto blockColumnCount = _blockMap.blockColumnCount; - - const auto translatedLowerPAD = (bin1 + bin3) / 2 / blockBinCount; - const auto translatedHigherPAD = (bin2 + bin4) / 2 / blockBinCount + 1; - const auto translatedNearerDepth = static_cast( - std::log2(1.0 + double(std::abs(bin1 - bin4)) / std::sqrt(2.0) / blockBinCount)); - const auto translatedFurtherDepth = static_cast( - std::log2(1.0 + double(std::abs(bin2 - bin3)) / std::sqrt(2.0) / blockBinCount)); - - // because code above assumes above diagonal; but we could be below diagonal - const auto nearerDepth = [&]() -> std::int64_t { - if ((bin1 > bin4 && bin2 < bin3) || (bin2 > bin3 && bin1 < bin4)) { - return 0; - } - return std::min(translatedNearerDepth, translatedFurtherDepth); - }(); - - // +1; integer divide rounds down - const auto furtherDepth = std::max(translatedNearerDepth, translatedFurtherDepth) + 1; - - buffer.clear(); - for (auto depth = nearerDepth; depth <= furtherDepth; ++depth) { - for (auto pad = translatedLowerPAD; pad <= translatedHigherPAD; ++pad) { - buffer.insert(static_cast(depth * blockColumnCount + pad)); - } - } -} - -inline void MatrixSelector::clearBlockCache() noexcept { _blockCache.reset(); } -constexpr double MatrixSelector::blockCacheHitRate() const noexcept { - return _blockCache.hit_rate(); -} -inline std::size_t MatrixSelector::blockCacheSize() const noexcept { return _blockCache.size(); } -constexpr std::size_t MatrixSelector::blockCacheSizeBytes() const noexcept { - return _blockCache.size_in_bytes(); -} -constexpr std::size_t MatrixSelector::blockCacheHits() const noexcept { return _blockCache.hits(); } -constexpr std::size_t MatrixSelector::blockCacheMisses() const noexcept { - return _blockCache.misses(); -} - -} // namespace hictk::internal diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index 5a31ad9c..c2926b5d 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -12,28 +12,31 @@ #include #include +#include "hictk/hic/block_reader.hpp" #include "hictk/hic/common.hpp" #include "hictk/hic/filestream.hpp" #include "hictk/hic/hic_file_stream.hpp" #include "hictk/hic/hic_footer.hpp" #include "hictk/hic/hic_header.hpp" -#include "hictk/hic/hic_matrix_selector.hpp" +// #include "hictk/hic/hic_matrix_selector.hpp" +#include "hictk/hic/pixel_selector.hpp" -namespace hictk { +namespace hictk::hic { class HiCFile { // clang-format off using FooterCacheT = std::unordered_map>; // clang-format on - std::shared_ptr _fs{}; - FooterCacheT _footers{}; + mutable std::shared_ptr _fs{}; + mutable FooterCacheT _footers{}; MatrixType _type{MatrixType::observed}; MatrixUnit _unit{MatrixUnit::BP}; - internal::BlockLRUCache _block_cache{}; - BinTable _bins{}; + mutable std::shared_ptr _block_cache{}; + std::shared_ptr _bins{}; public: + using QUERY_TYPE = GenomicInterval::Type; explicit HiCFile(std::string url_, std::uint32_t resolution_, MatrixType type_ = MatrixType::observed, MatrixUnit unit_ = MatrixUnit::BP, // TODO consider expressing cache size in terms of number of pixels @@ -48,39 +51,40 @@ class HiCFile { [[nodiscard]] const Reference &chromosomes() const noexcept; [[nodiscard]] const std::string &assembly() const noexcept; [[nodiscard]] const std::vector &avail_resolutions() const noexcept; - [[nodiscard]] constexpr std::uint32_t resolution() const noexcept; - - [[nodiscard]] internal::MatrixSelector get_matrix_selector(const Chromosome &chrom, - NormalizationMethod norm); - [[nodiscard]] internal::MatrixSelector get_matrix_selector(const std::string &chromName, - NormalizationMethod norm); - [[nodiscard]] internal::MatrixSelector get_matrix_selector(std::uint32_t chrom_id, - NormalizationMethod norm); - - [[nodiscard]] internal::MatrixSelector get_matrix_selector(const Chromosome &chrom1, - const Chromosome &chrom2, - NormalizationMethod norm); - [[nodiscard]] internal::MatrixSelector get_matrix_selector(const std::string &chrom1_name, - const std::string &chrom2_name, - NormalizationMethod norm); - [[nodiscard]] internal::MatrixSelector get_matrix_selector(std::uint32_t chrom1_id, - std::uint32_t chrom2_id, - NormalizationMethod norm); + [[nodiscard]] std::uint32_t resolution() const noexcept; + [[nodiscard]] PixelSelector fetch(std::string_view query, + NormalizationMethod norm = NormalizationMethod::NONE, + QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + [[nodiscard]] PixelSelector fetch(std::string_view chrom_name, std::uint32_t start, + std::uint32_t end, + NormalizationMethod norm = NormalizationMethod::NONE) const; + [[nodiscard]] PixelSelector fetch(std::string_view range1, std::string_view range2, + NormalizationMethod norm = NormalizationMethod::NONE, + QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + [[nodiscard]] PixelSelector fetch(std::string_view chrom1_name, std::uint32_t start1, + std::uint32_t end1, std::string_view chrom2_name, + std::uint32_t start2, std::uint32_t end2, + NormalizationMethod norm = NormalizationMethod::NONE) const; [[nodiscard]] std::size_t num_cached_footers() const noexcept; void purge_footer_cache(); private: [[nodiscard]] std::shared_ptr get_footer( std::uint32_t chrom1_id, std::uint32_t chrom2_id, MatrixType matrix_type, - NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution); + NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution) const; + + [[nodiscard]] PixelSelector fetch(const Chromosome &chrom1, std::uint32_t start1, + std::uint32_t end1, const Chromosome &chrom2, + std::uint32_t start2, std::uint32_t end2, + NormalizationMethod norm = NormalizationMethod::NONE) const; }; namespace utils { [[nodiscard]] bool is_hic_file(const std::filesystem::path &path); } // namespace utils -} // namespace hictk +} // namespace hictk::hic #include "../../hic_file_impl.hpp" #include "../../hic_file_utils_impl.hpp" diff --git a/src/hic/include/hictk/hic/block_reader.hpp b/src/hic/include/hictk/hic/block_reader.hpp new file mode 100644 index 00000000..d7202b71 --- /dev/null +++ b/src/hic/include/hictk/hic/block_reader.hpp @@ -0,0 +1,130 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include + +#include "hictk/chromosome.hpp" +#include "hictk/hic/cache.hpp" +#include "hictk/hic/filestream.hpp" +#include "hictk/hic/hic_file_stream.hpp" +#include "hictk/hic/index.hpp" + +namespace hictk::hic::internal { + +class BlockGrid { + struct Node { + std::shared_ptr block{}; + const Node* next_right{}; + const Node* next_down{}; + std::size_t + row{}; // These should be in absolute term (i.e. first/last row and col mapping to block) + std::size_t + col{}; // These should be in absolute term (i.e. first/last row and col mapping to block) + }; + + std::vector _grid{}; + + public: + class iterator; + BlockGrid() = default; + BlockGrid(const std::vector& index, std::size_t block_bin_count); + + [[nodiscard]] auto begin() const noexcept -> iterator; + [[nodiscard]] auto end() const noexcept -> iterator; + [[nodiscard]] std::size_t size() const noexcept; + + class iterator { + const Node* _node{}; + std::size_t _current_row{(std::numeric_limits::max)()}; + + public: + using difference_type = std::ptrdiff_t; + using value_type = Node; + using pointer = Node*; + using const_pointer = const Node*; + using reference = Node&; + using const_reference = const Node&; + using iterator_category = std::input_iterator_tag; + + iterator() = default; + explicit iterator(const Node& head); + [[nodiscard]] bool operator==(const iterator& other) const noexcept; + [[nodiscard]] bool operator!=(const iterator& other) const noexcept; + + auto operator++() noexcept -> iterator&; + auto operator++(int) noexcept -> iterator; + + [[nodiscard]] auto operator*() noexcept -> const_reference; + [[nodiscard]] auto operator->() noexcept -> const_pointer; + }; +}; + +class BinaryBuffer { + std::string _buffer{}; + std::size_t _i{}; + + public: + BinaryBuffer() = default; + template ::value>::type* = nullptr> + T read(); + + // Return the offset of the underlying buffer. Useful for error checking + [[nodiscard]] std::size_t operator()() const noexcept; + + // Reset and return ref to underlying buffer so that buff can be refilled + std::string& reset() noexcept; +}; + +class HiCBlockReader { + std::shared_ptr _hfs{}; + Index _index{}; + std::shared_ptr _blk_cache{}; // This should be passed in by file. Key should be + // changed from size_t to {chrom1, chrom2, size_t} + // We need the entire bin table in order to map pixels to abs bin ids + std::shared_ptr _bins{}; + BlockGrid _block_grid{}; + + BinaryBuffer _bbuffer{}; + std::vector _tmp_buffer{}; + + public: + HiCBlockReader() = default; + HiCBlockReader(std::shared_ptr hfs, const HiCFooter& footer, + std::shared_ptr bins_, std::shared_ptr block_cache_, + const PixelCoordinates& coords1, const PixelCoordinates& coords2); + + [[nodiscard]] explicit operator bool() const noexcept; + + [[nodiscard]] const Chromosome& chrom1() const noexcept; + [[nodiscard]] const Chromosome& chrom2() const noexcept; + [[nodiscard]] const BinTable& bins() const noexcept; + [[nodiscard]] const BlockGrid& grid() const; + + [[nodiscard]] double sum() const noexcept; + [[nodiscard]] double avg() const noexcept; + + [[nodiscard]] std::shared_ptr read(const BlockIndex& idx); + + private: + void find_overlapping_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2); + [[nodiscard]] static Index read_index(HiCFileStream& hfs, const HiCFooter& footer); + static void read_dispatcher_type1_block(bool i16Bin1, bool i16Bin2, bool i16Counts, + std::int32_t bin1Offset, std::int32_t bin2Offset, + BinaryBuffer& src, + std::vector& dest) noexcept; + template + static void read_type1_block(std::int32_t bin1Offset, std::int32_t bin2Offset, BinaryBuffer& src, + std::vector& dest) noexcept; + + template + static void read_type2_block(std::int32_t bin1Offset, std::int32_t bin2Offset, BinaryBuffer& src, + std::vector& dest) noexcept; +}; + +} // namespace hictk::hic::internal + +#include "../../../block_reader_impl.hpp" diff --git a/src/hic/include/hictk/hic/cache.hpp b/src/hic/include/hictk/hic/cache.hpp index a70509a8..a6c66e00 100644 --- a/src/hic/include/hictk/hic/cache.hpp +++ b/src/hic/include/hictk/hic/cache.hpp @@ -11,22 +11,27 @@ #include #include +#include "hictk/chromosome.hpp" #include "hictk/hic/common.hpp" #include "hictk/pixel.hpp" -namespace hictk::internal { +namespace hictk::hic::internal { class InteractionBlock { + public: struct ThinPixel { - std::int64_t bin2_id{}; + std::uint64_t bin2_id{}; float count{}; }; using Row = std::vector; - using BuffT = phmap::btree_map; + + private: + using BuffT = phmap::btree_map; + std::size_t _id{}; BuffT _interactions{}; - std::int64_t _first_col{}; - std::int64_t _last_col{}; + const Chromosome* _chrom1{}; + const Chromosome* _chrom2{}; public: using iterator = BuffT::iterator; @@ -43,7 +48,19 @@ class InteractionBlock { }; InteractionBlock() = default; - explicit InteractionBlock(const std::vector& pixels); + InteractionBlock(std::size_t id_, const std::vector& pixels); + + friend constexpr bool operator<(const InteractionBlock& a, const InteractionBlock& b) noexcept; + friend constexpr bool operator==(const InteractionBlock& a, const InteractionBlock& b) noexcept; + friend constexpr bool operator!=(const InteractionBlock& a, const InteractionBlock& b) noexcept; + + friend constexpr bool operator<(const InteractionBlock& a, std::size_t b_id) noexcept; + friend constexpr bool operator==(const InteractionBlock& a, std::size_t b_id) noexcept; + friend constexpr bool operator!=(const InteractionBlock& a, std::size_t b_id) noexcept; + + friend constexpr bool operator<(std::size_t a_id, const InteractionBlock& b) noexcept; + friend constexpr bool operator==(std::size_t a_id, const InteractionBlock& b) noexcept; + friend constexpr bool operator!=(std::size_t a_id, const InteractionBlock& b) noexcept; [[nodiscard]] auto operator()() const noexcept -> const BuffT&; @@ -55,20 +72,30 @@ class InteractionBlock { [[nodiscard]] auto end() const noexcept -> const_iterator; [[nodiscard]] auto cend() const noexcept -> const_iterator; - [[nodiscard]] auto find_overlap(std::int64_t first_row, std::int64_t last_row) const noexcept + [[nodiscard]] std::size_t id() const noexcept; + [[nodiscard]] const Chromosome& chrom1() const noexcept; + [[nodiscard]] const Chromosome& chrom2() const noexcept; + + [[nodiscard]] auto at(std::uint64_t row) const noexcept -> const_iterator; + [[nodiscard]] auto find_overlap(std::uint64_t first_row, std::uint64_t last_row) const noexcept -> Overlap; + [[nodiscard]] bool has_overlap(std::uint64_t first_row, std::uint64_t last_row) const noexcept; + [[nodiscard]] std::size_t size() const noexcept; [[nodiscard]] std::size_t size_in_bytes() const noexcept; +}; + +struct InteractionBlockCmp { + using is_transparent = void; - [[nodiscard]] std::int64_t first_row() const noexcept; - [[nodiscard]] std::int64_t last_row() const noexcept; - [[nodiscard]] std::int64_t first_col() const noexcept; - [[nodiscard]] std::int64_t last_col() const noexcept; + constexpr bool operator()(const InteractionBlock& a, const InteractionBlock& b) const noexcept; + constexpr bool operator()(const InteractionBlock& a, std::size_t b_id) const noexcept; + constexpr bool operator()(std::size_t a_id, const InteractionBlock& b) const noexcept; }; class BlockLRUCache { - using MapT = tsl::ordered_map>; + using MapT = tsl::ordered_map>; using key_t = MapT::key_type; using mapped_type = MapT::mapped_type; using iterator = MapT::iterator; @@ -112,6 +139,6 @@ class BlockLRUCache { void erase(iterator it); }; -} // namespace hictk::internal +} // namespace hictk::hic::internal #include "../../../cache_impl.hpp" diff --git a/src/hic/include/hictk/hic/common.hpp b/src/hic/include/hictk/hic/common.hpp index 32519529..8bc8221d 100644 --- a/src/hic/include/hictk/hic/common.hpp +++ b/src/hic/include/hictk/hic/common.hpp @@ -16,7 +16,9 @@ #include #include -namespace hictk { +#include "hictk/common.hpp" + +namespace hictk::hic { struct SerializedPixel { std::int64_t bin1_id{}; @@ -52,9 +54,9 @@ struct indexEntry { constexpr bool operator!=(const indexEntry &other) const noexcept { return !(*this == other); } }; -} // namespace hictk +} // namespace hictk::hic -namespace hictk { +namespace hictk::hic { enum class NormalizationMethod { NONE, @@ -144,10 +146,10 @@ enum class MatrixUnit { BP, FRAG }; throw std::runtime_error("Invalid unit \"" + s + "\""); } -} // namespace hictk +} // namespace hictk::hic template <> -struct fmt::formatter { +struct fmt::formatter { static constexpr auto parse(format_parse_context &ctx) -> decltype(ctx.begin()) { if (ctx.begin() != ctx.end() && *ctx.begin() != '}') { throw fmt::format_error("invalid format"); @@ -156,39 +158,39 @@ struct fmt::formatter { } template - static auto format(const hictk::NormalizationMethod n, FormatContext &ctx) + static auto format(const hictk::hic::NormalizationMethod n, FormatContext &ctx) -> decltype(ctx.out()) { + using NM = hictk::hic::NormalizationMethod; switch (n) { - case hictk::NormalizationMethod::NONE: + case NM::NONE: return fmt::format_to(ctx.out(), FMT_STRING("NONE")); - case hictk::NormalizationMethod::VC: + case NM::VC: return fmt::format_to(ctx.out(), FMT_STRING("VC")); - case hictk::NormalizationMethod::VC_SQRT: + case NM::VC_SQRT: return fmt::format_to(ctx.out(), FMT_STRING("VC_SQRT")); - case hictk::NormalizationMethod::KR: + case NM::KR: return fmt::format_to(ctx.out(), FMT_STRING("KR")); - case hictk::NormalizationMethod::SCALE: + case NM::SCALE: return fmt::format_to(ctx.out(), FMT_STRING("SCALE")); - case hictk::NormalizationMethod::INTER_VC: + case NM::INTER_VC: return fmt::format_to(ctx.out(), FMT_STRING("INTER_VC")); - case hictk::NormalizationMethod::INTER_KR: + case NM::INTER_KR: return fmt::format_to(ctx.out(), FMT_STRING("INTER_KR")); - case hictk::NormalizationMethod::INTER_SCALE: + case NM::INTER_SCALE: return fmt::format_to(ctx.out(), FMT_STRING("INTER_SCALE")); - case hictk::NormalizationMethod::GW_VC: + case NM::GW_VC: return fmt::format_to(ctx.out(), FMT_STRING("GW_VC")); - case hictk::NormalizationMethod::GW_KR: + case NM::GW_KR: return fmt::format_to(ctx.out(), FMT_STRING("GW_KR")); - case hictk::NormalizationMethod::GW_SCALE: + case NM::GW_SCALE: return fmt::format_to(ctx.out(), FMT_STRING("GW_SCALE")); } - assert(false); - std::abort(); + HICTK_UNREACHABLE_CODE; } }; template <> -struct fmt::formatter { +struct fmt::formatter { static constexpr auto parse(format_parse_context &ctx) -> decltype(ctx.begin()) { if (ctx.begin() != ctx.end() && *ctx.begin() != '}') { throw fmt::format_error("invalid format"); @@ -197,22 +199,21 @@ struct fmt::formatter { } template - static auto format(const hictk::MatrixType t, FormatContext &ctx) -> decltype(ctx.out()) { + static auto format(const hictk::hic::MatrixType t, FormatContext &ctx) -> decltype(ctx.out()) { switch (t) { - case hictk::MatrixType::observed: + case hictk::hic::MatrixType::observed: return fmt::format_to(ctx.out(), FMT_STRING("observed")); - case hictk::MatrixType::oe: + case hictk::hic::MatrixType::oe: return fmt::format_to(ctx.out(), FMT_STRING("oe")); - case hictk::MatrixType::expected: + case hictk::hic::MatrixType::expected: return fmt::format_to(ctx.out(), FMT_STRING("expected")); } - assert(false); - std::abort(); + HICTK_UNREACHABLE_CODE; } }; template <> -struct fmt::formatter { +struct fmt::formatter { static constexpr auto parse(format_parse_context &ctx) -> decltype(ctx.begin()) { if (ctx.begin() != ctx.end() && *ctx.begin() != '}') { throw fmt::format_error("invalid format"); @@ -221,15 +222,14 @@ struct fmt::formatter { } template - static auto format(const hictk::MatrixUnit u, FormatContext &ctx) -> decltype(ctx.out()) { + static auto format(const hictk::hic::MatrixUnit u, FormatContext &ctx) -> decltype(ctx.out()) { switch (u) { - case hictk::MatrixUnit::BP: + case hictk::hic::MatrixUnit::BP: return fmt::format_to(ctx.out(), FMT_STRING("BP")); - case hictk::MatrixUnit::FRAG: + case hictk::hic::MatrixUnit::FRAG: return fmt::format_to(ctx.out(), FMT_STRING("FRAG")); } - assert(false); - std::abort(); + HICTK_UNREACHABLE_CODE; } }; diff --git a/src/hic/include/hictk/hic/filestream.hpp b/src/hic/include/hictk/hic/filestream.hpp index 21f45366..20dcb011 100644 --- a/src/hic/include/hictk/hic/filestream.hpp +++ b/src/hic/include/hictk/hic/filestream.hpp @@ -14,7 +14,7 @@ #include "hictk/hic/common.hpp" -namespace hictk::internal::filestream { +namespace hictk::hic::internal::filestream { class FileStream { std::string path_{}; @@ -65,6 +65,6 @@ class FileStream { [[nodiscard]] static std::ifstream open_file(const std::string &path, std::ifstream::openmode mode); }; -} // namespace hictk::internal::filestream +} // namespace hictk::hic::internal::filestream #include "../../../filestream_impl.hpp" diff --git a/src/hic/include/hictk/hic/hic_file_stream.hpp b/src/hic/include/hictk/hic/hic_file_stream.hpp index c846994b..30a8304e 100644 --- a/src/hic/include/hictk/hic/hic_file_stream.hpp +++ b/src/hic/include/hictk/hic/hic_file_stream.hpp @@ -18,15 +18,20 @@ #include "hictk/hic/filestream.hpp" #include "hictk/hic/hic_footer.hpp" #include "hictk/hic/hic_header.hpp" +#include "hictk/hic/index.hpp" -namespace hictk::internal { +namespace hictk::hic::internal { -struct BlockMap { - phmap::btree_map blocks{}; - std::int32_t blockBinCount{}; - std::int32_t blockColumnCount{}; - double sumCount{}; -}; +// TODO REMOVE +// struct BlockIndex { +// phmap::btree_map blocks{}; +// std::int32_t blockBinCount{}; +// std::int32_t blockColumnCount{}; +// double sumCount{}; +// +// [[nodiscard]] indexEntry at(std::size_t id) const noexcept; +// [[nodiscard]] indexEntry at(std::size_t row, std::size_t col) const noexcept; +// }; class HiCFileStream { using Decompressor = UniquePtrWithDeleter; @@ -54,9 +59,10 @@ class HiCFileStream { std::string &buff); [[nodiscard]] static MatrixUnit readMatrixUnit(filestream::FileStream &fs, std::string &buff); - void readBlockMap(std::int64_t fileOffset, const Chromosome &chrom1, const Chromosome &chrom2, - MatrixUnit wantedUnit, std::int64_t wantedResolution, BlockMap &buffer); - void readAndInflate(indexEntry idx, std::string &plainTextBuffer); + [[nodiscard]] Index readBlockMap(std::int64_t fileOffset, const Chromosome &chrom1, + const Chromosome &chrom2, MatrixUnit wantedUnit, + std::int64_t wantedResolution); + void readAndInflate(const BlockIndex &idx, std::string &plainTextBuffer); [[nodiscard]] static bool checkMagicString(std::string url) noexcept; @@ -87,6 +93,6 @@ class HiCFileStream { [[nodiscard]] static auto init_decompressor() -> Decompressor; }; -} // namespace hictk::internal +} // namespace hictk::hic::internal #include "../../../hic_file_stream_impl.hpp" diff --git a/src/hic/include/hictk/hic/hic_footer.hpp b/src/hic/include/hictk/hic/hic_footer.hpp index 5fa01a03..e70fa6c0 100644 --- a/src/hic/include/hictk/hic/hic_footer.hpp +++ b/src/hic/include/hictk/hic/hic_footer.hpp @@ -11,7 +11,7 @@ #include "hictk/chromosome.hpp" #include "hictk/hic/common.hpp" -namespace hictk::internal { +namespace hictk::hic::internal { struct HiCFooterMetadata { std::string url{}; MatrixType matrix_type{MatrixType::observed}; @@ -61,6 +61,6 @@ class HiCFooter { [[nodiscard]] constexpr std::vector &c1Norm() noexcept; [[nodiscard]] constexpr std::vector &c2Norm() noexcept; }; -} // namespace hictk::internal +} // namespace hictk::hic::internal #include "../../../hic_footer_impl.hpp" diff --git a/src/hic/include/hictk/hic/hic_header.hpp b/src/hic/include/hictk/hic/hic_header.hpp index 56b44375..5bc75570 100644 --- a/src/hic/include/hictk/hic/hic_header.hpp +++ b/src/hic/include/hictk/hic/hic_header.hpp @@ -12,7 +12,7 @@ #include "hictk/hic/common.hpp" #include "hictk/reference.hpp" -namespace hictk::internal { +namespace hictk::hic::internal { struct HiCHeader { std::string url{}; @@ -29,6 +29,6 @@ struct HiCHeader { bool operator!=(const HiCHeader &other) const noexcept; }; -} // namespace hictk::internal +} // namespace hictk::hic::internal #include "../../../hic_header_impl.hpp" diff --git a/src/hic/include/hictk/hic/hic_matrix_selector.hpp b/src/hic/include/hictk/hic/hic_matrix_selector.hpp deleted file mode 100644 index b454ec32..00000000 --- a/src/hic/include/hictk/hic/hic_matrix_selector.hpp +++ /dev/null @@ -1,108 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "hictk/hic/cache.hpp" -#include "hictk/hic/common.hpp" -#include "hictk/hic/hic_file_stream.hpp" - -namespace hictk::internal { - -class MatrixSelector { - struct BinaryBuffer { - std::string buffer{}; - std::size_t i{}; - - template ::value>::type * = nullptr> - T read(); - }; - - std::shared_ptr _fs; - std::shared_ptr _footer; - BinTable _bins{}; - BlockMap _blockMap{}; - BlockLRUCache _blockCache{}; - std::set _blockNumberBuff{}; - std::vector _contactRecordBuff{}; - BinaryBuffer _buffer{}; - - public: - MatrixSelector() = delete; - MatrixSelector(std::shared_ptr fs, std::shared_ptr footer, - std::size_t block_cache_capacity); - - [[nodiscard]] const Chromosome &chrom1() const noexcept; - [[nodiscard]] const Chromosome &chrom2() const noexcept; - - [[nodiscard]] std::uint32_t resolution() const noexcept; - [[nodiscard]] MatrixType matrix_type() const noexcept; - [[nodiscard]] NormalizationMethod normalizationMethod() const noexcept; - [[nodiscard]] MatrixUnit matrixUnit() const noexcept; - - [[nodiscard]] std::int64_t numBins1() const noexcept; - [[nodiscard]] std::int64_t numBins2() const noexcept; - - [[nodiscard]] bool isIntra() const noexcept; - [[nodiscard]] bool isInter() const noexcept; - - [[nodiscard]] const std::vector &chrom1Norm() const noexcept; - [[nodiscard]] const std::vector &chrom2Norm() const noexcept; - - [[nodiscard]] inline double avgCount() const; - - void fetch(std::vector> &buffer, bool sorted = false); - // void fetch(const std::string &coord, std::vector> &buffer, bool sorted = false); - void fetch(std::int64_t start, std::int64_t end, std::vector> &buffer, - bool sorted = false); - - // void fetch(const std::string &coord1, const std::string &coord2, - // std::vector> &buffer, bool sorted = false); - void fetch(std::int64_t start1, std::int64_t end1, std::int64_t start2, std::int64_t end2, - std::vector> &buffer, bool sorted = false); - - void clearBlockCache() noexcept; - [[nodiscard]] constexpr double blockCacheHitRate() const noexcept; - [[nodiscard]] std::size_t blockCacheSize() const noexcept; - [[nodiscard]] constexpr std::size_t blockCacheSizeBytes() const noexcept; - [[nodiscard]] constexpr std::size_t blockCacheHits() const noexcept; - [[nodiscard]] constexpr std::size_t blockCacheMisses() const noexcept; - - private: - [[nodiscard]] static BlockMap readBlockMap(HiCFileStream &fs, const HiCFooter &footer); - - void readBlockNumbers(std::int64_t bin1, std::int64_t bin2, std::int64_t bin3, std::int64_t bin4, - std::set &buffer) const; - void readBlockNumbersV9Intra(std::int64_t bin1, std::int64_t bin2, std::int64_t bin3, - std::int64_t bin4, std::set &buffer) const; - [[nodiscard]] std::shared_ptr readBlockOfInteractions( - indexEntry idx, std::vector &buffer); - [[nodiscard]] SerializedPixel processInteraction(SerializedPixel record); - // static void readBlockOfInteractionsV6(BinaryBuffer &src, std::vector &dest); - - static void readBlockOfInteractionsType1Dispatcher(bool i16Bin1, bool i16Bin2, bool i16Counts, - std::int32_t bin1Offset, - std::int32_t bin2Offset, BinaryBuffer &src, - std::vector &dest) noexcept; - template - static void readBlockOfInteractionsType1(std::int32_t bin1Offset, std::int32_t bin2Offset, - BinaryBuffer &src, - std::vector &dest) noexcept; - - template - static void readBlockOfInteractionsType2(std::int32_t bin1Offset, std::int32_t bin2Offset, - BinaryBuffer &src, - std::vector &dest) noexcept; -}; - -} // namespace hictk::internal - -#include "../../../hic_matrix_selector_impl.hpp" diff --git a/src/hic/include/hictk/hic/index.hpp b/src/hic/include/hictk/hic/index.hpp new file mode 100644 index 00000000..6a401e5c --- /dev/null +++ b/src/hic/include/hictk/hic/index.hpp @@ -0,0 +1,94 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include + +#include "hictk/chromosome.hpp" +#include "hictk/hic/cache.hpp" +#include "hictk/hic/filestream.hpp" +#include "hictk/hic/hic_file_stream.hpp" + +namespace hictk::hic::internal { + +struct BlockIndex { + std::size_t id{null_id}; // NOLINT + std::size_t file_offset{}; // NOLINT + std::size_t compressed_size_bytes{}; // NOLINT + + std::size_t first_row{}; + std::size_t last_row{}; + std::size_t first_col{}; + std::size_t last_col{}; + + static constexpr auto null_id = (std::numeric_limits::max)(); + + constexpr explicit operator bool() const noexcept; + friend constexpr bool operator<(const BlockIndex& a, const BlockIndex& b) noexcept; + friend constexpr bool operator==(const BlockIndex& a, const BlockIndex& b) noexcept; + friend constexpr bool operator!=(const BlockIndex& a, const BlockIndex& b) noexcept; + + friend constexpr bool operator<(const BlockIndex& a, std::size_t b_id) noexcept; + friend constexpr bool operator==(const BlockIndex& a, std::size_t b_id) noexcept; + friend constexpr bool operator!=(const BlockIndex& a, std::size_t b_id) noexcept; + + friend constexpr bool operator<(std::size_t a_id, const BlockIndex& b) noexcept; + friend constexpr bool operator==(std::size_t a_id, const BlockIndex& b) noexcept; + friend constexpr bool operator!=(std::size_t a_id, const BlockIndex& b) noexcept; +}; + +struct BlockIndexCmp { + using is_transparent = void; + + constexpr bool operator()(const BlockIndex& a, const BlockIndex& b) const noexcept; + constexpr bool operator()(const BlockIndex& a, std::size_t b_id) const noexcept; + constexpr bool operator()(std::size_t a_id, const BlockIndex& b) const noexcept; +}; + +// Map coordinates (bp) to block IDs +class Index { + // map block_ids to file offsets + const phmap::btree_set _block_map{}; + std::int32_t _version{}; + std::size_t _block_bin_count{}; + std::size_t _block_column_count{}; // columns of blocks per matrix? + double _sum_count{}; // sum + + Chromosome _chrom1{}; + Chromosome _chrom2{}; + + public: + static constexpr auto npos = (std::numeric_limits::max)(); + + Index() = default; + Index(Chromosome chrom1_, Chromosome chrom2_, phmap::btree_set blocks_, + std::int32_t version_, std::size_t block_bin_count_, std::size_t block_column_count_, + double sum_count_); + + [[nodiscard]] const Chromosome& chrom1() const noexcept; + [[nodiscard]] const Chromosome& chrom2() const noexcept; + [[nodiscard]] bool is_intra() const noexcept; + [[nodiscard]] constexpr double matrix_sum() const noexcept; + [[nodiscard]] constexpr std::size_t block_bin_count() const noexcept; + [[nodiscard]] constexpr std::size_t block_column_count() const noexcept; + + std::vector map_2d_query_to_blocks(const PixelCoordinates& coords1, + const PixelCoordinates& coords2); + void map_2d_query_to_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2, + std::vector& buffer); + + const BlockIndex& at(std::size_t id) const; + + private: + void _map_2d_query_to_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2, + std::vector& buffer); + void _map_2d_query_to_blocks_intra_v9plus(const PixelCoordinates& coords1, + const PixelCoordinates& coords2, + std::vector& buffer); +}; + +} // namespace hictk::hic::internal + +#include "../../../index_impl.hpp" diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp new file mode 100644 index 00000000..13f1c841 --- /dev/null +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -0,0 +1,139 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include "hictk/bin_table.hpp" +#include "hictk/hic/cache.hpp" +#include "hictk/hic/common.hpp" +#include "hictk/hic/hic_file_stream.hpp" +#include "hictk/hic/index.hpp" +#include "hictk/pixel.hpp" + +namespace hictk::hic { + +class PixelSelector { + mutable internal::HiCBlockReader _reader{}; + std::shared_ptr _footer{}; + + PixelCoordinates _coord1{}; + PixelCoordinates _coord2{}; + + public: + template + class iterator; + + PixelSelector() = delete; + PixelSelector(std::shared_ptr hfs_, + std::shared_ptr footer_, + std::shared_ptr cache_, + std::shared_ptr bins_, PixelCoordinates coords) noexcept; + + PixelSelector(std::shared_ptr hfs_, + std::shared_ptr footer_, + std::shared_ptr cache_, + std::shared_ptr bins_, PixelCoordinates coord1_, + PixelCoordinates coord2_) noexcept; + + [[nodiscard]] bool operator==(const PixelSelector &other) const noexcept; + [[nodiscard]] bool operator!=(const PixelSelector &other) const noexcept; + /* + template + [[nodiscard]] auto begin() const -> iterator; + template + [[nodiscard]] auto end() const -> iterator; + + template + [[nodiscard]] auto cbegin() const -> iterator; + template + [[nodiscard]] auto cend() const -> iterator; + */ + + template + [[nodiscard]] std::vector> read_all() const; + + [[nodiscard]] const PixelCoordinates &coord1() const noexcept; + [[nodiscard]] const PixelCoordinates &coord2() const noexcept; + + [[nodiscard]] const Chromosome &chrom1() const noexcept; + [[nodiscard]] const Chromosome &chrom2() const noexcept; + + [[nodiscard]] const BinTable &bins() const noexcept; + [[nodiscard]] const internal::HiCFooterMetadata &metadata() const noexcept; + + [[nodiscard]] bool is_inter() const noexcept; + [[nodiscard]] bool is_intra() const noexcept; + template + [[nodiscard]] N sum() const noexcept; + [[nodiscard]] double avg() const noexcept; + + private: + [[nodiscard]] SerializedPixel process_interaction(SerializedPixel record) const; + /* + public: + template + class iterator { + static_assert(std::is_arithmetic_v); + friend PixelSelector; + const PixelSelector *_sel{}; + + std::uint32_t _pos1{}; + std::uint32_t _pos2{}; + + std::size_t _col_i{}; + + std::shared_ptr _blk{}; + internal::InteractionBlock::const_iterator _row{}; + + mutable Pixel _value{}; + + public: + using difference_type = std::ptrdiff_t; + using value_type = Pixel; + using pointer = value_type *; + using const_pointer = const value_type *; + using reference = value_type &; + using const_reference = const value_type &; + using iterator_category = std::forward_iterator_tag; + + iterator() = default; + explicit iterator(const PixelSelector &sel); + [[nodiscard]] static auto at_end(const PixelSelector &sel) -> iterator; + + [[nodiscard]] bool operator==(const iterator &other) const noexcept; + [[nodiscard]] bool operator!=(const iterator &other) const noexcept; + + [[nodiscard]] bool operator<(const iterator &other) const noexcept; + [[nodiscard]] bool operator<=(const iterator &other) const noexcept; + + [[nodiscard]] bool operator>(const iterator &other) const noexcept; + [[nodiscard]] bool operator>=(const iterator &other) const noexcept; + + [[nodiscard]] auto operator*() const -> const_reference; + // [[nodiscard]] auto operator->() const -> const_pointer; + + auto operator++() -> iterator &; + auto operator++(int) -> iterator; + + private: + [[nodiscard]] bool discard() const noexcept; + [[nodiscard]] bool is_at_end() const noexcept; + [[nodiscard]] const BinTable &bins() const noexcept; + [[nodiscard]] const PixelCoordinates &coord1() const noexcept; + [[nodiscard]] const PixelCoordinates &coord2() const noexcept; + + [[nodiscard]] const internal::InteractionBlock::Row &row() const noexcept; + [[nodiscard]] std::uint32_t pos1() const noexcept; + [[nodiscard]] std::uint32_t pos2() const noexcept; + [[nodiscard]] N count() const noexcept; + + void seek_to_next_block(); + void seek_to_next_overlap() noexcept; + }; + */ +}; + +} // namespace hictk::hic + +#include "../../../pixel_selector_impl.hpp" diff --git a/src/hic/include/hictk/hic/suppress_compiler_warnings.hpp b/src/hic/include/hictk/hic/suppress_compiler_warnings.hpp deleted file mode 100644 index e87a5c8b..00000000 --- a/src/hic/include/hictk/hic/suppress_compiler_warnings.hpp +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#pragma once - -// Source: https://www.fluentcpp.com/2019/08/30/how-to-disable-a-warning-in-cpp/ - -// clang-format off - -// Defines for GCC and Clang -#if defined(__GNUC__) || defined(__clang__) - #define DO_PRAGMA(X) _Pragma(#X) // NOLINT(cppcoreguidelines-macro-usage) - #define DISABLE_WARNING_PUSH DO_PRAGMA(GCC diagnostic push) - #define DISABLE_WARNING_POP DO_PRAGMA(GCC diagnostic pop) - #define DISABLE_WARNING(warningName) DO_PRAGMA(GCC diagnostic ignored warningName) // NOLINT(cppcoreguidelines-macro-usage) -#endif - -// Defines specific to Clang -#ifdef __clang__ - #define DISABLE_WARNING_USELESS_CAST -#endif - -// Defines specific to GCC -#if defined(__GNUC__) && !defined(__clang__) - #define DISABLE_WARNING_USELESS_CAST DISABLE_WARNING("-Wuseless-cast") -#endif - -// Defines for unknown/unsupported compilers -#if !defined(__GNUC__) && !defined(__clang__) - #define DISABLE_WARNING - #define DISABLE_WARNING_PUSH - #define DISABLE_WARNING_POP - - #define DISABLE_WARNING_USELESS_CAST -#endif - -// clang-format on diff --git a/src/hic/index_impl.hpp b/src/hic/index_impl.hpp new file mode 100644 index 00000000..3391cf63 --- /dev/null +++ b/src/hic/index_impl.hpp @@ -0,0 +1,215 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include +#include + +namespace hictk::hic::internal { + +constexpr BlockIndex::operator bool() const noexcept { + return id != null_id && compressed_size_bytes != 0; +} + +constexpr bool operator<(const BlockIndex &a, const BlockIndex &b) noexcept { return a < b.id; } +constexpr bool operator==(const BlockIndex &a, const BlockIndex &b) noexcept { return a == b.id; } +constexpr bool operator!=(const BlockIndex &a, const BlockIndex &b) noexcept { return !(a == b); } + +constexpr bool operator<(const BlockIndex &a, std::size_t b_id) noexcept { return a.id < b_id; } +constexpr bool operator==(const BlockIndex &a, std::size_t b_id) noexcept { return a.id == b_id; } +constexpr bool operator!=(const BlockIndex &a, std::size_t b_id) noexcept { return !(a == b_id); } + +constexpr bool operator<(std::size_t a_id, const BlockIndex &b) noexcept { return a_id < b.id; } +constexpr bool operator==(std::size_t a_id, const BlockIndex &b) noexcept { return a_id == b.id; } +constexpr bool operator!=(std::size_t a_id, const BlockIndex &b) noexcept { return !(a_id == b); } + +constexpr bool BlockIndexCmp::operator()(const BlockIndex &a, const BlockIndex &b) const noexcept { + return a < b; +} +constexpr bool BlockIndexCmp::operator()(const BlockIndex &a, std::size_t b_id) const noexcept { + return a < b_id; +} +constexpr bool BlockIndexCmp::operator()(std::size_t a_id, const BlockIndex &b) const noexcept { + return a_id < b; +} + +inline Index::Index(Chromosome chrom1_, Chromosome chrom2_, + phmap::btree_set blocks_, std::int32_t version_, + std::size_t block_bin_count_, std::size_t block_column_count_, + double sum_count_) + : _block_map(std::move(blocks_)), + _version(version_), + _block_bin_count(block_bin_count_), + _block_column_count(block_column_count_), + _sum_count(sum_count_), + _chrom1(std::move(chrom1_)), + _chrom2(std::move(chrom2_)) { + if (_block_bin_count == 0) { + throw std::runtime_error("index is corrupted: blockBinCount=0."); + } + if (_block_column_count == 0) { + throw std::runtime_error("index is corrupted: blockColumnCount=0."); + } +} + +inline const Chromosome &Index::chrom1() const noexcept { return _chrom1; } +inline const Chromosome &Index::chrom2() const noexcept { return _chrom2; } +inline bool Index::is_intra() const noexcept { return _chrom1 == _chrom2; } + +constexpr double Index::matrix_sum() const noexcept { return _sum_count; } + +constexpr std::size_t Index::block_bin_count() const noexcept { return _block_bin_count; } + +constexpr std::size_t Index::block_column_count() const noexcept { return _block_column_count; } + +inline std::vector Index::map_2d_query_to_blocks(const PixelCoordinates &coords1, + const PixelCoordinates &coords2) { + std::vector buffer{}; + map_2d_query_to_blocks(coords1, coords2, buffer); + return buffer; +} + +inline const BlockIndex &Index::at(std::size_t id) const { + auto match = _block_map.find(id); + if (match == _block_map.end()) { + throw std::out_of_range(fmt::format(FMT_STRING("unable to find block #{}: out of range"), id)); + } + return *match; +} + +inline void Index::map_2d_query_to_blocks(const PixelCoordinates &coords1, + const PixelCoordinates &coords2, + std::vector &buffer) { + assert(coords1.is_intra()); + assert(coords2.is_intra()); + + const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); + if (_version < 9 || is_intra) { + return _map_2d_query_to_blocks(coords1, coords2, buffer); + } + return _map_2d_query_to_blocks_intra_v9plus(coords1, coords2, buffer); +} + +inline void Index::_map_2d_query_to_blocks(const hictk::PixelCoordinates &coords1, + const hictk::PixelCoordinates &coords2, + std::vector &buffer) { + assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); + assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); + + auto bin1 = coords1.bin1.rel_id(); + auto bin2 = coords1.bin2.rel_id() + 1; + auto bin3 = coords2.bin1.rel_id(); + auto bin4 = coords2.bin2.rel_id() + 1; + + const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); + + if (is_intra && bin1 > bin3) { + std::swap(bin1, bin3); + std::swap(bin2, bin4); + } + + const auto col1 = bin1 / _block_bin_count; + const auto col2 = (bin2 + 1) / _block_bin_count; + const auto row1 = bin3 / _block_bin_count; + const auto row2 = (bin4 + 1) / _block_bin_count; + + // check region part that overlaps with lower left triangle but only if intrachromosomal + const auto checkLowerLeftTri = is_intra; + phmap::btree_set tmp_buffer{}; + // first check the upper triangular matrix_type + for (auto row = row1; row <= row2; ++row) { + for (auto col = col1; col <= col2; ++col) { + const auto id1 = row * _block_column_count + col; + auto match = _block_map.find(id1); + if (match != _block_map.end()) { + auto block = *match; + block.first_row = bin1; + block.last_row = bin2; + block.first_col = bin3; + block.last_col = bin4; + + tmp_buffer.emplace(block); + } + + if (checkLowerLeftTri) { + const auto id2 = col * _block_column_count + row; + match = _block_map.find(id2); + if (match != _block_map.end()) { + auto block = *match; + block.first_row = bin1; + block.last_row = bin2; + block.first_col = bin3; + block.last_col = bin4; + tmp_buffer.emplace(block); + } + } + } + } + + buffer.resize(tmp_buffer.size()); + std::copy(tmp_buffer.begin(), tmp_buffer.end(), buffer.begin()); +} + +inline void Index::_map_2d_query_to_blocks_intra_v9plus(const hictk::PixelCoordinates &coords1, + const hictk::PixelCoordinates &coords2, + std::vector &buffer) { + // https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#grid-structure + assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); + assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); + + auto bin1 = coords1.bin1.rel_id(); + auto bin2 = coords1.bin2.rel_id() + 1; + auto bin3 = coords2.bin1.rel_id(); + auto bin4 = coords2.bin2.rel_id() + 1; + + const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); + + if (is_intra && bin1 > bin3) { + std::swap(bin1, bin3); + std::swap(bin2, bin4); + } + + const auto translatedLowerPAD = (bin1 + bin3) / 2 / _block_bin_count; + const auto translatedHigherPAD = (bin2 + bin4) / 2 / _block_column_count + 1; + const auto translatedNearerDepth = + static_cast(std::log2(1.0 + double(hictk::internal::abs_diff(bin1, bin4)) / + std::sqrt(2.0) / double(_block_bin_count))); + const auto translatedFurtherDepth = + static_cast(std::log2(1.0 + double(hictk::internal::abs_diff(bin2, bin3)) / + std::sqrt(2.0) / double(_block_bin_count))); + + // code above assumes diagonal; but we could be below diagonal + const auto nearerDepth = [&]() -> std::size_t { + if ((bin1 > bin4 && bin2 < bin3) || (bin2 > bin3 && bin1 < bin4)) { + return 0; + } + return std::min(translatedNearerDepth, translatedFurtherDepth); + }(); + + // +1; integer divide rounds down + const auto furtherDepth = std::max(translatedNearerDepth, translatedFurtherDepth) + 1; + phmap::btree_set block_ids{}; + for (auto depth = nearerDepth; depth <= furtherDepth; ++depth) { + for (auto pad = translatedLowerPAD; pad <= translatedHigherPAD; ++pad) { + const auto id = depth * _block_column_count + pad; + auto match = _block_map.find(id); + if (match != _block_map.end()) { + auto block = *match; + block.first_row = bin1; + block.last_row = bin2; + block.first_col = bin3; + block.last_col = bin3; + block_ids.emplace(block); + } + } + } + buffer.resize(block_ids.size()); + std::copy(block_ids.begin(), block_ids.end(), buffer.begin()); +} + +} // namespace hictk::hic::internal diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp new file mode 100644 index 00000000..895c550e --- /dev/null +++ b/src/hic/pixel_selector_impl.hpp @@ -0,0 +1,401 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include "hictk/common.hpp" +#include "hictk/fmt.hpp" // TODO: remove me +#include "hictk/hic/hic_file_stream.hpp" + +namespace hictk::hic { + +inline PixelSelector::PixelSelector(std::shared_ptr hfs_, + std::shared_ptr footer_, + std::shared_ptr cache_, + std::shared_ptr bins_, + PixelCoordinates coords) noexcept + : PixelSelector(std::move(hfs_), std::move(footer_), std::move(cache_), std::move(bins_), + coords, std::move(coords)) {} + +inline PixelSelector::PixelSelector(std::shared_ptr hfs_, + std::shared_ptr footer_, + std::shared_ptr cache_, + std::shared_ptr bins_, PixelCoordinates coord1_, + PixelCoordinates coord2_) noexcept + : _reader(std::move(hfs_), *footer_, std::move(bins_), std::move(cache_), coord1_, coord2_), + _footer(std::move(footer_)), + _coord1(std::move(coord1_)), + _coord2(std::move(coord2_)) {} + +inline bool PixelSelector::operator==(const PixelSelector &other) const noexcept { + return _footer == other._footer && _coord1 == other._coord1 && _coord2 == other._coord2; +} + +inline bool PixelSelector::operator!=(const PixelSelector &other) const noexcept { + return !(*this == other); +} +/* +template +inline auto PixelSelector::cbegin() const -> iterator { + return iterator(*this); +} + +template +inline auto PixelSelector::cend() const -> iterator { + return iterator::at_end(*this); +} + +template +inline auto PixelSelector::begin() const -> iterator { + return this->cbegin(); +} + +template +inline auto PixelSelector::end() const -> iterator { + return this->cend(); +} +*/ + +inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record) const { + const auto &c1Norm = _footer->c1Norm(); + const auto &c2Norm = _footer->c2Norm(); + const auto &expected = _footer->expectedValues(); + + assert(is_inter() || record.bin1_id <= record.bin2_id); + + const auto skipNormalization = _footer->normalization() == NormalizationMethod::NONE || + _footer->matrix_type() == MatrixType::expected; + + if (!skipNormalization) { + const auto bin1 = static_cast(record.bin1_id); + const auto bin2 = static_cast(record.bin2_id); + assert(bin1 < c1Norm.size()); + assert(bin2 < c2Norm.size()); + record.count /= static_cast(c1Norm[bin1] * c2Norm[bin2]); + } + + record.bin1_id *= _footer->resolution(); + record.bin2_id *= _footer->resolution(); + + if (_footer->matrix_type() == MatrixType::observed) { + return record; + } + + const auto expectedCount = [&]() { + if (is_inter()) { + return float(_reader.avg()); + } + + const auto i = + static_cast((record.bin2_id - record.bin1_id) / _footer->resolution()); + assert(i < expected.size()); + return float(expected[i]); + }(); + + if (_footer->matrix_type() == MatrixType::expected) { + record.count = expectedCount; + return record; + } + + assert(_footer->matrix_type() == MatrixType::oe); + record.count /= expectedCount; + + return record; +} + +template +inline std::vector> PixelSelector::read_all() const { + // return {begin(), end()}; + std::vector> buffer{}; + std::size_t empty_blocks = 0; + + auto bin1 = coord1().bin1.rel_id(); + auto bin2 = coord1().bin2.rel_id() + 1; + auto bin3 = coord2().bin1.rel_id(); + auto bin4 = coord2().bin2.rel_id() + 1; + + for (const auto &block_idx : _reader.grid()) { + auto blk = _reader.read(*block_idx.block); + if (!blk) { + empty_blocks++; + continue; + } + + // Obs we use open-closed interval instead of open-open like is done in straw + for (const auto &[b1, row] : blk->find_overlap(bin1, bin2)) { + if (b1 >= bin2) { + // We're past the last row overlapping the query + break; + } + for (const auto &tp : row) { + const auto &b2 = tp.bin2_id; + if (b1 < bin1 || b2 < bin3) { + // We're upstream of the first column overlapping the query (if any) + continue; + } + + if (b2 >= bin4) { + // We're past the last column overlapping the query for the current row + break; + } + + auto record = process_interaction(SerializedPixel{static_cast(b1), + static_cast(b2), tp.count}); + if (std::isfinite(record.count)) { + buffer.emplace_back( + PixelCoordinates{ + bins().at(_footer->chrom1(), static_cast(record.bin1_id)), + bins().at(_footer->chrom2(), static_cast(record.bin2_id))}, + record.count); + } + } + } + } + // Only interactions from the same block are guaranteed to already be sorted + std::sort(buffer.begin(), buffer.end()); + return buffer; +} + +inline const PixelCoordinates &PixelSelector::coord1() const noexcept { return _coord1; } +inline const PixelCoordinates &PixelSelector::coord2() const noexcept { return _coord2; } + +inline const Chromosome &PixelSelector::chrom1() const noexcept { return _coord1.bin1.chrom(); } +inline const Chromosome &PixelSelector::chrom2() const noexcept { return _coord2.bin1.chrom(); } + +inline const BinTable &PixelSelector::bins() const noexcept { return _reader.bins(); } + +inline const internal::HiCFooterMetadata &PixelSelector::metadata() const noexcept { + assert(!!this->_footer); + return this->_footer->metadata(); +} + +inline bool PixelSelector::is_intra() const noexcept { return chrom1() == chrom2(); } + +inline bool PixelSelector::is_inter() const noexcept { return !is_intra(); } + +template +inline N PixelSelector::sum() const noexcept { + return _reader.sum(); +} +inline double PixelSelector::avg() const noexcept { return _reader.avg(); } +/* +template +inline PixelSelector::iterator::iterator(const PixelSelector &sel) + : _sel(&sel), _blk(_sel->_reader.read(this->coord1(), coord2())) { + if (!_blk) { + *this = at_end(sel); + return; + } + + _row = _blk->begin(); + seek_to_next_overlap(); + _value = *(*this); +} + +template +inline auto PixelSelector::iterator::at_end(const PixelSelector &sel) -> iterator { + iterator it{}; + + it._sel = &sel; + it._blk = nullptr; // This signals we're at end + + return it; +} + +template +inline bool PixelSelector::iterator::operator==(const iterator &other) const noexcept { + return _sel == other._sel && _blk == other._blk && _row == other._row && _col_i == other._col_i; +} + +template +inline bool PixelSelector::iterator::operator!=(const iterator &other) const noexcept { + return !(*this == other); +} + +template +inline bool PixelSelector::iterator::operator<(const iterator &other) const noexcept { + assert(!!_sel); + assert(_sel->coord1() == other._sel->coord1()); + assert(_sel->coord2() == other._sel->coord2()); + return _value < other._value; +} + +template +inline bool PixelSelector::iterator::operator<=(const iterator &other) const noexcept { + assert(!!_sel); + assert(_sel->coord1() == other._sel->coord1()); + assert(_sel->coord2() == other._sel->coord2()); + return _value <= other._value; +} + +template +inline bool PixelSelector::iterator::operator>(const iterator &other) const noexcept { + assert(!!_sel); + assert(_sel->coord1() == other._sel->coord1()); + assert(_sel->coord2() == other._sel->coord2()); + return _value > other._value; +} + +template +inline bool PixelSelector::iterator::operator>=(const iterator &other) const noexcept { + assert(!!_sel); + assert(_sel->coord1() == other._sel->coord1()); + assert(_sel->coord2() == other._sel->coord2()); + return _value >= other._value; +} + +template +inline auto PixelSelector::iterator::operator*() const -> const_reference { + assert(!!_sel); + + _value.coords = {bins().at(coord1().bin1.chrom(), pos1()), + bins().at(coord2().bin1.chrom(), pos2())}; + _value.count = count(); + return _value; +} + +template +inline auto PixelSelector::iterator::operator++() -> iterator & { + assert(!!_sel); + assert(!is_at_end()); + + if (discard()) { + this->_col_i--; + fmt::print(FMT_STRING("operator++={}\n"), *(*this)); + this->_col_i++; + seek_to_next_block(); + } else { + fmt::print(FMT_STRING("operator++{}\n"), *(*this)); + assert(_col_i < row().size()); + _col_i++; + } + + return *this; +} + +template +inline auto PixelSelector::iterator::operator++(int) -> iterator { + auto it = *this; + std::ignore = ++(*this); + return it; +} + +template +inline bool PixelSelector::iterator::discard() const noexcept { + if (is_at_end()) { + return false; + } + // clang-format off + return _col_i >= row().size() || + pos1() < coord1().bin1.start() || + pos1() > coord1().bin2.start() || + pos2() < coord2().bin1.start() || + pos2() > coord2().bin2.start(); + // clang-format on +} + +template +inline bool PixelSelector::iterator::is_at_end() const noexcept { + return !_blk; +} + +template +inline const BinTable &PixelSelector::iterator::bins() const noexcept { + assert(!!_sel); + return _sel->bins(); +} +template +inline const PixelCoordinates &PixelSelector::iterator::coord1() const noexcept { + assert(!!_sel); + return _sel->coord1(); +} +template +inline const PixelCoordinates &PixelSelector::iterator::coord2() const noexcept { + assert(!!_sel); + return _sel->coord2(); +} + +template +inline const internal::InteractionBlock::Row &PixelSelector::iterator::row() const noexcept { + assert(!is_at_end()); + return _row->second; +} + +template +inline std::uint32_t PixelSelector::iterator::pos1() const noexcept { + assert(!is_at_end()); + return static_cast(_row->first) * bins().bin_size(); +} + +template +inline std::uint32_t PixelSelector::iterator::pos2() const noexcept { + assert(!is_at_end()); + assert(_col_i < row().size()); + const auto &thin_pixel = row()[_col_i]; + return static_cast(thin_pixel.bin2_id) * bins().bin_size(); +} + +template +inline N PixelSelector::iterator::count() const noexcept { + assert(!is_at_end()); + assert(_col_i < row().size()); + const auto &thin_pixel = row()[_col_i]; + if constexpr (std::is_integral_v) { + return static_cast(std::round(thin_pixel.count)); + } else { + return conditional_static_cast(thin_pixel.count); + } +} + +template +inline void PixelSelector::iterator::seek_to_next_block() { + assert(!is_at_end()); + + // Figure out whether we should move right or down + + if (_value.coords.bin1.rel_id() == (--_blk->end())->first) { + // Advance row and reset col + // TODO + assert(false); + // const auto pos1 = (static_cast(_blk->last_row() + 2) * bins().bin_size()) - + // 1; const PixelCoordinates coords_{bins().at(coord1().bin1.chrom(), pos1), coord1().bin2}; + // auto blk = _sel->_reader.read(coords_, coord2()); + // assert(blk != _blk); + // _blk = blk; + } else { + // Advance col and leave row alone + _blk = _sel->_reader.read_after(_blk->id()); + } + + // fmt::print(FMT_STRING("jumping to {}:{}\n"), coords1_, coord2()); + if (!_blk) { + *this = at_end(*_sel); + return; + } + _col_i = 0; + _row = _blk->begin(); + + seek_to_next_overlap(); + _value = *(*this); +} + +template +inline void PixelSelector::iterator::seek_to_next_overlap() noexcept { + assert(!is_at_end()); + + for (; _row != _blk->end(); ++_row) { + if (pos1() >= coord1().bin1.start()) { + break; + } + } + + for (_col_i = 0; _col_i < row().size(); ++_col_i) { + if (pos2() >= coord2().bin1.start()) { + return; + } + } +} + */ + +} // namespace hictk::hic diff --git a/src/numeric/include/hictk/numeric_utils.hpp b/src/numeric/include/hictk/numeric_utils.hpp index 2a9690fd..e9039343 100644 --- a/src/numeric/include/hictk/numeric_utils.hpp +++ b/src/numeric/include/hictk/numeric_utils.hpp @@ -23,6 +23,9 @@ void parse_numeric_or_throw(std::string_view tok, N &field); template N parse_numeric_or_throw(std::string_view tok); + +template +constexpr N abs_diff(N n1, N n2) noexcept; } // namespace hictk::internal #include "../../numeric_utils_impl.hpp" diff --git a/src/numeric/numeric_utils_impl.hpp b/src/numeric/numeric_utils_impl.hpp index b0f1f440..d90c85e6 100644 --- a/src/numeric/numeric_utils_impl.hpp +++ b/src/numeric/numeric_utils_impl.hpp @@ -104,4 +104,10 @@ inline N parse_numeric_or_throw(std::string_view tok) { parse_numeric_or_throw(tok, field); return field; } + +template +constexpr N abs_diff(N n1, N n2) noexcept { + const auto [n3, n4] = std::minmax(n1, n2); + return n4 - n3; +} } // namespace hictk::internal diff --git a/src/pixel/include/hictk/pixel.hpp b/src/pixel/include/hictk/pixel.hpp index 80463122..bd2d956a 100644 --- a/src/pixel/include/hictk/pixel.hpp +++ b/src/pixel/include/hictk/pixel.hpp @@ -33,6 +33,8 @@ struct PixelCoordinates { [[nodiscard]] bool operator<=(const PixelCoordinates &other) const noexcept; [[nodiscard]] bool operator>(const PixelCoordinates &other) const noexcept; [[nodiscard]] bool operator>=(const PixelCoordinates &other) const noexcept; + + [[nodiscard]] bool is_intra() const noexcept; }; template diff --git a/src/pixel/pixel_impl.hpp b/src/pixel/pixel_impl.hpp index 08cfcf80..5156fa42 100644 --- a/src/pixel/pixel_impl.hpp +++ b/src/pixel/pixel_impl.hpp @@ -171,6 +171,10 @@ inline bool PixelCoordinates::operator>=(const PixelCoordinates &other) const no return this->bin1 >= other.bin1; } +inline bool PixelCoordinates::is_intra() const noexcept { + return this->bin1.chrom() == this->bin2.chrom(); +} + template inline Pixel::Pixel(Bin bin, N count_) noexcept : Pixel(bin, std::move(bin), count_) {} diff --git a/test/units/bin_table/bin_table_test.cpp b/test/units/bin_table/bin_table_test.cpp index da8edc17..331f879e 100644 --- a/test/units/bin_table/bin_table_test.cpp +++ b/test/units/bin_table/bin_table_test.cpp @@ -22,16 +22,17 @@ TEST_CASE("Bin", "[bin][short]") { const Chromosome chrom2{1, "chr2", 10}; SECTION("Ctors") { CHECK(Bin{chrom1, 1, 2}.has_null_id()); - CHECK_FALSE(Bin{0, chrom1, 1, 2}.has_null_id()); - CHECK_FALSE(Bin{0, GenomicInterval{chrom1, 1, 2}}.has_null_id()); + CHECK_FALSE(Bin{0, 0, chrom1, 1, 2}.has_null_id()); + CHECK_FALSE(Bin{0, 0, GenomicInterval{chrom1, 1, 2}}.has_null_id()); } SECTION("Accessors") { const Bin bin1{chrom1, 1, 2}; - const Bin bin2{10, chrom1, 1, 2}; + const Bin bin2{10, 5, chrom1, 1, 2}; CHECK(bin1.id() == Bin::null_id); CHECK(bin2.id() == 10); + CHECK(bin2.rel_id() == 5); CHECK(bin1.interval() == GenomicInterval{chrom1, 1, 2}); @@ -67,11 +68,11 @@ TEST_CASE("Bin", "[bin][short]") { } SECTION("operators (w/ id)") { - const Bin bin1{0, chrom1, 1, 2}; - const Bin bin2{1, chrom1, 2, 3}; + const Bin bin1{0, 0, chrom1, 1, 2}; + const Bin bin2{1, 1, chrom1, 2, 3}; - const Bin bin3{10, chrom2, 1, 2}; - const Bin bin4{10, chrom2, 10, 20}; + const Bin bin3{10, 10, chrom2, 1, 2}; + const Bin bin4{10, 10, chrom2, 10, 20}; CHECK(bin1 != bin2); CHECK(bin1 != bin3); @@ -95,7 +96,7 @@ TEST_CASE("Bin", "[bin][short]") { SECTION("fmt") { const Bin bin1{chrom1, 0, 100}; - const Bin bin2{123, chrom1, 0, 100}; + const Bin bin2{123, 123, chrom1, 0, 100}; CHECK(fmt::format(FMT_STRING("{}"), bin1) == std::to_string(Bin::null_id)); CHECK(fmt::format(FMT_STRING("{}"), bin2) == "123"); diff --git a/test/units/hic/CMakeLists.txt b/test/units/hic/CMakeLists.txt index 012cad36..83e9defc 100644 --- a/test/units/hic/CMakeLists.txt +++ b/test/units/hic/CMakeLists.txt @@ -15,7 +15,7 @@ target_sources( PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/filestream_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_stream_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_test.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/matrix_zoom_data_test.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/pixel_selector_test.cpp) target_link_libraries( hictk_hic_tests diff --git a/test/units/hic/filestream_test.cpp b/test/units/hic/filestream_test.cpp index 27867d6b..e1176bbc 100644 --- a/test/units/hic/filestream_test.cpp +++ b/test/units/hic/filestream_test.cpp @@ -14,9 +14,9 @@ #include #include "catch2/catch_test_macros.hpp" -#include "hictk/hic/suppress_compiler_warnings.hpp" +#include "hictk/suppress_warnings.hpp" -using namespace hictk::internal::filestream; +using namespace hictk::hic::internal::filestream; namespace hictk::test { inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) diff --git a/test/units/hic/hic_file_stream_test.cpp b/test/units/hic/hic_file_stream_test.cpp index aaa2f17a..e671ee02 100644 --- a/test/units/hic/hic_file_stream_test.cpp +++ b/test/units/hic/hic_file_stream_test.cpp @@ -11,14 +11,16 @@ #include #include -using namespace hictk; +using namespace hictk::hic; namespace hictk::test { inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -const auto pathV8 = (test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) -const auto pathV9 = (test::datadir / "4DNFIZ1ZVXC8.hic9").string(); // NOLINT(cert-err58-cpp) +const auto pathV8 = + (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) +const auto pathV9 = + (hictk::test::datadir / "4DNFIZ1ZVXC8.hic9").string(); // NOLINT(cert-err58-cpp) // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("readHeader (v8)", "[hic][v8][short]") { diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index f7931d67..c9ca625a 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -9,14 +9,15 @@ #include "hictk/hic.hpp" -using namespace hictk; +using namespace hictk::hic; namespace hictk::test { inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -const auto pathV8 = (test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) -const auto path_binary = (test::datadir / "data.zip").string(); // NOLINT(cert-err58-cpp) +const auto pathV8 = + (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) +const auto path_binary = (hictk::test::datadir / "data.zip").string(); // NOLINT(cert-err58-cpp) // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("utils: is_hic_file", "[hic][short]") { @@ -58,25 +59,25 @@ TEST_CASE("HiCFile footer cache", "[hic][short]") { if (chrom.is_all()) { continue; } - std::ignore = f.get_matrix_selector(chrom, NormalizationMethod::NONE); + std::ignore = f.fetch(chrom.name()); } CHECK(f.num_cached_footers() == 7); - const auto sel1 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE); - const auto sel2 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE); + const auto sel1 = f.fetch("chr2L"); + const auto sel2 = f.fetch("chr2L"); - // this check relies on the fact that chrom1Norm are stored in the footer, and that footers are + // this check relies on the fact that metadata are stored in footers, and that footers are // looked up in the cache when creating matrix selectors - CHECK(&sel1.chrom1Norm() == &sel2.chrom1Norm()); + CHECK(&sel1.metadata() == &sel2.metadata()); f.purge_footer_cache(); CHECK(f.num_cached_footers() == 0); - const auto sel3 = f.get_matrix_selector("chr2L", NormalizationMethod::NONE); + const auto sel3 = f.fetch("chr2L"); CHECK(f.num_cached_footers() == 1); - CHECK(&sel1.chrom1Norm() != &sel3.chrom1Norm()); + CHECK(&sel1.metadata() != &sel3.metadata()); } // NOLINTNEXTLINE(readability-function-cognitive-complexity) @@ -86,54 +87,37 @@ TEST_CASE("HiCFile get_matrix_selector", "[hic][short]") { REQUIRE(f.chromosomes().size() == 9); - const auto chrom1 = f.chromosomes().at("chr2L"); - const auto chrom2 = f.chromosomes().at("chr2R"); + const auto chrom1 = "chr2L"; + const auto chrom2 = "chr2R"; SECTION("intra-chromosomal") { - auto sel = f.get_matrix_selector(chrom1, norm); + auto sel = f.fetch(chrom1, norm); CHECK(sel.chrom1() == chrom1); - CHECK(sel.isIntra()); - - sel = f.get_matrix_selector(chrom1.id(), norm); - CHECK(sel.chrom1() == chrom1); - CHECK(sel.isIntra()); - - sel = f.get_matrix_selector(chrom1, norm); - CHECK(sel.chrom1() == chrom1); - CHECK(sel.isIntra()); } SECTION("inter-chromosomal") { - auto sel = f.get_matrix_selector(chrom1, chrom2, norm); - CHECK(sel.chrom1() == chrom1); - CHECK(sel.chrom2() == chrom2); - - sel = f.get_matrix_selector(chrom1.id(), chrom2.id(), norm); - CHECK(sel.chrom1() == chrom1); - CHECK(sel.chrom2() == chrom2); - - sel = f.get_matrix_selector(chrom1, chrom2, norm); - CHECK(sel.chrom1() == chrom1); - CHECK(sel.chrom2() == chrom2); - + auto sel = f.fetch(chrom1, chrom2, norm); CHECK(sel.chrom1() == chrom1); CHECK(sel.chrom2() == chrom2); } + /* SECTION("valid, but empty matrix") { - auto sel = f.get_matrix_selector("chrM", norm); - std::vector> buff{}; - sel.fetch(buff); - CHECK(buff.empty()); + // TODO: fixme + auto sel = f.fetch("chrM", norm); + std::vector> buff{}; + // sel.fetch(buff); + // CHECK(buff.empty()); } + */ SECTION("invalid chromosome") { - CHECK_THROWS(f.get_matrix_selector("not-a-chromosome", norm)); - CHECK_THROWS(f.get_matrix_selector(std::string{chrom1.name()}, "not-a-chromosome", norm)); - CHECK_THROWS(f.get_matrix_selector(999, norm)); - CHECK_THROWS(f.get_matrix_selector(chrom1.id(), 999, norm)); + CHECK_THROWS(f.fetch("not-a-chromosome", norm)); + CHECK_THROWS(f.fetch("chr2L", "not-a-chromosome", norm)); } + /* SECTION("malformed") { + // TODO: update CHECK_THROWS(f.get_matrix_selector(chrom2, chrom1, norm)); // NOLINT CHECK_THROWS(HiCFile(pathV8, f.resolution(), MatrixType::expected, MatrixUnit::BP) .get_matrix_selector(chrom1, NormalizationMethod::VC)); @@ -142,4 +126,6 @@ TEST_CASE("HiCFile get_matrix_selector", "[hic][short]") { CHECK_THROWS(HiCFile(pathV8, f.resolution(), MatrixType::observed, MatrixUnit::FRAG) .get_matrix_selector(chrom1, norm)); } + CHECK_THROWS(f.fetch(chrom1.id(), 999, norm)); + */ } diff --git a/test/units/hic/matrix_zoom_data_test.cpp b/test/units/hic/matrix_zoom_data_test.cpp deleted file mode 100644 index d8edfc79..00000000 --- a/test/units/hic/matrix_zoom_data_test.cpp +++ /dev/null @@ -1,418 +0,0 @@ -// Copyright (C) 2023 Roberto Rossini -// -// SPDX-License-Identifier: MIT - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "hictk/fmt.hpp" -#include "hictk/hic.hpp" - -using namespace hictk; - -namespace hictk::test { -inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) -} // namespace hictk::test - -const auto pathV8 = (test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) -const auto pathV9 = (test::datadir / "4DNFIZ1ZVXC8.hic9").string(); // NOLINT(cert-err58-cpp) - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -static std::vector> head(const std::vector>& buffer, std::size_t n = 5) { - REQUIRE(buffer.size() >= n); - - std::vector> slice(n); - std::copy_n(buffer.begin(), n, slice.begin()); - return slice; -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -static std::vector> tail(const std::vector>& buffer, std::size_t n = 5) { - REQUIRE(buffer.size() >= n); - - std::vector> slice(n); - std::copy_n(buffer.end() - std::int32_t(n), n, slice.begin()); - return slice; -} - -template -static N sumCounts(const std::vector>& buffer) { - return std::accumulate( - buffer.begin(), buffer.end(), N(0), - [](N accumulator, const Pixel& r) { return accumulator + static_cast(r.count); }); -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -static void checkContactRecordsAreWithinBound(std::uint32_t start1, std::uint32_t end1, - std::uint32_t start2, std::uint32_t end2, - const std::vector>& buffer) { - assert(start1 < end1); - assert(start2 < end2); - - for (const auto& r : buffer) { - CHECK(r.coords.bin1.start() >= std::min(start1, start2)); - CHECK(r.coords.bin1.end() < std::max(end1, end2)); - CHECK(r.coords.bin2.start() >= std::min(start1, start2)); - CHECK(r.coords.bin2.end() < std::max(end1, end2)); - } -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -static void compareContactRecord(const Pixel& r1, const SerializedPixel& r2) { - CHECK(r1.coords.bin1.start() == r2.bin1_id); - CHECK(r1.coords.bin2.start() == r2.bin2_id); - CHECK_THAT(r1.count, Catch::Matchers::WithinRel(r2.count)); -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector accessors", "[hic][short]") { - const auto sel = HiCFile(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - - CHECK(sel.chrom1().name() == "chr2L"); - CHECK(sel.chrom2().name() == "chr2L"); - CHECK(sel.matrix_type() == MatrixType::observed); - CHECK(sel.normalizationMethod() == NormalizationMethod::NONE); - CHECK(sel.matrixUnit() == MatrixUnit::BP); - CHECK(sel.resolution() == 2500000); - - REQUIRE(sel.chrom1().size() == 23513712); - CHECK(sel.numBins1() == 10); - CHECK(sel.numBins2() == 10); -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector LRU cache", "[hic][short]") { - std::vector> buffer; - HiCFile f(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP); - - auto sel = f.get_matrix_selector("chr2L", NormalizationMethod::NONE); - - CHECK(sel.blockCacheHitRate() == 0.0); - CHECK(sel.blockCacheSize() == 0); - - // Fill cache - sel.fetch(buffer); - CHECK(sel.blockCacheHitRate() == 0.0); - - sel.fetch(buffer); - CHECK(sel.blockCacheHitRate() == 0.5); - CHECK(sel.blockCacheSize() == 6); - - for (auto i = 0; i < 5; ++i) { - sel.fetch(buffer); - } - CHECK(sel.blockCacheHitRate() == 6.0 / 7.0); - CHECK(sel.blockCacheSize() == 6); - - sel.clearBlockCache(); - CHECK(sel.blockCacheHitRate() == 0); - CHECK(sel.blockCacheSize() == 0); -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { - std::vector> buffer; - SECTION("intra-chromosomal") { - constexpr std::size_t expected_size = 1433133; - constexpr std::int32_t expected_sum = 19968156; - - constexpr std::size_t N = 5; - constexpr std::array head_expected{1745, 2844, 409, 195, 195}; - constexpr std::array tail_expected{119, 34, 281, 53, 193}; - - constexpr auto expected_value = - std::make_pair(std::size_t(1229799), SerializedPixel{15770000, 15770000, 1234.0F}); - - SECTION("v8") { - auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK(sumCounts(buffer) == expected_sum); - - const auto h = head(buffer, N); - const auto t = tail(buffer, N); - - for (std::size_t i = 0; i < N; ++i) { - CHECK_THAT(head_expected[i], Catch::Matchers::WithinRel(h[i].count)); - CHECK_THAT(tail_expected[i], Catch::Matchers::WithinRel(t[i].count)); - } - - compareContactRecord(buffer[expected_value.first], expected_value.second); - } - SECTION("v9") { - auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK(sumCounts(buffer) == expected_sum); - - const auto h = head(buffer, N); - const auto t = tail(buffer, N); - - for (std::size_t i = 0; i < N; ++i) { - CHECK_THAT(head_expected[i], Catch::Matchers::WithinRel(h[i].count)); - CHECK_THAT(tail_expected[i], Catch::Matchers::WithinRel(t[i].count)); - } - - compareContactRecord(buffer[expected_value.first], expected_value.second); - } - } - - SECTION("inter-chromosomal") { - constexpr std::size_t expected_size = 56743; - constexpr std::int32_t expected_sum = 70567; - - constexpr std::size_t N = 5; - constexpr std::array head_expected{1, 1, 1, 1, 1}; - constexpr std::array tail_expected{1, 1, 1, 1, 1}; - - constexpr auto expected_value = - std::make_pair(std::size_t(3541), SerializedPixel{770000, 1300000, 13.0F}); - - SECTION("v8") { - auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK(sumCounts(buffer) == expected_sum); - - const auto h = head(buffer, N); - const auto t = tail(buffer, N); - - for (std::size_t i = 0; i < N; ++i) { - CHECK_THAT(head_expected[i], Catch::Matchers::WithinRel(h[i].count)); - CHECK_THAT(tail_expected[i], Catch::Matchers::WithinRel(t[i].count)); - } - - compareContactRecord(buffer[expected_value.first], expected_value.second); - } - - SECTION("v9") { - auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK(sumCounts(buffer) == expected_sum); - - const auto h = head(buffer, N); - const auto t = tail(buffer, N); - - for (std::size_t i = 0; i < N; ++i) { - CHECK_THAT(head_expected[i], Catch::Matchers::WithinRel(h[i].count)); - CHECK_THAT(tail_expected[i], Catch::Matchers::WithinRel(t[i].count)); - } - - compareContactRecord(buffer[expected_value.first], expected_value.second); - } - - SECTION("cover type 2 interactions") { - auto sel = HiCFile(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr2R", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == 110); - CHECK(sumCounts(buffer) == 1483112); - - compareContactRecord(buffer[38], SerializedPixel{7500000, 12500000, 16512}); - } - - SECTION("sub-queries") { - const std::uint32_t resolution = 10'000; - SECTION("single pixel") { - auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - sel.fetch(100000, 100001, 100000, 100001, buffer); - REQUIRE(buffer.size() == 1); - compareContactRecord(buffer.front(), SerializedPixel{100000, 100000, 13895.0F}); - } - - SECTION("upper-triangle") { - auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - sel.fetch(123456, 200000, 0, 200000, buffer, true); - REQUIRE(buffer.size() == 132); - CHECK(sumCounts(buffer) == 124561); - compareContactRecord(buffer[33], SerializedPixel{40000, 130000, 148}); - checkContactRecordsAreWithinBound(123456, 200000 + resolution, 0, 200000 + resolution, - buffer); - } - - SECTION("lower-triangle") { - auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - sel.fetch(0, 200000, 123456, 200000, buffer, true); - REQUIRE(buffer.size() == 132); - CHECK(sumCounts(buffer) == 124561); - compareContactRecord(buffer[33], SerializedPixel{40000, 130000, 148}); - checkContactRecordsAreWithinBound(0, 200000 + resolution, 123456, 200000 + resolution, - buffer); - } - - SECTION("inter-chromosomal") { - auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); - sel.fetch(123456, 200000, 0, 200000, buffer); - REQUIRE(buffer.size() == 57); - CHECK(sumCounts(buffer) == 74); - checkContactRecordsAreWithinBound(123456, 200000 + resolution, 0, 200000 + resolution, - buffer); - } - } - - SECTION("invalid") { - SECTION("invalid chromosome") { - HiCFile hic(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP); - CHECK_THROWS(hic.get_matrix_selector("chr123", NormalizationMethod::NONE)); - CHECK_THROWS(hic.get_matrix_selector(999, NormalizationMethod::NONE)); - } - SECTION("invalid unit") { - HiCFile hic(pathV9, 10'000, MatrixType::observed, MatrixUnit::FRAG); - CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE)); - } - SECTION("expected + norm") { - HiCFile hic(pathV9, 10'000, MatrixType::expected, MatrixUnit::BP); - CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::VC)); - } - SECTION("invalid range") { - HiCFile hic(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP); - CHECK_THROWS( - hic.get_matrix_selector("chr2L", NormalizationMethod::NONE).fetch(1000, 0, buffer)); - CHECK_THROWS(hic.get_matrix_selector("chr2L", NormalizationMethod::NONE) - .fetch(0, 1'000'000'000, buffer)); - } - } - } -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][short]") { - std::vector> buffer; - SECTION("intra-chromosomal") { - constexpr std::size_t expected_size = 1433133; - constexpr double expected_sum = 20391277.41514; - SECTION("v8") { - auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::VC); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - SECTION("v9") { - auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::VC); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - } - SECTION("inter-chromosomal") { - constexpr std::size_t expected_size = 56743; - constexpr double expected_sum = 96690.056244753; - SECTION("v8") { - auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::VC); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - - SECTION("v9") { - auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::VC); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - } -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][short]") { - std::vector> buffer; - SECTION("intra-chromosomal") { - constexpr std::size_t expected_size = 1433133; - constexpr double expected_sum = 18314748.068024; - SECTION("v8") { - auto sel = HiCFile(pathV8, 10'000, MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - SECTION("v9") { - auto sel = HiCFile(pathV9, 10'000, MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - } - SECTION("inter-chromosomal") { - constexpr std::size_t expected_size = 56743; - constexpr double expected_sum = 12610.80619812; - SECTION("v8") { - auto sel = HiCFile(pathV8, 10'000, MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - - SECTION("v9") { - auto sel = HiCFile(pathV9, 10'000, MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - } -} - -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (oe NONE BP 10000)", "[hic][short]") { - std::vector> buffer; - SECTION("intra-chromosomal") { - constexpr std::size_t expected_size = 1433133; - constexpr double expected_sum = 2785506.2274201; - SECTION("v8") { - auto sel = HiCFile(pathV8, 10'000, MatrixType::oe, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - SECTION("v9") { - auto sel = HiCFile(pathV9, 10'000, MatrixType::oe, MatrixUnit::BP) - .get_matrix_selector("chr2L", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - } - SECTION("inter-chromosomal") { - constexpr std::size_t expected_size = 56743; - constexpr double expected_sum = 317520.00459671; - SECTION("v8") { - auto sel = HiCFile(pathV8, 10'000, MatrixType::oe, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - - SECTION("v9") { - auto sel = HiCFile(pathV9, 10'000, MatrixType::oe, MatrixUnit::BP) - .get_matrix_selector("chr2L", "chr4", NormalizationMethod::NONE); - sel.fetch(buffer, true); - REQUIRE(buffer.size() == expected_size); - CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); - } - } -} diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp new file mode 100644 index 00000000..ff158d8e --- /dev/null +++ b/test/units/hic/pixel_selector_test.cpp @@ -0,0 +1,109 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include +#include +#include +#include +#include + +#include "hictk/hic.hpp" + +using namespace hictk::hic; + +namespace hictk::test { +inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) +} // namespace hictk::test + +const auto pathV8 = + (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) +const auto pathV9 = + (hictk::test::datadir / "4DNFIZ1ZVXC8.hic9").string(); // NOLINT(cert-err58-cpp) +const auto path_binary = (hictk::test::datadir / "data.zip").string(); // NOLINT(cert-err58-cpp) + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +static std::vector> head(const std::vector>& buffer, + std::size_t n = 5) { + REQUIRE(buffer.size() >= n); + + std::vector> slice(n); + std::copy_n(buffer.begin(), n, slice.begin()); + return slice; +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +static std::vector> tail(const std::vector>& buffer, + std::size_t n = 5) { + REQUIRE(buffer.size() >= n); + + std::vector> slice(n); + std::copy_n(buffer.end() - std::int32_t(n), n, slice.begin()); + return slice; +} + +template +static N sumCounts(const std::vector>& buffer) { + return std::accumulate(buffer.begin(), buffer.end(), N(0), + [](N accumulator, const hictk::Pixel& p) { + return accumulator + static_cast(p.count); + }); +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +static void compareContactRecord(const hictk::Pixel& r1, const SerializedPixel& r2) { + CHECK(r1.coords.bin1.start() == r2.bin1_id); + CHECK(r1.coords.bin2.start() == r2.bin2_id); + CHECK_THAT(r1.count, Catch::Matchers::WithinRel(r2.count)); +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { + SECTION("intra-chromosomal") { + constexpr std::size_t expected_size = 1433133; + constexpr std::int32_t expected_sum = 19968156; + + constexpr std::size_t N = 5; + constexpr std::array head_expected{1745, 2844, 409, 195, 195}; + constexpr std::array tail_expected{119, 34, 281, 53, 193}; + + constexpr auto expected_value = + std::make_pair(std::size_t(1229799), SerializedPixel{15770000, 15770000, 1234.0F}); + + SECTION("v8") { + auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP).fetch("chr2L"); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + + CHECK(sumCounts(buffer) == expected_sum); + + const auto h = head(buffer, N); + const auto t = tail(buffer, N); + + for (std::size_t i = 0; i < N; ++i) { + CHECK_THAT(head_expected[i], Catch::Matchers::WithinRel(h[i].count)); + CHECK_THAT(tail_expected[i], Catch::Matchers::WithinRel(t[i].count)); + } + + compareContactRecord(buffer[expected_value.first], expected_value.second); + } + + SECTION("v9") { + auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP).fetch("chr2L"); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + + CHECK(sumCounts(buffer) == expected_sum); + + const auto h = head(buffer, N); + const auto t = tail(buffer, N); + + for (std::size_t i = 0; i < N; ++i) { + CHECK_THAT(head_expected[i], Catch::Matchers::WithinRel(h[i].count)); + CHECK_THAT(tail_expected[i], Catch::Matchers::WithinRel(t[i].count)); + } + + compareContactRecord(buffer[expected_value.first], expected_value.second); + } + } +} From c053d6977afe30bebee32fc77dbcc300358a6c00 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 12 Jun 2023 19:18:54 +0200 Subject: [PATCH 05/48] Fix BlockGrid class In .hic files blocks are stored sorted by column, we want them sorted by row. --- src/hic/block_reader_impl.hpp | 17 ++++++++++------- src/hic/include/hictk/hic/block_reader.hpp | 2 +- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp index bb948924..8d3ab940 100644 --- a/src/hic/block_reader_impl.hpp +++ b/src/hic/block_reader_impl.hpp @@ -12,18 +12,21 @@ namespace hictk::hic::internal { -inline BlockGrid::BlockGrid(const std::vector &index, std::size_t block_bin_count) { +inline BlockGrid::BlockGrid(const std::vector &index, std::size_t block_column_count) { _grid.resize(index.size()); std::transform(index.begin(), index.end(), _grid.begin(), [&](const BlockIndex &idx) { - const auto row = idx.id / block_bin_count; - const auto col = idx.id % block_bin_count; + const auto row = idx.id / block_column_count; + const auto col = idx.id % block_column_count; return Node{std::make_shared(idx), nullptr, nullptr, row, col}; }); - assert(std::is_sorted(_grid.begin(), _grid.end(), [](const auto &n1, const auto &n2) { - return n1.block->id < n2.block->id; - })); + std::sort(_grid.begin(), _grid.end(), [](const Node &n1, const Node &n2) { + if (n1.col == n2.col) { + return n1.row < n2.row; + } + return n1.col < n2.col; + }); auto head = _grid.begin(); auto tail = _grid.end(); @@ -145,7 +148,7 @@ inline void HiCBlockReader::find_overlapping_blocks(const hictk::PixelCoordinate const hictk::PixelCoordinates &coords2) { std::vector _blocks_idx; _index.map_2d_query_to_blocks(coords1, coords2, _blocks_idx); - _block_grid = BlockGrid(_blocks_idx, _index.block_bin_count()); + _block_grid = BlockGrid(_blocks_idx, _index.block_column_count()); } inline Index HiCBlockReader::read_index(HiCFileStream &hfs, const HiCFooter &footer) { diff --git a/src/hic/include/hictk/hic/block_reader.hpp b/src/hic/include/hictk/hic/block_reader.hpp index d7202b71..69300ee5 100644 --- a/src/hic/include/hictk/hic/block_reader.hpp +++ b/src/hic/include/hictk/hic/block_reader.hpp @@ -31,7 +31,7 @@ class BlockGrid { public: class iterator; BlockGrid() = default; - BlockGrid(const std::vector& index, std::size_t block_bin_count); + BlockGrid(const std::vector& index, std::size_t block_column_count); [[nodiscard]] auto begin() const noexcept -> iterator; [[nodiscard]] auto end() const noexcept -> iterator; From 728376f16bca6aab0871be0cb29171bd0f0d3b4a Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 13 Jun 2023 13:21:06 +0200 Subject: [PATCH 06/48] Initial implementation of PixelSelector::iterator for HiC v8 files --- src/hic/block_reader_impl.hpp | 104 ++++---- src/hic/include/hictk/hic/block_reader.hpp | 53 ++--- src/hic/include/hictk/hic/pixel_selector.hpp | 145 +++++------ src/hic/pixel_selector_impl.hpp | 238 ++++++++++++------- test/units/hic/pixel_selector_test.cpp | 2 + 5 files changed, 294 insertions(+), 248 deletions(-) diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp index 8d3ab940..60ce9256 100644 --- a/src/hic/block_reader_impl.hpp +++ b/src/hic/block_reader_impl.hpp @@ -13,86 +13,76 @@ namespace hictk::hic::internal { inline BlockGrid::BlockGrid(const std::vector &index, std::size_t block_column_count) { + if (index.empty()) { + return; + } + _grid.resize(index.size()); std::transform(index.begin(), index.end(), _grid.begin(), [&](const BlockIndex &idx) { - const auto row = idx.id / block_column_count; - const auto col = idx.id % block_column_count; + const auto col = idx.id / block_column_count; + const auto row = idx.id % block_column_count; - return Node{std::make_shared(idx), nullptr, nullptr, row, col}; + return Node{std::make_shared(idx), {}, {}, row, col}; }); std::sort(_grid.begin(), _grid.end(), [](const Node &n1, const Node &n2) { - if (n1.col == n2.col) { - return n1.row < n2.row; + if (n1.row == n2.row) { + return n1.col < n2.col; } - return n1.col < n2.col; + return n1.row < n2.row; }); - auto head = _grid.begin(); - auto tail = _grid.end(); - while (head != tail) { - const Node *next_right{}; - const Node *next_down{}; - - const auto head_row = head->row; - - std::for_each(head + 1, tail, [&](const Node &n) { - if (!next_right || (n.row == head_row && n.col < next_right->col)) { - next_right = &n; - } - - if (!next_down || (n.row == head_row + 1 && n.col < next_down->col)) { - next_down = &n; - } - }); - - head->next_right = next_right; - head->next_down = next_down; - ++head; - } + init_nodes(); } -inline auto BlockGrid::begin() const noexcept -> iterator { return iterator{_grid.front()}; } -inline auto BlockGrid::end() const noexcept -> iterator { return {}; } -inline std::size_t BlockGrid::size() const noexcept { return _grid.size(); } - -inline BlockGrid::iterator::iterator(const BlockGrid::Node &head) - : _node(&head), _current_row(head.block->first_row) {} -inline bool BlockGrid::iterator::operator==(const BlockGrid::iterator &other) const noexcept { - return _node == other._node; +inline BlockGrid::BlockGrid(const BlockGrid &other) : _grid(other._grid) { init_nodes(); } +inline BlockGrid::BlockGrid(BlockGrid &&other) noexcept : _grid(std::move(other._grid)) { + init_nodes(); } -inline bool BlockGrid::iterator::operator!=(const BlockGrid::iterator &other) const noexcept { - return !(*this == other); +inline BlockGrid &BlockGrid::operator=(const BlockGrid &other) { + if (this == &other) { + return *this; + } + _grid = other._grid; + init_nodes(); + + return *this; } -inline auto BlockGrid::iterator::operator++() noexcept -> iterator & { - assert(!!_node); - const auto move_right = _current_row != _node->block->last_row; - if (move_right) { - _node = _node->next_right; - } else { - _node = _node->next_down; - _current_row = !!_node ? _node->block->first_row : (std::numeric_limits::max)(); +inline BlockGrid &BlockGrid::operator=(BlockGrid &&other) noexcept { + if (this == &other) { + return *this; } + _grid = std::move(other._grid); + init_nodes(); return *this; } -inline auto BlockGrid::iterator::operator++(int) noexcept -> iterator { - auto it = *this; - std::ignore = ++*this; - return it; -} +inline auto BlockGrid::begin() noexcept -> std::vector::iterator { return _grid.begin(); } +inline auto BlockGrid::end() noexcept -> std::vector::iterator { return _grid.end(); } -inline auto BlockGrid::iterator::operator*() noexcept -> const_reference { - assert(_node); - return *_node; +inline auto BlockGrid::begin() const noexcept -> std::vector::const_iterator { + return _grid.begin(); } +inline auto BlockGrid::end() const noexcept -> std::vector::const_iterator { + return _grid.end(); +} +inline std::size_t BlockGrid::size() const noexcept { return _grid.size(); } -inline auto BlockGrid::iterator::operator->() noexcept -> const_pointer { - assert(_node); - return _node; +inline void BlockGrid::init_nodes() { + auto current_row = _grid.begin(); + for (auto node = _grid.begin(); node != _grid.end(); ++node) { + if (node->row != current_row->row) { + current_row = node; + } + node->current_row = current_row; + + const auto row = node->row; + node->next_row = + std::find_if(node + 1, _grid.end(), [&](const auto &node1) { return node1.row == row; }); + } } template ::value>::type *> diff --git a/src/hic/include/hictk/hic/block_reader.hpp b/src/hic/include/hictk/hic/block_reader.hpp index 69300ee5..6a5c7909 100644 --- a/src/hic/include/hictk/hic/block_reader.hpp +++ b/src/hic/include/hictk/hic/block_reader.hpp @@ -16,51 +16,38 @@ namespace hictk::hic::internal { class BlockGrid { + public: struct Node { - std::shared_ptr block{}; - const Node* next_right{}; - const Node* next_down{}; - std::size_t - row{}; // These should be in absolute term (i.e. first/last row and col mapping to block) - std::size_t - col{}; // These should be in absolute term (i.e. first/last row and col mapping to block) + std::shared_ptr block_idx{}; + std::vector::iterator current_row{}; // first node in row + std::vector::iterator next_row{}; // node to first node in next row + std::size_t row{}; + std::size_t col{}; }; + private: std::vector _grid{}; public: class iterator; BlockGrid() = default; BlockGrid(const std::vector& index, std::size_t block_column_count); + BlockGrid(const BlockGrid& other); + BlockGrid(BlockGrid&& other) noexcept; + + ~BlockGrid() = default; - [[nodiscard]] auto begin() const noexcept -> iterator; - [[nodiscard]] auto end() const noexcept -> iterator; + BlockGrid& operator=(const BlockGrid& other); + BlockGrid& operator=(BlockGrid&& other) noexcept; + + [[nodiscard]] auto begin() noexcept -> std::vector::iterator; + [[nodiscard]] auto end() noexcept -> std::vector::iterator; + [[nodiscard]] auto begin() const noexcept -> std::vector::const_iterator; + [[nodiscard]] auto end() const noexcept -> std::vector::const_iterator; [[nodiscard]] std::size_t size() const noexcept; - class iterator { - const Node* _node{}; - std::size_t _current_row{(std::numeric_limits::max)()}; - - public: - using difference_type = std::ptrdiff_t; - using value_type = Node; - using pointer = Node*; - using const_pointer = const Node*; - using reference = Node&; - using const_reference = const Node&; - using iterator_category = std::input_iterator_tag; - - iterator() = default; - explicit iterator(const Node& head); - [[nodiscard]] bool operator==(const iterator& other) const noexcept; - [[nodiscard]] bool operator!=(const iterator& other) const noexcept; - - auto operator++() noexcept -> iterator&; - auto operator++(int) noexcept -> iterator; - - [[nodiscard]] auto operator*() noexcept -> const_reference; - [[nodiscard]] auto operator->() noexcept -> const_pointer; - }; + private: + void init_nodes(); }; class BinaryBuffer { diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index 13f1c841..e79597d6 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -38,21 +38,22 @@ class PixelSelector { [[nodiscard]] bool operator==(const PixelSelector &other) const noexcept; [[nodiscard]] bool operator!=(const PixelSelector &other) const noexcept; - /* - template - [[nodiscard]] auto begin() const -> iterator; - template - [[nodiscard]] auto end() const -> iterator; - - template - [[nodiscard]] auto cbegin() const -> iterator; - template - [[nodiscard]] auto cend() const -> iterator; - */ + template + [[nodiscard]] auto begin() const -> iterator; + template + [[nodiscard]] auto end() const -> iterator; + + template + [[nodiscard]] auto cbegin() const -> iterator; + template + [[nodiscard]] auto cend() const -> iterator; template [[nodiscard]] std::vector> read_all() const; + template + std::vector> read_all_dbg() const; + [[nodiscard]] const PixelCoordinates &coord1() const noexcept; [[nodiscard]] const PixelCoordinates &coord2() const noexcept; @@ -70,68 +71,68 @@ class PixelSelector { private: [[nodiscard]] SerializedPixel process_interaction(SerializedPixel record) const; - /* + + public: + template + class iterator { + static_assert(std::is_arithmetic_v); + friend PixelSelector; + const PixelSelector *_sel{}; + + std::shared_ptr _grid{}; + + decltype(_grid->begin()) _idx{}; // Index, knows where to read the next block + + std::shared_ptr + _blk{}; // block, the actual data. we need to keep a ptr here to make sure the underlying + // storage is not freed + std::size_t _bin1_id{}; + using PixelIt = decltype(_blk->begin()->second.begin()); + PixelIt _pixel_first{}; // iterator over pixels + PixelIt _pixel_last{}; // end iterator over pixels + + mutable Pixel _value{}; + public: - template - class iterator { - static_assert(std::is_arithmetic_v); - friend PixelSelector; - const PixelSelector *_sel{}; - - std::uint32_t _pos1{}; - std::uint32_t _pos2{}; - - std::size_t _col_i{}; - - std::shared_ptr _blk{}; - internal::InteractionBlock::const_iterator _row{}; - - mutable Pixel _value{}; - - public: - using difference_type = std::ptrdiff_t; - using value_type = Pixel; - using pointer = value_type *; - using const_pointer = const value_type *; - using reference = value_type &; - using const_reference = const value_type &; - using iterator_category = std::forward_iterator_tag; - - iterator() = default; - explicit iterator(const PixelSelector &sel); - [[nodiscard]] static auto at_end(const PixelSelector &sel) -> iterator; - - [[nodiscard]] bool operator==(const iterator &other) const noexcept; - [[nodiscard]] bool operator!=(const iterator &other) const noexcept; - - [[nodiscard]] bool operator<(const iterator &other) const noexcept; - [[nodiscard]] bool operator<=(const iterator &other) const noexcept; - - [[nodiscard]] bool operator>(const iterator &other) const noexcept; - [[nodiscard]] bool operator>=(const iterator &other) const noexcept; - - [[nodiscard]] auto operator*() const -> const_reference; - // [[nodiscard]] auto operator->() const -> const_pointer; - - auto operator++() -> iterator &; - auto operator++(int) -> iterator; - - private: - [[nodiscard]] bool discard() const noexcept; - [[nodiscard]] bool is_at_end() const noexcept; - [[nodiscard]] const BinTable &bins() const noexcept; - [[nodiscard]] const PixelCoordinates &coord1() const noexcept; - [[nodiscard]] const PixelCoordinates &coord2() const noexcept; - - [[nodiscard]] const internal::InteractionBlock::Row &row() const noexcept; - [[nodiscard]] std::uint32_t pos1() const noexcept; - [[nodiscard]] std::uint32_t pos2() const noexcept; - [[nodiscard]] N count() const noexcept; - - void seek_to_next_block(); - void seek_to_next_overlap() noexcept; - }; - */ + using difference_type = std::ptrdiff_t; + using value_type = Pixel; + using pointer = value_type *; + using const_pointer = const value_type *; + using reference = value_type &; + using const_reference = const value_type &; + using iterator_category = std::forward_iterator_tag; + + iterator() = default; + explicit iterator(const PixelSelector &sel); + [[nodiscard]] static auto at_end(const PixelSelector &sel) -> iterator; + + [[nodiscard]] bool operator==(const iterator &other) const noexcept; + [[nodiscard]] bool operator!=(const iterator &other) const noexcept; + + [[nodiscard]] bool operator<(const iterator &other) const noexcept; + + [[nodiscard]] auto operator*() const -> const_reference; + // [[nodiscard]] auto operator->() const -> const_pointer; + + auto operator++() -> iterator &; + auto operator++(int) -> iterator; + + private: + [[nodiscard]] bool discard() const noexcept; + [[nodiscard]] bool is_at_end() const noexcept; + [[nodiscard]] const BinTable &bins() const noexcept; + [[nodiscard]] const PixelCoordinates &coord1() const noexcept; + [[nodiscard]] const PixelCoordinates &coord2() const noexcept; + + [[nodiscard]] std::uint32_t pos1() const noexcept; + [[nodiscard]] std::uint32_t pos2() const noexcept; + [[nodiscard]] N count() const noexcept; + + void seek_to_next_block(); + void seek_to_next_overlap() noexcept; + + void mark_block_as_fully_read(); + }; }; } // namespace hictk::hic diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 895c550e..3d1a2594 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -35,7 +35,7 @@ inline bool PixelSelector::operator==(const PixelSelector &other) const noexcept inline bool PixelSelector::operator!=(const PixelSelector &other) const noexcept { return !(*this == other); } -/* + template inline auto PixelSelector::cbegin() const -> iterator { return iterator(*this); @@ -55,7 +55,6 @@ template inline auto PixelSelector::end() const -> iterator { return this->cend(); } -*/ inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record) const { const auto &c1Norm = _footer->c1Norm(); @@ -106,19 +105,82 @@ inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record template inline std::vector> PixelSelector::read_all() const { - // return {begin(), end()}; + return {begin(), end()}; + /* + std::vector> buffer{}; + std::size_t empty_blocks = 0; + + auto bin1 = coord1().bin1.rel_id(); + auto bin2 = coord1().bin2.rel_id() + 1; + auto bin3 = coord2().bin1.rel_id(); + auto bin4 = coord2().bin2.rel_id() + 1; + + std::size_t i = 0; + + for (const auto &block_idx : _reader.grid()) { + auto blk = _reader.read(*block_idx.block); + if (!blk) { + empty_blocks++; + continue; + } + std::ofstream ofs(fmt::format(FMT_STRING("/tmp/test{:03d}.bed"), i)); + // Obs we use open-closed interval instead of open-open like is done in straw + for (const auto &[b1, row] : blk->find_overlap(bin1, bin2)) { + for (const auto &tp : row) { + const auto &b2 = tp.bin2_id; + auto tmp_record = process_interaction(SerializedPixel{ + static_cast(b1), static_cast(b2), tp.count}); + fmt::print(ofs, FMT_STRING("{}\t{}\t{}\t{}\n"), i, tmp_record.bin1_id, tmp_record.bin2_id, + tp.count); + } + + if (b1 >= bin2) { + // We're past the last row overlapping the query + break; + } + for (const auto &tp : row) { + const auto &b2 = tp.bin2_id; + if (b1 < bin1 || b2 < bin3) { + // We're upstream of the first column overlapping the query (if any) + continue; + } + + if (b2 >= bin4) { + // We're past the last column overlapping the query for the current row + break; + } + + auto record = process_interaction(SerializedPixel{static_cast(b1), + static_cast(b2), + tp.count}); if (std::isfinite(record.count)) { buffer.emplace_back( PixelCoordinates{ + bins().at(_footer->chrom1(), static_cast(record.bin1_id)), + bins().at(_footer->chrom2(), static_cast(record.bin2_id))}, + record.count); + } + } + } + ++i; + } + // Only interactions from the same block are guaranteed to already be sorted + std::sort(buffer.begin(), buffer.end()); + return buffer; + */ +} + +template +inline std::vector> PixelSelector::read_all_dbg() const { std::vector> buffer{}; - std::size_t empty_blocks = 0; auto bin1 = coord1().bin1.rel_id(); auto bin2 = coord1().bin2.rel_id() + 1; auto bin3 = coord2().bin1.rel_id(); auto bin4 = coord2().bin2.rel_id() + 1; + std::size_t i = 0; + for (const auto &block_idx : _reader.grid()) { - auto blk = _reader.read(*block_idx.block); + auto blk = _reader.read(*block_idx.block_idx); if (!blk) { - empty_blocks++; continue; } @@ -151,6 +213,7 @@ inline std::vector> PixelSelector::read_all() const { } } } + ++i; } // Only interactions from the same block are guaranteed to already be sorted std::sort(buffer.begin(), buffer.end()); @@ -179,18 +242,24 @@ inline N PixelSelector::sum() const noexcept { return _reader.sum(); } inline double PixelSelector::avg() const noexcept { return _reader.avg(); } -/* + template inline PixelSelector::iterator::iterator(const PixelSelector &sel) - : _sel(&sel), _blk(_sel->_reader.read(this->coord1(), coord2())) { + : _sel(&sel), + _grid(std::make_shared(_sel->_reader.grid())), + _idx(_grid->begin()), + _blk(_sel->_reader.read(*_idx->block_idx)) { if (!_blk) { *this = at_end(sel); return; } - _row = _blk->begin(); + auto row_it = _blk->at(coord1().bin1.rel_id()); + _bin1_id = row_it->first; + _pixel_first = row_it->second.begin(); + _pixel_last = row_it->second.end(); + seek_to_next_overlap(); - _value = *(*this); } template @@ -205,7 +274,11 @@ inline auto PixelSelector::iterator::at_end(const PixelSelector &sel) -> iter template inline bool PixelSelector::iterator::operator==(const iterator &other) const noexcept { - return _sel == other._sel && _blk == other._blk && _row == other._row && _col_i == other._col_i; + // clang-format off + return _sel == other._sel && + _idx == other._idx && + _pixel_first == other._pixel_first; + // clang-format on } template @@ -218,57 +291,37 @@ inline bool PixelSelector::iterator::operator<(const iterator &other) const n assert(!!_sel); assert(_sel->coord1() == other._sel->coord1()); assert(_sel->coord2() == other._sel->coord2()); - return _value < other._value; -} - -template -inline bool PixelSelector::iterator::operator<=(const iterator &other) const noexcept { - assert(!!_sel); - assert(_sel->coord1() == other._sel->coord1()); - assert(_sel->coord2() == other._sel->coord2()); - return _value <= other._value; -} - -template -inline bool PixelSelector::iterator::operator>(const iterator &other) const noexcept { - assert(!!_sel); - assert(_sel->coord1() == other._sel->coord1()); - assert(_sel->coord2() == other._sel->coord2()); - return _value > other._value; -} - -template -inline bool PixelSelector::iterator::operator>=(const iterator &other) const noexcept { - assert(!!_sel); - assert(_sel->coord1() == other._sel->coord1()); - assert(_sel->coord2() == other._sel->coord2()); - return _value >= other._value; + if (_idx != other._idx) { + return _idx->block_idx->id < other._idx._node->block_idx->id; + } + if (_bin1_id != other._bin1_id) { + return _bin1_id < other._bin1_id; + } + return _pixel_first < other._pixel_first; } template inline auto PixelSelector::iterator::operator*() const -> const_reference { - assert(!!_sel); + assert(!is_at_end()); + + const auto old_value = _value; _value.coords = {bins().at(coord1().bin1.chrom(), pos1()), bins().at(coord2().bin1.chrom(), pos2())}; _value.count = count(); + + if (old_value != Pixel{}) { + assert(old_value < _value); + } return _value; } template inline auto PixelSelector::iterator::operator++() -> iterator & { - assert(!!_sel); assert(!is_at_end()); - if (discard()) { - this->_col_i--; - fmt::print(FMT_STRING("operator++={}\n"), *(*this)); - this->_col_i++; + if (discard() || ++_pixel_first == _pixel_last) { seek_to_next_block(); - } else { - fmt::print(FMT_STRING("operator++{}\n"), *(*this)); - assert(_col_i < row().size()); - _col_i++; } return *this; @@ -287,7 +340,7 @@ inline bool PixelSelector::iterator::discard() const noexcept { return false; } // clang-format off - return _col_i >= row().size() || + return _pixel_first == _pixel_last || pos1() < coord1().bin1.start() || pos1() > coord1().bin2.start() || pos2() < coord2().bin1.start() || @@ -316,35 +369,27 @@ inline const PixelCoordinates &PixelSelector::iterator::coord2() const noexce return _sel->coord2(); } -template -inline const internal::InteractionBlock::Row &PixelSelector::iterator::row() const noexcept { - assert(!is_at_end()); - return _row->second; -} - template inline std::uint32_t PixelSelector::iterator::pos1() const noexcept { assert(!is_at_end()); - return static_cast(_row->first) * bins().bin_size(); + return static_cast(_bin1_id) * bins().bin_size(); } template inline std::uint32_t PixelSelector::iterator::pos2() const noexcept { assert(!is_at_end()); - assert(_col_i < row().size()); - const auto &thin_pixel = row()[_col_i]; - return static_cast(thin_pixel.bin2_id) * bins().bin_size(); + assert(_pixel_first != _pixel_last); + return static_cast(_pixel_first->bin2_id) * bins().bin_size(); } template inline N PixelSelector::iterator::count() const noexcept { assert(!is_at_end()); - assert(_col_i < row().size()); - const auto &thin_pixel = row()[_col_i]; + assert(_pixel_first != _pixel_last); if constexpr (std::is_integral_v) { - return static_cast(std::round(thin_pixel.count)); + return static_cast(std::round(_pixel_first->count)); } else { - return conditional_static_cast(thin_pixel.count); + return conditional_static_cast(_pixel_first->count); } } @@ -352,50 +397,71 @@ template inline void PixelSelector::iterator::seek_to_next_block() { assert(!is_at_end()); - // Figure out whether we should move right or down - - if (_value.coords.bin1.rel_id() == (--_blk->end())->first) { - // Advance row and reset col - // TODO - assert(false); - // const auto pos1 = (static_cast(_blk->last_row() + 2) * bins().bin_size()) - - // 1; const PixelCoordinates coords_{bins().at(coord1().bin1.chrom(), pos1), coord1().bin2}; - // auto blk = _sel->_reader.read(coords_, coord2()); - // assert(blk != _blk); - // _blk = blk; + // All pixels in block have been read + assert(_blk); + const auto block_was_fully_read = _bin1_id == (--_blk->end())->first; + if (block_was_fully_read) { + mark_block_as_fully_read(); + } + + assert(_idx < _grid->end()); + assert(_idx->current_row < _grid->end()); + const auto end_of_row = (_idx + 1 == _grid->end()) || _idx->row != (_idx + 1)->row; + if (end_of_row) { + _idx = std::find_if(_idx->current_row, _idx + 1, + [](const auto &node) { return node.block_idx != nullptr; }); + ++_bin1_id; } else { - // Advance col and leave row alone - _blk = _sel->_reader.read_after(_blk->id()); + ++_idx; } - // fmt::print(FMT_STRING("jumping to {}:{}\n"), coords1_, coord2()); + if (_idx == _grid->end()) { + *this = at_end(*_sel); + return; + } + + assert(_idx->block_idx); + _blk = _sel->_reader.read(*_idx->block_idx); if (!_blk) { *this = at_end(*_sel); return; } - _col_i = 0; - _row = _blk->begin(); + + auto row_it = _blk->at(_bin1_id); + if (row_it->first != _bin1_id) { + seek_to_next_block(); + } + assert(row_it != _blk->end()); + _pixel_first = row_it->second.begin(); + _pixel_last = row_it->second.end(); + + assert(_pixel_first != _pixel_last); seek_to_next_overlap(); - _value = *(*this); } template inline void PixelSelector::iterator::seek_to_next_overlap() noexcept { assert(!is_at_end()); - for (; _row != _blk->end(); ++_row) { - if (pos1() >= coord1().bin1.start()) { - break; - } - } - - for (_col_i = 0; _col_i < row().size(); ++_col_i) { + do { if (pos2() >= coord2().bin1.start()) { return; } + } while (++_pixel_first != _pixel_last); + + if (_pixel_first == _pixel_last) { + seek_to_next_block(); } } - */ + +template +inline void PixelSelector::iterator::mark_block_as_fully_read() { + // deal with calls to operator++(int) + const auto i = std::distance(_grid->begin(), _idx); + _grid = std::make_shared(*_grid); + _idx = _grid->begin() + i; + _idx->block_idx = nullptr; +} } // namespace hictk::hic diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index ff158d8e..921a0589 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -86,6 +86,7 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } compareContactRecord(buffer[expected_value.first], expected_value.second); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); } SECTION("v9") { @@ -104,6 +105,7 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } compareContactRecord(buffer[expected_value.first], expected_value.second); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); } } } From 49455a104f9b70767fa07ccd40162a8ef2bc438d Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 13 Jun 2023 20:58:47 +0200 Subject: [PATCH 07/48] Bugfix --- src/genomic_interval/genomic_interval_impl.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/genomic_interval/genomic_interval_impl.hpp b/src/genomic_interval/genomic_interval_impl.hpp index 2afdb96c..bf03fb0e 100644 --- a/src/genomic_interval/genomic_interval_impl.hpp +++ b/src/genomic_interval/genomic_interval_impl.hpp @@ -113,7 +113,7 @@ inline GenomicInterval GenomicInterval::parse_ucsc(const Reference &chroms, std: } const auto p1 = query.find_last_of(':'); - const auto p2 = query.find_last_of('-'); + auto p2 = query.find_last_of('-'); if (p1 == std::string::npos && p2 == std::string::npos) { throw std::runtime_error( @@ -125,7 +125,8 @@ inline GenomicInterval GenomicInterval::parse_ucsc(const Reference &chroms, std: } if (query.find(',', p1) != std::string::npos) { - query.erase(std::remove(query.begin() + std::ptrdiff_t(p1), query.end(), ',')); + query.erase(std::remove(query.begin() + std::ptrdiff_t(p1), query.end(), ','), query.end()); + p2 = query.find_last_of('-'); } query[p1] = '\t'; From 993288714400bfc4f729a2cd0c5a634ad9595e39 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Tue, 13 Jun 2023 20:59:08 +0200 Subject: [PATCH 08/48] wip --- src/hic/include/hictk/hic/pixel_selector.hpp | 24 +- src/hic/pixel_selector_impl.hpp | 271 +++++++++---------- 2 files changed, 136 insertions(+), 159 deletions(-) diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index e79597d6..b5ac95e9 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -4,6 +4,9 @@ #pragma once +#include +#include + #include "hictk/bin_table.hpp" #include "hictk/hic/cache.hpp" #include "hictk/hic/common.hpp" @@ -78,20 +81,14 @@ class PixelSelector { static_assert(std::is_arithmetic_v); friend PixelSelector; const PixelSelector *_sel{}; + using BufferT = std::vector>; std::shared_ptr _grid{}; - decltype(_grid->begin()) _idx{}; // Index, knows where to read the next block - std::shared_ptr - _blk{}; // block, the actual data. we need to keep a ptr here to make sure the underlying - // storage is not freed std::size_t _bin1_id{}; - using PixelIt = decltype(_blk->begin()->second.begin()); - PixelIt _pixel_first{}; // iterator over pixels - PixelIt _pixel_last{}; // end iterator over pixels - - mutable Pixel _value{}; + mutable std::shared_ptr _buffer{}; + mutable std::size_t _buffer_i{}; public: using difference_type = std::ptrdiff_t; @@ -123,15 +120,14 @@ class PixelSelector { [[nodiscard]] const BinTable &bins() const noexcept; [[nodiscard]] const PixelCoordinates &coord1() const noexcept; [[nodiscard]] const PixelCoordinates &coord2() const noexcept; - - [[nodiscard]] std::uint32_t pos1() const noexcept; - [[nodiscard]] std::uint32_t pos2() const noexcept; - [[nodiscard]] N count() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; void seek_to_next_block(); - void seek_to_next_overlap() noexcept; void mark_block_as_fully_read(); + + [[nodiscard]] std::shared_ptr read_block() noexcept; + void read_chunk_of_pixels(const internal::InteractionBlock &blk); }; }; diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 3d1a2594..280a891b 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -106,65 +106,6 @@ inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record template inline std::vector> PixelSelector::read_all() const { return {begin(), end()}; - /* - std::vector> buffer{}; - std::size_t empty_blocks = 0; - - auto bin1 = coord1().bin1.rel_id(); - auto bin2 = coord1().bin2.rel_id() + 1; - auto bin3 = coord2().bin1.rel_id(); - auto bin4 = coord2().bin2.rel_id() + 1; - - std::size_t i = 0; - - for (const auto &block_idx : _reader.grid()) { - auto blk = _reader.read(*block_idx.block); - if (!blk) { - empty_blocks++; - continue; - } - std::ofstream ofs(fmt::format(FMT_STRING("/tmp/test{:03d}.bed"), i)); - // Obs we use open-closed interval instead of open-open like is done in straw - for (const auto &[b1, row] : blk->find_overlap(bin1, bin2)) { - for (const auto &tp : row) { - const auto &b2 = tp.bin2_id; - auto tmp_record = process_interaction(SerializedPixel{ - static_cast(b1), static_cast(b2), tp.count}); - fmt::print(ofs, FMT_STRING("{}\t{}\t{}\t{}\n"), i, tmp_record.bin1_id, tmp_record.bin2_id, - tp.count); - } - - if (b1 >= bin2) { - // We're past the last row overlapping the query - break; - } - for (const auto &tp : row) { - const auto &b2 = tp.bin2_id; - if (b1 < bin1 || b2 < bin3) { - // We're upstream of the first column overlapping the query (if any) - continue; - } - - if (b2 >= bin4) { - // We're past the last column overlapping the query for the current row - break; - } - - auto record = process_interaction(SerializedPixel{static_cast(b1), - static_cast(b2), - tp.count}); if (std::isfinite(record.count)) { buffer.emplace_back( PixelCoordinates{ - bins().at(_footer->chrom1(), static_cast(record.bin1_id)), - bins().at(_footer->chrom2(), static_cast(record.bin2_id))}, - record.count); - } - } - } - ++i; - } - // Only interactions from the same block are guaranteed to already be sorted - std::sort(buffer.begin(), buffer.end()); - return buffer; - */ } template @@ -183,9 +124,17 @@ inline std::vector> PixelSelector::read_all_dbg() const { if (!blk) { continue; } - + std::ofstream ofs(fmt::format(FMT_STRING("/tmp/test{}.bed"), i)); // Obs we use open-closed interval instead of open-open like is done in straw for (const auto &[b1, row] : blk->find_overlap(bin1, bin2)) { + for (const auto &tp : row) { + const auto &b2 = tp.bin2_id; + auto tmp_record = process_interaction(SerializedPixel{ + static_cast(b1), static_cast(b2), tp.count}); + fmt::print(ofs, FMT_STRING("{}\t{}\t{}\t{}\n"), i, tmp_record.bin1_id, tmp_record.bin2_id, + blk->id()); + } + if (b1 >= bin2) { // We're past the last row overlapping the query break; @@ -248,18 +197,14 @@ inline PixelSelector::iterator::iterator(const PixelSelector &sel) : _sel(&sel), _grid(std::make_shared(_sel->_reader.grid())), _idx(_grid->begin()), - _blk(_sel->_reader.read(*_idx->block_idx)) { - if (!_blk) { + _bin1_id(coord1().bin1.rel_id()), + _buffer(std::make_shared(1000)) { + if (_grid->size() == 0) { *this = at_end(sel); return; } - - auto row_it = _blk->at(coord1().bin1.rel_id()); - _bin1_id = row_it->first; - _pixel_first = row_it->second.begin(); - _pixel_last = row_it->second.end(); - - seek_to_next_overlap(); + auto blk = read_block(); + read_chunk_of_pixels(*blk); } template @@ -267,7 +212,7 @@ inline auto PixelSelector::iterator::at_end(const PixelSelector &sel) -> iter iterator it{}; it._sel = &sel; - it._blk = nullptr; // This signals we're at end + it._buffer = nullptr; // end of queue return it; } @@ -277,7 +222,7 @@ inline bool PixelSelector::iterator::operator==(const iterator &other) const // clang-format off return _sel == other._sel && _idx == other._idx && - _pixel_first == other._pixel_first; + size() == other.size(); // clang-format on } @@ -297,31 +242,29 @@ inline bool PixelSelector::iterator::operator<(const iterator &other) const n if (_bin1_id != other._bin1_id) { return _bin1_id < other._bin1_id; } - return _pixel_first < other._pixel_first; + return size() < other.size(); } template inline auto PixelSelector::iterator::operator*() const -> const_reference { - assert(!is_at_end()); - - const auto old_value = _value; - - _value.coords = {bins().at(coord1().bin1.chrom(), pos1()), - bins().at(coord2().bin1.chrom(), pos2())}; - _value.count = count(); - - if (old_value != Pixel{}) { - assert(old_value < _value); - } - return _value; + assert(!!_buffer); + assert(_buffer_i < _buffer->size()); + return (*_buffer)[_buffer_i]; } template inline auto PixelSelector::iterator::operator++() -> iterator & { - assert(!is_at_end()); + assert(!!_buffer); + + if (!!_buffer) { + ++_buffer_i; + } - if (discard() || ++_pixel_first == _pixel_last) { + while (_buffer_i == size()) { seek_to_next_block(); + if (is_at_end()) { + break; + } } return *this; @@ -337,20 +280,26 @@ inline auto PixelSelector::iterator::operator++(int) -> iterator { template inline bool PixelSelector::iterator::discard() const noexcept { if (is_at_end()) { - return false; + return true; } + + assert(!!_buffer); + if (_buffer->empty()) { + return true; + } + + const auto &pixel = _buffer->front(); // clang-format off - return _pixel_first == _pixel_last || - pos1() < coord1().bin1.start() || - pos1() > coord1().bin2.start() || - pos2() < coord2().bin1.start() || - pos2() > coord2().bin2.start(); + return pixel.coords.bin1 < coord1().bin1 || + pixel.coords.bin1 > coord1().bin2 || + pixel.coords.bin2 < coord2().bin1 || + pixel.coords.bin2 > coord2().bin2; // clang-format on } template inline bool PixelSelector::iterator::is_at_end() const noexcept { - return !_blk; + return _buffer == nullptr; } template @@ -370,36 +319,19 @@ inline const PixelCoordinates &PixelSelector::iterator::coord2() const noexce } template -inline std::uint32_t PixelSelector::iterator::pos1() const noexcept { - assert(!is_at_end()); - return static_cast(_bin1_id) * bins().bin_size(); -} - -template -inline std::uint32_t PixelSelector::iterator::pos2() const noexcept { - assert(!is_at_end()); - assert(_pixel_first != _pixel_last); - return static_cast(_pixel_first->bin2_id) * bins().bin_size(); -} - -template -inline N PixelSelector::iterator::count() const noexcept { - assert(!is_at_end()); - assert(_pixel_first != _pixel_last); - if constexpr (std::is_integral_v) { - return static_cast(std::round(_pixel_first->count)); - } else { - return conditional_static_cast(_pixel_first->count); - } +inline std::size_t PixelSelector::iterator::size() const noexcept { + return !_buffer ? 0 : _buffer->size(); } template inline void PixelSelector::iterator::seek_to_next_block() { - assert(!is_at_end()); - - // All pixels in block have been read - assert(_blk); - const auto block_was_fully_read = _bin1_id == (--_blk->end())->first; + auto blk = read_block(); + if (!blk) { + _buffer = nullptr; + return; + } + assert(blk); + const auto block_was_fully_read = _bin1_id == (--blk->end())->first; if (block_was_fully_read) { mark_block_as_fully_read(); } @@ -408,10 +340,15 @@ inline void PixelSelector::iterator::seek_to_next_block() { assert(_idx->current_row < _grid->end()); const auto end_of_row = (_idx + 1 == _grid->end()) || _idx->row != (_idx + 1)->row; if (end_of_row) { + if (!_grid) { + *this = at_end(*_sel); + } + fmt::print(FMT_STRING("next row...\n")); _idx = std::find_if(_idx->current_row, _idx + 1, [](const auto &node) { return node.block_idx != nullptr; }); ++_bin1_id; } else { + fmt::print(FMT_STRING("next col...\n")); ++_idx; } @@ -421,38 +358,16 @@ inline void PixelSelector::iterator::seek_to_next_block() { } assert(_idx->block_idx); - _blk = _sel->_reader.read(*_idx->block_idx); - if (!_blk) { + blk = _sel->_reader.read(*_idx->block_idx); + if (!blk) { *this = at_end(*_sel); return; } - auto row_it = _blk->at(_bin1_id); - if (row_it->first != _bin1_id) { - seek_to_next_block(); - } - assert(row_it != _blk->end()); - _pixel_first = row_it->second.begin(); - _pixel_last = row_it->second.end(); - - assert(_pixel_first != _pixel_last); - - seek_to_next_overlap(); -} - -template -inline void PixelSelector::iterator::seek_to_next_overlap() noexcept { - assert(!is_at_end()); - - do { - if (pos2() >= coord2().bin1.start()) { - return; - } - } while (++_pixel_first != _pixel_last); - - if (_pixel_first == _pixel_last) { + if (_bin1_id != blk->at(_bin1_id)->first) { seek_to_next_block(); } + read_chunk_of_pixels(*blk); } template @@ -464,4 +379,70 @@ inline void PixelSelector::iterator::mark_block_as_fully_read() { _idx->block_idx = nullptr; } +template +inline std::shared_ptr +PixelSelector::iterator::read_block() noexcept { + if (!_grid) { + return nullptr; + } + assert(_sel); + assert(_idx != _grid->end()); + assert(_idx->block_idx); + return _sel->_reader.read(*_idx->block_idx); +} + +template +inline void PixelSelector::iterator::read_chunk_of_pixels( + const internal::InteractionBlock &blk) { + auto row_it = blk.at(_bin1_id); + if (_bin1_id != row_it->first) { + return; + } + + auto first = row_it->second.begin(); + auto last = row_it->second.end(); + + first = + std::lower_bound(first, last, coord2().bin1.start(), + [&](const internal::InteractionBlock::ThinPixel &pixel, const auto &pos) { + return pixel.bin2_id * bins().bin_size() < pos; + }); + // clang-format off + const auto buff_capacity = + std::min({!!_buffer ? _buffer->capacity() : std::size_t(1000), + static_cast(std::distance(first, last))}); + // clang-format on + // This is fine, as iterators are not thread safe anyway + if (_buffer.use_count() == 1) { + _buffer->resize(buff_capacity); + } else { + _buffer = std::make_shared(buff_capacity); + } + _buffer->clear(); + _buffer_i = 0; + + const auto pos1 = static_cast(_bin1_id) * bins().bin_size(); + const auto bin1 = bins().at(coord1().bin1.chrom(), pos1); + do { + const auto pos2 = static_cast(first->bin2_id) * bins().bin_size(); + auto bin2 = bins().at(coord2().bin1.chrom(), pos2); + + if (bin2 > coord2().bin2) { + break; + } + + if constexpr (std::is_integral_v) { + const auto count = static_cast(std::round(first->count)); + _buffer->emplace_back(bin1, std::move(bin2), count); + } else { + const auto count = conditional_static_cast(first->count); + _buffer->emplace_back(bin1, std::move(bin2), count); + } + } while (++first != last); + + if (first != last && _bin1_id + 1 > coord1().bin2.rel_id()) { + _grid = nullptr; + _idx = {}; + } +} } // namespace hictk::hic From 606a15b5be9c637c936e8bc96182f90b49c52335 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Wed, 14 Jun 2023 20:23:04 +0200 Subject: [PATCH 09/48] Rework PixelSelector::iterator --- .../{cache_impl.hpp => block_cache_impl.hpp} | 8 +- src/hic/block_reader_impl.hpp | 98 +------- src/hic/footer_cache_impl.hpp | 74 ++++++ src/hic/hic_file_impl.hpp | 29 ++- src/hic/hic_file_stream_impl.hpp | 73 +++--- src/hic/hic_footer_impl.hpp | 22 +- src/hic/include/hictk/hic.hpp | 10 +- .../hictk/hic/{cache.hpp => block_cache.hpp} | 5 +- src/hic/include/hictk/hic/block_reader.hpp | 51 +--- src/hic/include/hictk/hic/footer_cache.hpp | 66 +++++ src/hic/include/hictk/hic/hic_file_stream.hpp | 23 +- src/hic/include/hictk/hic/hic_footer.hpp | 15 +- src/hic/include/hictk/hic/index.hpp | 94 ++++--- src/hic/include/hictk/hic/pixel_selector.hpp | 20 +- src/hic/index_impl.hpp | 197 +++++++++------ src/hic/pixel_selector_impl.hpp | 233 +++++++----------- test/units/hic/hic_file_stream_test.cpp | 58 ++--- 17 files changed, 569 insertions(+), 507 deletions(-) rename src/hic/{cache_impl.hpp => block_cache_impl.hpp} (96%) create mode 100644 src/hic/footer_cache_impl.hpp rename src/hic/include/hictk/hic/{cache.hpp => block_cache.hpp} (96%) create mode 100644 src/hic/include/hictk/hic/footer_cache.hpp diff --git a/src/hic/cache_impl.hpp b/src/hic/block_cache_impl.hpp similarity index 96% rename from src/hic/cache_impl.hpp rename to src/hic/block_cache_impl.hpp index d5f9117e..9eb827c0 100644 --- a/src/hic/cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -9,6 +9,8 @@ #include #include +#include "hictk/hic/hic_footer.hpp" + namespace hictk::hic::internal { constexpr bool operator<(const InteractionBlock &a, const InteractionBlock &b) noexcept { @@ -111,14 +113,14 @@ inline auto InteractionBlock::end() const noexcept -> const_iterator { return _i inline auto InteractionBlock::cend() const noexcept -> const_iterator { return end(); } -inline auto InteractionBlock::at(std::uint64_t row) const noexcept -> const_iterator { - return _interactions.lower_bound(row); +inline auto InteractionBlock::find(std::uint64_t row) const noexcept -> const_iterator { + return _interactions.find(row); } inline auto InteractionBlock::find_overlap(std::uint64_t first_row, std::uint64_t last_row) const noexcept -> Overlap { assert(first_row <= last_row); - return {at(first_row), _interactions.upper_bound(last_row)}; + return {_interactions.lower_bound(first_row), _interactions.upper_bound(last_row)}; } inline bool InteractionBlock::has_overlap(std::uint64_t first_row, diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp index 60ce9256..78e302cd 100644 --- a/src/hic/block_reader_impl.hpp +++ b/src/hic/block_reader_impl.hpp @@ -12,79 +12,6 @@ namespace hictk::hic::internal { -inline BlockGrid::BlockGrid(const std::vector &index, std::size_t block_column_count) { - if (index.empty()) { - return; - } - - _grid.resize(index.size()); - std::transform(index.begin(), index.end(), _grid.begin(), [&](const BlockIndex &idx) { - const auto col = idx.id / block_column_count; - const auto row = idx.id % block_column_count; - - return Node{std::make_shared(idx), {}, {}, row, col}; - }); - - std::sort(_grid.begin(), _grid.end(), [](const Node &n1, const Node &n2) { - if (n1.row == n2.row) { - return n1.col < n2.col; - } - return n1.row < n2.row; - }); - - init_nodes(); -} - -inline BlockGrid::BlockGrid(const BlockGrid &other) : _grid(other._grid) { init_nodes(); } -inline BlockGrid::BlockGrid(BlockGrid &&other) noexcept : _grid(std::move(other._grid)) { - init_nodes(); -} - -inline BlockGrid &BlockGrid::operator=(const BlockGrid &other) { - if (this == &other) { - return *this; - } - _grid = other._grid; - init_nodes(); - - return *this; -} - -inline BlockGrid &BlockGrid::operator=(BlockGrid &&other) noexcept { - if (this == &other) { - return *this; - } - _grid = std::move(other._grid); - init_nodes(); - - return *this; -} - -inline auto BlockGrid::begin() noexcept -> std::vector::iterator { return _grid.begin(); } -inline auto BlockGrid::end() noexcept -> std::vector::iterator { return _grid.end(); } - -inline auto BlockGrid::begin() const noexcept -> std::vector::const_iterator { - return _grid.begin(); -} -inline auto BlockGrid::end() const noexcept -> std::vector::const_iterator { - return _grid.end(); -} -inline std::size_t BlockGrid::size() const noexcept { return _grid.size(); } - -inline void BlockGrid::init_nodes() { - auto current_row = _grid.begin(); - for (auto node = _grid.begin(); node != _grid.end(); ++node) { - if (node->row != current_row->row) { - current_row = node; - } - node->current_row = current_row; - - const auto row = node->row; - node->next_row = - std::find_if(node + 1, _grid.end(), [&](const auto &node1) { return node1.row == row; }); - } -} - template ::value>::type *> inline T BinaryBuffer::read() { static_assert(sizeof(char) == 1, ""); @@ -104,17 +31,15 @@ inline std::string &BinaryBuffer::reset() noexcept { return _buffer; } -inline HiCBlockReader::HiCBlockReader(std::shared_ptr hfs, const HiCFooter &footer, +inline HiCBlockReader::HiCBlockReader(std::shared_ptr hfs, const Index &master_index, std::shared_ptr bins_, std::shared_ptr block_cache_, const PixelCoordinates &coords1, const PixelCoordinates &coords2) : _hfs(std::move(hfs)), - _index(read_index(*_hfs, footer)), _blk_cache(std::move(block_cache_)), - _bins(std::move(bins_)) { - find_overlapping_blocks(coords1, coords2); -} + _bins(std::move(bins_)), + _index(master_index.subset(coords1, coords2)) {} inline HiCBlockReader::operator bool() const noexcept { return !!_hfs; } @@ -123,7 +48,7 @@ inline const Chromosome &HiCBlockReader::chrom2() const noexcept { return _index inline const BinTable &HiCBlockReader::bins() const noexcept { return *_bins; } -inline const BlockGrid &HiCBlockReader::grid() const { return _block_grid; } +inline const Index &HiCBlockReader::index() const noexcept { return _index; } inline double HiCBlockReader::sum() const noexcept { return _index.matrix_sum(); } @@ -134,21 +59,14 @@ inline double HiCBlockReader::avg() const noexcept { return sum() / double(num_bins1 * num_bins2); } -inline void HiCBlockReader::find_overlapping_blocks(const hictk::PixelCoordinates &coords1, - const hictk::PixelCoordinates &coords2) { - std::vector _blocks_idx; - _index.map_2d_query_to_blocks(coords1, coords2, _blocks_idx); - _block_grid = BlockGrid(_blocks_idx, _index.block_column_count()); -} - inline Index HiCBlockReader::read_index(HiCFileStream &hfs, const HiCFooter &footer) { if (footer.fileOffset() == -1) { // Footer does not exist. However, query may be valid return {}; } - return hfs.readBlockMap(footer.fileOffset(), footer.chrom1(), footer.chrom2(), footer.unit(), - footer.resolution()); + return hfs.read_index(footer.fileOffset(), footer.chrom1(), footer.chrom2(), footer.unit(), + footer.resolution()); } inline std::shared_ptr HiCBlockReader::read(const BlockIndex &idx) { @@ -158,7 +76,7 @@ inline std::shared_ptr HiCBlockReader::read(const BlockI assert(_blk_cache); assert(_bins); - if (auto it = _blk_cache->find(idx.id); it != _blk_cache->end()) { + if (auto it = _blk_cache->find(idx.id()); it != _blk_cache->end()) { return it->second; } @@ -212,7 +130,7 @@ inline std::shared_ptr HiCBlockReader::read(const BlockI HICTK_UNREACHABLE_CODE; } - auto it = _blk_cache->emplace(idx.id, InteractionBlock{idx.id, _tmp_buffer}); + auto it = _blk_cache->emplace(idx.id(), InteractionBlock{idx.id(), _tmp_buffer}); return it.first->second; } diff --git a/src/hic/footer_cache_impl.hpp b/src/hic/footer_cache_impl.hpp new file mode 100644 index 00000000..9f5ab81c --- /dev/null +++ b/src/hic/footer_cache_impl.hpp @@ -0,0 +1,74 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include +#include + +#include "hictk/hic/hic_footer.hpp" + +namespace hictk::hic::internal { + +inline bool FooterCache::HiCFooterPtrCmp::operator()( + const std::shared_ptr &f1, + const std::shared_ptr &f2) const noexcept { + if (!f1 || !f2) { + return f1 == f2; + } + return *f1 == *f2; +} + +inline bool FooterCache::HiCFooterPtrCmp::operator()( + const HiCFooterMetadata &m1, const std::shared_ptr &f2) const noexcept { + if (!f2) { + return false; + } + return m1 == f2->metadata(); +} + +inline bool FooterCache::HiCFooterPtrCmp::operator()(const std::shared_ptr &f1, + const HiCFooterMetadata &m2) const noexcept { + if (!f1) { + return false; + } + return f1->metadata() == m2; +} + +inline std::size_t FooterCache::HiCFooterPtrHasher::operator()( + const std::shared_ptr &f) const noexcept { + if (!f) { + return std::hash{}(f.get()); + } + return (*this)(f->metadata()); +} +inline std::size_t FooterCache::HiCFooterPtrHasher::operator()( + const HiCFooterMetadata &m) const noexcept { + return std::hash{}(m); +} + +inline auto FooterCache::begin() const noexcept -> decltype(_cache.cbegin()) { + return _cache.begin(); +} +inline auto FooterCache::end() const noexcept -> decltype(_cache.cbegin()) { return _cache.end(); } + +inline auto FooterCache::cbegin() const noexcept -> decltype(_cache.cbegin()) { + return _cache.cbegin(); +} +inline auto FooterCache::cend() const noexcept -> decltype(_cache.cbegin()) { + return _cache.cend(); +} + +inline auto FooterCache::emplace(HiCFooter &&f) -> decltype(_cache.emplace()) { + return _cache.emplace(std::make_shared(std::move(f))); +} +inline auto FooterCache::find(const HiCFooterMetadata &m) -> const_iterator { + return _cache.find(m); +} +inline std::size_t FooterCache::size() const noexcept { return _cache.size(); } +inline void FooterCache::clear() { return _cache.clear(); } + +} // namespace hictk::hic::internal diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index 1155987d..4fed1f4c 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -13,6 +13,7 @@ #include #include "hictk/hic/common.hpp" +#include "hictk/hic/hic_footer.hpp" namespace hictk::hic { @@ -67,14 +68,13 @@ inline std::shared_ptr HiCFile::get_footer( _fs->header().chromosomes.at(chrom2_id)}; auto it = _footers.find(metadata); if (it != _footers.end()) { - return it->second; + return *it; } - auto footer = std::make_shared( - _fs->readFooter(chrom1_id, chrom2_id, matrix_type, norm, unit, resolution)); - auto node = _footers.emplace(std::move(metadata), std::move(footer)); - assert(node.second); - return node.first->second; + auto [node, _] = + _footers.emplace(_fs->read_footer(chrom1_id, chrom2_id, matrix_type, norm, unit, resolution)); + + return *node; } /* inline internal::MatrixSelector HiCFile::get_matrix_selector(const Chromosome& chrom, @@ -166,7 +166,22 @@ inline PixelSelector HiCFile::fetch(const Chromosome& chrom1, std::uint32_t star internal::HiCFooterMetadata metadata{url(), _type, norm, _unit, resolution(), chrom1, chrom2, -1}; - return std::make_shared(std::move(metadata)); + if (metadata.fileOffset == -1) { + return std::make_shared(internal::Index{metadata.chrom1, + metadata.chrom2, + metadata.unit, + metadata.resolution, + _fs->version(), + 1, + 1, + 0, + {}}, + std::move(metadata)); + } + return std::make_shared( + _fs->read_index(metadata.fileOffset, metadata.chrom1, metadata.chrom2, metadata.unit, + metadata.resolution), + std::move(metadata)); } }(); diff --git a/src/hic/hic_file_stream_impl.hpp b/src/hic/hic_file_stream_impl.hpp index e296cc82..bca54717 100644 --- a/src/hic/hic_file_stream_impl.hpp +++ b/src/hic/hic_file_stream_impl.hpp @@ -178,9 +178,9 @@ inline auto HiCFileStream::init_decompressor() -> Decompressor { return zs; } -inline Index HiCFileStream::readBlockMap(std::int64_t fileOffset, const Chromosome &chrom1, - const Chromosome &chrom2, MatrixUnit wantedUnit, - std::int64_t wantedResolution) { +inline Index HiCFileStream::read_index(std::int64_t fileOffset, const Chromosome &chrom1, + const Chromosome &chrom2, MatrixUnit wantedUnit, + std::int64_t wantedResolution) { _fs->seekg(fileOffset); [[maybe_unused]] const auto c1i = _fs->read(); @@ -199,8 +199,8 @@ inline Index HiCFileStream::readBlockMap(std::int64_t fileOffset, const Chromoso std::ignore = _fs->read(); // percent95 const auto foundResolution = static_cast(_fs->read()); - const auto blockBinCount = _fs->read(); - const auto blockColumnCount = _fs->read(); + const auto blockBinCount = static_cast(_fs->read()); + const auto blockColumnCount = static_cast(_fs->read()); const auto nBlocks = static_cast(_fs->read()); @@ -212,17 +212,15 @@ inline Index HiCFileStream::readBlockMap(std::int64_t fileOffset, const Chromoso const auto size = static_cast(_fs->read()); assert(position + size < _fs->size()); if (size > 0) { - buffer.emplace(BlockIndex{block_id, position, size, 0, 0, 0, 0}); + buffer.emplace(block_id, position, size, blockColumnCount); } } - return {chrom1, - chrom2, - std::move(buffer), - version(), - static_cast(blockBinCount), - static_cast(blockColumnCount), - static_cast(sumCount)}; + return {chrom1, chrom2, + wantedUnit, static_cast(wantedResolution), + version(), blockBinCount, + blockColumnCount, static_cast(sumCount), + std::move(buffer)}; } constexpr std::int64_t blockSize = sizeof(int32_t) + sizeof(int64_t) + sizeof(int32_t); @@ -316,13 +314,13 @@ inline void HiCFileStream::readAndInflate(const BlockIndex &idx, std::string &pl // _strbuff is used to store compressed data // plainTextBuffer is used to store decompressed data assert(_decompressor); - assert(idx.compressed_size_bytes > 0); - const auto buffSize = static_cast(idx.compressed_size_bytes); + assert(idx.compressed_size_bytes() > 0); + const auto buffSize = idx.compressed_size_bytes(); plainTextBuffer.reserve(buffSize * 3); plainTextBuffer.resize(plainTextBuffer.capacity()); - _fs->seekg(static_cast(idx.file_offset)); + _fs->seekg(static_cast(idx.file_offset())); _fs->read(_strbuff, buffSize); std::size_t bytes_decompressed{}; @@ -346,7 +344,7 @@ inline void HiCFileStream::readAndInflate(const BlockIndex &idx, std::string &pl } } catch (const std::exception &e) { throw std::runtime_error(fmt::format(FMT_STRING("failed to decompress block at pos {}: {}"), - idx.file_offset, e.what())); + idx.file_offset(), e.what())); } } @@ -359,12 +357,10 @@ inline bool HiCFileStream::checkMagicString(std::string url) noexcept { } } -inline HiCFooter HiCFileStream::readFooter(const std::uint32_t chrom1_id, - const std::uint32_t chrom2_id, - const MatrixType matrix_type, - const NormalizationMethod wanted_norm, - const MatrixUnit wanted_unit, - const std::uint32_t wanted_resolution) { +inline HiCFooter HiCFileStream::read_footer(std::uint32_t chrom1_id, std::uint32_t chrom2_id, + MatrixType matrix_type, NormalizationMethod wanted_norm, + MatrixUnit wanted_unit, + std::uint32_t wanted_resolution) { assert(chrom1_id <= chrom2_id); assert(std::find(_header->resolutions.begin(), _header->resolutions.end(), wanted_resolution) != _header->resolutions.end()); @@ -373,22 +369,17 @@ inline HiCFooter HiCFileStream::readFooter(const std::uint32_t chrom1_id, using NM = NormalizationMethod; // clang-format off - HiCFooter footer{ - HiCFooterMetadata{_fs->url(), - matrix_type, - wanted_norm, - wanted_unit, - wanted_resolution, - _header->chromosomes.at(chrom1_id), - _header->chromosomes.at(chrom2_id)} - }; + HiCFooterMetadata metadata{ + _fs->url(), + matrix_type, + wanted_norm, + wanted_unit, + wanted_resolution, + _header->chromosomes.at(chrom1_id), + _header->chromosomes.at(chrom2_id) + }; // clang-format on - auto &metadata = footer.metadata(); - auto &expectedValues = footer.expectedValues(); - auto &c1Norm = footer.c1Norm(); - auto &c2Norm = footer.c2Norm(); - const auto key = fmt::format(FMT_COMPILE("{}_{}"), chrom1_id, chrom2_id); _fs->seekg(masterOffset()); @@ -410,12 +401,20 @@ inline HiCFooter HiCFileStream::readFooter(const std::uint32_t chrom1_id, wanted_resolution, wanted_unit)); } + HiCFooter footer{read_index(metadata.fileOffset, metadata.chrom1, metadata.chrom2, metadata.unit, + metadata.resolution), + std::move(metadata)}; + if ((matrix_type == MT::observed && wanted_norm == NM::NONE) || ((matrix_type == MT::oe || matrix_type == MT::expected) && wanted_norm == NM::NONE && chrom1_id != chrom2_id)) { return footer; // no need to read wanted_norm vector index } + auto &expectedValues = footer.expectedValues(); + auto &c1Norm = footer.c1Norm(); + auto &c2Norm = footer.c2Norm(); + // read in and ignore expected value maps; don't store; reading these to // get to wanted_norm vector index auto nExpectedValues = _fs->read(); diff --git a/src/hic/hic_footer_impl.hpp b/src/hic/hic_footer_impl.hpp index 14e7bffb..6a77933c 100644 --- a/src/hic/hic_footer_impl.hpp +++ b/src/hic/hic_footer_impl.hpp @@ -24,8 +24,8 @@ inline bool HiCFooterMetadata::operator!=(const HiCFooterMetadata &other) const return !(*this == other); } -inline HiCFooter::HiCFooter(HiCFooterMetadata metadata_) noexcept - : _metadata(std::move(metadata_)) {} +inline HiCFooter::HiCFooter(Index index_, HiCFooterMetadata metadata_) noexcept + : _index(std::move(index_)), _metadata(std::move(metadata_)) {} constexpr HiCFooter::operator bool() const noexcept { return !metadata(); } inline bool HiCFooter::operator==(const HiCFooter &other) const noexcept { @@ -36,6 +36,7 @@ inline bool HiCFooter::operator!=(const HiCFooter &other) const noexcept { } constexpr const HiCFooterMetadata &HiCFooter::metadata() const noexcept { return _metadata; } constexpr HiCFooterMetadata &HiCFooter::metadata() noexcept { return _metadata; } +inline const Index &HiCFooter::index() const noexcept { return _index; } constexpr const std::string &HiCFooter::url() const noexcept { return metadata().url; } constexpr MatrixType HiCFooter::matrix_type() const noexcept { return metadata().matrix_type; } constexpr NormalizationMethod HiCFooter::normalization() const noexcept { @@ -73,10 +74,13 @@ constexpr std::vector &HiCFooter::c2Norm() noexcept { } // namespace hictk::hic::internal -template <> -struct std::hash { - inline std::size_t operator()(hictk::hic::internal::HiCFooterMetadata const &m) const noexcept { - return hictk::internal::hash_combine(0, m.url, m.matrix_type, m.normalization, m.unit, - m.resolution, m.chrom1, m.chrom2); - } -}; +inline std::size_t std::hash::operator()( + hictk::hic::internal::HiCFooterMetadata const &m) const noexcept { + return hictk::internal::hash_combine(0, m.url, m.matrix_type, m.normalization, m.unit, + m.resolution, m.chrom1, m.chrom2); +} + +inline std::size_t std::hash::operator()( + hictk::hic::internal::HiCFooter const &f) const noexcept { + return std::hash{}(f.metadata()); +} diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index c2926b5d..9cc583da 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -12,9 +12,11 @@ #include #include +#include "hictk/hic/block_cache.hpp" #include "hictk/hic/block_reader.hpp" #include "hictk/hic/common.hpp" #include "hictk/hic/filestream.hpp" +#include "hictk/hic/footer_cache.hpp" #include "hictk/hic/hic_file_stream.hpp" #include "hictk/hic/hic_footer.hpp" #include "hictk/hic/hic_header.hpp" @@ -22,14 +24,10 @@ #include "hictk/hic/pixel_selector.hpp" namespace hictk::hic { + class HiCFile { - // clang-format off - using FooterCacheT = - std::unordered_map>; - // clang-format on mutable std::shared_ptr _fs{}; - mutable FooterCacheT _footers{}; + mutable internal::FooterCache _footers{}; MatrixType _type{MatrixType::observed}; MatrixUnit _unit{MatrixUnit::BP}; mutable std::shared_ptr _block_cache{}; diff --git a/src/hic/include/hictk/hic/cache.hpp b/src/hic/include/hictk/hic/block_cache.hpp similarity index 96% rename from src/hic/include/hictk/hic/cache.hpp rename to src/hic/include/hictk/hic/block_cache.hpp index a6c66e00..25a9db20 100644 --- a/src/hic/include/hictk/hic/cache.hpp +++ b/src/hic/include/hictk/hic/block_cache.hpp @@ -13,6 +13,7 @@ #include "hictk/chromosome.hpp" #include "hictk/hic/common.hpp" +#include "hictk/hic/hic_footer.hpp" #include "hictk/pixel.hpp" namespace hictk::hic::internal { @@ -76,7 +77,7 @@ class InteractionBlock { [[nodiscard]] const Chromosome& chrom1() const noexcept; [[nodiscard]] const Chromosome& chrom2() const noexcept; - [[nodiscard]] auto at(std::uint64_t row) const noexcept -> const_iterator; + [[nodiscard]] auto find(std::uint64_t row) const noexcept -> const_iterator; [[nodiscard]] auto find_overlap(std::uint64_t first_row, std::uint64_t last_row) const noexcept -> Overlap; @@ -141,4 +142,4 @@ class BlockLRUCache { } // namespace hictk::hic::internal -#include "../../../cache_impl.hpp" +#include "../../../block_cache_impl.hpp" diff --git a/src/hic/include/hictk/hic/block_reader.hpp b/src/hic/include/hictk/hic/block_reader.hpp index 6a5c7909..a7ac961d 100644 --- a/src/hic/include/hictk/hic/block_reader.hpp +++ b/src/hic/include/hictk/hic/block_reader.hpp @@ -4,52 +4,21 @@ #pragma once +#include + +#include #include #include +#include +#include #include "hictk/chromosome.hpp" -#include "hictk/hic/cache.hpp" -#include "hictk/hic/filestream.hpp" +#include "hictk/hic/block_cache.hpp" #include "hictk/hic/hic_file_stream.hpp" #include "hictk/hic/index.hpp" namespace hictk::hic::internal { -class BlockGrid { - public: - struct Node { - std::shared_ptr block_idx{}; - std::vector::iterator current_row{}; // first node in row - std::vector::iterator next_row{}; // node to first node in next row - std::size_t row{}; - std::size_t col{}; - }; - - private: - std::vector _grid{}; - - public: - class iterator; - BlockGrid() = default; - BlockGrid(const std::vector& index, std::size_t block_column_count); - BlockGrid(const BlockGrid& other); - BlockGrid(BlockGrid&& other) noexcept; - - ~BlockGrid() = default; - - BlockGrid& operator=(const BlockGrid& other); - BlockGrid& operator=(BlockGrid&& other) noexcept; - - [[nodiscard]] auto begin() noexcept -> std::vector::iterator; - [[nodiscard]] auto end() noexcept -> std::vector::iterator; - [[nodiscard]] auto begin() const noexcept -> std::vector::const_iterator; - [[nodiscard]] auto end() const noexcept -> std::vector::const_iterator; - [[nodiscard]] std::size_t size() const noexcept; - - private: - void init_nodes(); -}; - class BinaryBuffer { std::string _buffer{}; std::size_t _i{}; @@ -68,19 +37,18 @@ class BinaryBuffer { class HiCBlockReader { std::shared_ptr _hfs{}; - Index _index{}; std::shared_ptr _blk_cache{}; // This should be passed in by file. Key should be // changed from size_t to {chrom1, chrom2, size_t} // We need the entire bin table in order to map pixels to abs bin ids std::shared_ptr _bins{}; - BlockGrid _block_grid{}; + Index _index{}; BinaryBuffer _bbuffer{}; std::vector _tmp_buffer{}; public: HiCBlockReader() = default; - HiCBlockReader(std::shared_ptr hfs, const HiCFooter& footer, + HiCBlockReader(std::shared_ptr hfs, const Index& master_index, std::shared_ptr bins_, std::shared_ptr block_cache_, const PixelCoordinates& coords1, const PixelCoordinates& coords2); @@ -89,7 +57,7 @@ class HiCBlockReader { [[nodiscard]] const Chromosome& chrom1() const noexcept; [[nodiscard]] const Chromosome& chrom2() const noexcept; [[nodiscard]] const BinTable& bins() const noexcept; - [[nodiscard]] const BlockGrid& grid() const; + [[nodiscard]] const Index& index() const noexcept; [[nodiscard]] double sum() const noexcept; [[nodiscard]] double avg() const noexcept; @@ -97,7 +65,6 @@ class HiCBlockReader { [[nodiscard]] std::shared_ptr read(const BlockIndex& idx); private: - void find_overlapping_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2); [[nodiscard]] static Index read_index(HiCFileStream& hfs, const HiCFooter& footer); static void read_dispatcher_type1_block(bool i16Bin1, bool i16Bin2, bool i16Counts, std::int32_t bin1Offset, std::int32_t bin2Offset, diff --git a/src/hic/include/hictk/hic/footer_cache.hpp b/src/hic/include/hictk/hic/footer_cache.hpp new file mode 100644 index 00000000..f82bc6a3 --- /dev/null +++ b/src/hic/include/hictk/hic/footer_cache.hpp @@ -0,0 +1,66 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#pragma once + +#include +#include +#include + +#include +#include +#include + +#include "hictk/chromosome.hpp" +#include "hictk/hic/common.hpp" +#include "hictk/hic/hic_footer.hpp" +#include "hictk/pixel.hpp" + +namespace hictk::hic::internal { + +class FooterCache { + struct HiCFooterPtrCmp { + using is_transparent = void; + + bool operator()(const std::shared_ptr& f1, + const std::shared_ptr& f2) const noexcept; + bool operator()(const HiCFooterMetadata& m1, + const std::shared_ptr& f2) const noexcept; + bool operator()(const std::shared_ptr& f1, + const HiCFooterMetadata& m2) const noexcept; + }; + + struct HiCFooterPtrHasher { + using is_transparent = void; + + std::size_t operator()(const std::shared_ptr& f) const noexcept; + std::size_t operator()(const HiCFooterMetadata& m) const noexcept; + }; + + using MapT = + phmap::flat_hash_set, HiCFooterPtrHasher, HiCFooterPtrCmp>; + MapT _cache; + + public: + using difference_type = MapT::difference_type; + using iterator = MapT::iterator; + using const_iterator = MapT::iterator; + FooterCache() = default; + + auto begin() const noexcept -> decltype(_cache.cbegin()); + auto end() const noexcept -> decltype(_cache.cbegin()); + + auto cbegin() const noexcept -> decltype(_cache.cbegin()); + auto cend() const noexcept -> decltype(_cache.cbegin()); + + auto emplace(HiCFooter&& f) -> decltype(_cache.emplace()); + auto find(const HiCFooterMetadata& m) -> const_iterator; + + [[nodiscard]] std::size_t size() const noexcept; + void clear(); +}; + +} // namespace hictk::hic::internal + +#include "../../../footer_cache_impl.hpp" diff --git a/src/hic/include/hictk/hic/hic_file_stream.hpp b/src/hic/include/hictk/hic/hic_file_stream.hpp index 30a8304e..5fbc8c43 100644 --- a/src/hic/include/hictk/hic/hic_file_stream.hpp +++ b/src/hic/include/hictk/hic/hic_file_stream.hpp @@ -22,17 +22,6 @@ namespace hictk::hic::internal { -// TODO REMOVE -// struct BlockIndex { -// phmap::btree_map blocks{}; -// std::int32_t blockBinCount{}; -// std::int32_t blockColumnCount{}; -// double sumCount{}; -// -// [[nodiscard]] indexEntry at(std::size_t id) const noexcept; -// [[nodiscard]] indexEntry at(std::size_t row, std::size_t col) const noexcept; -// }; - class HiCFileStream { using Decompressor = UniquePtrWithDeleter; std::shared_ptr _fs{}; @@ -50,18 +39,18 @@ class HiCFileStream { // reads the footer given a pair of chromosomes, wanted_norm, wanted_unit (BP or FRAG) and // resolution. - [[nodiscard]] HiCFooter readFooter(std::uint32_t chrom1_id, std::uint32_t chrom2_id, - MatrixType matrix_type, NormalizationMethod wanted_norm, - MatrixUnit wanted_unit, std::uint32_t wanted_resolution); + [[nodiscard]] HiCFooter read_footer(std::uint32_t chrom1_id, std::uint32_t chrom2_id, + MatrixType matrix_type, NormalizationMethod wanted_norm, + MatrixUnit wanted_unit, std::uint32_t wanted_resolution); [[nodiscard]] static MatrixType readMatrixType(filestream::FileStream &fs, std::string &buff); [[nodiscard]] static NormalizationMethod readNormalizationMethod(filestream::FileStream &fs, std::string &buff); [[nodiscard]] static MatrixUnit readMatrixUnit(filestream::FileStream &fs, std::string &buff); - [[nodiscard]] Index readBlockMap(std::int64_t fileOffset, const Chromosome &chrom1, - const Chromosome &chrom2, MatrixUnit wantedUnit, - std::int64_t wantedResolution); + [[nodiscard]] Index read_index(std::int64_t fileOffset, const Chromosome &chrom1, + const Chromosome &chrom2, MatrixUnit wantedUnit, + std::int64_t wantedResolution); void readAndInflate(const BlockIndex &idx, std::string &plainTextBuffer); [[nodiscard]] static bool checkMagicString(std::string url) noexcept; diff --git a/src/hic/include/hictk/hic/hic_footer.hpp b/src/hic/include/hictk/hic/hic_footer.hpp index e70fa6c0..7bd6fe61 100644 --- a/src/hic/include/hictk/hic/hic_footer.hpp +++ b/src/hic/include/hictk/hic/hic_footer.hpp @@ -5,11 +5,13 @@ #pragma once #include +#include #include #include #include "hictk/chromosome.hpp" #include "hictk/hic/common.hpp" +#include "hictk/hic/index.hpp" namespace hictk::hic::internal { struct HiCFooterMetadata { @@ -28,6 +30,7 @@ struct HiCFooterMetadata { }; class HiCFooter { + Index _index{}; HiCFooterMetadata _metadata{}; std::vector _expectedValues{}; std::vector _c1Norm{}; @@ -35,7 +38,7 @@ class HiCFooter { public: HiCFooter() = default; - explicit HiCFooter(HiCFooterMetadata metadata_) noexcept; + explicit HiCFooter(Index index_, HiCFooterMetadata metadata_) noexcept; constexpr explicit operator bool() const noexcept; bool operator==(const HiCFooter &other) const noexcept; @@ -43,6 +46,7 @@ class HiCFooter { [[nodiscard]] constexpr const HiCFooterMetadata &metadata() const noexcept; [[nodiscard]] constexpr HiCFooterMetadata &metadata() noexcept; + [[nodiscard]] const Index &index() const noexcept; [[nodiscard]] constexpr const std::string &url() const noexcept; [[nodiscard]] constexpr MatrixType matrix_type() const noexcept; @@ -63,4 +67,13 @@ class HiCFooter { }; } // namespace hictk::hic::internal +template <> +struct std::hash { + inline std::size_t operator()(hictk::hic::internal::HiCFooterMetadata const &m) const noexcept; +}; + +template <> +struct std::hash { + inline std::size_t operator()(hictk::hic::internal::HiCFooter const &f) const noexcept; +}; #include "../../../hic_footer_impl.hpp" diff --git a/src/hic/include/hictk/hic/index.hpp b/src/hic/include/hictk/hic/index.hpp index 6a401e5c..cc25839c 100644 --- a/src/hic/include/hictk/hic/index.hpp +++ b/src/hic/include/hictk/hic/index.hpp @@ -4,58 +4,75 @@ #pragma once +#include +#include +#include #include #include "hictk/chromosome.hpp" -#include "hictk/hic/cache.hpp" -#include "hictk/hic/filestream.hpp" -#include "hictk/hic/hic_file_stream.hpp" +#include "hictk/common.hpp" +#include "hictk/pixel.hpp" namespace hictk::hic::internal { -struct BlockIndex { - std::size_t id{null_id}; // NOLINT - std::size_t file_offset{}; // NOLINT - std::size_t compressed_size_bytes{}; // NOLINT +class BlockIndex { + public: + struct GridCoordinates { + std::size_t row; + std::size_t col; + + constexpr bool operator==(const GridCoordinates& other) const noexcept; + constexpr bool operator!=(const GridCoordinates& other) const noexcept; + constexpr bool operator<(const GridCoordinates& other) const noexcept; + }; - std::size_t first_row{}; - std::size_t last_row{}; - std::size_t first_col{}; - std::size_t last_col{}; + std::size_t _id{null_id}; // NOLINT + std::size_t _file_offset{}; // NOLINT + std::size_t _compressed_size_bytes{}; // NOLINT + GridCoordinates _coords{}; // NOLINT static constexpr auto null_id = (std::numeric_limits::max)(); - constexpr explicit operator bool() const noexcept; - friend constexpr bool operator<(const BlockIndex& a, const BlockIndex& b) noexcept; - friend constexpr bool operator==(const BlockIndex& a, const BlockIndex& b) noexcept; - friend constexpr bool operator!=(const BlockIndex& a, const BlockIndex& b) noexcept; + constexpr BlockIndex() = default; + constexpr BlockIndex(std::size_t id_, std::size_t file_offset_, + std::size_t compressed_size_bytes_, std::size_t block_column_count) noexcept; - friend constexpr bool operator<(const BlockIndex& a, std::size_t b_id) noexcept; - friend constexpr bool operator==(const BlockIndex& a, std::size_t b_id) noexcept; - friend constexpr bool operator!=(const BlockIndex& a, std::size_t b_id) noexcept; + [[nodiscard]] constexpr std::size_t id() const noexcept; + [[nodiscard]] constexpr std::size_t file_offset() const noexcept; + [[nodiscard]] constexpr std::size_t compressed_size_bytes() const noexcept; + [[nodiscard]] constexpr auto coords() const noexcept -> const GridCoordinates&; - friend constexpr bool operator<(std::size_t a_id, const BlockIndex& b) noexcept; - friend constexpr bool operator==(std::size_t a_id, const BlockIndex& b) noexcept; - friend constexpr bool operator!=(std::size_t a_id, const BlockIndex& b) noexcept; + constexpr explicit operator bool() const noexcept; + constexpr bool operator==(const BlockIndex& other) const noexcept; + constexpr bool operator!=(const BlockIndex& other) const noexcept; + constexpr bool operator<(const BlockIndex& other) const noexcept; + constexpr bool operator==(const BlockIndex::GridCoordinates& coords_) const noexcept; + constexpr bool operator!=(const BlockIndex::GridCoordinates& coords_) const noexcept; + constexpr bool operator<(const BlockIndex::GridCoordinates& coords_) const noexcept; }; struct BlockIndexCmp { using is_transparent = void; constexpr bool operator()(const BlockIndex& a, const BlockIndex& b) const noexcept; - constexpr bool operator()(const BlockIndex& a, std::size_t b_id) const noexcept; - constexpr bool operator()(std::size_t a_id, const BlockIndex& b) const noexcept; + constexpr bool operator()(const BlockIndex& a, + const BlockIndex::GridCoordinates& b_coords) const noexcept; + constexpr bool operator()(const BlockIndex::GridCoordinates& a_coords, + const BlockIndex& b) const noexcept; }; // Map coordinates (bp) to block IDs class Index { + using BlockIndexMap = phmap::btree_set; // map block_ids to file offsets - const phmap::btree_set _block_map{}; + const BlockIndexMap _block_map{}; std::int32_t _version{}; std::size_t _block_bin_count{}; std::size_t _block_column_count{}; // columns of blocks per matrix? double _sum_count{}; // sum + MatrixUnit _unit{}; + std::uint32_t _resolution{}; Chromosome _chrom1{}; Chromosome _chrom2{}; @@ -63,10 +80,12 @@ class Index { static constexpr auto npos = (std::numeric_limits::max)(); Index() = default; - Index(Chromosome chrom1_, Chromosome chrom2_, phmap::btree_set blocks_, + Index(Chromosome chrom1_, Chromosome chrom2_, MatrixUnit unit_, std::uint32_t resolution_, std::int32_t version_, std::size_t block_bin_count_, std::size_t block_column_count_, - double sum_count_); + double sum_count_, BlockIndexMap blocks_); + [[nodiscard]] MatrixUnit unit() const noexcept; + [[nodiscard]] std::uint32_t resolution() const noexcept; [[nodiscard]] const Chromosome& chrom1() const noexcept; [[nodiscard]] const Chromosome& chrom2() const noexcept; [[nodiscard]] bool is_intra() const noexcept; @@ -74,19 +93,28 @@ class Index { [[nodiscard]] constexpr std::size_t block_bin_count() const noexcept; [[nodiscard]] constexpr std::size_t block_column_count() const noexcept; - std::vector map_2d_query_to_blocks(const PixelCoordinates& coords1, - const PixelCoordinates& coords2); - void map_2d_query_to_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2, - std::vector& buffer); + [[nodiscard]] auto begin() const noexcept -> BlockIndexMap::const_iterator; + [[nodiscard]] auto end() const noexcept -> BlockIndexMap::const_iterator; + [[nodiscard]] auto cbegin() const noexcept -> BlockIndexMap::const_iterator; + [[nodiscard]] auto cend() const noexcept -> BlockIndexMap::const_iterator; + + [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] bool empty() const noexcept; + + [[nodiscard]] Index subset(const PixelCoordinates& coords1, + const PixelCoordinates& coords2) const; + + [[nodiscard]] auto map_2d_query_to_blocks(const PixelCoordinates& coords1, + const PixelCoordinates& coords2) const -> BlockIndexMap; - const BlockIndex& at(std::size_t id) const; + [[nodiscard]] const BlockIndex& at(std::size_t row, std::size_t col) const; private: void _map_2d_query_to_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2, - std::vector& buffer); + BlockIndexMap& buffer) const; void _map_2d_query_to_blocks_intra_v9plus(const PixelCoordinates& coords1, const PixelCoordinates& coords2, - std::vector& buffer); + BlockIndexMap& buffer) const; }; } // namespace hictk::hic::internal diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index b5ac95e9..1f74fac7 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -8,7 +8,7 @@ #include #include "hictk/bin_table.hpp" -#include "hictk/hic/cache.hpp" +#include "hictk/hic/block_cache.hpp" #include "hictk/hic/common.hpp" #include "hictk/hic/hic_file_stream.hpp" #include "hictk/hic/index.hpp" @@ -18,6 +18,7 @@ namespace hictk::hic { class PixelSelector { mutable internal::HiCBlockReader _reader{}; + std::shared_ptr _footer{}; PixelCoordinates _coord1{}; @@ -60,6 +61,11 @@ class PixelSelector { [[nodiscard]] const PixelCoordinates &coord1() const noexcept; [[nodiscard]] const PixelCoordinates &coord2() const noexcept; + [[nodiscard]] MatrixType matrix_type() const noexcept; + [[nodiscard]] NormalizationMethod normalization() const noexcept; + [[nodiscard]] MatrixUnit unit() const noexcept; + [[nodiscard]] std::uint32_t resolution() const noexcept; + [[nodiscard]] const Chromosome &chrom1() const noexcept; [[nodiscard]] const Chromosome &chrom2() const noexcept; @@ -83,12 +89,10 @@ class PixelSelector { const PixelSelector *_sel{}; using BufferT = std::vector>; - std::shared_ptr _grid{}; - decltype(_grid->begin()) _idx{}; // Index, knows where to read the next block - std::size_t _bin1_id{}; mutable std::shared_ptr _buffer{}; mutable std::size_t _buffer_i{}; + mutable std::size_t _pixels_processed{}; public: using difference_type = std::ptrdiff_t; @@ -122,12 +126,8 @@ class PixelSelector { [[nodiscard]] const PixelCoordinates &coord2() const noexcept; [[nodiscard]] std::size_t size() const noexcept; - void seek_to_next_block(); - - void mark_block_as_fully_read(); - - [[nodiscard]] std::shared_ptr read_block() noexcept; - void read_chunk_of_pixels(const internal::InteractionBlock &blk); + void read_next_row(); + [[nodiscard]] internal::Index find_blocks_overlapping_current_row(); }; }; diff --git a/src/hic/index_impl.hpp b/src/hic/index_impl.hpp index 3391cf63..b65de44d 100644 --- a/src/hic/index_impl.hpp +++ b/src/hic/index_impl.hpp @@ -4,59 +4,106 @@ #pragma once +#include + #include #include #include +#include #include #include namespace hictk::hic::internal { +constexpr bool BlockIndex::GridCoordinates::operator==( + const BlockIndex::GridCoordinates &other) const noexcept { + return row == other.row && col == other.col; +} + +constexpr bool BlockIndex::GridCoordinates::operator!=( + const BlockIndex::GridCoordinates &other) const noexcept { + return !(*this == other); +} + +constexpr bool BlockIndex::GridCoordinates::operator<( + const BlockIndex::GridCoordinates &other) const noexcept { + if (row == other.row) { + return col < other.col; + } + return row < other.row; +} + +constexpr BlockIndex::BlockIndex(std::size_t id_, std::size_t file_offset_, + std::size_t compressed_size_bytes_, + std::size_t block_column_count) noexcept + : _id(id_), + _file_offset(file_offset_), + _compressed_size_bytes(compressed_size_bytes_), + _coords({_id % block_column_count, _id / block_column_count}) {} + +constexpr std::size_t BlockIndex::id() const noexcept { return _id; } +constexpr std::size_t BlockIndex::file_offset() const noexcept { return _file_offset; } +constexpr std::size_t BlockIndex::compressed_size_bytes() const noexcept { + return _compressed_size_bytes; +} +constexpr auto BlockIndex::coords() const noexcept -> const GridCoordinates & { return _coords; } + constexpr BlockIndex::operator bool() const noexcept { - return id != null_id && compressed_size_bytes != 0; + return _id != null_id && _compressed_size_bytes != 0; +} + +constexpr bool BlockIndex::operator==(const BlockIndex &other) const noexcept { + return _id == other._id; +} + +constexpr bool BlockIndex::operator!=(const BlockIndex &other) const noexcept { + return !(*this == other); +} + +constexpr bool BlockIndex::operator<(const BlockIndex &other) const noexcept { + return _coords < other._coords; } -constexpr bool operator<(const BlockIndex &a, const BlockIndex &b) noexcept { return a < b.id; } -constexpr bool operator==(const BlockIndex &a, const BlockIndex &b) noexcept { return a == b.id; } -constexpr bool operator!=(const BlockIndex &a, const BlockIndex &b) noexcept { return !(a == b); } +constexpr bool BlockIndex::operator==(const BlockIndex::GridCoordinates &coords_) const noexcept { + return _coords == coords_; +} -constexpr bool operator<(const BlockIndex &a, std::size_t b_id) noexcept { return a.id < b_id; } -constexpr bool operator==(const BlockIndex &a, std::size_t b_id) noexcept { return a.id == b_id; } -constexpr bool operator!=(const BlockIndex &a, std::size_t b_id) noexcept { return !(a == b_id); } +constexpr bool BlockIndex::operator!=(const BlockIndex::GridCoordinates &coords_) const noexcept { + return !(*this == coords_); +} -constexpr bool operator<(std::size_t a_id, const BlockIndex &b) noexcept { return a_id < b.id; } -constexpr bool operator==(std::size_t a_id, const BlockIndex &b) noexcept { return a_id == b.id; } -constexpr bool operator!=(std::size_t a_id, const BlockIndex &b) noexcept { return !(a_id == b); } +constexpr bool BlockIndex::operator<(const BlockIndex::GridCoordinates &coords_) const noexcept { + return _coords < coords_; +} constexpr bool BlockIndexCmp::operator()(const BlockIndex &a, const BlockIndex &b) const noexcept { - return a < b; + return a.coords() < b.coords(); } -constexpr bool BlockIndexCmp::operator()(const BlockIndex &a, std::size_t b_id) const noexcept { - return a < b_id; + +constexpr bool BlockIndexCmp::operator()( + const BlockIndex &a, const BlockIndex::GridCoordinates &b_coords) const noexcept { + return a < b_coords; } -constexpr bool BlockIndexCmp::operator()(std::size_t a_id, const BlockIndex &b) const noexcept { - return a_id < b; +constexpr bool BlockIndexCmp::operator()(const BlockIndex::GridCoordinates &a_coords, + const BlockIndex &b) const noexcept { + return a_coords < b._coords; } -inline Index::Index(Chromosome chrom1_, Chromosome chrom2_, - phmap::btree_set blocks_, std::int32_t version_, - std::size_t block_bin_count_, std::size_t block_column_count_, - double sum_count_) +inline Index::Index(Chromosome chrom1_, Chromosome chrom2_, MatrixUnit unit_, + std::uint32_t resolution_, std::int32_t version_, std::size_t block_bin_count_, + std::size_t block_column_count_, double sum_count_, BlockIndexMap blocks_) : _block_map(std::move(blocks_)), _version(version_), _block_bin_count(block_bin_count_), _block_column_count(block_column_count_), _sum_count(sum_count_), + _unit(unit_), + _resolution(resolution_), _chrom1(std::move(chrom1_)), - _chrom2(std::move(chrom2_)) { - if (_block_bin_count == 0) { - throw std::runtime_error("index is corrupted: blockBinCount=0."); - } - if (_block_column_count == 0) { - throw std::runtime_error("index is corrupted: blockColumnCount=0."); - } -} + _chrom2(std::move(chrom2_)) {} +inline MatrixUnit Index::unit() const noexcept { return _unit; } +inline std::uint32_t Index::resolution() const noexcept { return _resolution; } inline const Chromosome &Index::chrom1() const noexcept { return _chrom1; } inline const Chromosome &Index::chrom2() const noexcept { return _chrom2; } inline bool Index::is_intra() const noexcept { return _chrom1 == _chrom2; } @@ -67,37 +114,65 @@ constexpr std::size_t Index::block_bin_count() const noexcept { return _block_bi constexpr std::size_t Index::block_column_count() const noexcept { return _block_column_count; } -inline std::vector Index::map_2d_query_to_blocks(const PixelCoordinates &coords1, - const PixelCoordinates &coords2) { - std::vector buffer{}; - map_2d_query_to_blocks(coords1, coords2, buffer); - return buffer; +inline auto Index::begin() const noexcept -> BlockIndexMap::const_iterator { + return _block_map.begin(); +} +inline auto Index::end() const noexcept -> BlockIndexMap::const_iterator { + return _block_map.end(); +} +inline auto Index::cbegin() const noexcept -> BlockIndexMap::const_iterator { + return _block_map.cbegin(); +} +inline auto Index::cend() const noexcept -> BlockIndexMap::const_iterator { + return _block_map.cend(); +} + +inline std::size_t Index::size() const noexcept { return _block_map.size(); } + +inline bool Index::empty() const noexcept { return size() == 0; } // NOLINT + +inline Index Index::subset(const PixelCoordinates &coords1, const PixelCoordinates &coords2) const { + return { + chrom1(), + chrom2(), + unit(), + resolution(), + _version, + block_bin_count(), + block_column_count(), + matrix_sum(), + map_2d_query_to_blocks(coords1, coords2), + }; } -inline const BlockIndex &Index::at(std::size_t id) const { - auto match = _block_map.find(id); +inline const BlockIndex &Index::at(std::size_t row, std::size_t col) const { + auto match = _block_map.find(BlockIndex::GridCoordinates{row, col}); if (match == _block_map.end()) { - throw std::out_of_range(fmt::format(FMT_STRING("unable to find block #{}: out of range"), id)); + throw std::out_of_range( + fmt::format(FMT_STRING("unable to find block {}{}: out of range"), row, col)); } return *match; } -inline void Index::map_2d_query_to_blocks(const PixelCoordinates &coords1, - const PixelCoordinates &coords2, - std::vector &buffer) { +inline auto Index::map_2d_query_to_blocks(const PixelCoordinates &coords1, + const PixelCoordinates &coords2) const -> BlockIndexMap { assert(coords1.is_intra()); assert(coords2.is_intra()); + BlockIndexMap buffer{}; + const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); if (_version < 9 || is_intra) { - return _map_2d_query_to_blocks(coords1, coords2, buffer); + _map_2d_query_to_blocks(coords1, coords2, buffer); + } else { + _map_2d_query_to_blocks_intra_v9plus(coords1, coords2, buffer); } - return _map_2d_query_to_blocks_intra_v9plus(coords1, coords2, buffer); + return buffer; } inline void Index::_map_2d_query_to_blocks(const hictk::PixelCoordinates &coords1, const hictk::PixelCoordinates &coords2, - std::vector &buffer) { + BlockIndexMap &buffer) const { assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); @@ -120,44 +195,28 @@ inline void Index::_map_2d_query_to_blocks(const hictk::PixelCoordinates &coords // check region part that overlaps with lower left triangle but only if intrachromosomal const auto checkLowerLeftTri = is_intra; - phmap::btree_set tmp_buffer{}; + buffer.clear(); // first check the upper triangular matrix_type for (auto row = row1; row <= row2; ++row) { for (auto col = col1; col <= col2; ++col) { - const auto id1 = row * _block_column_count + col; - auto match = _block_map.find(id1); + auto match = _block_map.find(BlockIndex::GridCoordinates{row, col}); if (match != _block_map.end()) { - auto block = *match; - block.first_row = bin1; - block.last_row = bin2; - block.first_col = bin3; - block.last_col = bin4; - - tmp_buffer.emplace(block); + buffer.emplace(*match); } if (checkLowerLeftTri) { - const auto id2 = col * _block_column_count + row; - match = _block_map.find(id2); + match = _block_map.find(BlockIndex::GridCoordinates{col, row}); if (match != _block_map.end()) { - auto block = *match; - block.first_row = bin1; - block.last_row = bin2; - block.first_col = bin3; - block.last_col = bin4; - tmp_buffer.emplace(block); + buffer.emplace(*match); } } } } - - buffer.resize(tmp_buffer.size()); - std::copy(tmp_buffer.begin(), tmp_buffer.end(), buffer.begin()); } inline void Index::_map_2d_query_to_blocks_intra_v9plus(const hictk::PixelCoordinates &coords1, const hictk::PixelCoordinates &coords2, - std::vector &buffer) { + BlockIndexMap &buffer) const { // https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#grid-structure assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); @@ -193,23 +252,15 @@ inline void Index::_map_2d_query_to_blocks_intra_v9plus(const hictk::PixelCoordi // +1; integer divide rounds down const auto furtherDepth = std::max(translatedNearerDepth, translatedFurtherDepth) + 1; - phmap::btree_set block_ids{}; for (auto depth = nearerDepth; depth <= furtherDepth; ++depth) { for (auto pad = translatedLowerPAD; pad <= translatedHigherPAD; ++pad) { - const auto id = depth * _block_column_count + pad; - auto match = _block_map.find(id); + auto match = _block_map.find(BlockIndex::GridCoordinates{depth, pad}); if (match != _block_map.end()) { auto block = *match; - block.first_row = bin1; - block.last_row = bin2; - block.first_col = bin3; - block.last_col = bin3; - block_ids.emplace(block); + buffer.emplace(block); } } } - buffer.resize(block_ids.size()); - std::copy(block_ids.begin(), block_ids.end(), buffer.begin()); } } // namespace hictk::hic::internal diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 280a891b..cd1f4b56 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -23,13 +23,15 @@ inline PixelSelector::PixelSelector(std::shared_ptr hfs std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coord1_, PixelCoordinates coord2_) noexcept - : _reader(std::move(hfs_), *footer_, std::move(bins_), std::move(cache_), coord1_, coord2_), + : _reader(std::move(hfs_), footer_->index(), std::move(bins_), std::move(cache_), coord1_, + coord2_), _footer(std::move(footer_)), _coord1(std::move(coord1_)), _coord2(std::move(coord2_)) {} inline bool PixelSelector::operator==(const PixelSelector &other) const noexcept { - return _footer == other._footer && _coord1 == other._coord1 && _coord2 == other._coord2; + return _reader.index().chrom1() == _reader.index().chrom2() && _coord1 == other._coord1 && + _coord2 == other._coord2; } inline bool PixelSelector::operator!=(const PixelSelector &other) const noexcept { @@ -63,8 +65,8 @@ inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record assert(is_inter() || record.bin1_id <= record.bin2_id); - const auto skipNormalization = _footer->normalization() == NormalizationMethod::NONE || - _footer->matrix_type() == MatrixType::expected; + const auto skipNormalization = + normalization() == NormalizationMethod::NONE || matrix_type() == MatrixType::expected; if (!skipNormalization) { const auto bin1 = static_cast(record.bin1_id); @@ -74,10 +76,10 @@ inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record record.count /= static_cast(c1Norm[bin1] * c2Norm[bin2]); } - record.bin1_id *= _footer->resolution(); - record.bin2_id *= _footer->resolution(); + record.bin1_id *= resolution(); + record.bin2_id *= resolution(); - if (_footer->matrix_type() == MatrixType::observed) { + if (matrix_type() == MatrixType::observed) { return record; } @@ -86,18 +88,17 @@ inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record return float(_reader.avg()); } - const auto i = - static_cast((record.bin2_id - record.bin1_id) / _footer->resolution()); + const auto i = static_cast((record.bin2_id - record.bin1_id) / resolution()); assert(i < expected.size()); return float(expected[i]); }(); - if (_footer->matrix_type() == MatrixType::expected) { + if (matrix_type() == MatrixType::expected) { record.count = expectedCount; return record; } - assert(_footer->matrix_type() == MatrixType::oe); + assert(matrix_type() == MatrixType::oe); record.count /= expectedCount; return record; @@ -119,8 +120,8 @@ inline std::vector> PixelSelector::read_all_dbg() const { std::size_t i = 0; - for (const auto &block_idx : _reader.grid()) { - auto blk = _reader.read(*block_idx.block_idx); + for (const auto &block_idx : _reader.index()) { + auto blk = _reader.read(block_idx); if (!blk) { continue; } @@ -155,9 +156,8 @@ inline std::vector> PixelSelector::read_all_dbg() const { static_cast(b2), tp.count}); if (std::isfinite(record.count)) { buffer.emplace_back( - PixelCoordinates{ - bins().at(_footer->chrom1(), static_cast(record.bin1_id)), - bins().at(_footer->chrom2(), static_cast(record.bin2_id))}, + PixelCoordinates{bins().at(chrom1(), static_cast(record.bin1_id)), + bins().at(chrom2(), static_cast(record.bin2_id))}, record.count); } } @@ -171,6 +171,14 @@ inline std::vector> PixelSelector::read_all_dbg() const { inline const PixelCoordinates &PixelSelector::coord1() const noexcept { return _coord1; } inline const PixelCoordinates &PixelSelector::coord2() const noexcept { return _coord2; } +inline MatrixType PixelSelector::matrix_type() const noexcept { return metadata().matrix_type; } +inline NormalizationMethod PixelSelector::normalization() const noexcept { + return metadata().normalization; +} +inline MatrixUnit PixelSelector::unit() const noexcept { return _reader.index().unit(); } +inline std::uint32_t PixelSelector::resolution() const noexcept { + return _reader.index().resolution(); +} inline const Chromosome &PixelSelector::chrom1() const noexcept { return _coord1.bin1.chrom(); } inline const Chromosome &PixelSelector::chrom2() const noexcept { return _coord2.bin1.chrom(); } @@ -194,17 +202,14 @@ inline double PixelSelector::avg() const noexcept { return _reader.avg(); } template inline PixelSelector::iterator::iterator(const PixelSelector &sel) - : _sel(&sel), - _grid(std::make_shared(_sel->_reader.grid())), - _idx(_grid->begin()), - _bin1_id(coord1().bin1.rel_id()), - _buffer(std::make_shared(1000)) { - if (_grid->size() == 0) { + : _sel(&sel), _bin1_id(coord1().bin1.rel_id()), _buffer(std::make_shared()) { + if (_sel->_reader.index().empty()) { *this = at_end(sel); return; } - auto blk = read_block(); - read_chunk_of_pixels(*blk); + while (_buffer->empty()) { + read_next_row(); + } } template @@ -221,7 +226,6 @@ template inline bool PixelSelector::iterator::operator==(const iterator &other) const noexcept { // clang-format off return _sel == other._sel && - _idx == other._idx && size() == other.size(); // clang-format on } @@ -233,16 +237,9 @@ inline bool PixelSelector::iterator::operator!=(const iterator &other) const template inline bool PixelSelector::iterator::operator<(const iterator &other) const noexcept { + assert(_sel == other._sel); assert(!!_sel); - assert(_sel->coord1() == other._sel->coord1()); - assert(_sel->coord2() == other._sel->coord2()); - if (_idx != other._idx) { - return _idx->block_idx->id < other._idx._node->block_idx->id; - } - if (_bin1_id != other._bin1_id) { - return _bin1_id < other._bin1_id; - } - return size() < other.size(); + return _pixels_processed < other._pixels_processed; } template @@ -256,15 +253,10 @@ template inline auto PixelSelector::iterator::operator++() -> iterator & { assert(!!_buffer); - if (!!_buffer) { - ++_buffer_i; - } - - while (_buffer_i == size()) { - seek_to_next_block(); - if (is_at_end()) { - break; - } + ++_pixels_processed; + ++_buffer_i; + while (!is_at_end() && _buffer_i >= size()) { + read_next_row(); } return *this; @@ -322,127 +314,72 @@ template inline std::size_t PixelSelector::iterator::size() const noexcept { return !_buffer ? 0 : _buffer->size(); } - template -inline void PixelSelector::iterator::seek_to_next_block() { - auto blk = read_block(); - if (!blk) { - _buffer = nullptr; - return; - } - assert(blk); - const auto block_was_fully_read = _bin1_id == (--blk->end())->first; - if (block_was_fully_read) { - mark_block_as_fully_read(); - } +inline internal::Index PixelSelector::iterator::find_blocks_overlapping_current_row() { + const auto end_pos = coord2().bin2.start(); + const auto pos1 = (std::min)(end_pos, static_cast(_bin1_id) * bins().bin_size()); + const auto pos2 = (std::min)(end_pos, pos1 + bins().bin_size()); - assert(_idx < _grid->end()); - assert(_idx->current_row < _grid->end()); - const auto end_of_row = (_idx + 1 == _grid->end()) || _idx->row != (_idx + 1)->row; - if (end_of_row) { - if (!_grid) { - *this = at_end(*_sel); - } - fmt::print(FMT_STRING("next row...\n")); - _idx = std::find_if(_idx->current_row, _idx + 1, - [](const auto &node) { return node.block_idx != nullptr; }); - ++_bin1_id; - } else { - fmt::print(FMT_STRING("next col...\n")); - ++_idx; - } - - if (_idx == _grid->end()) { - *this = at_end(*_sel); - return; - } + const auto coord1_ = PixelCoordinates(bins().at(coord1().bin1.chrom(), pos1), + bins().at(coord1().bin1.chrom(), pos2)); - assert(_idx->block_idx); - blk = _sel->_reader.read(*_idx->block_idx); - if (!blk) { - *this = at_end(*_sel); - return; - } - - if (_bin1_id != blk->at(_bin1_id)->first) { - seek_to_next_block(); - } - read_chunk_of_pixels(*blk); + return _sel->_reader.index().subset(coord1_, coord2()); } template -inline void PixelSelector::iterator::mark_block_as_fully_read() { - // deal with calls to operator++(int) - const auto i = std::distance(_grid->begin(), _idx); - _grid = std::make_shared(*_grid); - _idx = _grid->begin() + i; - _idx->block_idx = nullptr; -} - -template -inline std::shared_ptr -PixelSelector::iterator::read_block() noexcept { - if (!_grid) { - return nullptr; - } - assert(_sel); - assert(_idx != _grid->end()); - assert(_idx->block_idx); - return _sel->_reader.read(*_idx->block_idx); -} - -template -inline void PixelSelector::iterator::read_chunk_of_pixels( - const internal::InteractionBlock &blk) { - auto row_it = blk.at(_bin1_id); - if (_bin1_id != row_it->first) { +inline void PixelSelector::iterator::read_next_row() { + assert(!!_sel); + const auto blocks = find_blocks_overlapping_current_row(); + if (blocks.empty() || _bin1_id > coord1().bin2.rel_id()) { + *this = at_end(*_sel); return; } - auto first = row_it->second.begin(); - auto last = row_it->second.end(); - - first = - std::lower_bound(first, last, coord2().bin1.start(), - [&](const internal::InteractionBlock::ThinPixel &pixel, const auto &pos) { - return pixel.bin2_id * bins().bin_size() < pos; - }); - // clang-format off - const auto buff_capacity = - std::min({!!_buffer ? _buffer->capacity() : std::size_t(1000), - static_cast(std::distance(first, last))}); - // clang-format on - // This is fine, as iterators are not thread safe anyway - if (_buffer.use_count() == 1) { - _buffer->resize(buff_capacity); - } else { - _buffer = std::make_shared(buff_capacity); + if (_buffer.use_count() != 1) { + _buffer = std::make_shared(_buffer->capacity()); } + _buffer->clear(); _buffer_i = 0; + const auto bin_size = bins().bin_size(); + const auto bin1 = + bins().at(coord1().bin1.chrom(), static_cast(_bin1_id) * bin_size); + bool pixels_are_sorted = true; + for (const auto block_idx : blocks) { + const auto blk = _sel->_reader.read(block_idx); + const auto match = blk->find(_bin1_id); + if (match == blk->end()) { + continue; + } - const auto pos1 = static_cast(_bin1_id) * bins().bin_size(); - const auto bin1 = bins().at(coord1().bin1.chrom(), pos1); - do { - const auto pos2 = static_cast(first->bin2_id) * bins().bin_size(); - auto bin2 = bins().at(coord2().bin1.chrom(), pos2); + const auto &pixels = match->second; + auto first = std::lower_bound(pixels.begin(), pixels.end(), coord2().bin1.rel_id(), + [](const internal::InteractionBlock::ThinPixel &pixel, + std::size_t bin_id) { return pixel.bin2_id < bin_id; }); - if (bin2 > coord2().bin2) { - break; - } + while (first != pixels.end()) { + if (first->bin2_id > coord2().bin2.rel_id()) { + break; + } + const auto pos2 = static_cast(first->bin2_id) * bin_size; + if constexpr (std::is_integral_v) { + _buffer->emplace_back( + Pixel{PixelCoordinates{bin1, bins().at(coord2().bin1.chrom(), pos2)}, + static_cast(std::round(first->count))}); + } else { + _buffer->emplace_back( + Pixel{PixelCoordinates{bin1, bins().at(coord2().bin1.chrom(), pos2)}, + conditional_static_cast(first->count)}); + } - if constexpr (std::is_integral_v) { - const auto count = static_cast(std::round(first->count)); - _buffer->emplace_back(bin1, std::move(bin2), count); - } else { - const auto count = conditional_static_cast(first->count); - _buffer->emplace_back(bin1, std::move(bin2), count); + pixels_are_sorted &= _buffer->size() < 2 || *(_buffer->end() - 2) < _buffer->back(); + ++first; } - } while (++first != last); - - if (first != last && _bin1_id + 1 > coord1().bin2.rel_id()) { - _grid = nullptr; - _idx = {}; } + if (!pixels_are_sorted) { + std::sort(_buffer->begin(), _buffer->end()); + } + _bin1_id++; } + } // namespace hictk::hic diff --git a/test/units/hic/hic_file_stream_test.cpp b/test/units/hic/hic_file_stream_test.cpp index e671ee02..bccac204 100644 --- a/test/units/hic/hic_file_stream_test.cpp +++ b/test/units/hic/hic_file_stream_test.cpp @@ -68,7 +68,7 @@ TEST_CASE("readHeader (v9)", "[hic][v9][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("readFooter (v8)", "[hic][v8][short]") { +TEST_CASE("read_footer (v8)", "[hic][v8][short]") { internal::HiCFileStream s(pathV8); const auto chr2L = s.header().chromosomes.at("chr2L"); const auto chr2R = s.header().chromosomes.at("chr2R"); @@ -81,8 +81,8 @@ TEST_CASE("readFooter (v8)", "[hic][v8][short]") { 0.008417076032024847}; SECTION("observed NONE BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2L.id(), MatrixType::observed, - NormalizationMethod::NONE, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2L.id(), MatrixType::observed, + NormalizationMethod::NONE, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); CHECK(f.normalization() == NormalizationMethod::NONE); @@ -95,8 +95,8 @@ TEST_CASE("readFooter (v8)", "[hic][v8][short]") { } SECTION("observed VC BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2R.id(), MatrixType::observed, - NormalizationMethod::VC, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, + NormalizationMethod::VC, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); CHECK(f.normalization() == NormalizationMethod::VC); @@ -109,8 +109,8 @@ TEST_CASE("readFooter (v8)", "[hic][v8][short]") { } SECTION("observed VC_SQRT BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2R.id(), MatrixType::observed, - NormalizationMethod::VC_SQRT, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, + NormalizationMethod::VC_SQRT, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); CHECK(f.normalization() == NormalizationMethod::VC_SQRT); @@ -123,8 +123,8 @@ TEST_CASE("readFooter (v8)", "[hic][v8][short]") { } SECTION("observed KR BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2R.id(), MatrixType::observed, - NormalizationMethod::KR, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, + NormalizationMethod::KR, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); CHECK(f.normalization() == NormalizationMethod::KR); @@ -137,8 +137,8 @@ TEST_CASE("readFooter (v8)", "[hic][v8][short]") { } SECTION("observed SCALE BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2R.id(), MatrixType::observed, - NormalizationMethod::SCALE, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, + NormalizationMethod::SCALE, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); CHECK(f.normalization() == NormalizationMethod::SCALE); @@ -151,8 +151,8 @@ TEST_CASE("readFooter (v8)", "[hic][v8][short]") { } SECTION("oe NONE BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2L.id(), MatrixType::oe, NormalizationMethod::NONE, - MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2L.id(), MatrixType::oe, NormalizationMethod::NONE, + MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::oe); CHECK(f.normalization() == NormalizationMethod::NONE); @@ -171,8 +171,8 @@ TEST_CASE("readFooter (v8)", "[hic][v8][short]") { } SECTION("expected NONE BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2L.id(), MatrixType::expected, - NormalizationMethod::NONE, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2L.id(), MatrixType::expected, + NormalizationMethod::NONE, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::expected); CHECK(f.normalization() == NormalizationMethod::NONE); @@ -192,7 +192,7 @@ TEST_CASE("readFooter (v8)", "[hic][v8][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("readFooter (v9)", "[hic][v9][short]") { +TEST_CASE("read_footer (v9)", "[hic][v9][short]") { internal::HiCFileStream s(pathV9); const auto chr2L = s.header().chromosomes.at("chr2L"); const auto chr2R = s.header().chromosomes.at("chr2R"); @@ -205,8 +205,8 @@ TEST_CASE("readFooter (v9)", "[hic][v9][short]") { 0.008417075820557469}; SECTION("observed NONE BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2L.id(), MatrixType::observed, - NormalizationMethod::NONE, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2L.id(), MatrixType::observed, + NormalizationMethod::NONE, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); CHECK(f.normalization() == NormalizationMethod::NONE); @@ -219,8 +219,8 @@ TEST_CASE("readFooter (v9)", "[hic][v9][short]") { } SECTION("observed VC BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2R.id(), MatrixType::observed, - NormalizationMethod::VC, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, + NormalizationMethod::VC, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); CHECK(f.normalization() == NormalizationMethod::VC); @@ -233,8 +233,8 @@ TEST_CASE("readFooter (v9)", "[hic][v9][short]") { } SECTION("observed VC_SQRT BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2R.id(), MatrixType::observed, - NormalizationMethod::VC_SQRT, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, + NormalizationMethod::VC_SQRT, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); CHECK(f.normalization() == NormalizationMethod::VC_SQRT); @@ -248,7 +248,7 @@ TEST_CASE("readFooter (v9)", "[hic][v9][short]") { /* TODO: for some reason KR normalization is missing SECTION("observed KR BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2R.id(), MatrixType::observed, + const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, NormalizationMethod::KR, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); @@ -262,8 +262,8 @@ TEST_CASE("readFooter (v9)", "[hic][v9][short]") { } */ SECTION("observed SCALE BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2R.id(), MatrixType::observed, - NormalizationMethod::SCALE, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, + NormalizationMethod::SCALE, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::observed); CHECK(f.normalization() == NormalizationMethod::SCALE); @@ -276,8 +276,8 @@ TEST_CASE("readFooter (v9)", "[hic][v9][short]") { } SECTION("oe NONE BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2L.id(), MatrixType::oe, NormalizationMethod::NONE, - MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2L.id(), MatrixType::oe, NormalizationMethod::NONE, + MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::oe); CHECK(f.normalization() == NormalizationMethod::NONE); @@ -296,8 +296,8 @@ TEST_CASE("readFooter (v9)", "[hic][v9][short]") { } SECTION("expected NONE BP 5000") { - const auto f = s.readFooter(chr2L.id(), chr2L.id(), MatrixType::expected, - NormalizationMethod::NONE, MatrixUnit::BP, 5000); + const auto f = s.read_footer(chr2L.id(), chr2L.id(), MatrixType::expected, + NormalizationMethod::NONE, MatrixUnit::BP, 5000); CHECK(f.matrix_type() == MatrixType::expected); CHECK(f.normalization() == NormalizationMethod::NONE); From 31b5847695cf8958b9c28a3c1514af587a83ae67 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 09:51:28 +0200 Subject: [PATCH 10/48] Bugfix. Simple tests now pass for both v8 and v9 .hic files --- src/hic/CMakeLists.txt | 7 +- src/hic/block_cache_impl.hpp | 47 ++---- src/hic/block_reader_impl.hpp | 7 +- src/hic/footer_cache_impl.hpp | 2 + src/hic/hic_file_stream_impl.hpp | 2 +- src/hic/include/hictk/hic/block_cache.hpp | 24 +-- src/hic/include/hictk/hic/block_reader.hpp | 6 +- src/hic/include/hictk/hic/footer_cache.hpp | 15 +- src/hic/include/hictk/hic/hic_file_stream.hpp | 1 - src/hic/include/hictk/hic/index.hpp | 48 ++++-- src/hic/include/hictk/hic/pixel_selector.hpp | 18 +- src/hic/index_impl.hpp | 120 +++++++------ src/hic/pixel_selector_impl.hpp | 157 +++++++++--------- 13 files changed, 210 insertions(+), 244 deletions(-) diff --git a/src/hic/CMakeLists.txt b/src/hic/CMakeLists.txt index d1a58a77..e1e08716 100644 --- a/src/hic/CMakeLists.txt +++ b/src/hic/CMakeLists.txt @@ -12,15 +12,16 @@ add_library(hictk::hic ALIAS hic) target_sources( hic - INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/cache_impl.hpp + INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/block_cache_impl.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/block_reader_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/filestream_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_stream_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_utils_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_footer_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_header_impl.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/hic_matrix_selector_impl.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/filestream_impl.hpp) + ${CMAKE_CURRENT_SOURCE_DIR}/index_impl.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/pixel_selector_impl.hpp) target_include_directories(hic INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/include) target_link_libraries( diff --git a/src/hic/block_cache_impl.hpp b/src/hic/block_cache_impl.hpp index 9eb827c0..65c9d465 100644 --- a/src/hic/block_cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -4,9 +4,13 @@ #pragma once +#include #include +#include #include #include +#include +#include #include #include "hictk/hic/hic_footer.hpp" @@ -43,24 +47,6 @@ constexpr bool operator!=(std::size_t a_id, const InteractionBlock &b) noexcept return !(a_id == b); } -constexpr bool InteractionBlockCmp::operator()(const InteractionBlock &a, - const InteractionBlock &b) const noexcept { - return a < b; -} -constexpr bool InteractionBlockCmp::operator()(const InteractionBlock &a, - std::size_t b_id) const noexcept { - return a < b_id; -} -constexpr bool InteractionBlockCmp::operator()(std::size_t a_id, - const InteractionBlock &b) const noexcept { - return a_id < b; -} - -inline auto InteractionBlock::Overlap::begin() const noexcept { return first; } -inline auto InteractionBlock::Overlap::end() const noexcept { return last; } -inline auto InteractionBlock::Overlap::cbegin() const noexcept { return begin(); } -inline auto InteractionBlock::Overlap::cend() const noexcept { return end(); } - inline std::size_t InteractionBlock::id() const noexcept { return _id; } inline const Chromosome &InteractionBlock::chrom1() const noexcept { assert(_chrom1); @@ -73,7 +59,7 @@ inline const Chromosome &InteractionBlock::chrom2() const noexcept { inline InteractionBlock::InteractionBlock(std::size_t id_, const std::vector &pixels) - : _id(id_) { + : _id(id_), _size(pixels.size()) { if (pixels.empty()) { return; } @@ -81,9 +67,11 @@ inline InteractionBlock::InteractionBlock(std::size_t id_, for (const SerializedPixel &p : pixels) { const auto b1 = static_cast(p.bin1_id); const auto b2 = static_cast(p.bin2_id); - auto [node, inserted] = this->_interactions.try_emplace(b1, Row{{b2, p.count}}); - if (!inserted) { + auto node = this->_interactions.find(b1); + if (node != this->_interactions.end()) { node->second.emplace_back(ThinPixel{b2, p.count}); + } else { + this->_interactions.emplace(b1, Row{{b2, p.count}}); } } if constexpr (ndebug_not_defined()) { @@ -117,23 +105,10 @@ inline auto InteractionBlock::find(std::uint64_t row) const noexcept -> const_it return _interactions.find(row); } -inline auto InteractionBlock::find_overlap(std::uint64_t first_row, - std::uint64_t last_row) const noexcept -> Overlap { - assert(first_row <= last_row); - return {_interactions.lower_bound(first_row), _interactions.upper_bound(last_row)}; -} - -inline bool InteractionBlock::has_overlap(std::uint64_t first_row, - std::uint64_t last_row) const noexcept { - auto overlap = find_overlap(first_row, last_row); - - return overlap.begin() != this->_interactions.end(); -} - -inline std::size_t InteractionBlock::size() const noexcept { return _interactions.size(); } +inline std::size_t InteractionBlock::size() const noexcept { return _size; } inline std::size_t InteractionBlock::size_in_bytes() const noexcept { - return sizeof(Pixel) * size(); + return sizeof(ThinPixel) * size(); } inline BlockLRUCache::BlockLRUCache(std::size_t max_size_in_bytes) diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp index 78e302cd..1eed4b83 100644 --- a/src/hic/block_reader_impl.hpp +++ b/src/hic/block_reader_impl.hpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -33,13 +34,11 @@ inline std::string &BinaryBuffer::reset() noexcept { inline HiCBlockReader::HiCBlockReader(std::shared_ptr hfs, const Index &master_index, std::shared_ptr bins_, - std::shared_ptr block_cache_, - const PixelCoordinates &coords1, - const PixelCoordinates &coords2) + std::shared_ptr block_cache_) : _hfs(std::move(hfs)), _blk_cache(std::move(block_cache_)), _bins(std::move(bins_)), - _index(master_index.subset(coords1, coords2)) {} + _index(master_index) {} inline HiCBlockReader::operator bool() const noexcept { return !!_hfs; } diff --git a/src/hic/footer_cache_impl.hpp b/src/hic/footer_cache_impl.hpp index 9f5ab81c..3fc83f45 100644 --- a/src/hic/footer_cache_impl.hpp +++ b/src/hic/footer_cache_impl.hpp @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include diff --git a/src/hic/hic_file_stream_impl.hpp b/src/hic/hic_file_stream_impl.hpp index bca54717..bccd4ba9 100644 --- a/src/hic/hic_file_stream_impl.hpp +++ b/src/hic/hic_file_stream_impl.hpp @@ -204,7 +204,7 @@ inline Index HiCFileStream::read_index(std::int64_t fileOffset, const Chromosome const auto nBlocks = static_cast(_fs->read()); - phmap::btree_set buffer; + phmap::flat_hash_set buffer; if (wantedUnit == foundUnit && wantedResolution == foundResolution) { for (std::size_t j = 0; j < nBlocks; ++j) { const auto block_id = static_cast(_fs->read()); diff --git a/src/hic/include/hictk/hic/block_cache.hpp b/src/hic/include/hictk/hic/block_cache.hpp index 25a9db20..78cf1620 100644 --- a/src/hic/include/hictk/hic/block_cache.hpp +++ b/src/hic/include/hictk/hic/block_cache.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -33,21 +34,12 @@ class InteractionBlock { BuffT _interactions{}; const Chromosome* _chrom1{}; const Chromosome* _chrom2{}; + std::size_t _size{}; public: using iterator = BuffT::iterator; using const_iterator = BuffT::const_iterator; - struct Overlap { - const_iterator first{}; // NOLINT - const_iterator last{}; // NOLINT - - [[nodiscard]] auto begin() const noexcept; - [[nodiscard]] auto end() const noexcept; - [[nodiscard]] auto cbegin() const noexcept; - [[nodiscard]] auto cend() const noexcept; - }; - InteractionBlock() = default; InteractionBlock(std::size_t id_, const std::vector& pixels); @@ -78,23 +70,11 @@ class InteractionBlock { [[nodiscard]] const Chromosome& chrom2() const noexcept; [[nodiscard]] auto find(std::uint64_t row) const noexcept -> const_iterator; - [[nodiscard]] auto find_overlap(std::uint64_t first_row, std::uint64_t last_row) const noexcept - -> Overlap; - - [[nodiscard]] bool has_overlap(std::uint64_t first_row, std::uint64_t last_row) const noexcept; [[nodiscard]] std::size_t size() const noexcept; [[nodiscard]] std::size_t size_in_bytes() const noexcept; }; -struct InteractionBlockCmp { - using is_transparent = void; - - constexpr bool operator()(const InteractionBlock& a, const InteractionBlock& b) const noexcept; - constexpr bool operator()(const InteractionBlock& a, std::size_t b_id) const noexcept; - constexpr bool operator()(std::size_t a_id, const InteractionBlock& b) const noexcept; -}; - class BlockLRUCache { using MapT = tsl::ordered_map>; using key_t = MapT::key_type; diff --git a/src/hic/include/hictk/hic/block_reader.hpp b/src/hic/include/hictk/hic/block_reader.hpp index a7ac961d..3095d8a7 100644 --- a/src/hic/include/hictk/hic/block_reader.hpp +++ b/src/hic/include/hictk/hic/block_reader.hpp @@ -4,8 +4,6 @@ #pragma once -#include - #include #include #include @@ -49,8 +47,8 @@ class HiCBlockReader { public: HiCBlockReader() = default; HiCBlockReader(std::shared_ptr hfs, const Index& master_index, - std::shared_ptr bins_, std::shared_ptr block_cache_, - const PixelCoordinates& coords1, const PixelCoordinates& coords2); + std::shared_ptr bins_, + std::shared_ptr block_cache_); [[nodiscard]] explicit operator bool() const noexcept; diff --git a/src/hic/include/hictk/hic/footer_cache.hpp b/src/hic/include/hictk/hic/footer_cache.hpp index f82bc6a3..2eb8fa40 100644 --- a/src/hic/include/hictk/hic/footer_cache.hpp +++ b/src/hic/include/hictk/hic/footer_cache.hpp @@ -4,10 +4,10 @@ #pragma once -#include #include #include +#include #include #include #include @@ -15,7 +15,6 @@ #include "hictk/chromosome.hpp" #include "hictk/hic/common.hpp" #include "hictk/hic/hic_footer.hpp" -#include "hictk/pixel.hpp" namespace hictk::hic::internal { @@ -48,14 +47,14 @@ class FooterCache { using const_iterator = MapT::iterator; FooterCache() = default; - auto begin() const noexcept -> decltype(_cache.cbegin()); - auto end() const noexcept -> decltype(_cache.cbegin()); + [[nodiscard]] auto begin() const noexcept -> decltype(_cache.cbegin()); + [[nodiscard]] auto end() const noexcept -> decltype(_cache.cbegin()); - auto cbegin() const noexcept -> decltype(_cache.cbegin()); - auto cend() const noexcept -> decltype(_cache.cbegin()); + [[nodiscard]] auto cbegin() const noexcept -> decltype(_cache.cbegin()); + [[nodiscard]] auto cend() const noexcept -> decltype(_cache.cbegin()); - auto emplace(HiCFooter&& f) -> decltype(_cache.emplace()); - auto find(const HiCFooterMetadata& m) -> const_iterator; + [[nodiscard]] auto emplace(HiCFooter&& f) -> decltype(_cache.emplace()); + [[nodiscard]] auto find(const HiCFooterMetadata& m) -> const_iterator; [[nodiscard]] std::size_t size() const noexcept; void clear(); diff --git a/src/hic/include/hictk/hic/hic_file_stream.hpp b/src/hic/include/hictk/hic/hic_file_stream.hpp index 5fbc8c43..ab14fcd1 100644 --- a/src/hic/include/hictk/hic/hic_file_stream.hpp +++ b/src/hic/include/hictk/hic/hic_file_stream.hpp @@ -5,7 +5,6 @@ #pragma once #include -#include #include #include diff --git a/src/hic/include/hictk/hic/index.hpp b/src/hic/include/hictk/hic/index.hpp index cc25839c..c5453e97 100644 --- a/src/hic/include/hictk/hic/index.hpp +++ b/src/hic/include/hictk/hic/index.hpp @@ -4,10 +4,14 @@ #pragma once +#include + #include #include +#include #include #include +#include #include "hictk/chromosome.hpp" #include "hictk/common.hpp" @@ -18,8 +22,8 @@ namespace hictk::hic::internal { class BlockIndex { public: struct GridCoordinates { - std::size_t row; - std::size_t col; + std::size_t row; // NOLINT + std::size_t col; // NOLINT constexpr bool operator==(const GridCoordinates& other) const noexcept; constexpr bool operator!=(const GridCoordinates& other) const noexcept; @@ -46,24 +50,28 @@ class BlockIndex { constexpr bool operator==(const BlockIndex& other) const noexcept; constexpr bool operator!=(const BlockIndex& other) const noexcept; constexpr bool operator<(const BlockIndex& other) const noexcept; - constexpr bool operator==(const BlockIndex::GridCoordinates& coords_) const noexcept; - constexpr bool operator!=(const BlockIndex::GridCoordinates& coords_) const noexcept; - constexpr bool operator<(const BlockIndex::GridCoordinates& coords_) const noexcept; + constexpr bool operator==(std::size_t id_) const noexcept; + constexpr bool operator!=(std::size_t id_) const noexcept; }; -struct BlockIndexCmp { +struct BlockIndexHasher { + using is_transparent = void; + + std::size_t operator()(const BlockIndex& b) const noexcept; + std::size_t operator()(std::size_t id) const noexcept; +}; + +struct BlockIndexEq { using is_transparent = void; constexpr bool operator()(const BlockIndex& a, const BlockIndex& b) const noexcept; - constexpr bool operator()(const BlockIndex& a, - const BlockIndex::GridCoordinates& b_coords) const noexcept; - constexpr bool operator()(const BlockIndex::GridCoordinates& a_coords, - const BlockIndex& b) const noexcept; + constexpr bool operator()(const BlockIndex& a, std::size_t b_id) const noexcept; + constexpr bool operator()(std::size_t a_id, const BlockIndex& b) const noexcept; }; // Map coordinates (bp) to block IDs class Index { - using BlockIndexMap = phmap::btree_set; + using BlockIndexMap = phmap::flat_hash_set; // map block_ids to file offsets const BlockIndexMap _block_map{}; std::int32_t _version{}; @@ -75,6 +83,7 @@ class Index { std::uint32_t _resolution{}; Chromosome _chrom1{}; Chromosome _chrom2{}; + mutable phmap::flat_hash_set _tmp_buffer{32}; public: static constexpr auto npos = (std::numeric_limits::max)(); @@ -101,22 +110,25 @@ class Index { [[nodiscard]] std::size_t size() const noexcept; [[nodiscard]] bool empty() const noexcept; - [[nodiscard]] Index subset(const PixelCoordinates& coords1, - const PixelCoordinates& coords2) const; - - [[nodiscard]] auto map_2d_query_to_blocks(const PixelCoordinates& coords1, - const PixelCoordinates& coords2) const -> BlockIndexMap; + [[nodiscard]] std::vector find_overlaps(const PixelCoordinates& coords1, + const PixelCoordinates& coords2) const; [[nodiscard]] const BlockIndex& at(std::size_t row, std::size_t col) const; private: void _map_2d_query_to_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2, - BlockIndexMap& buffer) const; + std::vector& buffer) const; void _map_2d_query_to_blocks_intra_v9plus(const PixelCoordinates& coords1, const PixelCoordinates& coords2, - BlockIndexMap& buffer) const; + std::vector& buffer) const; }; } // namespace hictk::hic::internal +template <> +struct std::hash { + inline std::size_t operator()(hictk::hic::internal::BlockIndex const& b) const noexcept { + return std::hash{}(b.id()); + } +}; #include "../../../index_impl.hpp" diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index 1f74fac7..06bb9820 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -4,7 +4,9 @@ #pragma once -#include +#include +#include +#include #include #include "hictk/bin_table.hpp" @@ -23,6 +25,7 @@ class PixelSelector { PixelCoordinates _coord1{}; PixelCoordinates _coord2{}; + std::size_t _read_all_at_once_thresh{}; public: template @@ -32,13 +35,14 @@ class PixelSelector { PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, - std::shared_ptr bins_, PixelCoordinates coords) noexcept; + std::shared_ptr bins_, PixelCoordinates coords, + std::size_t read_all_at_once_threshold = 0) noexcept; PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coord1_, - PixelCoordinates coord2_) noexcept; + PixelCoordinates coord2_, std::size_t read_all_at_once_threshold = 0) noexcept; [[nodiscard]] bool operator==(const PixelSelector &other) const noexcept; [[nodiscard]] bool operator!=(const PixelSelector &other) const noexcept; @@ -55,9 +59,6 @@ class PixelSelector { template [[nodiscard]] std::vector> read_all() const; - template - std::vector> read_all_dbg() const; - [[nodiscard]] const PixelCoordinates &coord1() const noexcept; [[nodiscard]] const PixelCoordinates &coord2() const noexcept; @@ -104,7 +105,7 @@ class PixelSelector { using iterator_category = std::forward_iterator_tag; iterator() = default; - explicit iterator(const PixelSelector &sel); + explicit iterator(const PixelSelector &sel, std::size_t read_at_once_thresh); [[nodiscard]] static auto at_end(const PixelSelector &sel) -> iterator; [[nodiscard]] bool operator==(const iterator &other) const noexcept; @@ -127,7 +128,8 @@ class PixelSelector { [[nodiscard]] std::size_t size() const noexcept; void read_next_row(); - [[nodiscard]] internal::Index find_blocks_overlapping_current_row(); + void read_all_at_once(); + [[nodiscard]] std::vector find_blocks_overlapping_current_row(); }; }; diff --git a/src/hic/index_impl.hpp b/src/hic/index_impl.hpp index b65de44d..88060673 100644 --- a/src/hic/index_impl.hpp +++ b/src/hic/index_impl.hpp @@ -8,10 +8,15 @@ #include #include +#include +#include #include #include #include #include +#include + +#include "hictk/numeric_utils.hpp" namespace hictk::hic::internal { @@ -64,29 +69,25 @@ constexpr bool BlockIndex::operator<(const BlockIndex &other) const noexcept { return _coords < other._coords; } -constexpr bool BlockIndex::operator==(const BlockIndex::GridCoordinates &coords_) const noexcept { - return _coords == coords_; -} +constexpr bool BlockIndex::operator==(std::size_t id_) const noexcept { return _id == id_; } -constexpr bool BlockIndex::operator!=(const BlockIndex::GridCoordinates &coords_) const noexcept { - return !(*this == coords_); -} +constexpr bool BlockIndex::operator!=(std::size_t id_) const noexcept { return !(*this == id_); } -constexpr bool BlockIndex::operator<(const BlockIndex::GridCoordinates &coords_) const noexcept { - return _coords < coords_; +inline std::size_t BlockIndexHasher::operator()(const BlockIndex &b) const noexcept { + return (*this)(b.id()); } - -constexpr bool BlockIndexCmp::operator()(const BlockIndex &a, const BlockIndex &b) const noexcept { - return a.coords() < b.coords(); +inline std::size_t BlockIndexHasher::operator()(std::size_t id) const noexcept { + return std::hash{}(id); } -constexpr bool BlockIndexCmp::operator()( - const BlockIndex &a, const BlockIndex::GridCoordinates &b_coords) const noexcept { - return a < b_coords; +constexpr bool BlockIndexEq::operator()(const BlockIndex &a, const BlockIndex &b) const noexcept { + return a == b; } -constexpr bool BlockIndexCmp::operator()(const BlockIndex::GridCoordinates &a_coords, - const BlockIndex &b) const noexcept { - return a_coords < b._coords; +constexpr bool BlockIndexEq::operator()(std::size_t a_id, const BlockIndex &b) const noexcept { + return a_id == b.id(); +} +constexpr bool BlockIndexEq::operator()(const BlockIndex &a, std::size_t b_id) const noexcept { + return a.id() == b_id; } inline Index::Index(Chromosome chrom1_, Chromosome chrom2_, MatrixUnit unit_, @@ -131,48 +132,35 @@ inline std::size_t Index::size() const noexcept { return _block_map.size(); } inline bool Index::empty() const noexcept { return size() == 0; } // NOLINT -inline Index Index::subset(const PixelCoordinates &coords1, const PixelCoordinates &coords2) const { - return { - chrom1(), - chrom2(), - unit(), - resolution(), - _version, - block_bin_count(), - block_column_count(), - matrix_sum(), - map_2d_query_to_blocks(coords1, coords2), - }; -} - -inline const BlockIndex &Index::at(std::size_t row, std::size_t col) const { - auto match = _block_map.find(BlockIndex::GridCoordinates{row, col}); - if (match == _block_map.end()) { - throw std::out_of_range( - fmt::format(FMT_STRING("unable to find block {}{}: out of range"), row, col)); - } - return *match; -} - -inline auto Index::map_2d_query_to_blocks(const PixelCoordinates &coords1, - const PixelCoordinates &coords2) const -> BlockIndexMap { +inline std::vector Index::find_overlaps(const PixelCoordinates &coords1, + const PixelCoordinates &coords2) const { assert(coords1.is_intra()); assert(coords2.is_intra()); - BlockIndexMap buffer{}; + std::vector buffer{}; const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); - if (_version < 9 || is_intra) { - _map_2d_query_to_blocks(coords1, coords2, buffer); - } else { + if (_version > 8 && is_intra) { _map_2d_query_to_blocks_intra_v9plus(coords1, coords2, buffer); + } else { + _map_2d_query_to_blocks(coords1, coords2, buffer); } return buffer; } +inline const BlockIndex &Index::at(std::size_t row, std::size_t col) const { + const auto block_id = (col * block_column_count()) + row; + auto match = _block_map.find(block_id); + if (match == _block_map.end()) { + throw std::out_of_range( + fmt::format(FMT_STRING("unable to find block {}{}: out of range"), row, col)); + } + return *match; +} + inline void Index::_map_2d_query_to_blocks(const hictk::PixelCoordinates &coords1, const hictk::PixelCoordinates &coords2, - BlockIndexMap &buffer) const { + std::vector &buffer) const { assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); @@ -195,46 +183,50 @@ inline void Index::_map_2d_query_to_blocks(const hictk::PixelCoordinates &coords // check region part that overlaps with lower left triangle but only if intrachromosomal const auto checkLowerLeftTri = is_intra; - buffer.clear(); // first check the upper triangular matrix_type for (auto row = row1; row <= row2; ++row) { for (auto col = col1; col <= col2; ++col) { - auto match = _block_map.find(BlockIndex::GridCoordinates{row, col}); + auto block_id = (col * block_column_count()) + row; + auto match = _block_map.find(block_id); if (match != _block_map.end()) { - buffer.emplace(*match); + _tmp_buffer.emplace(*match); } if (checkLowerLeftTri) { - match = _block_map.find(BlockIndex::GridCoordinates{col, row}); + block_id = (row * block_column_count()) + col; + match = _block_map.find(block_id); if (match != _block_map.end()) { - buffer.emplace(*match); + _tmp_buffer.emplace(*match); } } } } + + buffer.resize(_tmp_buffer.size()); + std::move(_tmp_buffer.begin(), _tmp_buffer.end(), buffer.begin()); + std::sort(buffer.begin(), buffer.end()); } inline void Index::_map_2d_query_to_blocks_intra_v9plus(const hictk::PixelCoordinates &coords1, const hictk::PixelCoordinates &coords2, - BlockIndexMap &buffer) const { + std::vector &buffer) const { // https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#grid-structure assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); + assert(coords1.bin1.chrom() == coords2.bin1.chrom()); auto bin1 = coords1.bin1.rel_id(); auto bin2 = coords1.bin2.rel_id() + 1; auto bin3 = coords2.bin1.rel_id(); auto bin4 = coords2.bin2.rel_id() + 1; - const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); - - if (is_intra && bin1 > bin3) { + if (bin1 > bin3) { std::swap(bin1, bin3); std::swap(bin2, bin4); } const auto translatedLowerPAD = (bin1 + bin3) / 2 / _block_bin_count; - const auto translatedHigherPAD = (bin2 + bin4) / 2 / _block_column_count + 1; + const auto translatedHigherPAD = (bin2 + bin4) / 2 / _block_bin_count + 1; const auto translatedNearerDepth = static_cast(std::log2(1.0 + double(hictk::internal::abs_diff(bin1, bin4)) / std::sqrt(2.0) / double(_block_bin_count))); @@ -247,20 +239,24 @@ inline void Index::_map_2d_query_to_blocks_intra_v9plus(const hictk::PixelCoordi if ((bin1 > bin4 && bin2 < bin3) || (bin2 > bin3 && bin1 < bin4)) { return 0; } - return std::min(translatedNearerDepth, translatedFurtherDepth); + return (std::min)(translatedNearerDepth, translatedFurtherDepth); }(); // +1; integer divide rounds down - const auto furtherDepth = std::max(translatedNearerDepth, translatedFurtherDepth) + 1; + const auto furtherDepth = (std::max)(translatedNearerDepth, translatedFurtherDepth) + 1; + _tmp_buffer.clear(); for (auto depth = nearerDepth; depth <= furtherDepth; ++depth) { for (auto pad = translatedLowerPAD; pad <= translatedHigherPAD; ++pad) { - auto match = _block_map.find(BlockIndex::GridCoordinates{depth, pad}); + const auto block_id = (depth * block_column_count()) + pad; + auto match = _block_map.find(block_id); if (match != _block_map.end()) { - auto block = *match; - buffer.emplace(block); + _tmp_buffer.emplace(*match); } } } + buffer.resize(_tmp_buffer.size()); + std::move(_tmp_buffer.begin(), _tmp_buffer.end(), buffer.begin()); + std::sort(buffer.begin(), buffer.end()); } } // namespace hictk::hic::internal diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index cd1f4b56..e1f8b73d 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -4,30 +4,40 @@ #pragma once +#include +#include +#include +#include + +#include "hictk/bin_table.hpp" #include "hictk/common.hpp" -#include "hictk/fmt.hpp" // TODO: remove me +#include "hictk/hic/block_cache.hpp" +#include "hictk/hic/common.hpp" #include "hictk/hic/hic_file_stream.hpp" +#include "hictk/hic/hic_footer.hpp" +#include "hictk/pixel.hpp" namespace hictk::hic { inline PixelSelector::PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, - std::shared_ptr bins_, - PixelCoordinates coords) noexcept + std::shared_ptr bins_, PixelCoordinates coords, + std::size_t read_all_at_once_thresh) noexcept : PixelSelector(std::move(hfs_), std::move(footer_), std::move(cache_), std::move(bins_), - coords, std::move(coords)) {} + coords, std::move(coords), read_all_at_once_thresh) {} inline PixelSelector::PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coord1_, - PixelCoordinates coord2_) noexcept - : _reader(std::move(hfs_), footer_->index(), std::move(bins_), std::move(cache_), coord1_, - coord2_), + PixelCoordinates coord2_, + std::size_t read_all_at_once_thresh) noexcept + : _reader(std::move(hfs_), footer_->index(), std::move(bins_), std::move(cache_)), _footer(std::move(footer_)), _coord1(std::move(coord1_)), - _coord2(std::move(coord2_)) {} + _coord2(std::move(coord2_)), + _read_all_at_once_thresh(read_all_at_once_thresh) {} inline bool PixelSelector::operator==(const PixelSelector &other) const noexcept { return _reader.index().chrom1() == _reader.index().chrom2() && _coord1 == other._coord1 && @@ -40,7 +50,7 @@ inline bool PixelSelector::operator!=(const PixelSelector &other) const noexcept template inline auto PixelSelector::cbegin() const -> iterator { - return iterator(*this); + return iterator(*this, _read_all_at_once_thresh); } template @@ -109,66 +119,6 @@ inline std::vector> PixelSelector::read_all() const { return {begin(), end()}; } -template -inline std::vector> PixelSelector::read_all_dbg() const { - std::vector> buffer{}; - - auto bin1 = coord1().bin1.rel_id(); - auto bin2 = coord1().bin2.rel_id() + 1; - auto bin3 = coord2().bin1.rel_id(); - auto bin4 = coord2().bin2.rel_id() + 1; - - std::size_t i = 0; - - for (const auto &block_idx : _reader.index()) { - auto blk = _reader.read(block_idx); - if (!blk) { - continue; - } - std::ofstream ofs(fmt::format(FMT_STRING("/tmp/test{}.bed"), i)); - // Obs we use open-closed interval instead of open-open like is done in straw - for (const auto &[b1, row] : blk->find_overlap(bin1, bin2)) { - for (const auto &tp : row) { - const auto &b2 = tp.bin2_id; - auto tmp_record = process_interaction(SerializedPixel{ - static_cast(b1), static_cast(b2), tp.count}); - fmt::print(ofs, FMT_STRING("{}\t{}\t{}\t{}\n"), i, tmp_record.bin1_id, tmp_record.bin2_id, - blk->id()); - } - - if (b1 >= bin2) { - // We're past the last row overlapping the query - break; - } - for (const auto &tp : row) { - const auto &b2 = tp.bin2_id; - if (b1 < bin1 || b2 < bin3) { - // We're upstream of the first column overlapping the query (if any) - continue; - } - - if (b2 >= bin4) { - // We're past the last column overlapping the query for the current row - break; - } - - auto record = process_interaction(SerializedPixel{static_cast(b1), - static_cast(b2), tp.count}); - if (std::isfinite(record.count)) { - buffer.emplace_back( - PixelCoordinates{bins().at(chrom1(), static_cast(record.bin1_id)), - bins().at(chrom2(), static_cast(record.bin2_id))}, - record.count); - } - } - } - ++i; - } - // Only interactions from the same block are guaranteed to already be sorted - std::sort(buffer.begin(), buffer.end()); - return buffer; -} - inline const PixelCoordinates &PixelSelector::coord1() const noexcept { return _coord1; } inline const PixelCoordinates &PixelSelector::coord2() const noexcept { return _coord2; } inline MatrixType PixelSelector::matrix_type() const noexcept { return metadata().matrix_type; } @@ -201,12 +151,19 @@ inline N PixelSelector::sum() const noexcept { inline double PixelSelector::avg() const noexcept { return _reader.avg(); } template -inline PixelSelector::iterator::iterator(const PixelSelector &sel) +inline PixelSelector::iterator::iterator(const PixelSelector &sel, + std::size_t read_at_once_thresh) : _sel(&sel), _bin1_id(coord1().bin1.rel_id()), _buffer(std::make_shared()) { if (_sel->_reader.index().empty()) { *this = at_end(sel); return; } + + if (_sel->_reader.index().size() < read_at_once_thresh) { + read_all_at_once(); + return; + } + while (_buffer->empty()) { read_next_row(); } @@ -315,7 +272,8 @@ inline std::size_t PixelSelector::iterator::size() const noexcept { return !_buffer ? 0 : _buffer->size(); } template -inline internal::Index PixelSelector::iterator::find_blocks_overlapping_current_row() { +inline std::vector +PixelSelector::iterator::find_blocks_overlapping_current_row() { const auto end_pos = coord2().bin2.start(); const auto pos1 = (std::min)(end_pos, static_cast(_bin1_id) * bins().bin_size()); const auto pos2 = (std::min)(end_pos, pos1 + bins().bin_size()); @@ -323,7 +281,7 @@ inline internal::Index PixelSelector::iterator::find_blocks_overlapping_curre const auto coord1_ = PixelCoordinates(bins().at(coord1().bin1.chrom(), pos1), bins().at(coord1().bin1.chrom(), pos2)); - return _sel->_reader.index().subset(coord1_, coord2()); + return _sel->_reader.index().find_overlaps(coord1_, coord2()); } template @@ -344,7 +302,6 @@ inline void PixelSelector::iterator::read_next_row() { const auto bin_size = bins().bin_size(); const auto bin1 = bins().at(coord1().bin1.chrom(), static_cast(_bin1_id) * bin_size); - bool pixels_are_sorted = true; for (const auto block_idx : blocks) { const auto blk = _sel->_reader.read(block_idx); const auto match = blk->find(_bin1_id); @@ -371,15 +328,61 @@ inline void PixelSelector::iterator::read_next_row() { Pixel{PixelCoordinates{bin1, bins().at(coord2().bin1.chrom(), pos2)}, conditional_static_cast(first->count)}); } - - pixels_are_sorted &= _buffer->size() < 2 || *(_buffer->end() - 2) < _buffer->back(); ++first; } } + assert(std::is_sorted(_buffer->begin(), _buffer->end())); + _bin1_id++; +} + +template +inline void PixelSelector::iterator::read_all_at_once() { + assert(!!_sel); + const auto &blocks = _sel->_reader.index(); + if (blocks.empty()) { + *this = at_end(*_sel); + return; + } + + if (_buffer.use_count() != 1) { + _buffer = std::make_shared(_buffer->capacity()); + } + + _buffer->clear(); + _buffer_i = 0; + const auto bin_size = bins().bin_size(); + bool pixels_are_sorted = true; + for (const auto block_idx : blocks) { + for (const auto &[bin1_id, pixels] : *_sel->_reader.read(block_idx)) { + const auto bin1 = + bins().at(coord1().bin1.chrom(), static_cast(bin1_id) * bin_size); + if (bin1 < coord1().bin1 || bin1 > coord1().bin2) { + continue; + } + for (const auto &p : pixels) { + const auto bin2 = + bins().at(coord2().bin1.chrom(), static_cast(p.bin2_id) * bin_size); + if (bin2 < coord2().bin1) { + continue; + } + if (bin2 > coord2().bin2) { + break; + } + if constexpr (std::is_integral_v) { + _buffer->emplace_back( + Pixel{PixelCoordinates{bin1, bin2}, static_cast(std::round(p.count))}); + } else { + _buffer->emplace_back( + Pixel{PixelCoordinates{bin1, bin2}, conditional_static_cast(p.count)}); + } + + pixels_are_sorted &= _buffer->size() < 2 || *(_buffer->end() - 2) < _buffer->back(); + } + } + } if (!pixels_are_sorted) { std::sort(_buffer->begin(), _buffer->end()); } - _bin1_id++; + _bin1_id = coord1().bin2.rel_id() + 1; } - } // namespace hictk::hic From 9bda74855efb8dc7c84c4a278d52dfd70125750b Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 09:56:06 +0200 Subject: [PATCH 11/48] Make file names for hic library more consistent --- src/hic/CMakeLists.txt | 6 +- src/hic/block_cache_impl.hpp | 2 +- src/hic/block_reader_impl.hpp | 4 +- ...e_stream_impl.hpp => file_reader_impl.hpp} | 68 +++++++++---------- src/hic/footer_cache_impl.hpp | 2 +- .../{hic_footer_impl.hpp => footer_impl.hpp} | 0 .../{hic_header_impl.hpp => header_impl.hpp} | 0 src/hic/hic_file_impl.hpp | 4 +- src/hic/hic_file_utils_impl.hpp | 2 +- src/hic/include/hictk/hic.hpp | 8 +-- src/hic/include/hictk/hic/block_cache.hpp | 2 +- src/hic/include/hictk/hic/block_reader.hpp | 8 +-- .../{hic_file_stream.hpp => file_reader.hpp} | 12 ++-- .../hictk/hic/{hic_footer.hpp => footer.hpp} | 2 +- src/hic/include/hictk/hic/footer_cache.hpp | 2 +- .../hictk/hic/{hic_header.hpp => header.hpp} | 2 +- src/hic/include/hictk/hic/pixel_selector.hpp | 6 +- src/hic/pixel_selector_impl.hpp | 8 +-- test/units/hic/hic_file_stream_test.cpp | 12 ++-- 19 files changed, 75 insertions(+), 75 deletions(-) rename src/hic/{hic_file_stream_impl.hpp => file_reader_impl.hpp} (89%) rename src/hic/{hic_footer_impl.hpp => footer_impl.hpp} (100%) rename src/hic/{hic_header_impl.hpp => header_impl.hpp} (100%) rename src/hic/include/hictk/hic/{hic_file_stream.hpp => file_reader.hpp} (94%) rename src/hic/include/hictk/hic/{hic_footer.hpp => footer.hpp} (98%) rename src/hic/include/hictk/hic/{hic_header.hpp => header.hpp} (94%) diff --git a/src/hic/CMakeLists.txt b/src/hic/CMakeLists.txt index e1e08716..6959fc45 100644 --- a/src/hic/CMakeLists.txt +++ b/src/hic/CMakeLists.txt @@ -16,10 +16,10 @@ target_sources( ${CMAKE_CURRENT_SOURCE_DIR}/block_reader_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/filestream_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_impl.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_stream_impl.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/file_reader_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_utils_impl.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/hic_footer_impl.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/hic_header_impl.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/footer_impl.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/header_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/index_impl.hpp ${CMAKE_CURRENT_SOURCE_DIR}/pixel_selector_impl.hpp) diff --git a/src/hic/block_cache_impl.hpp b/src/hic/block_cache_impl.hpp index 65c9d465..7855189d 100644 --- a/src/hic/block_cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -13,7 +13,7 @@ #include #include -#include "hictk/hic/hic_footer.hpp" +#include "hictk/hic/footer.hpp" namespace hictk::hic::internal { diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp index 1eed4b83..bde5e188 100644 --- a/src/hic/block_reader_impl.hpp +++ b/src/hic/block_reader_impl.hpp @@ -32,7 +32,7 @@ inline std::string &BinaryBuffer::reset() noexcept { return _buffer; } -inline HiCBlockReader::HiCBlockReader(std::shared_ptr hfs, const Index &master_index, +inline HiCBlockReader::HiCBlockReader(std::shared_ptr hfs, const Index &master_index, std::shared_ptr bins_, std::shared_ptr block_cache_) : _hfs(std::move(hfs)), @@ -58,7 +58,7 @@ inline double HiCBlockReader::avg() const noexcept { return sum() / double(num_bins1 * num_bins2); } -inline Index HiCBlockReader::read_index(HiCFileStream &hfs, const HiCFooter &footer) { +inline Index HiCBlockReader::read_index(HiCFileReader &hfs, const HiCFooter &footer) { if (footer.fileOffset() == -1) { // Footer does not exist. However, query may be valid return {}; diff --git a/src/hic/hic_file_stream_impl.hpp b/src/hic/file_reader_impl.hpp similarity index 89% rename from src/hic/hic_file_stream_impl.hpp rename to src/hic/file_reader_impl.hpp index bccd4ba9..422318fb 100644 --- a/src/hic/hic_file_stream_impl.hpp +++ b/src/hic/file_reader_impl.hpp @@ -22,11 +22,11 @@ namespace hictk::hic::internal { -inline HiCFileStream::HiCFileStream(std::string url) - : _fs(std::make_shared(HiCFileStream::openStream(std::move(url)))), - _header(std::make_shared(HiCFileStream::readHeader(*_fs))) {} +inline HiCFileReader::HiCFileReader(std::string url) + : _fs(std::make_shared(HiCFileReader::openStream(std::move(url)))), + _header(std::make_shared(HiCFileReader::readHeader(*_fs))) {} -inline filestream::FileStream HiCFileStream::openStream(std::string url) { +inline filestream::FileStream HiCFileReader::openStream(std::string url) { try { return filestream::FileStream(url); } catch (const std::exception &e) { @@ -34,20 +34,20 @@ inline filestream::FileStream HiCFileStream::openStream(std::string url) { } } -inline const std::string &HiCFileStream::url() const noexcept { return _fs->url(); } -inline const HiCHeader &HiCFileStream::header() const noexcept { return *_header; } +inline const std::string &HiCFileReader::url() const noexcept { return _fs->url(); } +inline const HiCHeader &HiCFileReader::header() const noexcept { return *_header; } -inline std::int32_t HiCFileStream::version() const noexcept { +inline std::int32_t HiCFileReader::version() const noexcept { assert(_header->version != -1); return _header->version; } -inline void HiCFileStream::discardExpectedVector(std::int64_t nValues) { +inline void HiCFileReader::discardExpectedVector(std::int64_t nValues) { const std::int64_t elementSize = version() > 8 ? sizeof(float) : sizeof(double); _fs->seekg(nValues * elementSize, std::ios::cur); } -inline std::vector HiCFileStream::readExpectedVector(std::int64_t nValues) { +inline std::vector HiCFileReader::readExpectedVector(std::int64_t nValues) { std::vector initialExpectedValues(static_cast(nValues)); if (version() > 8) { std::vector tmpbuff(static_cast(nValues)); @@ -64,7 +64,7 @@ inline std::vector HiCFileStream::readExpectedVector(std::int64_t nValue return initialExpectedValues; } -inline std::vector HiCFileStream::readNormalizationFactors(std::uint32_t wantedChrom) { +inline std::vector HiCFileReader::readNormalizationFactors(std::uint32_t wantedChrom) { const auto nFactors = _fs->read(); std::vector normFactors{}; auto readFactor = [this]() { @@ -84,7 +84,7 @@ inline std::vector HiCFileStream::readNormalizationFactors(std::uint32_t return normFactors; } -inline void HiCFileStream::applyNormalizationFactors(std::vector &expectedValues, +inline void HiCFileReader::applyNormalizationFactors(std::vector &expectedValues, const std::vector &normFactors) { if (normFactors.empty() || expectedValues.empty()) { return; @@ -94,7 +94,7 @@ inline void HiCFileStream::applyNormalizationFactors(std::vector &expect [&](auto n) { return n / factor; }); } } -inline std::vector HiCFileStream::readNormalizationVector(indexEntry cNormEntry, +inline std::vector HiCFileReader::readNormalizationVector(indexEntry cNormEntry, std::size_t numValuesExpected) { _fs->seekg(cNormEntry.position); const auto numValues = static_cast(readNValues()); @@ -121,54 +121,54 @@ inline std::vector HiCFileStream::readNormalizationVector(indexEntry cNo return buffer; } -inline void HiCFileStream::discardNormalizationFactors(std::uint32_t wantedChrom) { +inline void HiCFileReader::discardNormalizationFactors(std::uint32_t wantedChrom) { std::ignore = readNormalizationFactors(wantedChrom); } -inline MatrixType HiCFileStream::readMatrixType(filestream::FileStream &fs, std::string &buff) { +inline MatrixType HiCFileReader::readMatrixType(filestream::FileStream &fs, std::string &buff) { fs.getline(buff, '\0'); return ParseMatrixTypeStr(buff); } -inline NormalizationMethod HiCFileStream::readNormalizationMethod(filestream::FileStream &fs, +inline NormalizationMethod HiCFileReader::readNormalizationMethod(filestream::FileStream &fs, std::string &buff) { fs.getline(buff, '\0'); return ParseNormStr(buff); } -inline MatrixUnit HiCFileStream::readMatrixUnit(filestream::FileStream &fs, std::string &buff) { +inline MatrixUnit HiCFileReader::readMatrixUnit(filestream::FileStream &fs, std::string &buff) { fs.getline(buff, '\0'); return ParseUnitStr(buff); } -inline MatrixType HiCFileStream::readMatrixType() { - return HiCFileStream::readMatrixType(*_fs, _strbuff); +inline MatrixType HiCFileReader::readMatrixType() { + return HiCFileReader::readMatrixType(*_fs, _strbuff); } -inline NormalizationMethod HiCFileStream::readNormalizationMethod() { - return HiCFileStream::readNormalizationMethod(*_fs, _strbuff); +inline NormalizationMethod HiCFileReader::readNormalizationMethod() { + return HiCFileReader::readNormalizationMethod(*_fs, _strbuff); } -inline MatrixUnit HiCFileStream::readMatrixUnit() { - return HiCFileStream::readMatrixUnit(*_fs, _strbuff); +inline MatrixUnit HiCFileReader::readMatrixUnit() { + return HiCFileReader::readMatrixUnit(*_fs, _strbuff); } -inline std::int64_t HiCFileStream::readNValues() { +inline std::int64_t HiCFileReader::readNValues() { if (version() > 8) { return _fs->read(); } return _fs->read(); } -inline bool HiCFileStream::checkMagicString(filestream::FileStream &fs) { +inline bool HiCFileReader::checkMagicString(filestream::FileStream &fs) { return fs.getline('\0') == "HIC"; } -inline std::int64_t HiCFileStream::masterOffset() const noexcept { +inline std::int64_t HiCFileReader::masterOffset() const noexcept { return _header->masterIndexOffset; } -inline auto HiCFileStream::init_decompressor() -> Decompressor { +inline auto HiCFileReader::init_decompressor() -> Decompressor { Decompressor zs(libdeflate_alloc_decompressor(), [](auto *ptr) { libdeflate_free_decompressor(ptr); }); if (!zs) { @@ -178,7 +178,7 @@ inline auto HiCFileStream::init_decompressor() -> Decompressor { return zs; } -inline Index HiCFileStream::read_index(std::int64_t fileOffset, const Chromosome &chrom1, +inline Index HiCFileReader::read_index(std::int64_t fileOffset, const Chromosome &chrom1, const Chromosome &chrom2, MatrixUnit wantedUnit, std::int64_t wantedResolution) { _fs->seekg(fileOffset); @@ -232,11 +232,11 @@ inline Index HiCFileStream::read_index(std::int64_t fileOffset, const Chromosome chrom1.name(), chrom2.name(), wantedUnit, wantedResolution)); } -inline bool HiCFileStream::checkMagicString() { return checkMagicString(*_fs); } +inline bool HiCFileReader::checkMagicString() { return checkMagicString(*_fs); } // reads the header, storing the positions of the normalization vectors and returning the // masterIndexPosition pointer -inline HiCHeader HiCFileStream::readHeader(filestream::FileStream &fs) { +inline HiCHeader HiCFileReader::readHeader(filestream::FileStream &fs) { if (!checkMagicString(fs)) { throw std::runtime_error(fmt::format( FMT_STRING("Hi-C magic string is missing. {} does not appear to be a hic file"), fs.url())); @@ -309,7 +309,7 @@ inline HiCHeader HiCFileStream::readHeader(filestream::FileStream &fs) { return header; } -inline void HiCFileStream::readAndInflate(const BlockIndex &idx, std::string &plainTextBuffer) { +inline void HiCFileReader::readAndInflate(const BlockIndex &idx, std::string &plainTextBuffer) { try { // _strbuff is used to store compressed data // plainTextBuffer is used to store decompressed data @@ -348,16 +348,16 @@ inline void HiCFileStream::readAndInflate(const BlockIndex &idx, std::string &pl } } -inline bool HiCFileStream::checkMagicString(std::string url) noexcept { +inline bool HiCFileReader::checkMagicString(std::string url) noexcept { try { - filestream::FileStream fs(HiCFileStream::openStream(std::move(url))); - return HiCFileStream::checkMagicString(fs); + filestream::FileStream fs(HiCFileReader::openStream(std::move(url))); + return HiCFileReader::checkMagicString(fs); } catch (...) { return false; } } -inline HiCFooter HiCFileStream::read_footer(std::uint32_t chrom1_id, std::uint32_t chrom2_id, +inline HiCFooter HiCFileReader::read_footer(std::uint32_t chrom1_id, std::uint32_t chrom2_id, MatrixType matrix_type, NormalizationMethod wanted_norm, MatrixUnit wanted_unit, std::uint32_t wanted_resolution) { diff --git a/src/hic/footer_cache_impl.hpp b/src/hic/footer_cache_impl.hpp index 3fc83f45..5afc2b01 100644 --- a/src/hic/footer_cache_impl.hpp +++ b/src/hic/footer_cache_impl.hpp @@ -11,7 +11,7 @@ #include #include -#include "hictk/hic/hic_footer.hpp" +#include "hictk/hic/footer.hpp" namespace hictk::hic::internal { diff --git a/src/hic/hic_footer_impl.hpp b/src/hic/footer_impl.hpp similarity index 100% rename from src/hic/hic_footer_impl.hpp rename to src/hic/footer_impl.hpp diff --git a/src/hic/hic_header_impl.hpp b/src/hic/header_impl.hpp similarity index 100% rename from src/hic/hic_header_impl.hpp rename to src/hic/header_impl.hpp diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index 4fed1f4c..aa7aaa3d 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -13,13 +13,13 @@ #include #include "hictk/hic/common.hpp" -#include "hictk/hic/hic_footer.hpp" +#include "hictk/hic/footer.hpp" namespace hictk::hic { inline HiCFile::HiCFile(std::string url_, std::uint32_t resolution_, MatrixType type_, MatrixUnit unit_, std::uint64_t block_cache_capacity) - : _fs(std::make_shared(std::move(url_))), + : _fs(std::make_shared(std::move(url_))), _type(type_), _unit(unit_), _block_cache(std::make_shared(block_cache_capacity)), diff --git a/src/hic/hic_file_utils_impl.hpp b/src/hic/hic_file_utils_impl.hpp index 48bfa238..e62904e7 100644 --- a/src/hic/hic_file_utils_impl.hpp +++ b/src/hic/hic_file_utils_impl.hpp @@ -9,6 +9,6 @@ namespace hictk::hic::utils { inline bool is_hic_file(const std::filesystem::path& path) { - return internal::HiCFileStream::checkMagicString(path.string()); + return internal::HiCFileReader::checkMagicString(path.string()); } } // namespace hictk::hic::utils diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index 9cc583da..85c97818 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -15,18 +15,18 @@ #include "hictk/hic/block_cache.hpp" #include "hictk/hic/block_reader.hpp" #include "hictk/hic/common.hpp" +#include "hictk/hic/file_reader.hpp" #include "hictk/hic/filestream.hpp" +#include "hictk/hic/footer.hpp" #include "hictk/hic/footer_cache.hpp" -#include "hictk/hic/hic_file_stream.hpp" -#include "hictk/hic/hic_footer.hpp" -#include "hictk/hic/hic_header.hpp" +#include "hictk/hic/header.hpp" // #include "hictk/hic/hic_matrix_selector.hpp" #include "hictk/hic/pixel_selector.hpp" namespace hictk::hic { class HiCFile { - mutable std::shared_ptr _fs{}; + mutable std::shared_ptr _fs{}; mutable internal::FooterCache _footers{}; MatrixType _type{MatrixType::observed}; MatrixUnit _unit{MatrixUnit::BP}; diff --git a/src/hic/include/hictk/hic/block_cache.hpp b/src/hic/include/hictk/hic/block_cache.hpp index 78cf1620..0118b73e 100644 --- a/src/hic/include/hictk/hic/block_cache.hpp +++ b/src/hic/include/hictk/hic/block_cache.hpp @@ -14,7 +14,7 @@ #include "hictk/chromosome.hpp" #include "hictk/hic/common.hpp" -#include "hictk/hic/hic_footer.hpp" +#include "hictk/hic/footer.hpp" #include "hictk/pixel.hpp" namespace hictk::hic::internal { diff --git a/src/hic/include/hictk/hic/block_reader.hpp b/src/hic/include/hictk/hic/block_reader.hpp index 3095d8a7..8d2125cc 100644 --- a/src/hic/include/hictk/hic/block_reader.hpp +++ b/src/hic/include/hictk/hic/block_reader.hpp @@ -12,7 +12,7 @@ #include "hictk/chromosome.hpp" #include "hictk/hic/block_cache.hpp" -#include "hictk/hic/hic_file_stream.hpp" +#include "hictk/hic/file_reader.hpp" #include "hictk/hic/index.hpp" namespace hictk::hic::internal { @@ -34,7 +34,7 @@ class BinaryBuffer { }; class HiCBlockReader { - std::shared_ptr _hfs{}; + std::shared_ptr _hfs{}; std::shared_ptr _blk_cache{}; // This should be passed in by file. Key should be // changed from size_t to {chrom1, chrom2, size_t} // We need the entire bin table in order to map pixels to abs bin ids @@ -46,7 +46,7 @@ class HiCBlockReader { public: HiCBlockReader() = default; - HiCBlockReader(std::shared_ptr hfs, const Index& master_index, + HiCBlockReader(std::shared_ptr hfs, const Index& master_index, std::shared_ptr bins_, std::shared_ptr block_cache_); @@ -63,7 +63,7 @@ class HiCBlockReader { [[nodiscard]] std::shared_ptr read(const BlockIndex& idx); private: - [[nodiscard]] static Index read_index(HiCFileStream& hfs, const HiCFooter& footer); + [[nodiscard]] static Index read_index(HiCFileReader& hfs, const HiCFooter& footer); static void read_dispatcher_type1_block(bool i16Bin1, bool i16Bin2, bool i16Counts, std::int32_t bin1Offset, std::int32_t bin2Offset, BinaryBuffer& src, diff --git a/src/hic/include/hictk/hic/hic_file_stream.hpp b/src/hic/include/hictk/hic/file_reader.hpp similarity index 94% rename from src/hic/include/hictk/hic/hic_file_stream.hpp rename to src/hic/include/hictk/hic/file_reader.hpp index ab14fcd1..59f7cea1 100644 --- a/src/hic/include/hictk/hic/hic_file_stream.hpp +++ b/src/hic/include/hictk/hic/file_reader.hpp @@ -15,13 +15,13 @@ #include "hictk/chromosome.hpp" #include "hictk/hic/common.hpp" #include "hictk/hic/filestream.hpp" -#include "hictk/hic/hic_footer.hpp" -#include "hictk/hic/hic_header.hpp" +#include "hictk/hic/footer.hpp" +#include "hictk/hic/header.hpp" #include "hictk/hic/index.hpp" namespace hictk::hic::internal { -class HiCFileStream { +class HiCFileReader { using Decompressor = UniquePtrWithDeleter; std::shared_ptr _fs{}; std::shared_ptr _header{}; @@ -29,8 +29,8 @@ class HiCFileStream { Decompressor _decompressor{init_decompressor()}; public: - HiCFileStream() = default; - explicit HiCFileStream(std::string url); + HiCFileReader() = default; + explicit HiCFileReader(std::string url); [[nodiscard]] inline const std::string &url() const noexcept; [[nodiscard]] const HiCHeader &header() const noexcept; @@ -83,4 +83,4 @@ class HiCFileStream { }; } // namespace hictk::hic::internal -#include "../../../hic_file_stream_impl.hpp" +#include "../../../file_reader_impl.hpp" diff --git a/src/hic/include/hictk/hic/hic_footer.hpp b/src/hic/include/hictk/hic/footer.hpp similarity index 98% rename from src/hic/include/hictk/hic/hic_footer.hpp rename to src/hic/include/hictk/hic/footer.hpp index 7bd6fe61..aecfaa67 100644 --- a/src/hic/include/hictk/hic/hic_footer.hpp +++ b/src/hic/include/hictk/hic/footer.hpp @@ -76,4 +76,4 @@ template <> struct std::hash { inline std::size_t operator()(hictk::hic::internal::HiCFooter const &f) const noexcept; }; -#include "../../../hic_footer_impl.hpp" +#include "../../../footer_impl.hpp" diff --git a/src/hic/include/hictk/hic/footer_cache.hpp b/src/hic/include/hictk/hic/footer_cache.hpp index 2eb8fa40..cf30e47b 100644 --- a/src/hic/include/hictk/hic/footer_cache.hpp +++ b/src/hic/include/hictk/hic/footer_cache.hpp @@ -14,7 +14,7 @@ #include "hictk/chromosome.hpp" #include "hictk/hic/common.hpp" -#include "hictk/hic/hic_footer.hpp" +#include "hictk/hic/footer.hpp" namespace hictk::hic::internal { diff --git a/src/hic/include/hictk/hic/hic_header.hpp b/src/hic/include/hictk/hic/header.hpp similarity index 94% rename from src/hic/include/hictk/hic/hic_header.hpp rename to src/hic/include/hictk/hic/header.hpp index 5bc75570..3f41823c 100644 --- a/src/hic/include/hictk/hic/hic_header.hpp +++ b/src/hic/include/hictk/hic/header.hpp @@ -31,4 +31,4 @@ struct HiCHeader { } // namespace hictk::hic::internal -#include "../../../hic_header_impl.hpp" +#include "../../../header_impl.hpp" diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index 06bb9820..d97e39e9 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -12,7 +12,7 @@ #include "hictk/bin_table.hpp" #include "hictk/hic/block_cache.hpp" #include "hictk/hic/common.hpp" -#include "hictk/hic/hic_file_stream.hpp" +#include "hictk/hic/file_reader.hpp" #include "hictk/hic/index.hpp" #include "hictk/pixel.hpp" @@ -32,13 +32,13 @@ class PixelSelector { class iterator; PixelSelector() = delete; - PixelSelector(std::shared_ptr hfs_, + PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coords, std::size_t read_all_at_once_threshold = 0) noexcept; - PixelSelector(std::shared_ptr hfs_, + PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coord1_, diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index e1f8b73d..afc381d6 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -13,13 +13,13 @@ #include "hictk/common.hpp" #include "hictk/hic/block_cache.hpp" #include "hictk/hic/common.hpp" -#include "hictk/hic/hic_file_stream.hpp" -#include "hictk/hic/hic_footer.hpp" +#include "hictk/hic/file_reader.hpp" +#include "hictk/hic/footer.hpp" #include "hictk/pixel.hpp" namespace hictk::hic { -inline PixelSelector::PixelSelector(std::shared_ptr hfs_, +inline PixelSelector::PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coords, @@ -27,7 +27,7 @@ inline PixelSelector::PixelSelector(std::shared_ptr hfs : PixelSelector(std::move(hfs_), std::move(footer_), std::move(cache_), std::move(bins_), coords, std::move(coords), read_all_at_once_thresh) {} -inline PixelSelector::PixelSelector(std::shared_ptr hfs_, +inline PixelSelector::PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coord1_, diff --git a/test/units/hic/hic_file_stream_test.cpp b/test/units/hic/hic_file_stream_test.cpp index bccac204..ffb1171d 100644 --- a/test/units/hic/hic_file_stream_test.cpp +++ b/test/units/hic/hic_file_stream_test.cpp @@ -2,8 +2,6 @@ // // SPDX-License-Identifier: MIT -#include "hictk/hic/hic_file_stream.hpp" - #include #include #include @@ -11,6 +9,8 @@ #include #include +#include "hictk/hic/file_reader.hpp" + using namespace hictk::hic; namespace hictk::test { @@ -29,7 +29,7 @@ TEST_CASE("readHeader (v8)", "[hic][v8][short]") { constexpr auto* genomeID = "dm6"; constexpr auto nChromosomes = 9; - const auto header = internal::HiCFileStream(pathV8).header(); + const auto header = internal::HiCFileReader(pathV8).header(); CHECK(header.url == pathV8); CHECK(header.masterIndexOffset == 131515430); CHECK(header.genomeID == genomeID); @@ -51,7 +51,7 @@ TEST_CASE("readHeader (v9)", "[hic][v9][short]") { constexpr auto* genomeID = "dm6"; constexpr auto nChromosomes = 9; - const auto header = internal::HiCFileStream(pathV9).header(); + const auto header = internal::HiCFileReader(pathV9).header(); CHECK(header.url == pathV9); CHECK(header.masterIndexOffset == 130706734); @@ -69,7 +69,7 @@ TEST_CASE("readHeader (v9)", "[hic][v9][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("read_footer (v8)", "[hic][v8][short]") { - internal::HiCFileStream s(pathV8); + internal::HiCFileReader s(pathV8); const auto chr2L = s.header().chromosomes.at("chr2L"); const auto chr2R = s.header().chromosomes.at("chr2R"); // first 5 expected values @@ -193,7 +193,7 @@ TEST_CASE("read_footer (v8)", "[hic][v8][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("read_footer (v9)", "[hic][v9][short]") { - internal::HiCFileStream s(pathV9); + internal::HiCFileReader s(pathV9); const auto chr2L = s.header().chromosomes.at("chr2L"); const auto chr2R = s.header().chromosomes.at("chr2R"); // first 5 expected values From a301a44ce40dacd697ad474d23769497c98ff40e Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 10:09:26 +0200 Subject: [PATCH 12/48] Bugfix --- src/hic/file_reader_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hic/file_reader_impl.hpp b/src/hic/file_reader_impl.hpp index 422318fb..cb6ecff2 100644 --- a/src/hic/file_reader_impl.hpp +++ b/src/hic/file_reader_impl.hpp @@ -401,9 +401,11 @@ inline HiCFooter HiCFileReader::read_footer(std::uint32_t chrom1_id, std::uint32 wanted_resolution, wanted_unit)); } + const auto file_offset = _fs->tellg(); HiCFooter footer{read_index(metadata.fileOffset, metadata.chrom1, metadata.chrom2, metadata.unit, metadata.resolution), std::move(metadata)}; + _fs->seekg(static_cast(file_offset)); if ((matrix_type == MT::observed && wanted_norm == NM::NONE) || ((matrix_type == MT::oe || matrix_type == MT::expected) && wanted_norm == NM::NONE && @@ -415,8 +417,6 @@ inline HiCFooter HiCFileReader::read_footer(std::uint32_t chrom1_id, std::uint32 auto &c1Norm = footer.c1Norm(); auto &c2Norm = footer.c2Norm(); - // read in and ignore expected value maps; don't store; reading these to - // get to wanted_norm vector index auto nExpectedValues = _fs->read(); for (std::int32_t i = 0; i < nExpectedValues; ++i) { const auto foundUnit = readMatrixUnit(); From 9510fc25a1fecdadf54ca0b8ee933572b3953f3d Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:04:57 +0200 Subject: [PATCH 13/48] Update tests --- src/hic/hic_file_impl.hpp | 52 ++++++--------- src/hic/include/hictk/hic.hpp | 19 ++++-- src/hic/pixel_selector_impl.hpp | 6 +- test/units/hic/pixel_selector_test.cpp | 89 ++++++++++++++++++++------ 4 files changed, 110 insertions(+), 56 deletions(-) diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index aa7aaa3d..8a7e8528 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -76,44 +76,27 @@ inline std::shared_ptr HiCFile::get_footer( return *node; } -/* -inline internal::MatrixSelector HiCFile::get_matrix_selector(const Chromosome& chrom, - NormalizationMethod norm) { - return get_matrix_selector(chrom, chrom, norm); -} -inline internal::MatrixSelector HiCFile::get_matrix_selector(const std::string& chromName, - NormalizationMethod norm) { - return get_matrix_selector(chromName, chromName, norm); -} -inline internal::MatrixSelector HiCFile::get_matrix_selector(std::uint32_t chrom_id, - NormalizationMethod norm) { - return get_matrix_selector(chrom_id, chrom_id, norm); -} - -inline internal::MatrixSelector HiCFile::get_matrix_selector(const Chromosome& chrom1, - const Chromosome& chrom2, - NormalizationMethod norm) { - return get_matrix_selector(chrom1.id(), chrom2.id(), norm); -} - - */ inline PixelSelector HiCFile::fetch(std::string_view query, NormalizationMethod norm, - QUERY_TYPE query_type) const { + QUERY_TYPE query_type, + std::size_t read_at_once_threshold) const { const auto gi = query_type == QUERY_TYPE::BED ? GenomicInterval::parse_bed(this->chromosomes(), query) : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{query}); - return this->fetch(gi.chrom(), gi.start(), gi.end(), gi.chrom(), gi.start(), gi.end(), norm); + return this->fetch(gi.chrom(), gi.start(), gi.end(), gi.chrom(), gi.start(), gi.end(), norm, + read_at_once_threshold); } inline PixelSelector HiCFile::fetch(std::string_view chrom_name, std::uint32_t start, - std::uint32_t end, NormalizationMethod norm) const { - return this->fetch(chrom_name, start, end, chrom_name, start, end, norm); + std::uint32_t end, NormalizationMethod norm, + std::size_t read_at_once_threshold) const { + return this->fetch(chrom_name, start, end, chrom_name, start, end, norm, read_at_once_threshold); } inline PixelSelector HiCFile::fetch(std::string_view range1, std::string_view range2, - NormalizationMethod norm, QUERY_TYPE query_type) const { + NormalizationMethod norm, QUERY_TYPE query_type, + std::size_t read_at_once_threshold) const { const auto gi1 = query_type == QUERY_TYPE::BED ? GenomicInterval::parse_bed(this->chromosomes(), range1) : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{range1}); @@ -122,22 +105,24 @@ inline PixelSelector HiCFile::fetch(std::string_view range1, std::string_view ra ? GenomicInterval::parse_bed(this->chromosomes(), range2) : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{range2}); - return this->fetch(gi1.chrom(), gi1.start(), gi1.end(), gi2.chrom(), gi2.start(), gi2.end(), - norm); + return this->fetch(gi1.chrom(), gi1.start(), gi1.end(), gi2.chrom(), gi2.start(), gi2.end(), norm, + read_at_once_threshold); } inline PixelSelector HiCFile::fetch(std::string_view chrom1_name, std::uint32_t start1, std::uint32_t end1, std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, - NormalizationMethod norm) const { + NormalizationMethod norm, + std::size_t read_at_once_threshold) const { return this->fetch(chromosomes().at(chrom1_name), start1, end1, chromosomes().at(chrom2_name), - start2, end2, norm); + start2, end2, norm, read_at_once_threshold); } inline PixelSelector HiCFile::fetch(const Chromosome& chrom1, std::uint32_t start1, std::uint32_t end1, const Chromosome& chrom2, std::uint32_t start2, std::uint32_t end2, - NormalizationMethod norm) const { + NormalizationMethod norm, + std::size_t read_at_once_threshold) const { if (chrom1 > chrom2) { throw std::runtime_error( "Query overlaps the lower-triangle of the matrix. This is currently not supported."); @@ -185,11 +170,14 @@ inline PixelSelector HiCFile::fetch(const Chromosome& chrom1, std::uint32_t star } }(); - return PixelSelector{_fs, footer, _block_cache, _bins, coord1, coord2}; + return PixelSelector{_fs, footer, _block_cache, _bins, coord1, coord2, read_at_once_threshold}; } inline std::size_t HiCFile::num_cached_footers() const noexcept { return _footers.size(); } inline void HiCFile::purge_footer_cache() { _footers.clear(); } +inline double HiCFile::block_cache_hit_rate() const noexcept { return _block_cache->hit_rate(); } +inline std::size_t HiCFile::block_cache_size() const noexcept { return _block_cache->size(); } +inline void HiCFile::clear_block_cache() noexcept { _block_cache->reset(); } } // namespace hictk::hic diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index 85c97818..cfe659d9 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -53,20 +53,28 @@ class HiCFile { [[nodiscard]] PixelSelector fetch(std::string_view query, NormalizationMethod norm = NormalizationMethod::NONE, - QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + QUERY_TYPE query_type = QUERY_TYPE::UCSC, + std::size_t read_at_once_threshold = 0) const; [[nodiscard]] PixelSelector fetch(std::string_view chrom_name, std::uint32_t start, std::uint32_t end, - NormalizationMethod norm = NormalizationMethod::NONE) const; + NormalizationMethod norm = NormalizationMethod::NONE, + std::size_t read_at_once_threshold = 0) const; [[nodiscard]] PixelSelector fetch(std::string_view range1, std::string_view range2, NormalizationMethod norm = NormalizationMethod::NONE, - QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; + QUERY_TYPE query_type = QUERY_TYPE::UCSC, + std::size_t read_at_once_threshold = 0) const; [[nodiscard]] PixelSelector fetch(std::string_view chrom1_name, std::uint32_t start1, std::uint32_t end1, std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, - NormalizationMethod norm = NormalizationMethod::NONE) const; + NormalizationMethod norm = NormalizationMethod::NONE, + std::size_t read_at_once_threshold = 0) const; [[nodiscard]] std::size_t num_cached_footers() const noexcept; void purge_footer_cache(); + [[nodiscard]] double block_cache_hit_rate() const noexcept; + [[nodiscard]] std::size_t block_cache_size() const noexcept; + void clear_block_cache() noexcept; + private: [[nodiscard]] std::shared_ptr get_footer( std::uint32_t chrom1_id, std::uint32_t chrom2_id, MatrixType matrix_type, @@ -75,7 +83,8 @@ class HiCFile { [[nodiscard]] PixelSelector fetch(const Chromosome &chrom1, std::uint32_t start1, std::uint32_t end1, const Chromosome &chrom2, std::uint32_t start2, std::uint32_t end2, - NormalizationMethod norm = NormalizationMethod::NONE) const; + NormalizationMethod norm, + std::size_t read_at_once_threshold) const; }; namespace utils { diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index afc381d6..f4b1e6df 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -116,7 +116,11 @@ inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record template inline std::vector> PixelSelector::read_all() const { - return {begin(), end()}; + // We push_back into buff to avoid traversing pixels twice (once to figure out the vector size, + // and a second time to copy the actual data) + std::vector> buff{}; + std::copy(begin(), end(), std::back_inserter(buff)); + return buff; } inline const PixelCoordinates &PixelSelector::coord1() const noexcept { return _coord1; } diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index 921a0589..03f3d87b 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -16,6 +16,9 @@ namespace hictk::test { inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test +template +using Pixel = hictk::Pixel; + const auto pathV8 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) const auto pathV9 = @@ -23,38 +26,88 @@ const auto pathV9 = const auto path_binary = (hictk::test::datadir / "data.zip").string(); // NOLINT(cert-err58-cpp) // NOLINTNEXTLINE(readability-function-cognitive-complexity) -static std::vector> head(const std::vector>& buffer, - std::size_t n = 5) { +template +static std::vector> head(const std::vector>& buffer, + std::size_t n = 5) { REQUIRE(buffer.size() >= n); - std::vector> slice(n); + std::vector> slice(n); std::copy_n(buffer.begin(), n, slice.begin()); return slice; } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -static std::vector> tail(const std::vector>& buffer, - std::size_t n = 5) { +template +static std::vector> tail(const std::vector>& buffer, + std::size_t n = 5) { REQUIRE(buffer.size() >= n); - std::vector> slice(n); + std::vector> slice(n); std::copy_n(buffer.end() - std::int32_t(n), n, slice.begin()); return slice; } template -static N sumCounts(const std::vector>& buffer) { +static N sumCounts(const std::vector>& buffer) { return std::accumulate(buffer.begin(), buffer.end(), N(0), - [](N accumulator, const hictk::Pixel& p) { + [](N accumulator, const hictk::Pixel& p) { return accumulator + static_cast(p.count); }); } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -static void compareContactRecord(const hictk::Pixel& r1, const SerializedPixel& r2) { +template +static void compareContactRecord(const hictk::Pixel& r1, const SerializedPixel& r2) { CHECK(r1.coords.bin1.start() == r2.bin1_id); CHECK(r1.coords.bin2.start() == r2.bin2_id); - CHECK_THAT(r1.count, Catch::Matchers::WithinRel(r2.count)); + if constexpr (std::is_floating_point_v) { + CHECK_THAT(r1.count, Catch::Matchers::WithinRel(r2.count)); + } else { + CHECK(r1.count == static_cast(r2.count)); + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("MatrixSelector accessors", "[hic][short]") { + const auto sel = HiCFile(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L", NormalizationMethod::NONE); + + CHECK(sel.chrom1().name() == "chr2L"); + CHECK(sel.chrom2().name() == "chr2L"); + CHECK(sel.matrix_type() == MatrixType::observed); + CHECK(sel.normalization() == NormalizationMethod::NONE); + CHECK(sel.unit() == MatrixUnit::BP); + CHECK(sel.resolution() == 2500000); + + REQUIRE(sel.chrom1().size() == 23513712); +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("MatrixSelector LRU cache", "[hic][short]") { + HiCFile f(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP); + + auto sel = f.fetch("chr2L", NormalizationMethod::NONE, HiCFile::QUERY_TYPE::UCSC, 100); + // Fill cache + const auto expected_sum = sumCounts(sel.read_all()); + + REQUIRE(f.block_cache_hit_rate() == 0); + REQUIRE(f.block_cache_size() == 6); + + auto sum = sumCounts(sel.read_all()); + CHECK(sum == expected_sum); + CHECK(f.block_cache_hit_rate() == 0.5); + CHECK(f.block_cache_size() == 6); + + for (auto i = 0; i < 3; ++i) { + sum = sumCounts(sel.read_all()); + CHECK(sum == expected_sum); + } + CHECK(f.block_cache_hit_rate() == 4.0 / 5.0); + CHECK(f.block_cache_size() == 6); + + f.clear_block_cache(); + CHECK(f.block_cache_hit_rate() == 0); + CHECK(f.block_cache_size() == 0); } // NOLINTNEXTLINE(readability-function-cognitive-complexity) @@ -64,15 +117,15 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { constexpr std::int32_t expected_sum = 19968156; constexpr std::size_t N = 5; - constexpr std::array head_expected{1745, 2844, 409, 195, 195}; - constexpr std::array tail_expected{119, 34, 281, 53, 193}; + constexpr std::array head_expected{1745, 2844, 409, 195, 195}; + constexpr std::array tail_expected{119, 34, 281, 53, 193}; constexpr auto expected_value = std::make_pair(std::size_t(1229799), SerializedPixel{15770000, 15770000, 1234.0F}); SECTION("v8") { auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP).fetch("chr2L"); - const auto buffer = sel.read_all(); + const auto buffer = sel.read_all(); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -81,8 +134,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { const auto t = tail(buffer, N); for (std::size_t i = 0; i < N; ++i) { - CHECK_THAT(head_expected[i], Catch::Matchers::WithinRel(h[i].count)); - CHECK_THAT(tail_expected[i], Catch::Matchers::WithinRel(t[i].count)); + CHECK(head_expected[i] == h[i].count); + CHECK(tail_expected[i] == t[i].count); } compareContactRecord(buffer[expected_value.first], expected_value.second); @@ -91,7 +144,7 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { SECTION("v9") { auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP).fetch("chr2L"); - const auto buffer = sel.read_all(); + const auto buffer = sel.read_all(); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -100,8 +153,8 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { const auto t = tail(buffer, N); for (std::size_t i = 0; i < N; ++i) { - CHECK_THAT(head_expected[i], Catch::Matchers::WithinRel(h[i].count)); - CHECK_THAT(tail_expected[i], Catch::Matchers::WithinRel(t[i].count)); + CHECK(head_expected[i] == h[i].count); + CHECK(tail_expected[i] == t[i].count); } compareContactRecord(buffer[expected_value.first], expected_value.second); From a4936eaadf04df503137b4971afa05b6184f4e24 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:39:06 +0200 Subject: [PATCH 14/48] Bugfix --- src/hic/index_impl.hpp | 1 + src/hic/pixel_selector_impl.hpp | 7 +------ 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/hic/index_impl.hpp b/src/hic/index_impl.hpp index 88060673..78d3c224 100644 --- a/src/hic/index_impl.hpp +++ b/src/hic/index_impl.hpp @@ -183,6 +183,7 @@ inline void Index::_map_2d_query_to_blocks(const hictk::PixelCoordinates &coords // check region part that overlaps with lower left triangle but only if intrachromosomal const auto checkLowerLeftTri = is_intra; + _tmp_buffer.clear(); // first check the upper triangular matrix_type for (auto row = row1; row <= row2; ++row) { for (auto col = col1; col <= col2; ++col) { diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index f4b1e6df..ed12cb4a 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -355,7 +355,6 @@ inline void PixelSelector::iterator::read_all_at_once() { _buffer->clear(); _buffer_i = 0; const auto bin_size = bins().bin_size(); - bool pixels_are_sorted = true; for (const auto block_idx : blocks) { for (const auto &[bin1_id, pixels] : *_sel->_reader.read(block_idx)) { const auto bin1 = @@ -379,14 +378,10 @@ inline void PixelSelector::iterator::read_all_at_once() { _buffer->emplace_back( Pixel{PixelCoordinates{bin1, bin2}, conditional_static_cast(p.count)}); } - - pixels_are_sorted &= _buffer->size() < 2 || *(_buffer->end() - 2) < _buffer->back(); } } } - if (!pixels_are_sorted) { - std::sort(_buffer->begin(), _buffer->end()); - } + assert(std::is_sorted(_buffer->begin(), _buffer->end())); _bin1_id = coord1().bin2.rel_id() + 1; } } // namespace hictk::hic From 591899349aed624f540421a605d58a51b9476568 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:51:04 +0200 Subject: [PATCH 15/48] Reduce code duplication in Index --- src/hic/include/hictk/hic/index.hpp | 11 ++-- src/hic/index_impl.hpp | 92 +++++++++++------------------ 2 files changed, 39 insertions(+), 64 deletions(-) diff --git a/src/hic/include/hictk/hic/index.hpp b/src/hic/include/hictk/hic/index.hpp index c5453e97..9cf15f0e 100644 --- a/src/hic/include/hictk/hic/index.hpp +++ b/src/hic/include/hictk/hic/index.hpp @@ -116,11 +116,12 @@ class Index { [[nodiscard]] const BlockIndex& at(std::size_t row, std::size_t col) const; private: - void _map_2d_query_to_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2, - std::vector& buffer) const; - void _map_2d_query_to_blocks_intra_v9plus(const PixelCoordinates& coords1, - const PixelCoordinates& coords2, - std::vector& buffer) const; + void map_2d_query_to_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2, + std::vector& buffer) const; + void generate_block_list(std::size_t bin1, std::size_t bin2, std::size_t bin3, std::size_t bin4, + bool is_intra) const; + void generate_block_list_intra_v9plus(std::size_t bin1, std::size_t bin2, std::size_t bin3, + std::size_t bin4) const; }; } // namespace hictk::hic::internal diff --git a/src/hic/index_impl.hpp b/src/hic/index_impl.hpp index 78d3c224..66cd8c28 100644 --- a/src/hic/index_impl.hpp +++ b/src/hic/index_impl.hpp @@ -138,13 +138,7 @@ inline std::vector Index::find_overlaps(const PixelCoordinates &coor assert(coords2.is_intra()); std::vector buffer{}; - - const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); - if (_version > 8 && is_intra) { - _map_2d_query_to_blocks_intra_v9plus(coords1, coords2, buffer); - } else { - _map_2d_query_to_blocks(coords1, coords2, buffer); - } + map_2d_query_to_blocks(coords1, coords2, buffer); return buffer; } @@ -158,74 +152,33 @@ inline const BlockIndex &Index::at(std::size_t row, std::size_t col) const { return *match; } -inline void Index::_map_2d_query_to_blocks(const hictk::PixelCoordinates &coords1, - const hictk::PixelCoordinates &coords2, - std::vector &buffer) const { - assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); - assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); - - auto bin1 = coords1.bin1.rel_id(); - auto bin2 = coords1.bin2.rel_id() + 1; - auto bin3 = coords2.bin1.rel_id(); - auto bin4 = coords2.bin2.rel_id() + 1; - - const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); - - if (is_intra && bin1 > bin3) { - std::swap(bin1, bin3); - std::swap(bin2, bin4); - } - +inline void Index::generate_block_list(std::size_t bin1, std::size_t bin2, std::size_t bin3, + std::size_t bin4, bool is_intra) const { const auto col1 = bin1 / _block_bin_count; const auto col2 = (bin2 + 1) / _block_bin_count; const auto row1 = bin3 / _block_bin_count; const auto row2 = (bin4 + 1) / _block_bin_count; // check region part that overlaps with lower left triangle but only if intrachromosomal - const auto checkLowerLeftTri = is_intra; - _tmp_buffer.clear(); - // first check the upper triangular matrix_type for (auto row = row1; row <= row2; ++row) { for (auto col = col1; col <= col2; ++col) { - auto block_id = (col * block_column_count()) + row; - auto match = _block_map.find(block_id); + const auto block_id = (row * block_column_count()) + col; + const auto match = _block_map.find(block_id); if (match != _block_map.end()) { _tmp_buffer.emplace(*match); } - - if (checkLowerLeftTri) { - block_id = (row * block_column_count()) + col; - match = _block_map.find(block_id); - if (match != _block_map.end()) { - _tmp_buffer.emplace(*match); - } - } } } - buffer.resize(_tmp_buffer.size()); - std::move(_tmp_buffer.begin(), _tmp_buffer.end(), buffer.begin()); - std::sort(buffer.begin(), buffer.end()); -} - -inline void Index::_map_2d_query_to_blocks_intra_v9plus(const hictk::PixelCoordinates &coords1, - const hictk::PixelCoordinates &coords2, - std::vector &buffer) const { - // https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md#grid-structure - assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); - assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); - assert(coords1.bin1.chrom() == coords2.bin1.chrom()); - - auto bin1 = coords1.bin1.rel_id(); - auto bin2 = coords1.bin2.rel_id() + 1; - auto bin3 = coords2.bin1.rel_id(); - auto bin4 = coords2.bin2.rel_id() + 1; - - if (bin1 > bin3) { + if (is_intra) { std::swap(bin1, bin3); - std::swap(bin2, bin4); + std::swap(bin3, bin4); + generate_block_list(bin1, bin2, bin3, bin4, false); } +} +inline void Index::generate_block_list_intra_v9plus(std::size_t bin1, std::size_t bin2, + std::size_t bin3, std::size_t bin4) const { const auto translatedLowerPAD = (bin1 + bin3) / 2 / _block_bin_count; const auto translatedHigherPAD = (bin2 + bin4) / 2 / _block_bin_count + 1; const auto translatedNearerDepth = @@ -245,7 +198,6 @@ inline void Index::_map_2d_query_to_blocks_intra_v9plus(const hictk::PixelCoordi // +1; integer divide rounds down const auto furtherDepth = (std::max)(translatedNearerDepth, translatedFurtherDepth) + 1; - _tmp_buffer.clear(); for (auto depth = nearerDepth; depth <= furtherDepth; ++depth) { for (auto pad = translatedLowerPAD; pad <= translatedHigherPAD; ++pad) { const auto block_id = (depth * block_column_count()) + pad; @@ -255,6 +207,28 @@ inline void Index::_map_2d_query_to_blocks_intra_v9plus(const hictk::PixelCoordi } } } +} + +inline void Index::map_2d_query_to_blocks(const hictk::PixelCoordinates &coords1, + const hictk::PixelCoordinates &coords2, + std::vector &buffer) const { + assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); + assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); + + auto bin1 = coords1.bin1.rel_id(); + auto bin2 = coords1.bin2.rel_id() + 1; + auto bin3 = coords2.bin1.rel_id(); + auto bin4 = coords2.bin2.rel_id() + 1; + + const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); + + _tmp_buffer.clear(); + if (_version > 8 && is_intra) { + generate_block_list_intra_v9plus(bin1, bin2, bin3, bin4); + } else { + generate_block_list(bin1, bin2, bin3, bin4, is_intra); + } + buffer.resize(_tmp_buffer.size()); std::move(_tmp_buffer.begin(), _tmp_buffer.end(), buffer.begin()); std::sort(buffer.begin(), buffer.end()); From 82e7e1d9c1902ad3f3c8f3e93b744d3cb2fe3921 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 12:52:03 +0200 Subject: [PATCH 16/48] Bugfix. Add tests for interchrom queries --- src/hic/pixel_selector_impl.hpp | 3 +- test/units/hic/pixel_selector_test.cpp | 70 ++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 2 deletions(-) diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index ed12cb4a..c4407266 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -278,7 +278,7 @@ inline std::size_t PixelSelector::iterator::size() const noexcept { template inline std::vector PixelSelector::iterator::find_blocks_overlapping_current_row() { - const auto end_pos = coord2().bin2.start(); + const auto end_pos = coord1().bin2.start(); const auto pos1 = (std::min)(end_pos, static_cast(_bin1_id) * bins().bin_size()); const auto pos2 = (std::min)(end_pos, pos1 + bins().bin_size()); @@ -317,7 +317,6 @@ inline void PixelSelector::iterator::read_next_row() { auto first = std::lower_bound(pixels.begin(), pixels.end(), coord2().bin1.rel_id(), [](const internal::InteractionBlock::ThinPixel &pixel, std::size_t bin_id) { return pixel.bin2_id < bin_id; }); - while (first != pixels.end()) { if (first->bin2_id > coord2().bin2.rel_id()) { break; diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index 03f3d87b..93290b16 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -54,6 +54,21 @@ static N sumCounts(const std::vector>& buffer) { return accumulator + static_cast(p.count); }); } +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +template +static void checkContactRecordsAreWithinBound(std::uint32_t start1, std::uint32_t end1, + std::uint32_t start2, std::uint32_t end2, + const std::vector>& buffer) { + assert(start1 < end1); + assert(start2 < end2); + + for (const auto& r : buffer) { + CHECK(r.coords.bin1.start() >= std::min(start1, start2)); + CHECK(r.coords.bin1.end() < std::max(end1, end2)); + CHECK(r.coords.bin2.start() >= std::min(start1, start2)); + CHECK(r.coords.bin2.end() < std::max(end1, end2)); + } +} // NOLINTNEXTLINE(readability-function-cognitive-complexity) template @@ -161,4 +176,59 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { CHECK(std::is_sorted(buffer.begin(), buffer.end())); } } + + SECTION("inter-chromosomal") { + constexpr std::size_t expected_size = 56743; + constexpr std::int32_t expected_sum = 70567; + + constexpr std::size_t N = 5; + constexpr std::array head_expected{1, 1, 1, 1, 1}; + constexpr std::array tail_expected{1, 1, 1, 1, 1}; + + constexpr auto expected_value = + std::make_pair(std::size_t(3541), SerializedPixel{770000, 1300000, 13.0F}); + + SECTION("v8") { + auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L", "chr4", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + auto f = std::fopen("/tmp/test.bg2", "w"); + fmt::print(f, FMT_STRING("{}\n"), fmt::join(buffer, "\n")); + std::fclose(f); + REQUIRE(buffer.size() == expected_size); + + CHECK(sumCounts(buffer) == expected_sum); + + const auto h = head(buffer, N); + const auto t = tail(buffer, N); + + for (std::size_t i = 0; i < N; ++i) { + CHECK(head_expected[i] == h[i].count); + CHECK(tail_expected[i] == t[i].count); + } + + compareContactRecord(buffer[expected_value.first], expected_value.second); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); + } + + SECTION("v9") { + auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L", "chr4", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + + CHECK(sumCounts(buffer) == expected_sum); + + const auto h = head(buffer, N); + const auto t = tail(buffer, N); + + for (std::size_t i = 0; i < N; ++i) { + CHECK(head_expected[i] == h[i].count); + CHECK(tail_expected[i] == t[i].count); + } + + compareContactRecord(buffer[expected_value.first], expected_value.second); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); + } + } } From 33774983d864caf4fa1c6f6f9421deebb79f40ed Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 17:54:15 +0200 Subject: [PATCH 17/48] Bugfix. Add tests for sub-chromosomal queries --- src/hic/hic_file_impl.hpp | 4 +- src/hic/include/hictk/hic/index.hpp | 4 +- src/hic/index_impl.hpp | 10 +---- src/hic/pixel_selector_impl.hpp | 10 +++-- test/units/hic/pixel_selector_test.cpp | 61 ++++++++++++++++++++++++-- 5 files changed, 70 insertions(+), 19 deletions(-) diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index 8a7e8528..1e22fb72 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -133,8 +133,8 @@ inline PixelSelector HiCFile::fetch(const Chromosome& chrom1, std::uint32_t star FMT_STRING("matrix type {} is incompatible with normalization method {}"), _type, norm)); } - const PixelCoordinates coord1 = {_bins->at(chrom1, start1), _bins->at(chrom1, end1)}; - const PixelCoordinates coord2 = {_bins->at(chrom2, start2), _bins->at(chrom2, end2)}; + const PixelCoordinates coord1 = {_bins->at(chrom1, start1), _bins->at(chrom1, end1 - 1)}; + const PixelCoordinates coord2 = {_bins->at(chrom2, start2), _bins->at(chrom2, end2 - 1)}; auto footer = [&]() { try { diff --git a/src/hic/include/hictk/hic/index.hpp b/src/hic/include/hictk/hic/index.hpp index 9cf15f0e..3674878e 100644 --- a/src/hic/include/hictk/hic/index.hpp +++ b/src/hic/include/hictk/hic/index.hpp @@ -118,8 +118,8 @@ class Index { private: void map_2d_query_to_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2, std::vector& buffer) const; - void generate_block_list(std::size_t bin1, std::size_t bin2, std::size_t bin3, std::size_t bin4, - bool is_intra) const; + void generate_block_list(std::size_t bin1, std::size_t bin2, std::size_t bin3, + std::size_t bin4) const; void generate_block_list_intra_v9plus(std::size_t bin1, std::size_t bin2, std::size_t bin3, std::size_t bin4) const; }; diff --git a/src/hic/index_impl.hpp b/src/hic/index_impl.hpp index 66cd8c28..333ae006 100644 --- a/src/hic/index_impl.hpp +++ b/src/hic/index_impl.hpp @@ -153,7 +153,7 @@ inline const BlockIndex &Index::at(std::size_t row, std::size_t col) const { } inline void Index::generate_block_list(std::size_t bin1, std::size_t bin2, std::size_t bin3, - std::size_t bin4, bool is_intra) const { + std::size_t bin4) const { const auto col1 = bin1 / _block_bin_count; const auto col2 = (bin2 + 1) / _block_bin_count; const auto row1 = bin3 / _block_bin_count; @@ -169,12 +169,6 @@ inline void Index::generate_block_list(std::size_t bin1, std::size_t bin2, std:: } } } - - if (is_intra) { - std::swap(bin1, bin3); - std::swap(bin3, bin4); - generate_block_list(bin1, bin2, bin3, bin4, false); - } } inline void Index::generate_block_list_intra_v9plus(std::size_t bin1, std::size_t bin2, @@ -226,7 +220,7 @@ inline void Index::map_2d_query_to_blocks(const hictk::PixelCoordinates &coords1 if (_version > 8 && is_intra) { generate_block_list_intra_v9plus(bin1, bin2, bin3, bin4); } else { - generate_block_list(bin1, bin2, bin3, bin4, is_intra); + generate_block_list(bin1, bin2, bin3, bin4); } buffer.resize(_tmp_buffer.size()); diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index c4407266..ec8e9ac5 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -318,18 +318,20 @@ inline void PixelSelector::iterator::read_next_row() { [](const internal::InteractionBlock::ThinPixel &pixel, std::size_t bin_id) { return pixel.bin2_id < bin_id; }); while (first != pixels.end()) { - if (first->bin2_id > coord2().bin2.rel_id()) { + const auto &p = *first; + if (p.bin2_id > coord2().bin2.rel_id()) { break; } - const auto pos2 = static_cast(first->bin2_id) * bin_size; + + const auto pos2 = static_cast(p.bin2_id) * bin_size; if constexpr (std::is_integral_v) { _buffer->emplace_back( Pixel{PixelCoordinates{bin1, bins().at(coord2().bin1.chrom(), pos2)}, - static_cast(std::round(first->count))}); + static_cast(std::round(p.count))}); } else { _buffer->emplace_back( Pixel{PixelCoordinates{bin1, bins().at(coord2().bin1.chrom(), pos2)}, - conditional_static_cast(first->count)}); + conditional_static_cast(p.count)}); } ++first; } diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index 93290b16..ac733b9e 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -192,9 +192,6 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) .fetch("chr2L", "chr4", NormalizationMethod::NONE); const auto buffer = sel.read_all(); - auto f = std::fopen("/tmp/test.bg2", "w"); - fmt::print(f, FMT_STRING("{}\n"), fmt::join(buffer, "\n")); - std::fclose(f); REQUIRE(buffer.size() == expected_size); CHECK(sumCounts(buffer) == expected_sum); @@ -231,4 +228,62 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { CHECK(std::is_sorted(buffer.begin(), buffer.end())); } } + + SECTION("cover type 2 interactions") { + auto sel = HiCFile(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L", "chr2R", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == 110); + CHECK(sumCounts(buffer) == 1483112); + + compareContactRecord(buffer[38], SerializedPixel{7500000, 12500000, 16512}); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); + } + + SECTION("sub-chromosomal queries") { + const std::uint32_t resolution = 10'000; + SECTION("single pixel") { + auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L:100,000-100,001", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == 1); + compareContactRecord(buffer.front(), SerializedPixel{100000, 100000, 13895.0F}); + } + + SECTION("upper-triangle") { + auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L:123,456-200,000", "chr2L:0-200,000", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == 36); + CHECK(sumCounts(buffer) == 99946); + compareContactRecord(buffer[33], SerializedPixel{180000, 180000, 3888}); + + checkContactRecordsAreWithinBound(123456, 200000 + resolution, 0, 200000 + resolution, + buffer); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); + } + + SECTION("lower-triangle") { + auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L:0-200,000", "chr2L:123,456-200,000", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == 132); + CHECK(sumCounts(buffer) == 124561); + compareContactRecord(buffer[33], SerializedPixel{40000, 130000, 148}); + checkContactRecordsAreWithinBound(0, 200000 + resolution, 123456, 200000 + resolution, + buffer); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); + } + + SECTION("inter-chromosomal") { + auto sel = HiCFile(pathV9, resolution, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L:123,456-200,000", "chr4:0-200,000", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == 57); + CHECK(sumCounts(buffer) == 74); + checkContactRecordsAreWithinBound(123456, 200000 + resolution, 0, 200000 + resolution, + buffer); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); + } + } } From 89e277d1f11f09cde928fef8e0b8303728b37ce3 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 18:11:10 +0200 Subject: [PATCH 18/48] Simplify PixelSelector impl. --- src/hic/hic_file_impl.hpp | 28 +++--- src/hic/include/hictk/hic.hpp | 15 ++-- src/hic/include/hictk/hic/pixel_selector.hpp | 13 ++- src/hic/pixel_selector_impl.hpp | 95 +++----------------- test/units/hic/pixel_selector_test.cpp | 28 ------ 5 files changed, 35 insertions(+), 144 deletions(-) diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index 1e22fb72..07e32bcf 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -78,25 +78,21 @@ inline std::shared_ptr HiCFile::get_footer( } inline PixelSelector HiCFile::fetch(std::string_view query, NormalizationMethod norm, - QUERY_TYPE query_type, - std::size_t read_at_once_threshold) const { + QUERY_TYPE query_type) const { const auto gi = query_type == QUERY_TYPE::BED ? GenomicInterval::parse_bed(this->chromosomes(), query) : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{query}); - return this->fetch(gi.chrom(), gi.start(), gi.end(), gi.chrom(), gi.start(), gi.end(), norm, - read_at_once_threshold); + return this->fetch(gi.chrom(), gi.start(), gi.end(), gi.chrom(), gi.start(), gi.end(), norm); } inline PixelSelector HiCFile::fetch(std::string_view chrom_name, std::uint32_t start, - std::uint32_t end, NormalizationMethod norm, - std::size_t read_at_once_threshold) const { - return this->fetch(chrom_name, start, end, chrom_name, start, end, norm, read_at_once_threshold); + std::uint32_t end, NormalizationMethod norm) const { + return this->fetch(chrom_name, start, end, chrom_name, start, end, norm); } inline PixelSelector HiCFile::fetch(std::string_view range1, std::string_view range2, - NormalizationMethod norm, QUERY_TYPE query_type, - std::size_t read_at_once_threshold) const { + NormalizationMethod norm, QUERY_TYPE query_type) const { const auto gi1 = query_type == QUERY_TYPE::BED ? GenomicInterval::parse_bed(this->chromosomes(), range1) : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{range1}); @@ -105,24 +101,22 @@ inline PixelSelector HiCFile::fetch(std::string_view range1, std::string_view ra ? GenomicInterval::parse_bed(this->chromosomes(), range2) : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{range2}); - return this->fetch(gi1.chrom(), gi1.start(), gi1.end(), gi2.chrom(), gi2.start(), gi2.end(), norm, - read_at_once_threshold); + return this->fetch(gi1.chrom(), gi1.start(), gi1.end(), gi2.chrom(), gi2.start(), gi2.end(), + norm); } inline PixelSelector HiCFile::fetch(std::string_view chrom1_name, std::uint32_t start1, std::uint32_t end1, std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, - NormalizationMethod norm, - std::size_t read_at_once_threshold) const { + NormalizationMethod norm) const { return this->fetch(chromosomes().at(chrom1_name), start1, end1, chromosomes().at(chrom2_name), - start2, end2, norm, read_at_once_threshold); + start2, end2, norm); } inline PixelSelector HiCFile::fetch(const Chromosome& chrom1, std::uint32_t start1, std::uint32_t end1, const Chromosome& chrom2, std::uint32_t start2, std::uint32_t end2, - NormalizationMethod norm, - std::size_t read_at_once_threshold) const { + NormalizationMethod norm) const { if (chrom1 > chrom2) { throw std::runtime_error( "Query overlaps the lower-triangle of the matrix. This is currently not supported."); @@ -170,7 +164,7 @@ inline PixelSelector HiCFile::fetch(const Chromosome& chrom1, std::uint32_t star } }(); - return PixelSelector{_fs, footer, _block_cache, _bins, coord1, coord2, read_at_once_threshold}; + return PixelSelector{_fs, footer, _block_cache, _bins, coord1, coord2}; } inline std::size_t HiCFile::num_cached_footers() const noexcept { return _footers.size(); } diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index cfe659d9..3306fca5 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -53,21 +53,17 @@ class HiCFile { [[nodiscard]] PixelSelector fetch(std::string_view query, NormalizationMethod norm = NormalizationMethod::NONE, - QUERY_TYPE query_type = QUERY_TYPE::UCSC, - std::size_t read_at_once_threshold = 0) const; + QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; [[nodiscard]] PixelSelector fetch(std::string_view chrom_name, std::uint32_t start, std::uint32_t end, - NormalizationMethod norm = NormalizationMethod::NONE, - std::size_t read_at_once_threshold = 0) const; + NormalizationMethod norm = NormalizationMethod::NONE) const; [[nodiscard]] PixelSelector fetch(std::string_view range1, std::string_view range2, NormalizationMethod norm = NormalizationMethod::NONE, - QUERY_TYPE query_type = QUERY_TYPE::UCSC, - std::size_t read_at_once_threshold = 0) const; + QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; [[nodiscard]] PixelSelector fetch(std::string_view chrom1_name, std::uint32_t start1, std::uint32_t end1, std::string_view chrom2_name, std::uint32_t start2, std::uint32_t end2, - NormalizationMethod norm = NormalizationMethod::NONE, - std::size_t read_at_once_threshold = 0) const; + NormalizationMethod norm = NormalizationMethod::NONE) const; [[nodiscard]] std::size_t num_cached_footers() const noexcept; void purge_footer_cache(); @@ -83,8 +79,7 @@ class HiCFile { [[nodiscard]] PixelSelector fetch(const Chromosome &chrom1, std::uint32_t start1, std::uint32_t end1, const Chromosome &chrom2, std::uint32_t start2, std::uint32_t end2, - NormalizationMethod norm, - std::size_t read_at_once_threshold) const; + NormalizationMethod norm) const; }; namespace utils { diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index d97e39e9..f273c504 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -25,7 +25,6 @@ class PixelSelector { PixelCoordinates _coord1{}; PixelCoordinates _coord2{}; - std::size_t _read_all_at_once_thresh{}; public: template @@ -35,14 +34,14 @@ class PixelSelector { PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, - std::shared_ptr bins_, PixelCoordinates coords, - std::size_t read_all_at_once_threshold = 0) noexcept; + std::shared_ptr bins_, PixelCoordinates coords) + noexcept; PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coord1_, - PixelCoordinates coord2_, std::size_t read_all_at_once_threshold = 0) noexcept; + PixelCoordinates coord2_) noexcept; [[nodiscard]] bool operator==(const PixelSelector &other) const noexcept; [[nodiscard]] bool operator!=(const PixelSelector &other) const noexcept; @@ -105,7 +104,7 @@ class PixelSelector { using iterator_category = std::forward_iterator_tag; iterator() = default; - explicit iterator(const PixelSelector &sel, std::size_t read_at_once_thresh); + explicit iterator(const PixelSelector &sel); [[nodiscard]] static auto at_end(const PixelSelector &sel) -> iterator; [[nodiscard]] bool operator==(const iterator &other) const noexcept; @@ -114,13 +113,12 @@ class PixelSelector { [[nodiscard]] bool operator<(const iterator &other) const noexcept; [[nodiscard]] auto operator*() const -> const_reference; - // [[nodiscard]] auto operator->() const -> const_pointer; + [[nodiscard]] auto operator->() const -> const_pointer; auto operator++() -> iterator &; auto operator++(int) -> iterator; private: - [[nodiscard]] bool discard() const noexcept; [[nodiscard]] bool is_at_end() const noexcept; [[nodiscard]] const BinTable &bins() const noexcept; [[nodiscard]] const PixelCoordinates &coord1() const noexcept; @@ -128,7 +126,6 @@ class PixelSelector { [[nodiscard]] std::size_t size() const noexcept; void read_next_row(); - void read_all_at_once(); [[nodiscard]] std::vector find_blocks_overlapping_current_row(); }; }; diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index ec8e9ac5..2058164e 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -22,22 +22,20 @@ namespace hictk::hic { inline PixelSelector::PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, - std::shared_ptr bins_, PixelCoordinates coords, - std::size_t read_all_at_once_thresh) noexcept + std::shared_ptr bins_, + PixelCoordinates coords) noexcept : PixelSelector(std::move(hfs_), std::move(footer_), std::move(cache_), std::move(bins_), - coords, std::move(coords), read_all_at_once_thresh) {} + coords, std::move(coords)) {} inline PixelSelector::PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coord1_, - PixelCoordinates coord2_, - std::size_t read_all_at_once_thresh) noexcept + PixelCoordinates coord2_) noexcept : _reader(std::move(hfs_), footer_->index(), std::move(bins_), std::move(cache_)), _footer(std::move(footer_)), _coord1(std::move(coord1_)), - _coord2(std::move(coord2_)), - _read_all_at_once_thresh(read_all_at_once_thresh) {} + _coord2(std::move(coord2_)) {} inline bool PixelSelector::operator==(const PixelSelector &other) const noexcept { return _reader.index().chrom1() == _reader.index().chrom2() && _coord1 == other._coord1 && @@ -50,7 +48,7 @@ inline bool PixelSelector::operator!=(const PixelSelector &other) const noexcept template inline auto PixelSelector::cbegin() const -> iterator { - return iterator(*this, _read_all_at_once_thresh); + return iterator(*this); } template @@ -155,19 +153,13 @@ inline N PixelSelector::sum() const noexcept { inline double PixelSelector::avg() const noexcept { return _reader.avg(); } template -inline PixelSelector::iterator::iterator(const PixelSelector &sel, - std::size_t read_at_once_thresh) +inline PixelSelector::iterator::iterator(const PixelSelector &sel) : _sel(&sel), _bin1_id(coord1().bin1.rel_id()), _buffer(std::make_shared()) { if (_sel->_reader.index().empty()) { *this = at_end(sel); return; } - if (_sel->_reader.index().size() < read_at_once_thresh) { - read_all_at_once(); - return; - } - while (_buffer->empty()) { read_next_row(); } @@ -210,6 +202,13 @@ inline auto PixelSelector::iterator::operator*() const -> const_reference { return (*_buffer)[_buffer_i]; } +template +inline auto PixelSelector::iterator::operator->() const -> const_pointer { + assert(!!_buffer); + assert(_buffer_i < _buffer->size()); + return &(*_buffer)[_buffer_i]; +} + template inline auto PixelSelector::iterator::operator++() -> iterator & { assert(!!_buffer); @@ -230,26 +229,6 @@ inline auto PixelSelector::iterator::operator++(int) -> iterator { return it; } -template -inline bool PixelSelector::iterator::discard() const noexcept { - if (is_at_end()) { - return true; - } - - assert(!!_buffer); - if (_buffer->empty()) { - return true; - } - - const auto &pixel = _buffer->front(); - // clang-format off - return pixel.coords.bin1 < coord1().bin1 || - pixel.coords.bin1 > coord1().bin2 || - pixel.coords.bin2 < coord2().bin1 || - pixel.coords.bin2 > coord2().bin2; - // clang-format on -} - template inline bool PixelSelector::iterator::is_at_end() const noexcept { return _buffer == nullptr; @@ -339,50 +318,4 @@ inline void PixelSelector::iterator::read_next_row() { assert(std::is_sorted(_buffer->begin(), _buffer->end())); _bin1_id++; } - -template -inline void PixelSelector::iterator::read_all_at_once() { - assert(!!_sel); - const auto &blocks = _sel->_reader.index(); - if (blocks.empty()) { - *this = at_end(*_sel); - return; - } - - if (_buffer.use_count() != 1) { - _buffer = std::make_shared(_buffer->capacity()); - } - - _buffer->clear(); - _buffer_i = 0; - const auto bin_size = bins().bin_size(); - for (const auto block_idx : blocks) { - for (const auto &[bin1_id, pixels] : *_sel->_reader.read(block_idx)) { - const auto bin1 = - bins().at(coord1().bin1.chrom(), static_cast(bin1_id) * bin_size); - if (bin1 < coord1().bin1 || bin1 > coord1().bin2) { - continue; - } - for (const auto &p : pixels) { - const auto bin2 = - bins().at(coord2().bin1.chrom(), static_cast(p.bin2_id) * bin_size); - if (bin2 < coord2().bin1) { - continue; - } - if (bin2 > coord2().bin2) { - break; - } - if constexpr (std::is_integral_v) { - _buffer->emplace_back( - Pixel{PixelCoordinates{bin1, bin2}, static_cast(std::round(p.count))}); - } else { - _buffer->emplace_back( - Pixel{PixelCoordinates{bin1, bin2}, conditional_static_cast(p.count)}); - } - } - } - } - assert(std::is_sorted(_buffer->begin(), _buffer->end())); - _bin1_id = coord1().bin2.rel_id() + 1; -} } // namespace hictk::hic diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index ac733b9e..22ebcb98 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -97,34 +97,6 @@ TEST_CASE("MatrixSelector accessors", "[hic][short]") { REQUIRE(sel.chrom1().size() == 23513712); } -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector LRU cache", "[hic][short]") { - HiCFile f(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP); - - auto sel = f.fetch("chr2L", NormalizationMethod::NONE, HiCFile::QUERY_TYPE::UCSC, 100); - // Fill cache - const auto expected_sum = sumCounts(sel.read_all()); - - REQUIRE(f.block_cache_hit_rate() == 0); - REQUIRE(f.block_cache_size() == 6); - - auto sum = sumCounts(sel.read_all()); - CHECK(sum == expected_sum); - CHECK(f.block_cache_hit_rate() == 0.5); - CHECK(f.block_cache_size() == 6); - - for (auto i = 0; i < 3; ++i) { - sum = sumCounts(sel.read_all()); - CHECK(sum == expected_sum); - } - CHECK(f.block_cache_hit_rate() == 4.0 / 5.0); - CHECK(f.block_cache_size() == 6); - - f.clear_block_cache(); - CHECK(f.block_cache_hit_rate() == 0); - CHECK(f.block_cache_size() == 0); -} - // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { SECTION("intra-chromosomal") { From 6b4a8ff231c219a9f725477c716d75e4ea97a2f7 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 18:41:31 +0200 Subject: [PATCH 19/48] Address clang-tidy warnings --- src/hic/include/hictk/hic/pixel_selector.hpp | 3 +-- test/units/hic/hic_file_stream_test.cpp | 8 +++---- test/units/hic/hic_file_test.cpp | 13 ++++++----- test/units/hic/pixel_selector_test.cpp | 24 ++++++++++---------- 4 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index f273c504..c727aa15 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -34,8 +34,7 @@ class PixelSelector { PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, std::shared_ptr cache_, - std::shared_ptr bins_, PixelCoordinates coords) - noexcept; + std::shared_ptr bins_, PixelCoordinates coords) noexcept; PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, diff --git a/test/units/hic/hic_file_stream_test.cpp b/test/units/hic/hic_file_stream_test.cpp index ffb1171d..99c8ba54 100644 --- a/test/units/hic/hic_file_stream_test.cpp +++ b/test/units/hic/hic_file_stream_test.cpp @@ -17,10 +17,10 @@ namespace hictk::test { inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -const auto pathV8 = - (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) -const auto pathV9 = - (hictk::test::datadir / "4DNFIZ1ZVXC8.hic9").string(); // NOLINT(cert-err58-cpp) +// NOLINTNEXTLINE(cert-err58-cpp) +const auto pathV8 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); +// NOLINTNEXTLINE(cert-err58-cpp) +const auto pathV9 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic9").string(); // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("readHeader (v8)", "[hic][v8][short]") { diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index c9ca625a..b0e9cdd1 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -15,9 +15,10 @@ namespace hictk::test { inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err58-cpp) } // namespace hictk::test -const auto pathV8 = - (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) -const auto path_binary = (hictk::test::datadir / "data.zip").string(); // NOLINT(cert-err58-cpp) +// NOLINTNEXTLINE(cert-err58-cpp) +const auto pathV8 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); +// NOLINTNEXTLINE(cert-err58-cpp) +const auto path_binary = (hictk::test::datadir / "data.zip").string(); // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("utils: is_hic_file", "[hic][short]") { @@ -83,12 +84,12 @@ TEST_CASE("HiCFile footer cache", "[hic][short]") { // NOLINTNEXTLINE(readability-function-cognitive-complexity) TEST_CASE("HiCFile get_matrix_selector", "[hic][short]") { constexpr auto norm = NormalizationMethod::NONE; - HiCFile f(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP); + const HiCFile f(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP); REQUIRE(f.chromosomes().size() == 9); - const auto chrom1 = "chr2L"; - const auto chrom2 = "chr2R"; + const auto* chrom1 = "chr2L"; + const auto* chrom2 = "chr2R"; SECTION("intra-chromosomal") { auto sel = f.fetch(chrom1, norm); CHECK(sel.chrom1() == chrom1); diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index 22ebcb98..9c80ef56 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -19,13 +19,15 @@ inline const std::filesystem::path datadir{"test/data/hic"}; // NOLINT(cert-err template using Pixel = hictk::Pixel; -const auto pathV8 = - (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); // NOLINT(cert-err58-cpp) -const auto pathV9 = - (hictk::test::datadir / "4DNFIZ1ZVXC8.hic9").string(); // NOLINT(cert-err58-cpp) -const auto path_binary = (hictk::test::datadir / "data.zip").string(); // NOLINT(cert-err58-cpp) +// NOLINTNEXTLINE(cert-err58-cpp) +const auto pathV8 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); + +// NOLINTNEXTLINE(cert-err58-cpp) +const auto pathV9 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic9").string(); + +// NOLINTNEXTLINE(cert-err58-cpp) +const auto path_binary = (hictk::test::datadir / "data.zip").string(); -// NOLINTNEXTLINE(readability-function-cognitive-complexity) template static std::vector> head(const std::vector>& buffer, std::size_t n = 5) { @@ -36,8 +38,7 @@ static std::vector> head(const std::vector>& buf return slice; } -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -template +template // NOLINTNEXTLINE(readability-function-cognitive-complexity) static std::vector> tail(const std::vector>& buffer, std::size_t n = 5) { REQUIRE(buffer.size() >= n); @@ -54,8 +55,8 @@ static N sumCounts(const std::vector>& buffer) { return accumulator + static_cast(p.count); }); } -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -template + +template // NOLINTNEXTLINE(readability-function-cognitive-complexity) static void checkContactRecordsAreWithinBound(std::uint32_t start1, std::uint32_t end1, std::uint32_t start2, std::uint32_t end2, const std::vector>& buffer) { @@ -70,8 +71,7 @@ static void checkContactRecordsAreWithinBound(std::uint32_t start1, std::uint32_ } } -// NOLINTNEXTLINE(readability-function-cognitive-complexity) -template +template // NOLINTNEXTLINE(readability-function-cognitive-complexity) static void compareContactRecord(const hictk::Pixel& r1, const SerializedPixel& r2) { CHECK(r1.coords.bin1.start() == r2.bin1_id); CHECK(r1.coords.bin2.start() == r2.bin2_id); From 2fecb08e8ff36bde94172a3b7346d2a999ecdf34 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 19:21:22 +0200 Subject: [PATCH 20/48] Bugfix. Add more tests --- src/hic/block_reader_impl.hpp | 12 +- src/hic/include/hictk/hic/block_reader.hpp | 2 +- src/hic/include/hictk/hic/pixel_selector.hpp | 3 +- src/hic/pixel_selector_impl.hpp | 29 ++-- test/units/hic/pixel_selector_test.cpp | 137 +++++++++++++++++++ 5 files changed, 162 insertions(+), 21 deletions(-) diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp index bde5e188..a70f87fd 100644 --- a/src/hic/block_reader_impl.hpp +++ b/src/hic/block_reader_impl.hpp @@ -51,9 +51,15 @@ inline const Index &HiCBlockReader::index() const noexcept { return _index; } inline double HiCBlockReader::sum() const noexcept { return _index.matrix_sum(); } -inline double HiCBlockReader::avg() const noexcept { - const auto num_bins1 = bins().subset(chrom1()).size(); - const auto num_bins2 = bins().subset(chrom2()).size(); +inline double HiCBlockReader::avg() const { + if (_index.is_intra()) { + throw std::domain_error( + "HiCBlockReader::avg is not implemented for intra-chromosomal matrices"); + } + + const auto bin_size = bins().bin_size(); + const auto num_bins1 = (chrom1().size() + bin_size - 1) / bin_size; + const auto num_bins2 = (chrom2().size() + bin_size - 1) / bin_size; return sum() / double(num_bins1 * num_bins2); } diff --git a/src/hic/include/hictk/hic/block_reader.hpp b/src/hic/include/hictk/hic/block_reader.hpp index 8d2125cc..ff8a9170 100644 --- a/src/hic/include/hictk/hic/block_reader.hpp +++ b/src/hic/include/hictk/hic/block_reader.hpp @@ -58,7 +58,7 @@ class HiCBlockReader { [[nodiscard]] const Index& index() const noexcept; [[nodiscard]] double sum() const noexcept; - [[nodiscard]] double avg() const noexcept; + [[nodiscard]] double avg() const; [[nodiscard]] std::shared_ptr read(const BlockIndex& idx); diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index c727aa15..28349d32 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -78,7 +78,8 @@ class PixelSelector { [[nodiscard]] double avg() const noexcept; private: - [[nodiscard]] SerializedPixel process_interaction(SerializedPixel record) const; + [[nodiscard]] internal::InteractionBlock::ThinPixel transform_pixel( + std::size_t bin1, internal::InteractionBlock::ThinPixel pixel) const; public: template diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 2058164e..d5252f20 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -66,29 +66,26 @@ inline auto PixelSelector::end() const -> iterator { return this->cend(); } -inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record) const { +inline internal::InteractionBlock::ThinPixel PixelSelector::transform_pixel( + std::size_t bin1, internal::InteractionBlock::ThinPixel pixel) const { const auto &c1Norm = _footer->c1Norm(); const auto &c2Norm = _footer->c2Norm(); const auto &expected = _footer->expectedValues(); - assert(is_inter() || record.bin1_id <= record.bin2_id); + assert(is_inter() || bin1 <= pixel.bin2_id); const auto skipNormalization = normalization() == NormalizationMethod::NONE || matrix_type() == MatrixType::expected; if (!skipNormalization) { - const auto bin1 = static_cast(record.bin1_id); - const auto bin2 = static_cast(record.bin2_id); + const auto bin2 = static_cast(pixel.bin2_id); assert(bin1 < c1Norm.size()); assert(bin2 < c2Norm.size()); - record.count /= static_cast(c1Norm[bin1] * c2Norm[bin2]); + pixel.count /= static_cast(c1Norm[bin1] * c2Norm[bin2]); } - record.bin1_id *= resolution(); - record.bin2_id *= resolution(); - if (matrix_type() == MatrixType::observed) { - return record; + return pixel; } const auto expectedCount = [&]() { @@ -96,20 +93,20 @@ inline SerializedPixel PixelSelector::process_interaction(SerializedPixel record return float(_reader.avg()); } - const auto i = static_cast((record.bin2_id - record.bin1_id) / resolution()); + const auto i = (pixel.bin2_id - bin1); assert(i < expected.size()); return float(expected[i]); }(); if (matrix_type() == MatrixType::expected) { - record.count = expectedCount; - return record; + pixel.count = expectedCount; + return pixel; } assert(matrix_type() == MatrixType::oe); - record.count /= expectedCount; + pixel.count /= expectedCount; - return record; + return pixel; } template @@ -297,7 +294,7 @@ inline void PixelSelector::iterator::read_next_row() { [](const internal::InteractionBlock::ThinPixel &pixel, std::size_t bin_id) { return pixel.bin2_id < bin_id; }); while (first != pixels.end()) { - const auto &p = *first; + const auto p = _sel->transform_pixel(_bin1_id, *first); if (p.bin2_id > coord2().bin2.rel_id()) { break; } @@ -306,7 +303,7 @@ inline void PixelSelector::iterator::read_next_row() { if constexpr (std::is_integral_v) { _buffer->emplace_back( Pixel{PixelCoordinates{bin1, bins().at(coord2().bin1.chrom(), pos2)}, - static_cast(std::round(p.count))}); + conditional_static_cast(std::round(p.count))}); } else { _buffer->emplace_back( Pixel{PixelCoordinates{bin1, bins().at(coord2().bin1.chrom(), pos2)}, diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index 9c80ef56..bbd6e32f 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -258,4 +258,141 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { CHECK(std::is_sorted(buffer.begin(), buffer.end())); } } + SECTION("invalid") { + SECTION("invalid chromosome") { + const HiCFile hic(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP); + CHECK_THROWS(hic.fetch("chr123", NormalizationMethod::NONE)); + } + SECTION("invalid unit") { + const HiCFile hic(pathV9, 10'000, MatrixType::observed, MatrixUnit::FRAG); + CHECK_THROWS(hic.fetch("chr2L", NormalizationMethod::NONE)); + } + SECTION("expected + norm") { + const HiCFile hic(pathV9, 10'000, MatrixType::expected, MatrixUnit::BP); + CHECK_THROWS(hic.fetch("chr2L", NormalizationMethod::VC)); + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][short]") { + SECTION("intra-chromosomal") { + constexpr std::size_t expected_size = 1433133; + constexpr double expected_sum = 20391277.41514; + SECTION("v8") { + auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L", NormalizationMethod::VC); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + SECTION("v9") { + auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L", NormalizationMethod::VC); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + } + SECTION("inter-chromosomal") { + constexpr std::size_t expected_size = 56743; + constexpr double expected_sum = 96690.056244753; + SECTION("v8") { + auto sel = HiCFile(pathV8, 10'000, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L", "chr4", NormalizationMethod::VC); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + + SECTION("v9") { + auto sel = HiCFile(pathV9, 10'000, MatrixType::observed, MatrixUnit::BP) + .fetch("chr2L", "chr4", NormalizationMethod::VC); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][short]") { + SECTION("intra-chromosomal") { + constexpr std::size_t expected_size = 1433133; + constexpr double expected_sum = 18314748.068024; + SECTION("v8") { + auto sel = HiCFile(pathV8, 10'000, MatrixType::expected, MatrixUnit::BP) + .fetch("chr2L", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + SECTION("v9") { + auto sel = HiCFile(pathV9, 10'000, MatrixType::expected, MatrixUnit::BP) + .fetch("chr2L", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + } + SECTION("inter-chromosomal") { + constexpr std::size_t expected_size = 56743; + constexpr double expected_sum = 12610.80619812; + SECTION("v8") { + auto sel = HiCFile(pathV8, 10'000, MatrixType::expected, MatrixUnit::BP) + .fetch("chr2L", "chr4", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + + SECTION("v9") { + auto sel = HiCFile(pathV9, 10'000, MatrixType::expected, MatrixUnit::BP) + .fetch("chr2L", "chr4", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + } +} + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("MatrixSelector fetch (oe NONE BP 10000)", "[hic][short]") { + SECTION("intra-chromosomal") { + constexpr std::size_t expected_size = 1433133; + constexpr double expected_sum = 2785506.2274201; + SECTION("v8") { + auto sel = HiCFile(pathV8, 10'000, MatrixType::oe, MatrixUnit::BP) + .fetch("chr2L", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + SECTION("v9") { + auto sel = HiCFile(pathV9, 10'000, MatrixType::oe, MatrixUnit::BP) + .fetch("chr2L", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + } + SECTION("inter-chromosomal") { + constexpr std::size_t expected_size = 56743; + constexpr double expected_sum = 317520.00459671; + SECTION("v8") { + auto sel = HiCFile(pathV8, 10'000, MatrixType::oe, MatrixUnit::BP) + .fetch("chr2L", "chr4", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + + SECTION("v9") { + auto sel = HiCFile(pathV9, 10'000, MatrixType::oe, MatrixUnit::BP) + .fetch("chr2L", "chr4", NormalizationMethod::NONE); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == expected_size); + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(expected_sum, 1.0e-6)); + } + } } From 285717e7c6af78cfce7457f5b773e1c24001060c Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 19:26:18 +0200 Subject: [PATCH 21/48] Address compiler warnings --- src/hic/pixel_selector_impl.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index d5252f20..4d221068 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -78,7 +78,7 @@ inline internal::InteractionBlock::ThinPixel PixelSelector::transform_pixel( normalization() == NormalizationMethod::NONE || matrix_type() == MatrixType::expected; if (!skipNormalization) { - const auto bin2 = static_cast(pixel.bin2_id); + const auto bin2 = pixel.bin2_id; assert(bin1 < c1Norm.size()); assert(bin2 < c2Norm.size()); pixel.count /= static_cast(c1Norm[bin1] * c2Norm[bin2]); From 312e92838142183864115fc43af6793f48ed6510 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 19:30:19 +0200 Subject: [PATCH 22/48] Improve CMake status updates --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 15aa9294..9597feca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -113,13 +113,13 @@ add_subdirectory(src) if(HICTK_ENABLE_TESTING) enable_testing() - message(STATUS "-- Building unit tests.") + message(STATUS "Building unit tests.") target_compile_definitions(hictk_project_options INTERFACE HICTK_ENABLE_TESTING) add_subdirectory(test) endif() if(HICTK_BUILD_EXAMPLES) - message(STATUS "-- Building examples.") + message(STATUS "Building examples.") # add_subdirectory(examples) endif() From 8943827fec42f19c69a4fea1dafcfdd3b882d1f4 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 19:31:14 +0200 Subject: [PATCH 23/48] Increase CTest timeout in windows CI --- .github/workflows/windows-ci.yml | 2 +- test/units/hic/pixel_selector_test.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/windows-ci.yml b/.github/workflows/windows-ci.yml index 585d880e..4b450e09 100644 --- a/.github/workflows/windows-ci.yml +++ b/.github/workflows/windows-ci.yml @@ -153,7 +153,7 @@ jobs: --schedule-random \ --output-on-failure \ --no-tests=error \ - --timeout 300 |& + --timeout 600 |& head -n 1000 windows-ci-status-check: diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index bbd6e32f..0e338e53 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -98,7 +98,7 @@ TEST_CASE("MatrixSelector accessors", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { +TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][long]") { SECTION("intra-chromosomal") { constexpr std::size_t expected_size = 1433133; constexpr std::int32_t expected_sum = 19968156; @@ -275,7 +275,7 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][short]") { +TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][long]") { SECTION("intra-chromosomal") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 20391277.41514; @@ -316,7 +316,7 @@ TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][short]") { +TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][long]") { SECTION("intra-chromosomal") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 18314748.068024; @@ -357,7 +357,7 @@ TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (oe NONE BP 10000)", "[hic][short]") { +TEST_CASE("MatrixSelector fetch (oe NONE BP 10000)", "[hic][long]") { SECTION("intra-chromosomal") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 2785506.2274201; From 9efa491a2cbe49f49f2e9f020ef624095832fc44 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 19:42:54 +0200 Subject: [PATCH 24/48] Rename test cases --- test/units/cooler/attribute_test.cpp | 6 +++--- test/units/cooler/dataset_test.cpp | 14 +++++++------- test/units/cooler/index_test.cpp | 10 +++++----- test/units/cooler/pixel_selector_test.cpp | 4 ++-- test/units/cooler/utils_equal_test.cpp | 2 +- test/units/cooler/utils_merge_test.cpp | 2 +- test/units/hic/CMakeLists.txt | 2 +- ...c_file_stream_test.cpp => file_reader_test.cpp} | 12 ++++++------ test/units/hic/filestream_test.cpp | 12 ++++++------ test/units/hic/hic_file_test.cpp | 8 ++++---- test/units/hic/pixel_selector_test.cpp | 10 +++++----- 11 files changed, 41 insertions(+), 41 deletions(-) rename test/units/hic/{hic_file_stream_test.cpp => file_reader_test.cpp} (98%) diff --git a/test/units/cooler/attribute_test.cpp b/test/units/cooler/attribute_test.cpp index 2fab5b26..ddeb95cd 100644 --- a/test/units/cooler/attribute_test.cpp +++ b/test/units/cooler/attribute_test.cpp @@ -44,7 +44,7 @@ static void compare_attribute(H5Obj& obj, std::string_view key, const std::vecto } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Attribute: write", "[cooler][short]") { +TEST_CASE("Cooler: attribute write", "[cooler][short]") { const auto path = testdir() / "test_write_attrs.cool"; auto f = HighFive::File(path.string(), HighFive::File::Truncate); @@ -287,7 +287,7 @@ TEST_CASE("Attribute: write", "[cooler][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Attribute: read", "[cooler][short]") { +TEST_CASE("Cooler: attribute read", "[cooler][short]") { const auto path = datadir / "test_read_attrs.h5"; auto f = HighFive::File(path.string(), HighFive::File::ReadOnly); @@ -412,7 +412,7 @@ TEST_CASE("Attribute: read", "[cooler][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Attribute: read - test numeric conversions", "[cooler][short]") { +TEST_CASE("Cooler: attribute read - test numeric conversions", "[cooler][short]") { const auto path = testdir() / "test_read_attrs_numeric_conversion.cool"; auto f = HighFive::File(path.string(), HighFive::File::Truncate); diff --git a/test/units/cooler/dataset_test.cpp b/test/units/cooler/dataset_test.cpp index c19d9b9c..d3665dde 100644 --- a/test/units/cooler/dataset_test.cpp +++ b/test/units/cooler/dataset_test.cpp @@ -23,7 +23,7 @@ const auto& testdir = hictk::test::testdir; const auto& datadir = hictk::test::datadir; // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Dataset: read", "[dataset][short]") { +TEST_CASE("Cooler: dataset read", "[dataset][short]") { const auto path = datadir / "cooler_test_file.cool"; const RootGroup grp{HighFive::File(path.string()).getGroup("/")}; @@ -101,7 +101,7 @@ TEST_CASE("Dataset: read", "[dataset][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Dataset: write", "[dataset][short]") { +TEST_CASE("Cooler: dataset write", "[dataset][short]") { const auto path = testdir() / "test_dataset_write.cool"; const RootGroup grp{HighFive::File(path.string(), HighFive::File::Truncate).getGroup("/")}; @@ -210,7 +210,7 @@ TEST_CASE("Dataset: write", "[dataset][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Dataset: accessors", "[dataset][short]") { +TEST_CASE("Cooler: dataset accessors", "[dataset][short]") { const auto path = datadir / "cooler_test_file.cool"; const RootGroup grp{HighFive::File(path.string()).getGroup("/")}; @@ -226,7 +226,7 @@ TEST_CASE("Dataset: accessors", "[dataset][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Dataset: linear iteration", "[dataset][long]") { +TEST_CASE("Cooler: dataset linear iteration", "[dataset][long]") { const auto path = datadir / "cooler_test_file.cool"; const RootGroup grp{HighFive::File(path.string()).getGroup("/")}; @@ -264,7 +264,7 @@ TEST_CASE("Dataset: linear iteration", "[dataset][long]") { } // NOLINT(clang-analyzer-cplusplus.NewDeleteLeaks) // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Dataset: random iteration", "[dataset][medium]") { +TEST_CASE("Cooler: dataset random iteration", "[dataset][medium]") { const auto path = testdir() / "dataset_iterator_random.h5"; const RootGroup grp{HighFive::File(path.string(), HighFive::File::Truncate).getGroup("/")}; @@ -326,7 +326,7 @@ TEST_CASE("Dataset: random iteration", "[dataset][medium]") { } } -TEST_CASE("Dataset: large read/write", "[dataset][long]") { +TEST_CASE("Cooler: dataset large read/write", "[dataset][long]") { const auto path = testdir() / "test_dataset_large_rw.h5"; constexpr std::uint64_t seed{4195331987557451569}; @@ -354,7 +354,7 @@ TEST_CASE("Dataset: large read/write", "[dataset][long]") { [&](const auto& n) { CHECK(n == static_cast(rand_eng())); }); } -TEST_CASE("Dataset: attributes", "[dataset][short]") { +TEST_CASE("Cooler: dataset attributes", "[dataset][short]") { SECTION("read") { const auto path = datadir / "test_read_attrs.h5"; diff --git a/test/units/cooler/index_test.cpp b/test/units/cooler/index_test.cpp index 45091748..4a451e45 100644 --- a/test/units/cooler/index_test.cpp +++ b/test/units/cooler/index_test.cpp @@ -19,7 +19,7 @@ namespace hictk::cooler::test::index { const auto& datadir = hictk::test::datadir; // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Index: ctor", "[index][short]") { +TEST_CASE("Cooler: index ctor", "[index][short]") { constexpr std::uint32_t bin_size = 100; const auto bins = std::make_shared( Reference{Chromosome{0, "chr1", 10001}, Chromosome{1, "chr2", 5000}}, bin_size); @@ -41,7 +41,7 @@ TEST_CASE("Index: ctor", "[index][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Index: offset setters and getters", "[index][short]") { +TEST_CASE("Cooler: index offset setters and getters", "[index][short]") { constexpr std::uint32_t bin_size = 10; const auto bins = std::make_shared(Reference{Chromosome{0, "chr1", 100}}, bin_size); @@ -96,7 +96,7 @@ TEST_CASE("Index: offset setters and getters", "[index][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Index: iterator", "[index][short]") { +TEST_CASE("Cooler: index iterator", "[index][short]") { constexpr std::uint32_t bin_size = 1000; const auto bins = std::make_shared( Reference{Chromosome{0, "chr1", 10001}, Chromosome{1, "chr2", 5000}}, bin_size); @@ -130,7 +130,7 @@ TEST_CASE("Index: iterator", "[index][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Index: validation", "[index][short]") { +TEST_CASE("Cooler: index validation", "[index][short]") { constexpr std::uint32_t bin_size = 1000; const auto bins = std::make_shared( Reference{Chromosome{0, "chr1", 10001}, Chromosome{1, "chr2", 5000}}, bin_size); @@ -168,7 +168,7 @@ TEST_CASE("Index: validation", "[index][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Index: compute chromosome offsets", "[index][short]") { +TEST_CASE("Cooler: index compute chromosome offsets", "[index][short]") { constexpr std::uint32_t bin_size = 1000; const auto bins = std::make_shared( Reference{Chromosome{0, "chr1", 10001}, Chromosome{1, "chr2", 5000}}, bin_size); diff --git a/test/units/cooler/pixel_selector_test.cpp b/test/units/cooler/pixel_selector_test.cpp index 203f3995..ab5f07f8 100644 --- a/test/units/cooler/pixel_selector_test.cpp +++ b/test/units/cooler/pixel_selector_test.cpp @@ -42,7 +42,7 @@ static std::ptrdiff_t generate_test_data(const std::filesystem::path& path, cons } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Pixel selector: 1D queries", "[pixel_selector][short]") { +TEST_CASE("Cooler: pixel selector 1D queries", "[pixel_selector][short]") { const auto path1 = testdir() / "pixel_selector_devel.cool"; const Reference chroms{Chromosome{0, "chr1", 1000}, Chromosome{1, "chr2", 100}}; @@ -212,7 +212,7 @@ TEST_CASE("Pixel selector: 1D queries", "[pixel_selector][short]") { } } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Pixel selector: 2D queries", "[pixel_selector][short]") { +TEST_CASE("Cooler: pixel selector 2D queries", "[pixel_selector][short]") { using T = std::uint32_t; const auto path = datadir / "cooler_test_file.cool"; auto f = File::open_read_only(path.string()); diff --git a/test/units/cooler/utils_equal_test.cpp b/test/units/cooler/utils_equal_test.cpp index 348ee680..6213744c 100644 --- a/test/units/cooler/utils_equal_test.cpp +++ b/test/units/cooler/utils_equal_test.cpp @@ -19,7 +19,7 @@ inline const auto& testdir = hictk::test::testdir; inline const auto& datadir = hictk::test::datadir; // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("utils: equal", "[equal][utils][short]") { +TEST_CASE("Cooler: utils equal", "[equal][utils][short]") { const auto path1 = datadir / "cooler_test_file.cool"; const auto path2 = datadir / "multires_cooler_test_file.mcool::/resolutions/6400000"; diff --git a/test/units/cooler/utils_merge_test.cpp b/test/units/cooler/utils_merge_test.cpp index 5d241d39..57e01189 100644 --- a/test/units/cooler/utils_merge_test.cpp +++ b/test/units/cooler/utils_merge_test.cpp @@ -19,7 +19,7 @@ inline const auto& testdir = hictk::test::testdir; inline const auto& datadir = hictk::test::datadir; // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("utils: merge", "[merge][utils][short]") { +TEST_CASE("Cooler: utils merge", "[merge][utils][long]") { const auto src = datadir / "cooler_test_file.cool"; const auto dest = testdir() / "cooler_merge_test1.cool"; diff --git a/test/units/hic/CMakeLists.txt b/test/units/hic/CMakeLists.txt index 83e9defc..326b463a 100644 --- a/test/units/hic/CMakeLists.txt +++ b/test/units/hic/CMakeLists.txt @@ -13,7 +13,7 @@ add_executable(hictk_hic_tests) target_sources( hictk_hic_tests PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/filestream_test.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_stream_test.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/file_reader_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/hic_file_test.cpp ${CMAKE_CURRENT_SOURCE_DIR}/pixel_selector_test.cpp) diff --git a/test/units/hic/hic_file_stream_test.cpp b/test/units/hic/file_reader_test.cpp similarity index 98% rename from test/units/hic/hic_file_stream_test.cpp rename to test/units/hic/file_reader_test.cpp index 99c8ba54..7e687e3c 100644 --- a/test/units/hic/hic_file_stream_test.cpp +++ b/test/units/hic/file_reader_test.cpp @@ -2,6 +2,8 @@ // // SPDX-License-Identifier: MIT +#include "hictk/hic/file_reader.hpp" + #include #include #include @@ -9,8 +11,6 @@ #include #include -#include "hictk/hic/file_reader.hpp" - using namespace hictk::hic; namespace hictk::test { @@ -23,7 +23,7 @@ const auto pathV8 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); const auto pathV9 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic9").string(); // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("readHeader (v8)", "[hic][v8][short]") { +TEST_CASE("HiC: read header (v8)", "[hic][v8][short]") { constexpr std::array resolutions{2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 1000}; constexpr auto* genomeID = "dm6"; @@ -45,7 +45,7 @@ TEST_CASE("readHeader (v8)", "[hic][v8][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("readHeader (v9)", "[hic][v9][short]") { +TEST_CASE("HiC: read header (v9)", "[hic][v9][short]") { constexpr std::array resolutions{2500000, 1000000, 500000, 250000, 100000, 50000, 25000, 10000, 5000, 1000}; constexpr auto* genomeID = "dm6"; @@ -68,7 +68,7 @@ TEST_CASE("readHeader (v9)", "[hic][v9][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("read_footer (v8)", "[hic][v8][short]") { +TEST_CASE("HiC: read footer (v8)", "[hic][v8][short]") { internal::HiCFileReader s(pathV8); const auto chr2L = s.header().chromosomes.at("chr2L"); const auto chr2R = s.header().chromosomes.at("chr2R"); @@ -192,7 +192,7 @@ TEST_CASE("read_footer (v8)", "[hic][v8][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("read_footer (v9)", "[hic][v9][short]") { +TEST_CASE("HiC: read footer (v9)", "[hic][v9][short]") { internal::HiCFileReader s(pathV9); const auto chr2L = s.header().chromosomes.at("chr2L"); const auto chr2R = s.header().chromosomes.at("chr2R"); diff --git a/test/units/hic/filestream_test.cpp b/test/units/hic/filestream_test.cpp index e1176bbc..a8d5a386 100644 --- a/test/units/hic/filestream_test.cpp +++ b/test/units/hic/filestream_test.cpp @@ -53,7 +53,7 @@ static std::vector read_file_by_line(const std::string& path_, char } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Local - ctor", "[hic][short]") { +TEST_CASE("HiC: filestream ctor", "[hic][short]") { SECTION("default") { const FileStream s{}; CHECK(s.url().empty()); @@ -71,7 +71,7 @@ TEST_CASE("Local - ctor", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Local - seek", "[hic][short]") { +TEST_CASE("HiC: filestream seek", "[hic][short]") { FileStream s(path_plaintext); { std::string buff; @@ -115,7 +115,7 @@ TEST_CASE("Local - seek", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Local - read", "[hic][short]") { +TEST_CASE("HiC: filestream read", "[hic][short]") { FileStream s(path_plaintext); std::string buffer{"garbage"}; @@ -160,7 +160,7 @@ TEST_CASE("Local - read", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Local - append", "[hic][short]") { +TEST_CASE("HiC: filestream append", "[hic][short]") { FileStream s(path_plaintext); std::string buffer; @@ -194,7 +194,7 @@ TEST_CASE("Local - append", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Local - getline", "[hic][short]") { +TEST_CASE("HiC: filestream getline", "[hic][short]") { FileStream s(path_plaintext); std::string buffer; @@ -237,7 +237,7 @@ TEST_CASE("Local - getline", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("Local - read binary", "[hic][short]") { +TEST_CASE("HiC: filestream read binary", "[hic][short]") { FileStream s(path_binary); s.seekg(10); diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index b0e9cdd1..9b20cf15 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -21,13 +21,13 @@ const auto pathV8 = (hictk::test::datadir / "4DNFIZ1ZVXC8.hic8").string(); const auto path_binary = (hictk::test::datadir / "data.zip").string(); // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("utils: is_hic_file", "[hic][short]") { +TEST_CASE("HiC: utils is_hic_file", "[hic][short]") { CHECK(utils::is_hic_file(pathV8)); CHECK_FALSE(utils::is_hic_file(path_binary)); } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("HiCFile accessors", "[hic][short]") { +TEST_CASE("HiC: file accessors", "[hic][short]") { const HiCFile f(pathV8, 1'000); CHECK(f.url() == pathV8); @@ -52,7 +52,7 @@ TEST_CASE("HiCFile accessors", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("HiCFile footer cache", "[hic][short]") { +TEST_CASE("HiC: footer cache", "[hic][short]") { HiCFile f(pathV8, 2'500'000); CHECK(f.num_cached_footers() == 0); @@ -82,7 +82,7 @@ TEST_CASE("HiCFile footer cache", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("HiCFile get_matrix_selector", "[hic][short]") { +TEST_CASE("HiC: fetch", "[hic][short]") { constexpr auto norm = NormalizationMethod::NONE; const HiCFile f(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP); diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index 0e338e53..49bf5eba 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -83,7 +83,7 @@ static void compareContactRecord(const hictk::Pixel& r1, const SerializedPixe } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector accessors", "[hic][short]") { +TEST_CASE("HiC: pixel selector accessors", "[hic][short]") { const auto sel = HiCFile(pathV8, 2'500'000, MatrixType::observed, MatrixUnit::BP) .fetch("chr2L", NormalizationMethod::NONE); @@ -98,7 +98,7 @@ TEST_CASE("MatrixSelector accessors", "[hic][short]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][long]") { +TEST_CASE("HiC: pixel selector fetch (observed NONE BP 10000)", "[hic][long]") { SECTION("intra-chromosomal") { constexpr std::size_t expected_size = 1433133; constexpr std::int32_t expected_sum = 19968156; @@ -275,7 +275,7 @@ TEST_CASE("MatrixSelector fetch (observed NONE BP 10000)", "[hic][long]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][long]") { +TEST_CASE("HiC: pixel selector fetch (observed VC BP 10000)", "[hic][long]") { SECTION("intra-chromosomal") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 20391277.41514; @@ -316,7 +316,7 @@ TEST_CASE("MatrixSelector fetch (observed VC BP 10000)", "[hic][long]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][long]") { +TEST_CASE("HiC: pixel selector fetch (expected NONE BP 10000)", "[hic][long]") { SECTION("intra-chromosomal") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 18314748.068024; @@ -357,7 +357,7 @@ TEST_CASE("MatrixSelector fetch (expected NONE BP 10000)", "[hic][long]") { } // NOLINTNEXTLINE(readability-function-cognitive-complexity) -TEST_CASE("MatrixSelector fetch (oe NONE BP 10000)", "[hic][long]") { +TEST_CASE("HiC: pixel selector fetch (oe NONE BP 10000)", "[hic][long]") { SECTION("intra-chromosomal") { constexpr std::size_t expected_size = 1433133; constexpr double expected_sum = 2785506.2274201; From cefd136c0d85aa14742d154f0b9c3a86070f4c24 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Thu, 15 Jun 2023 19:43:01 +0200 Subject: [PATCH 25/48] Update tests [ci full] --- test/units/hic/file_reader_test.cpp | 15 --------------- test/units/hic/hic_file_test.cpp | 18 +++++------------- 2 files changed, 5 insertions(+), 28 deletions(-) diff --git a/test/units/hic/file_reader_test.cpp b/test/units/hic/file_reader_test.cpp index 7e687e3c..fd5661a4 100644 --- a/test/units/hic/file_reader_test.cpp +++ b/test/units/hic/file_reader_test.cpp @@ -246,21 +246,6 @@ TEST_CASE("HiC: read footer (v9)", "[hic][v9][short]") { CHECK(f.expectedValues().empty()); } - /* TODO: for some reason KR normalization is missing - SECTION("observed KR BP 5000") { - const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, - NormalizationMethod::KR, MatrixUnit::BP, 5000); - - CHECK(f.matrix_type() == MatrixType::observed); - CHECK(f.normalization() == NormalizationMethod::KR); - CHECK(f.unit() == MatrixUnit::BP); - CHECK(f.resolution() == 5000); - CHECK(f.fileOffset() == 11625116); - CHECK(f.c1Norm().size() == 4703); - CHECK(f.c2Norm().size() == 5058); - CHECK(f._expectedValues().empty()); - } */ - SECTION("observed SCALE BP 5000") { const auto f = s.read_footer(chr2L.id(), chr2R.id(), MatrixType::observed, NormalizationMethod::SCALE, MatrixUnit::BP, 5000); diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index 9b20cf15..3e72c4fc 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -101,32 +101,24 @@ TEST_CASE("HiC: fetch", "[hic][short]") { CHECK(sel.chrom2() == chrom2); } - /* SECTION("valid, but empty matrix") { - // TODO: fixme auto sel = f.fetch("chrM", norm); - std::vector> buff{}; - // sel.fetch(buff); - // CHECK(buff.empty()); + const auto buff = sel.read_all(); + CHECK(buff.empty()); } - */ SECTION("invalid chromosome") { CHECK_THROWS(f.fetch("not-a-chromosome", norm)); CHECK_THROWS(f.fetch("chr2L", "not-a-chromosome", norm)); } - /* SECTION("malformed") { - // TODO: update - CHECK_THROWS(f.get_matrix_selector(chrom2, chrom1, norm)); // NOLINT + CHECK_THROWS(f.fetch(chrom2, chrom1, norm)); // NOLINT CHECK_THROWS(HiCFile(pathV8, f.resolution(), MatrixType::expected, MatrixUnit::BP) - .get_matrix_selector(chrom1, NormalizationMethod::VC)); + .fetch(chrom1, NormalizationMethod::VC)); // Matrix does not have contacts for fragments CHECK_THROWS(HiCFile(pathV8, f.resolution(), MatrixType::observed, MatrixUnit::FRAG) - .get_matrix_selector(chrom1, norm)); + .fetch(chrom1, norm)); } - CHECK_THROWS(f.fetch(chrom1.id(), 999, norm)); - */ } From f76f8ba85be19e8a717a2b1dbd61629af58c8174 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 16 Jun 2023 10:38:57 +0200 Subject: [PATCH 26/48] Refactor PixelMerger to make it more general --- src/cooler/include/hictk/cooler/utils.hpp | 40 ----- src/cooler/utils_merge_impl.hpp | 173 +++++++++----------- src/pixel/include/hictk/pixel.hpp | 37 +++++ src/pixel/pixel_impl.hpp | 187 +++++++++------------- 4 files changed, 183 insertions(+), 254 deletions(-) diff --git a/src/cooler/include/hictk/cooler/utils.hpp b/src/cooler/include/hictk/cooler/utils.hpp index b964fe49..3a9b7c68 100644 --- a/src/cooler/include/hictk/cooler/utils.hpp +++ b/src/cooler/include/hictk/cooler/utils.hpp @@ -5,7 +5,6 @@ #pragma once #include -#include #include #include #include @@ -14,8 +13,6 @@ namespace hictk::cooler::utils { -enum class MergeStrategy { IN_MEMORY, PQUEUE }; - /// Iterable of hictk::File or strings template void merge(Str first_file, Str last_file, std::string_view dest_uri, @@ -25,43 +22,6 @@ void merge(Str first_file, Str last_file, std::string_view dest_uri, bool ignore_attributes = true); [[nodiscard]] bool equal(const File& clr1, const File& clr2, bool ignore_attributes = true); -namespace internal { - -/// This class is basically a wrapper around a priority queue of objects of type Node -/// Node consist of a pixel and an index. The index represent from which iterator (i.e. file) the -/// pixel was read. This allows us to know from which iterator we should read the next pixel (i.e. -/// the same iterator from which the top pixel originated) -template -class PixelMerger { - struct Node { - Pixel pixel{}; - std::size_t i{}; - - bool operator<(const Node& other) const noexcept; - bool operator>(const Node& other) const noexcept; - bool operator==(const Node& other) const noexcept; - bool operator!=(const Node& other) const noexcept; - }; - - std::vector> _buffer{}; - std::priority_queue, std::greater<>> _pqueue{}; - using PixelIt = decltype(std::declval().begin()); - - std::vector _heads{}; - std::vector _tails{}; - - public: - PixelMerger() = delete; - explicit PixelMerger(const std::vector& input_coolers); - template - PixelMerger(FileIt first_file, FileIt last_file); - void merge(File& clr, std::size_t queue_capacity, bool quiet = true); - - private: - void replace_top_node(std::size_t i); - [[nodiscard]] Pixel next(); -}; -} // namespace internal } // namespace hictk::cooler::utils #include "../../../utils_equal_impl.hpp" diff --git a/src/cooler/utils_merge_impl.hpp b/src/cooler/utils_merge_impl.hpp index b14b2331..3899d101 100644 --- a/src/cooler/utils_merge_impl.hpp +++ b/src/cooler/utils_merge_impl.hpp @@ -14,104 +14,6 @@ namespace hictk::cooler::utils { namespace internal { -template -inline bool PixelMerger::Node::operator<(const Node& other) const noexcept { - assert(!!this->pixel); - assert(!!other.pixel); - return this->pixel.coords < other.pixel.coords; -} - -template -inline bool PixelMerger::Node::operator>(const Node& other) const noexcept { - assert(!!this->pixel); - assert(!!other.pixel); - return this->pixel.coords > other.pixel.coords; -} - -template -inline bool PixelMerger::Node::operator==(const Node& other) const noexcept { - return this->pixel.coords == other.pixel.coords; -} - -template -inline bool PixelMerger::Node::operator!=(const Node& other) const noexcept { - return !(*this == other); -} - -template -inline PixelMerger::PixelMerger(const std::vector& input_coolers) - : PixelMerger(input_coolers.begin(), input_coolers.end()) {} - -template -template -inline PixelMerger::PixelMerger(FileIt first_file, FileIt last_file) { - std::for_each(first_file, last_file, [&](const auto& clr) { - auto first = clr.template begin(); - auto last = clr.template end(); - if (first != last) { - auto pixel = *first++; - _heads.emplace_back(std::move(first)); - _tails.emplace_back(std::move(last)); - _pqueue.emplace(Node{std::move(pixel), _pqueue.size()}); - } - }); -} - -template -inline void PixelMerger::merge(File& clr, std::size_t queue_capacity, bool quiet) { - this->_buffer.clear(); - this->_buffer.reserve((std::max)(queue_capacity, this->_buffer.capacity())); - - std::size_t pixels_processed{}; - while (true) { - auto pixel = this->next(); - if (!pixel) { - break; - } - this->_buffer.emplace_back(std::move(pixel)); - if (this->_buffer.size() == queue_capacity) { - clr.append_pixels(this->_buffer.begin(), this->_buffer.end()); - pixels_processed += this->_buffer.size(); - if (!quiet && pixels_processed % (std::max)(queue_capacity, std::size_t(1'000'000)) == 0) { - fmt::print(stderr, FMT_STRING("Procesed {}M pixels...\n"), pixels_processed / 1'000'000); - } - this->_buffer.clear(); - } - } - - if (!this->_buffer.empty()) { - clr.append_pixels(this->_buffer.begin(), this->_buffer.end()); - } -} - -template -inline void PixelMerger::replace_top_node(std::size_t i) { - assert(this->_pqueue.top().i == i); - this->_pqueue.pop(); - if (auto& it = this->_heads[i]; it != this->_tails[i]) { - this->_pqueue.emplace(Node{*it++, i}); - } -} - -template -inline Pixel PixelMerger::next() { - if (this->_pqueue.empty()) { - return {}; - } - - auto current_node = this->_pqueue.top(); - this->replace_top_node(current_node.i); - - while (!this->_pqueue.empty()) { - const auto next_node = this->_pqueue.top(); - if (next_node != current_node) { - break; - } - current_node.pixel.count += next_node.pixel.count; - this->replace_top_node(next_node.i); - } - return current_node.pixel; -} [[nodiscard]] inline std::uint32_t get_bin_size_checked(const std::vector& coolers) { assert(coolers.size() > 1); @@ -153,6 +55,75 @@ inline Pixel PixelMerger::next() { return false; } +template +inline void merge(const std::vector::iterator>& heads, + const std::vector::iterator>& tails, File& dest, + std::size_t queue_capacity, bool quiet) { + hictk::internal::PixelMerger merger{heads, tails}; + + std::vector> buffer(queue_capacity); + buffer.clear(); + + std::size_t pixels_processed{}; + while (true) { + auto pixel = merger.next(); + if (!pixel) { + break; + } + + buffer.emplace_back(std::move(pixel)); + if (buffer.size() == queue_capacity) { + dest.append_pixels(buffer.begin(), buffer.end()); + pixels_processed += buffer.size(); + if (!quiet && pixels_processed % (std::max)(queue_capacity, std::size_t(1'000'000)) == 0) { + fmt::print(stderr, FMT_STRING("Procesed {}M pixels...\n"), pixels_processed / 1'000'000); + } + } + } + + if (!buffer.empty()) { + dest.append_pixels(buffer.begin(), buffer.end()); + } +} + +template +struct CoolerIteratorPairs { + std::vector::iterator> heads{}; + std::vector::iterator> tails{}; +}; + +template +inline CoolerIteratorPairs collect_iterators(const std::vector& clrs) { + if constexpr (std::is_floating_point_v) { + std::vector::iterator> heads{}; + std::vector::iterator> tails{}; + + for (const auto& clr : clrs) { + auto first = clr.begin(); + auto last = clr.end(); + if (first != last) { + heads.emplace_back(std::move(first)); + tails.emplace_back(std::move(last)); + } + } + + return {heads, tails}; + } else { + std::vector::iterator> heads{}; + std::vector::iterator> tails{}; + + for (const auto& clr : clrs) { + auto first = clr.begin(); + auto last = clr.end(); + if (first != last) { + heads.emplace_back(std::move(first)); + tails.emplace_back(std::move(last)); + } + } + return {heads, tails}; + } +} + } // namespace internal template @@ -179,9 +150,11 @@ inline void merge(Str first_file, Str last_file, std::string_view dest_uri, : File::create_new_cooler(dest_uri, chroms, bin_size, overwrite_if_exists); try { if (float_pixels) { - internal::PixelMerger(clrs).merge(dest, chunk_size, quiet); + auto [heads, tails] = internal::collect_iterators(clrs); + internal::merge(heads, tails, dest, chunk_size, quiet); } else { - internal::PixelMerger(clrs).merge(dest, chunk_size, quiet); + auto [heads, tails] = internal::collect_iterators(clrs); + internal::merge(heads, tails, dest, chunk_size, quiet); } } catch (const std::exception& e) { throw std::runtime_error( diff --git a/src/pixel/include/hictk/pixel.hpp b/src/pixel/include/hictk/pixel.hpp index bd2d956a..cabe3526 100644 --- a/src/pixel/include/hictk/pixel.hpp +++ b/src/pixel/include/hictk/pixel.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -64,6 +65,42 @@ struct Pixel { [[nodiscard]] bool operator>=(const Pixel &other) const noexcept; }; +namespace internal { + +/// This class is basically a wrapper around a priority queue of objects of type Node +/// Node consist of a pixel and an index. The index represent from which iterator the +/// pixel was read. This allows us to know from which iterator we should read the next pixel (i.e. +/// the same iterator from which the top pixel originated) +template +class PixelMerger { + using N = decltype(std::declval()->count); + struct Node { + Pixel pixel{}; // NOLINT + std::size_t i{}; // NOLINT + + bool operator<(const Node &other) const noexcept; + bool operator>(const Node &other) const noexcept; + bool operator==(const Node &other) const noexcept; + bool operator!=(const Node &other) const noexcept; + }; + + std::vector> _buffer{}; + std::priority_queue, std::greater<>> _pqueue{}; + + std::vector _heads{}; + std::vector _tails{}; + + public: + PixelMerger() = delete; + PixelMerger(const std::vector &head, const std::vector &tail); + PixelMerger(PixelIt head_first, PixelIt head_last, PixelIt tail_first); + [[nodiscard]] auto next() -> Pixel; + + private: + void replace_top_node(std::size_t i); +}; +} // namespace internal + } // namespace hictk #include "../../pixel_impl.hpp" diff --git a/src/pixel/pixel_impl.hpp b/src/pixel/pixel_impl.hpp index 5156fa42..a7a73cab 100644 --- a/src/pixel/pixel_impl.hpp +++ b/src/pixel/pixel_impl.hpp @@ -10,120 +10,6 @@ #include "hictk/chromosome.hpp" namespace hictk { -/* -inline PixelCoordinates::PixelCoordinates(const std::shared_ptr &bins, - const Chromosome &chrom1, const Chromosome &chrom2, - std::uint32_t bin1_start_, std::uint32_t bin2_start_) - : PixelCoordinates(bins, bins->chromosomes().get_id(chrom1), bins->chromosomes().get_id(chrom2), - bin1_start_, bin2_start_) {} - -inline PixelCoordinates::PixelCoordinates(const std::shared_ptr &bins, - std::string_view chrom1_name, - std::string_view chrom2_name, std::uint32_t bin1_start_, - std::uint32_t bin2_start_) - : PixelCoordinates(bins, bins->chromosomes().get_id(chrom1_name), - bins->chromosomes().get_id(chrom2_name), bin1_start_, bin2_start_) {} - -inline PixelCoordinates::PixelCoordinates(const std::shared_ptr &bins, - std::uint32_t chrom1_id_, std::uint32_t chrom2_id_, - std::uint32_t bin1_start_, std::uint32_t bin2_start_) - : PixelCoordinates(bins, bins->map_to_bin_id(chrom1_id_, bin1_start_), - bins->map_to_bin_id(chrom2_id_, bin2_start_)) {} - -inline PixelCoordinates::PixelCoordinates(const std::shared_ptr &bins, - const Chromosome &chrom, std::uint32_t bin1_start_, - std::uint32_t bin2_start_) - : PixelCoordinates(bins, chrom, chrom, bin1_start_, bin2_start_) {} - -inline PixelCoordinates::PixelCoordinates(const std::shared_ptr &bins, - std::uint32_t chrom_id, std::uint32_t bin1_start_, - std::uint32_t bin2_start_) - : PixelCoordinates(bins, chrom_id, chrom_id, bin1_start_, bin2_start_) {} - -inline PixelCoordinates::PixelCoordinates(const std::shared_ptr &bins, - std::string_view chrom_name, std::uint32_t bin1_start_, - std::uint32_t bin2_start_) - : PixelCoordinates(bins, chrom_name, chrom_name, bin1_start_, bin2_start_) {} - -inline PixelCoordinates::PixelCoordinates(std::shared_ptr bins, - std::uint64_t bin1_id_, std::uint64_t bin2_id_) - : _bins(std::move(bins)), _bin1_id(bin1_id_), _bin2_id(bin2_id_) { - assert(_bin1_id <= _bins->size()); - assert(_bin2_id <= _bins->size()); -} - -inline PixelCoordinates::operator bool() const noexcept { return !!this->_bins; } - -inline const Chromosome &PixelCoordinates::chrom1() const { return this->bin1().chrom; } - -inline const Chromosome &PixelCoordinates::chrom2() const { return this->bin2().chrom; } - -inline std::uint32_t PixelCoordinates::bin1.chrom().id() const { - return this->_bins->chromosomes().get_id(this->chrom1()); -} - -inline std::uint32_t PixelCoordinates::bin2.chrom().id() const { - return this->_bins->chromosomes().get_id(this->chrom2()); -} - -inline GenomicInterval PixelCoordinates::bin1() const { - assert(this->_bins); - assert(!!*this); - - return this->_bins->bin_id_to_coords(_bin1_id); -} - -inline GenomicInterval PixelCoordinates::bin2() const { - assert(this->_bins); - assert(!!*this); - - return this->_bins->at(_bin2_id); -} - -inline std::uint64_t PixelCoordinates::bin1_id() const noexcept { return this->_bin1_id; } -inline std::uint64_t PixelCoordinates::bin2_id() const noexcept { return this->_bin2_id; } - -inline std::uint32_t PixelCoordinates::bin_size() const noexcept { - assert(this->_bins); - return this->_bins->bin_size(); -} - -constexpr bool PixelCoordinates::operator==(const PixelCoordinates &other) const noexcept { - return this->_bin1_id == other._bin1_id && this->_bin2_id == other._bin2_id; -} - -constexpr bool PixelCoordinates::operator!=(const PixelCoordinates &other) const noexcept { - return !(*this == other); -} - -constexpr bool PixelCoordinates::operator<(const PixelCoordinates &other) const noexcept { - if (this->_bin1_id == other._bin1_id) { - return this->_bin2_id < other._bin2_id; - } - return this->_bin1_id < other._bin1_id; -} - -constexpr bool PixelCoordinates::operator<=(const PixelCoordinates &other) const noexcept { - if (this->_bin1_id == other._bin1_id) { - return this->_bin2_id <= other._bin2_id; - } - return this->_bin1_id <= other._bin1_id; -} - -constexpr bool PixelCoordinates::operator>(const PixelCoordinates &other) const noexcept { - if (this->_bin1_id == other._bin1_id) { - return this->_bin2_id > other._bin2_id; - } - return this->_bin1_id > other._bin1_id; -} - -constexpr bool PixelCoordinates::operator>=(const PixelCoordinates &other) const noexcept { - if (this->_bin1_id == other._bin1_id) { - return this->_bin2_id >= other._bin2_id; - } - return this->_bin1_id >= other._bin1_id; -} -*/ inline PixelCoordinates::PixelCoordinates(Bin bin1_, Bin bin2_) noexcept : bin1(std::move(bin1_)), bin2(std::move(bin2_)) {} @@ -245,4 +131,77 @@ inline bool Pixel::operator>=(const Pixel &other) const noexcept { } return this->coords >= other.coords; } + +namespace internal { +template +inline bool PixelMerger::Node::operator<(const Node &other) const noexcept { + assert(!!this->pixel); + assert(!!other.pixel); + return this->pixel.coords < other.pixel.coords; +} + +template +inline bool PixelMerger::Node::operator>(const Node &other) const noexcept { + assert(!!this->pixel); + assert(!!other.pixel); + return this->pixel.coords > other.pixel.coords; +} + +template +inline bool PixelMerger::Node::operator==(const Node &other) const noexcept { + return this->pixel.coords == other.pixel.coords; +} + +template +inline bool PixelMerger::Node::operator!=(const Node &other) const noexcept { + return !(*this == other); +} + +template +inline PixelMerger::PixelMerger(const std::vector &heads, + const std::vector &tails) { + assert(heads.size() == tails.size()); + auto tail = tails.begin(); + std::for_each(heads.begin(), heads.end(), [&](const auto &it) { + auto first = it; + auto last = *tail++; + if (first != last) { + auto pixel = *first++; + _heads.emplace_back(std::move(first)); + _tails.emplace_back(std::move(last)); + _pqueue.emplace(Node{std::move(pixel), _pqueue.size()}); + } + }); +} + +template +inline void PixelMerger::replace_top_node(std::size_t i) { + assert(this->_pqueue.top().i == i); + this->_pqueue.pop(); + if (auto &it = this->_heads[i]; it != this->_tails[i]) { + this->_pqueue.emplace(Node{*it++, i}); + } +} + +template +inline auto PixelMerger::next() -> Pixel { + if (this->_pqueue.empty()) { + return {}; + } + + auto current_node = this->_pqueue.top(); + this->replace_top_node(current_node.i); + + while (!this->_pqueue.empty()) { + const auto next_node = this->_pqueue.top(); + if (next_node != current_node) { + break; + } + current_node.pixel.count += next_node.pixel.count; + this->replace_top_node(next_node.i); + } + return current_node.pixel; +} +} // namespace internal + } // namespace hictk From 39e88617778fcad18b53d8934e46f46e9cf83df6 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 16 Jun 2023 13:38:08 +0200 Subject: [PATCH 27/48] Update LRU block cache to work when multiple chromosomes are fetched at the same time --- src/hic/block_cache_impl.hpp | 27 ++++++++++++++++++---- src/hic/block_reader_impl.hpp | 9 +++++--- src/hic/include/hictk/hic/block_cache.hpp | 26 +++++++++++++++++---- src/hic/include/hictk/hic/block_reader.hpp | 4 +++- src/hic/pixel_selector_impl.hpp | 6 ++--- 5 files changed, 56 insertions(+), 16 deletions(-) diff --git a/src/hic/block_cache_impl.hpp b/src/hic/block_cache_impl.hpp index 7855189d..b2e39b5a 100644 --- a/src/hic/block_cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -13,6 +13,7 @@ #include #include +#include "hictk/hash.hpp" #include "hictk/hic/footer.hpp" namespace hictk::hic::internal { @@ -111,6 +112,16 @@ inline std::size_t InteractionBlock::size_in_bytes() const noexcept { return sizeof(ThinPixel) * size(); } +inline std::size_t BlockLRUCache::KeyHasher::operator()( + const BlockLRUCache::Key &k) const noexcept { + return hictk::internal::hash_combine(0, k.chrom1_id, k.chrom2_id, k.id); +} + +constexpr bool BlockLRUCache::Key::operator==( + const hictk::hic::internal::BlockLRUCache::Key &other) const noexcept { + return chrom1_id == other.chrom1_id && chrom2_id == other.chrom2_id && id == other.id; +} + inline BlockLRUCache::BlockLRUCache(std::size_t max_size_in_bytes) : _max_size_bytes(max_size_in_bytes) { if (_max_size_bytes == 0) { @@ -140,7 +151,9 @@ inline auto BlockLRUCache::end() const noexcept -> const_iterator { return _cach inline auto BlockLRUCache::cend() const noexcept -> const_iterator { return end(); } -inline auto BlockLRUCache::find(key_t key) -> iterator { +inline auto BlockLRUCache::find(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id) + -> iterator { + const Key key{chrom1_id, chrom2_id, block_id}; auto it = _cache.find(key); if (it == end()) { _misses++; @@ -164,7 +177,9 @@ inline void BlockLRUCache::erase(iterator it) { std::ignore = _cache.erase(it); } -inline auto BlockLRUCache::emplace(key_t key, mapped_type &&block) -> std::pair { +inline auto BlockLRUCache::emplace(std::size_t chrom1_id, std::size_t chrom2_id, + std::size_t block_id, mapped_type &&block) + -> std::pair { assert(block); while (size() != 0 && size_in_bytes() + block->size_in_bytes() > max_size_in_bytes()) { @@ -172,12 +187,14 @@ inline auto BlockLRUCache::emplace(key_t key, mapped_type &&block) -> std::pair< } _current_size_bytes += block->size_in_bytes(); - return _cache.emplace(key, std::move(block)); + return _cache.emplace(Key{chrom1_id, chrom2_id, block_id}, std::move(block)); } -inline auto BlockLRUCache::emplace(key_t key, InteractionBlock &&block) +inline auto BlockLRUCache::emplace(std::size_t chrom1_id, std::size_t chrom2_id, + std::size_t block_id, InteractionBlock &&block) -> std::pair { - return emplace(key, std::make_shared(std::move(block))); + return emplace(chrom1_id, chrom2_id, block_id, + std::make_shared(std::move(block))); } constexpr double BlockLRUCache::hit_rate() const noexcept { diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp index a70f87fd..73aa8631 100644 --- a/src/hic/block_reader_impl.hpp +++ b/src/hic/block_reader_impl.hpp @@ -74,14 +74,16 @@ inline Index HiCBlockReader::read_index(HiCFileReader &hfs, const HiCFooter &foo footer.resolution()); } -inline std::shared_ptr HiCBlockReader::read(const BlockIndex &idx) { +inline std::shared_ptr HiCBlockReader::read(const Chromosome &chrom1, + const Chromosome &chrom2, + const BlockIndex &idx) { if (!idx) { return {nullptr}; } assert(_blk_cache); assert(_bins); - if (auto it = _blk_cache->find(idx.id()); it != _blk_cache->end()) { + if (auto it = _blk_cache->find(chrom1.id(), chrom2.id(), idx.id()); it != _blk_cache->end()) { return it->second; } @@ -135,7 +137,8 @@ inline std::shared_ptr HiCBlockReader::read(const BlockI HICTK_UNREACHABLE_CODE; } - auto it = _blk_cache->emplace(idx.id(), InteractionBlock{idx.id(), _tmp_buffer}); + auto it = _blk_cache->emplace(chrom1.id(), chrom2.id(), idx.id(), + InteractionBlock{idx.id(), _tmp_buffer}); return it.first->second; } diff --git a/src/hic/include/hictk/hic/block_cache.hpp b/src/hic/include/hictk/hic/block_cache.hpp index 0118b73e..ff71b055 100644 --- a/src/hic/include/hictk/hic/block_cache.hpp +++ b/src/hic/include/hictk/hic/block_cache.hpp @@ -76,7 +76,22 @@ class InteractionBlock { }; class BlockLRUCache { - using MapT = tsl::ordered_map>; + public: + struct Key { + std::size_t chrom1_id; // NOLINT + std::size_t chrom2_id; // NOLINT + std::size_t id; // NOLINT + + [[nodiscard]] constexpr bool operator==(const Key& other) const noexcept; + }; + + private: + struct KeyHasher { + [[nodiscard]] std::size_t operator()(const Key& k) const noexcept; + }; + + using MapT = + tsl::ordered_map, KeyHasher, std::equal_to<>>; using key_t = MapT::key_type; using mapped_type = MapT::mapped_type; using iterator = MapT::iterator; @@ -106,10 +121,13 @@ class BlockLRUCache { [[nodiscard]] auto end() const noexcept -> const_iterator; [[nodiscard]] auto cend() const noexcept -> const_iterator; - [[nodiscard]] auto find(key_t key) -> iterator; + [[nodiscard]] auto find(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id) + -> iterator; - auto emplace(key_t key, mapped_type&& block) -> std::pair; - auto emplace(key_t key, InteractionBlock&& block) -> std::pair; + auto emplace(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id, + mapped_type&& block) -> std::pair; + auto emplace(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id, + InteractionBlock&& block) -> std::pair; [[nodiscard]] constexpr double hit_rate() const noexcept; [[nodiscard]] constexpr std::size_t hits() const noexcept; diff --git a/src/hic/include/hictk/hic/block_reader.hpp b/src/hic/include/hictk/hic/block_reader.hpp index ff8a9170..3d5556fe 100644 --- a/src/hic/include/hictk/hic/block_reader.hpp +++ b/src/hic/include/hictk/hic/block_reader.hpp @@ -60,7 +60,9 @@ class HiCBlockReader { [[nodiscard]] double sum() const noexcept; [[nodiscard]] double avg() const; - [[nodiscard]] std::shared_ptr read(const BlockIndex& idx); + [[nodiscard]] std::shared_ptr read(const Chromosome& chrom1, + const Chromosome& chrom2, + const BlockIndex& idx); private: [[nodiscard]] static Index read_index(HiCFileReader& hfs, const HiCFooter& footer); diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 4d221068..032da678 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -280,10 +280,10 @@ inline void PixelSelector::iterator::read_next_row() { _buffer->clear(); _buffer_i = 0; const auto bin_size = bins().bin_size(); - const auto bin1 = - bins().at(coord1().bin1.chrom(), static_cast(_bin1_id) * bin_size); + const auto &chrom1 = coord1().bin1.chrom(); + const auto bin1 = bins().at(chrom1, static_cast(_bin1_id) * bin_size); for (const auto block_idx : blocks) { - const auto blk = _sel->_reader.read(block_idx); + const auto blk = _sel->_reader.read(chrom1, coord2().bin1.chrom(), block_idx); const auto match = blk->find(_bin1_id); if (match == blk->end()) { continue; From 6f4a591c6fd6c32e1e3492a3532be449d9acbb60 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 16 Jun 2023 13:38:53 +0200 Subject: [PATCH 28/48] Various minor changes to HiCFile and PixelMerger --- src/hic/file_reader_impl.hpp | 5 +-- src/hic/hic_file_impl.hpp | 54 ++++++++----------------------- src/hic/include/hictk/hic.hpp | 3 +- src/pixel/include/hictk/pixel.hpp | 3 +- src/pixel/pixel_impl.hpp | 15 ++++----- 5 files changed, 23 insertions(+), 57 deletions(-) diff --git a/src/hic/file_reader_impl.hpp b/src/hic/file_reader_impl.hpp index cb6ecff2..cedc2429 100644 --- a/src/hic/file_reader_impl.hpp +++ b/src/hic/file_reader_impl.hpp @@ -395,10 +395,7 @@ inline HiCFooter HiCFileReader::read_footer(std::uint32_t chrom1_id, std::uint32 } } if (metadata.fileOffset == -1) { - throw std::runtime_error(fmt::format( - FMT_STRING("unable to find interactions for {}:{} at {} ({}): unable to read file offset"), - _header->chromosomes.at(chrom1_id).name(), _header->chromosomes.at(chrom2_id).name(), - wanted_resolution, wanted_unit)); + return HiCFooter{Index{}, std::move(metadata)}; } const auto file_offset = _fs->tellg(); diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index 07e32bcf..9db704da 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -23,7 +23,7 @@ inline HiCFile::HiCFile(std::string url_, std::uint32_t resolution_, MatrixType _type(type_), _unit(unit_), _block_cache(std::make_shared(block_cache_capacity)), - _bins(std::make_shared(chromosomes(), resolution_)) { + _bins(std::make_shared(_fs->header().chromosomes, resolution_)) { assert(block_cache_capacity != 0); if (!has_resolution(resolution())) { throw std::runtime_error(fmt::format( @@ -46,7 +46,11 @@ inline const std::string& HiCFile::name() const noexcept { return url(); } inline std::int32_t HiCFile::version() const noexcept { return _fs->version(); } -inline const Reference& HiCFile::chromosomes() const noexcept { return _fs->header().chromosomes; } +inline const BinTable& HiCFile::bins() const noexcept { + assert(_bins); + return *_bins; +} +inline const Reference& HiCFile::chromosomes() const noexcept { return bins().chromosomes(); } inline const std::string& HiCFile::assembly() const noexcept { return _fs->header().genomeID; } @@ -94,12 +98,12 @@ inline PixelSelector HiCFile::fetch(std::string_view chrom_name, std::uint32_t s inline PixelSelector HiCFile::fetch(std::string_view range1, std::string_view range2, NormalizationMethod norm, QUERY_TYPE query_type) const { const auto gi1 = query_type == QUERY_TYPE::BED - ? GenomicInterval::parse_bed(this->chromosomes(), range1) - : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{range1}); + ? GenomicInterval::parse_bed(chromosomes(), range1) + : GenomicInterval::parse_ucsc(chromosomes(), std::string{range1}); const auto gi2 = query_type == QUERY_TYPE::BED - ? GenomicInterval::parse_bed(this->chromosomes(), range2) - : GenomicInterval::parse_ucsc(this->chromosomes(), std::string{range2}); + ? GenomicInterval::parse_bed(chromosomes(), range2) + : GenomicInterval::parse_ucsc(chromosomes(), std::string{range2}); return this->fetch(gi1.chrom(), gi1.start(), gi1.end(), gi2.chrom(), gi2.start(), gi2.end(), norm); @@ -130,41 +134,9 @@ inline PixelSelector HiCFile::fetch(const Chromosome& chrom1, std::uint32_t star const PixelCoordinates coord1 = {_bins->at(chrom1, start1), _bins->at(chrom1, end1 - 1)}; const PixelCoordinates coord2 = {_bins->at(chrom2, start2), _bins->at(chrom2, end2 - 1)}; - auto footer = [&]() { - try { - return get_footer(chrom1.id(), chrom2.id(), _type, norm, _unit, resolution()); - } catch (const std::exception& e) { - // Check whether query is valid but there are no interactions for the given chromosome - // pair - const auto missing_footer = - std::string_view{e.what()}.find("unable to read file offset") == std::string_view::npos; - if (missing_footer) { - throw; - } - - internal::HiCFooterMetadata metadata{url(), _type, norm, _unit, - resolution(), chrom1, chrom2, -1}; - - if (metadata.fileOffset == -1) { - return std::make_shared(internal::Index{metadata.chrom1, - metadata.chrom2, - metadata.unit, - metadata.resolution, - _fs->version(), - 1, - 1, - 0, - {}}, - std::move(metadata)); - } - return std::make_shared( - _fs->read_index(metadata.fileOffset, metadata.chrom1, metadata.chrom2, metadata.unit, - metadata.resolution), - std::move(metadata)); - } - }(); - - return PixelSelector{_fs, footer, _block_cache, _bins, coord1, coord2}; + return {_fs, get_footer(chrom1, chrom2, _type, norm, _unit, resolution()), + _block_cache, _bins, + coord1, coord2}; } inline std::size_t HiCFile::num_cached_footers() const noexcept { return _footers.size(); } diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index 3306fca5..96c87527 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -47,6 +47,7 @@ class HiCFile { [[nodiscard]] const std::string &name() const noexcept; [[nodiscard]] std::int32_t version() const noexcept; [[nodiscard]] const Reference &chromosomes() const noexcept; + [[nodiscard]] const BinTable &bins() const noexcept; [[nodiscard]] const std::string &assembly() const noexcept; [[nodiscard]] const std::vector &avail_resolutions() const noexcept; [[nodiscard]] std::uint32_t resolution() const noexcept; @@ -73,7 +74,7 @@ class HiCFile { private: [[nodiscard]] std::shared_ptr get_footer( - std::uint32_t chrom1_id, std::uint32_t chrom2_id, MatrixType matrix_type, + const Chromosome &chrom1, const Chromosome &chrom2, MatrixType matrix_type, NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution) const; [[nodiscard]] PixelSelector fetch(const Chromosome &chrom1, std::uint32_t start1, diff --git a/src/pixel/include/hictk/pixel.hpp b/src/pixel/include/hictk/pixel.hpp index cabe3526..e1e0220f 100644 --- a/src/pixel/include/hictk/pixel.hpp +++ b/src/pixel/include/hictk/pixel.hpp @@ -92,8 +92,7 @@ class PixelMerger { public: PixelMerger() = delete; - PixelMerger(const std::vector &head, const std::vector &tail); - PixelMerger(PixelIt head_first, PixelIt head_last, PixelIt tail_first); + PixelMerger(std::vector head, std::vector tail); [[nodiscard]] auto next() -> Pixel; private: diff --git a/src/pixel/pixel_impl.hpp b/src/pixel/pixel_impl.hpp index a7a73cab..37676567 100644 --- a/src/pixel/pixel_impl.hpp +++ b/src/pixel/pixel_impl.hpp @@ -158,20 +158,17 @@ inline bool PixelMerger::Node::operator!=(const Node &other) const noex } template -inline PixelMerger::PixelMerger(const std::vector &heads, - const std::vector &tails) { +inline PixelMerger::PixelMerger(std::vector heads, std::vector tails) { assert(heads.size() == tails.size()); - auto tail = tails.begin(); - std::for_each(heads.begin(), heads.end(), [&](const auto &it) { - auto first = it; - auto last = *tail++; + for (std::size_t i = 0; i < heads.size(); ++i) { + auto &first = heads[i]; + auto &last = tails[i]; if (first != last) { - auto pixel = *first++; _heads.emplace_back(std::move(first)); _tails.emplace_back(std::move(last)); - _pqueue.emplace(Node{std::move(pixel), _pqueue.size()}); + _pqueue.emplace(Node{std::move(*_heads.back()++), _pqueue.size()}); } - }); + } } template From ae48afc05134285b8e2660f55dbdbc38464a7bd4 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 16 Jun 2023 13:39:40 +0200 Subject: [PATCH 29/48] Add fetch method to fully traverse a .hic file --- src/hic/hic_file_impl.hpp | 66 ++++++++++-- src/hic/include/hictk/hic.hpp | 2 + src/hic/include/hictk/hic/pixel_selector.hpp | 64 +++++++++++ src/hic/pixel_selector_impl.hpp | 108 +++++++++++++++++++ test/units/hic/pixel_selector_test.cpp | 31 ++++++ 5 files changed, 260 insertions(+), 11 deletions(-) diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index 9db704da..b846a370 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -61,24 +61,68 @@ inline const std::vector& HiCFile::avail_resolutions() const noex inline std::uint32_t HiCFile::resolution() const noexcept { return _bins->bin_size(); } inline std::shared_ptr HiCFile::get_footer( - std::uint32_t chrom1_id, std::uint32_t chrom2_id, MatrixType matrix_type, + const Chromosome& chrom1, const Chromosome& chrom2, MatrixType matrix_type, NormalizationMethod norm, MatrixUnit unit, std::uint32_t resolution) const { - const internal::HiCFooterMetadata metadata{url(), - matrix_type, - norm, - unit, - resolution, - _fs->header().chromosomes.at(chrom1_id), - _fs->header().chromosomes.at(chrom2_id)}; + const internal::HiCFooterMetadata metadata{url(), matrix_type, norm, unit, + resolution, chrom1, chrom2}; auto it = _footers.find(metadata); if (it != _footers.end()) { return *it; } - auto [node, _] = - _footers.emplace(_fs->read_footer(chrom1_id, chrom2_id, matrix_type, norm, unit, resolution)); + try { + auto [node, _] = _footers.emplace( + _fs->read_footer(chrom1.id(), chrom2.id(), matrix_type, norm, unit, resolution)); + + return *node; + } catch (const std::exception& e) { + // Check whether query is valid but there are no interactions for the given chromosome + // pair + const auto missing_footer = + std::string_view{e.what()}.find("unable to read file offset") == std::string_view::npos; + if (missing_footer) { + throw; + } + + auto idx = [&]() -> internal::Index { + if (metadata.fileOffset != -1) { + return _fs->read_index(metadata.fileOffset, metadata.chrom1, metadata.chrom2, metadata.unit, + metadata.resolution); + } + // Chromosomes are valid, but footer is missing, meaning that file has no interactions + // for the given params: return an empty index/footer + return {metadata.chrom1, + metadata.chrom2, + metadata.unit, + metadata.resolution, + _fs->version(), + 1, + 1, + 0, + {}}; + }(); + return std::make_shared(std::move(idx), std::move(metadata)); + } +} + +inline PixelSelectorAll HiCFile::fetch(NormalizationMethod norm) const { + std::vector selectors; + + for (std::uint32_t chrom1_id = 0; chrom1_id < chromosomes().size(); ++chrom1_id) { + const auto& chrom1 = chromosomes().at(chrom1_id); + if (chrom1.is_all()) { + continue; + } + for (std::uint32_t chrom2_id = chrom1_id; chrom2_id < chromosomes().size(); ++chrom2_id) { + const auto& chrom2 = chromosomes().at(chrom2_id); + if (chrom2.is_all()) { + continue; + } + selectors.emplace_back(fetch(chrom1.name(), chrom2.name(), norm)); + } + } - return *node; + return PixelSelectorAll{std::move(selectors)}; } inline PixelSelector HiCFile::fetch(std::string_view query, NormalizationMethod norm, diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index 96c87527..eaf030cf 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -52,6 +52,8 @@ class HiCFile { [[nodiscard]] const std::vector &avail_resolutions() const noexcept; [[nodiscard]] std::uint32_t resolution() const noexcept; + [[nodiscard]] PixelSelectorAll fetch(NormalizationMethod norm = NormalizationMethod::NONE) const; + [[nodiscard]] PixelSelector fetch(std::string_view query, NormalizationMethod norm = NormalizationMethod::NONE, QUERY_TYPE query_type = QUERY_TYPE::UCSC) const; diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index 28349d32..64cc953a 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -130,6 +130,70 @@ class PixelSelector { }; }; +class PixelSelectorAll { + public: + template + class iterator; + + private: + std::vector _selectors{}; + + public: + PixelSelectorAll() = default; + explicit PixelSelectorAll(std::vector selectors_) noexcept; + + template + [[nodiscard]] auto begin() const -> iterator; + template + [[nodiscard]] auto end() const -> iterator; + + template + [[nodiscard]] auto cbegin() const -> iterator; + template + [[nodiscard]] auto cend() const -> iterator; + + template + [[nodiscard]] std::vector> read_all() const; + + [[nodiscard]] MatrixType matrix_type() const noexcept; + [[nodiscard]] NormalizationMethod normalization() const noexcept; + [[nodiscard]] MatrixUnit unit() const noexcept; + [[nodiscard]] std::uint32_t resolution() const noexcept; + + [[nodiscard]] const BinTable &bins() const noexcept; + + template + class iterator { + static constexpr auto npos = (std::numeric_limits::max)(); + + using PixelMerger = hictk::internal::PixelMerger>; + std::shared_ptr _merger{}; + Pixel _value{}; + std::size_t _i{npos}; + + public: + using difference_type = std::ptrdiff_t; + using value_type = Pixel; + using pointer = value_type *; + using const_pointer = const value_type *; + using reference = value_type &; + using const_reference = const value_type &; + using iterator_category = std::forward_iterator_tag; + + iterator() = default; + explicit iterator(const std::vector &selectors_); + + [[nodiscard]] bool operator==(const iterator &other) const noexcept; + [[nodiscard]] bool operator!=(const iterator &other) const noexcept; + + [[nodiscard]] auto operator*() const -> const_reference; + [[nodiscard]] auto operator->() const -> const_pointer; + + auto operator++() -> iterator &; + auto operator++(int) -> iterator; + }; +}; + } // namespace hictk::hic #include "../../../pixel_selector_impl.hpp" diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 032da678..ac9eee79 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -315,4 +315,112 @@ inline void PixelSelector::iterator::read_next_row() { assert(std::is_sorted(_buffer->begin(), _buffer->end())); _bin1_id++; } + +inline PixelSelectorAll::PixelSelectorAll(std::vector selectors_) noexcept + : _selectors(std::move(selectors_)) {} + +template +inline auto PixelSelectorAll::begin() const -> iterator { + return cbegin(); +} +template +inline auto PixelSelectorAll::cbegin() const -> iterator { + return iterator(_selectors); +} + +template +inline auto PixelSelectorAll::end() const -> iterator { + return cend(); +} +template +inline auto PixelSelectorAll::cend() const -> iterator { + return iterator{}; +} + +template +inline std::vector> PixelSelectorAll::read_all() const { + // We push_back into buff to avoid traversing pixels twice (once to figure out the vector size, + // and a second time to copy the actual data) + std::vector> buff{}; + std::copy(begin(), end(), std::back_inserter(buff)); + return buff; +} + +inline MatrixType PixelSelectorAll::matrix_type() const noexcept { + return _selectors.front().matrix_type(); +} +inline NormalizationMethod PixelSelectorAll::normalization() const noexcept { + return _selectors.front().normalization(); +} +inline MatrixUnit PixelSelectorAll::unit() const noexcept { return _selectors.front().unit(); } +inline std::uint32_t PixelSelectorAll::resolution() const noexcept { + return _selectors.front().resolution(); +} + +inline const BinTable &PixelSelectorAll::bins() const noexcept { return _selectors.front().bins(); } + +template +inline PixelSelectorAll::iterator::iterator(const std::vector &selectors_) { + std::vector> heads; + std::vector> tails; + + for (const auto &sel : selectors_) { + auto first = sel.begin(); + auto last = sel.end(); + if (first != last) { + heads.emplace_back(std::move(first)); + tails.emplace_back(std::move(last)); + } + } + + if (heads.empty()) { + *this = iterator{}; + return; + } + + _merger = std::make_shared(std::move(heads), std::move(tails)); + _value = _merger->next(); + + if (!_value) { + *this = iterator{}; + return; + } +} + +template +inline bool PixelSelectorAll::iterator::operator==(const iterator &other) const noexcept { + return _i == other._i && _value == other._value; +} + +template +inline bool PixelSelectorAll::iterator::operator!=(const iterator &other) const noexcept { + return !(*this == other); +} + +template +inline auto PixelSelectorAll::iterator::operator*() const -> const_reference { + return _value; +} + +template +inline auto PixelSelectorAll::iterator::operator->() const -> const_pointer { + return &_value; +} + +template +inline auto PixelSelectorAll::iterator::operator++() -> iterator & { + _value = _merger->next(); + if (!_value) { + *this = iterator{}; + } + return *this; +} + +template +inline auto PixelSelectorAll::iterator::operator++(int) -> iterator { + auto it = *this; + std::ignore = ++(*this); + return it; +} + } // namespace hictk::hic diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index 49bf5eba..162861bc 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -396,3 +396,34 @@ TEST_CASE("HiC: pixel selector fetch (oe NONE BP 10000)", "[hic][long]") { } } } + +// NOLINTNEXTLINE(readability-function-cognitive-complexity) +TEST_CASE("HiC: pixel selector fetch all (observed NONE BP 100000)", "[hic][long]") { + SECTION("accessors") { + auto sel = HiCFile(pathV8, 100'000, MatrixType::observed, MatrixUnit::BP).fetch(); + + CHECK(sel.resolution() == 100'000); + CHECK(sel.matrix_type() == MatrixType::observed); + CHECK(sel.normalization() == NormalizationMethod::NONE); + CHECK(sel.unit() == MatrixUnit::BP); + CHECK(sel.bins().size() == 1382); + } + + SECTION("v8") { + auto sel = HiCFile(pathV8, 100'000, MatrixType::observed, MatrixUnit::BP).fetch(); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == 890384); + + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(119208613, 1.0e-6)); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); + } + + SECTION("v9") { + auto sel = HiCFile(pathV8, 100'000, MatrixType::observed, MatrixUnit::BP).fetch(); + const auto buffer = sel.read_all(); + REQUIRE(buffer.size() == 890384); + + CHECK_THAT(sumCounts(buffer), Catch::Matchers::WithinRel(119208613, 1.0e-6)); + CHECK(std::is_sorted(buffer.begin(), buffer.end())); + } +} From 63056d5b29e5479ed5a5b36fcba6fafa76e412ea Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 16 Jun 2023 13:54:07 +0200 Subject: [PATCH 30/48] Bugfix --- test/CMakeLists.txt | 2 +- test/units/hic/hic_file_test.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e00748ff..330bf7c0 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -7,6 +7,6 @@ add_subdirectory(units) option(HICTK_DOWNLOAD_TEST_DATASET "Download datasets required by unit and integration tests" ON) if(HICTK_DOWNLOAD_TEST_DATASET) - message("-- Downloading test dataset...") + message(STATUS "Downloading test dataset...") include(${PROJECT_SOURCE_DIR}/cmake/FetchTestDataset.cmake) endif() diff --git a/test/units/hic/hic_file_test.cpp b/test/units/hic/hic_file_test.cpp index 3e72c4fc..463b16c1 100644 --- a/test/units/hic/hic_file_test.cpp +++ b/test/units/hic/hic_file_test.cpp @@ -63,7 +63,7 @@ TEST_CASE("HiC: footer cache", "[hic][short]") { std::ignore = f.fetch(chrom.name()); } - CHECK(f.num_cached_footers() == 7); + CHECK(f.num_cached_footers() == 8); const auto sel1 = f.fetch("chr2L"); const auto sel2 = f.fetch("chr2L"); From 921593975f78fdf1de2ce04a60e804bf287517c8 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 16 Jun 2023 14:14:58 +0200 Subject: [PATCH 31/48] Remove unused code --- src/hic/hic_file_impl.hpp | 36 +++--------------------------------- 1 file changed, 3 insertions(+), 33 deletions(-) diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index b846a370..4e486f60 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -69,40 +69,10 @@ inline std::shared_ptr HiCFile::get_footer( if (it != _footers.end()) { return *it; } + auto [node, _] = _footers.emplace( + _fs->read_footer(chrom1.id(), chrom2.id(), matrix_type, norm, unit, resolution)); - try { - auto [node, _] = _footers.emplace( - _fs->read_footer(chrom1.id(), chrom2.id(), matrix_type, norm, unit, resolution)); - - return *node; - } catch (const std::exception& e) { - // Check whether query is valid but there are no interactions for the given chromosome - // pair - const auto missing_footer = - std::string_view{e.what()}.find("unable to read file offset") == std::string_view::npos; - if (missing_footer) { - throw; - } - - auto idx = [&]() -> internal::Index { - if (metadata.fileOffset != -1) { - return _fs->read_index(metadata.fileOffset, metadata.chrom1, metadata.chrom2, metadata.unit, - metadata.resolution); - } - // Chromosomes are valid, but footer is missing, meaning that file has no interactions - // for the given params: return an empty index/footer - return {metadata.chrom1, - metadata.chrom2, - metadata.unit, - metadata.resolution, - _fs->version(), - 1, - 1, - 0, - {}}; - }(); - return std::make_shared(std::move(idx), std::move(metadata)); - } + return *node; } inline PixelSelectorAll HiCFile::fetch(NormalizationMethod norm) const { From faf0bf56edb9b2ca2bedcd7e9eb41a162f5c8e6e Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 16 Jun 2023 14:53:03 +0200 Subject: [PATCH 32/48] Update windows-ci.yml --- .github/workflows/windows-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows-ci.yml b/.github/workflows/windows-ci.yml index 4b450e09..c9877da9 100644 --- a/.github/workflows/windows-ci.yml +++ b/.github/workflows/windows-ci.yml @@ -153,7 +153,7 @@ jobs: --schedule-random \ --output-on-failure \ --no-tests=error \ - --timeout 600 |& + --timeout 3600 |& head -n 1000 windows-ci-status-check: From deb039a958055dead6d9d1bb8c65c986b5acfdae Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Fri, 16 Jun 2023 18:19:20 +0200 Subject: [PATCH 33/48] Update windows-ci.yml --- .github/workflows/windows-ci.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/windows-ci.yml b/.github/workflows/windows-ci.yml index c9877da9..9ba8a6ac 100644 --- a/.github/workflows/windows-ci.yml +++ b/.github/workflows/windows-ci.yml @@ -153,8 +153,9 @@ jobs: --schedule-random \ --output-on-failure \ --no-tests=error \ - --timeout 3600 |& - head -n 1000 + --timeout 180 \ + --exclude 'HiC: pixel selector fetch.*' |& + tail -n 1000 windows-ci-status-check: name: Status Check (Windows CI) From e62de11983318fc24d971a6a614dc1f83e36dc3a Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sat, 17 Jun 2023 16:24:00 +0200 Subject: [PATCH 34/48] Update windows-ci.yml --- .github/workflows/windows-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/windows-ci.yml b/.github/workflows/windows-ci.yml index 9ba8a6ac..5b9fe310 100644 --- a/.github/workflows/windows-ci.yml +++ b/.github/workflows/windows-ci.yml @@ -154,7 +154,7 @@ jobs: --output-on-failure \ --no-tests=error \ --timeout 180 \ - --exclude 'HiC: pixel selector fetch.*' |& + --exclude-regex 'HiC: pixel selector fetch.*' |& tail -n 1000 windows-ci-status-check: From 32b44c09d06ad77f3bee024ead34939f16823247 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sat, 17 Jun 2023 17:18:06 +0200 Subject: [PATCH 35/48] Update windows-ci.yml --- .github/workflows/windows-ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/windows-ci.yml b/.github/workflows/windows-ci.yml index 5b9fe310..b9bd0876 100644 --- a/.github/workflows/windows-ci.yml +++ b/.github/workflows/windows-ci.yml @@ -154,7 +154,8 @@ jobs: --output-on-failure \ --no-tests=error \ --timeout 180 \ - --exclude-regex 'HiC: pixel selector fetch.*' |& + --exclude-regex 'HiC: pixel selector fetch.*' \ + --exclude-regex 'Cooler: dataset large read/write.*' |& tail -n 1000 windows-ci-status-check: From cfdcd7ee88511504af311b6b3ebf26a228ffe027 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sat, 17 Jun 2023 17:53:27 +0200 Subject: [PATCH 36/48] Update windows-ci.yml --- .github/workflows/windows-ci.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/windows-ci.yml b/.github/workflows/windows-ci.yml index b9bd0876..ce924695 100644 --- a/.github/workflows/windows-ci.yml +++ b/.github/workflows/windows-ci.yml @@ -154,8 +154,7 @@ jobs: --output-on-failure \ --no-tests=error \ --timeout 180 \ - --exclude-regex 'HiC: pixel selector fetch.*' \ - --exclude-regex 'Cooler: dataset large read/write.*' |& + --exclude-regex '(HiC: pixel selector fetch.*)|(Cooler: dataset large read\/write.*)' |& tail -n 1000 windows-ci-status-check: From e9200356c5009d5e7c09418849e2ae3874624386 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sat, 17 Jun 2023 22:24:50 +0200 Subject: [PATCH 37/48] Simplify BlockCache Replace LRU cache with a simple FIFO. --- conanfile.txt | 1 - src/hic/CMakeLists.txt | 3 +- src/hic/block_cache_impl.hpp | 146 ++++++++----------- src/hic/block_reader_impl.hpp | 12 +- src/hic/hic_file_impl.hpp | 5 +- src/hic/include/hictk/hic.hpp | 5 +- src/hic/include/hictk/hic/block_cache.hpp | 92 ++++++------ src/hic/include/hictk/hic/block_reader.hpp | 6 +- src/hic/include/hictk/hic/common.hpp | 1 - src/hic/include/hictk/hic/footer_cache.hpp | 1 - src/hic/include/hictk/hic/pixel_selector.hpp | 9 +- src/hic/pixel_selector_impl.hpp | 4 +- 12 files changed, 121 insertions(+), 164 deletions(-) diff --git a/conanfile.txt b/conanfile.txt index 82a390f8..ae1e1e8f 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -10,7 +10,6 @@ fmt/10.0.0 hdf5/1.14.0 highfive/2.7.1 libdeflate/1.18 -tsl-ordered-map/1.1.0 zlib/1.2.13 [generators] diff --git a/src/hic/CMakeLists.txt b/src/hic/CMakeLists.txt index 6959fc45..578ce680 100644 --- a/src/hic/CMakeLists.txt +++ b/src/hic/CMakeLists.txt @@ -5,7 +5,6 @@ find_package(fmt QUIET REQUIRED) find_package(Filesystem REQUIRED) find_package(libdeflate QUIET REQUIRED) -find_package(tsl-ordered-map QUIET REQUIRED) add_library(hic INTERFACE) add_library(hictk::hic ALIAS hic) @@ -37,5 +36,5 @@ target_link_system_libraries( INTERFACE fmt::fmt libdeflate::libdeflate - tsl::ordered_map + phmap std::filesystem) diff --git a/src/hic/block_cache_impl.hpp b/src/hic/block_cache_impl.hpp index b2e39b5a..0486bdc1 100644 --- a/src/hic/block_cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -48,16 +48,6 @@ constexpr bool operator!=(std::size_t a_id, const InteractionBlock &b) noexcept return !(a_id == b); } -inline std::size_t InteractionBlock::id() const noexcept { return _id; } -inline const Chromosome &InteractionBlock::chrom1() const noexcept { - assert(_chrom1); - return *_chrom1; -} -inline const Chromosome &InteractionBlock::chrom2() const noexcept { - assert(_chrom2); - return *_chrom2; -} - inline InteractionBlock::InteractionBlock(std::size_t id_, const std::vector &pixels) : _id(id_), _size(pixels.size()) { @@ -68,12 +58,14 @@ inline InteractionBlock::InteractionBlock(std::size_t id_, for (const SerializedPixel &p : pixels) { const auto b1 = static_cast(p.bin1_id); const auto b2 = static_cast(p.bin2_id); - auto node = this->_interactions.find(b1); - if (node != this->_interactions.end()) { - node->second.emplace_back(ThinPixel{b2, p.count}); - } else { - this->_interactions.emplace(b1, Row{{b2, p.count}}); - } + + _first_bin1_id = (std::min)(b1, _first_bin1_id); + _first_bin2_id = (std::min)(b2, _first_bin2_id); + _last_bin1_id = (std::max)(b1, _last_bin1_id); + _last_bin2_id = (std::max)(b2, _last_bin2_id); + + auto [node, _] = this->_interactions.try_emplace(b1, Row{}); + node->second.push_back({b2, p.count}); } if constexpr (ndebug_not_defined()) { for (auto &[_, buff] : this->_interactions) { @@ -102,6 +94,21 @@ inline auto InteractionBlock::end() const noexcept -> const_iterator { return _i inline auto InteractionBlock::cend() const noexcept -> const_iterator { return end(); } +inline std::size_t InteractionBlock::id() const noexcept { return _id; } +inline const Chromosome &InteractionBlock::chrom1() const noexcept { + assert(_chrom1); + return *_chrom1; +} +inline const Chromosome &InteractionBlock::chrom2() const noexcept { + assert(_chrom2); + return *_chrom2; +} + +inline std::size_t InteractionBlock::first_bin1_id() const noexcept { return _first_bin1_id; } +inline std::size_t InteractionBlock::first_bin2_id() const noexcept { return _first_bin2_id; } +inline std::size_t InteractionBlock::last_bin1_id() const noexcept { return _last_bin1_id; } +inline std::size_t InteractionBlock::last_bin2_id() const noexcept { return _last_bin2_id; } + inline auto InteractionBlock::find(std::uint64_t row) const noexcept -> const_iterator { return _interactions.find(row); } @@ -112,102 +119,65 @@ inline std::size_t InteractionBlock::size_in_bytes() const noexcept { return sizeof(ThinPixel) * size(); } -inline std::size_t BlockLRUCache::KeyHasher::operator()( - const BlockLRUCache::Key &k) const noexcept { - return hictk::internal::hash_combine(0, k.chrom1_id, k.chrom2_id, k.id); -} - -constexpr bool BlockLRUCache::Key::operator==( - const hictk::hic::internal::BlockLRUCache::Key &other) const noexcept { +constexpr bool BlockID::operator==(const BlockID &other) const noexcept { return chrom1_id == other.chrom1_id && chrom2_id == other.chrom2_id && id == other.id; } -inline BlockLRUCache::BlockLRUCache(std::size_t max_size_in_bytes) - : _max_size_bytes(max_size_in_bytes) { - if (_max_size_bytes == 0) { - throw std::runtime_error("Invalid block cache capacity: capacity cannot be 0"); - } -} - -constexpr std::size_t BlockLRUCache::size_in_bytes() const noexcept { return _current_size_bytes; } -constexpr std::size_t BlockLRUCache::max_size_in_bytes() const noexcept { return _max_size_bytes; } - -inline void BlockLRUCache::reset() noexcept { - _cache.clear(); - _current_size_bytes = 0; - _hits = 0; - _misses = 0; +inline BlockCache::BlockCache(std::size_t capacity) : _map(capacity), _capacity(capacity) { + assert(capacity != 0); } -inline auto BlockLRUCache::begin() noexcept -> iterator { return _cache.begin(); } - -inline auto BlockLRUCache::begin() const noexcept -> const_iterator { return _cache.begin(); } - -inline auto BlockLRUCache::cbegin() const noexcept -> const_iterator { return begin(); } - -inline auto BlockLRUCache::end() noexcept -> iterator { return _cache.end(); } - -inline auto BlockLRUCache::end() const noexcept -> const_iterator { return _cache.end(); } - -inline auto BlockLRUCache::cend() const noexcept -> const_iterator { return end(); } - -inline auto BlockLRUCache::find(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id) - -> iterator { - const Key key{chrom1_id, chrom2_id, block_id}; - auto it = _cache.find(key); - if (it == end()) { - _misses++; - return it; +inline auto BlockCache::find(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id) + -> Value { + auto match = _map.find({chrom1_id, chrom2_id, block_id}); + if (match != _map.end()) { + ++_hits; + return match->second; } - _hits++; - auto block = it->second; - std::ignore = _cache.erase(it); - auto node = _cache.emplace(key, std::move(block)); - return node.first; -} -inline void BlockLRUCache::erase(key_t key) { - auto it = _cache.find(key); - assert(it != _cache.end()); - erase(it); + ++_misses; + return nullptr; } -inline void BlockLRUCache::erase(iterator it) { - _current_size_bytes -= it->second->size_in_bytes(); - std::ignore = _cache.erase(it); -} - -inline auto BlockLRUCache::emplace(std::size_t chrom1_id, std::size_t chrom2_id, - std::size_t block_id, mapped_type &&block) - -> std::pair { - assert(block); - - while (size() != 0 && size_in_bytes() + block->size_in_bytes() > max_size_in_bytes()) { - erase(begin()); +inline auto BlockCache::emplace(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id, + Value block) -> Value { + if (_map.size() == _capacity) { + pop_oldest(); } - _current_size_bytes += block->size_in_bytes(); - return _cache.emplace(Key{chrom1_id, chrom2_id, block_id}, std::move(block)); + BlockID key{chrom1_id, chrom2_id, block_id}; + _queue.push(key); + _map.emplace(std::move(key), block); + return block; } -inline auto BlockLRUCache::emplace(std::size_t chrom1_id, std::size_t chrom2_id, - std::size_t block_id, InteractionBlock &&block) - -> std::pair { +inline auto BlockCache::emplace(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id, + InteractionBlock &&block) -> Value { return emplace(chrom1_id, chrom2_id, block_id, std::make_shared(std::move(block))); } -constexpr double BlockLRUCache::hit_rate() const noexcept { +constexpr std::size_t BlockCache::capacity() const noexcept { return _capacity; } +inline std::size_t BlockCache::size() const noexcept { return _map.size(); } + +constexpr double BlockCache::hit_rate() const noexcept { if (_hits + _misses == 0) { return 0.0; } return double(_hits) / double(_hits + _misses); } -constexpr std::size_t BlockLRUCache::hits() const noexcept { return _hits; } +constexpr void BlockCache::reset_stats() noexcept { + _hits = 0; + _misses = 0; +} -constexpr std::size_t BlockLRUCache::misses() const noexcept { return _misses; } +constexpr std::size_t BlockCache::hits() const noexcept { return _hits; } +constexpr std::size_t BlockCache::misses() const noexcept { return _misses; } -inline std::size_t BlockLRUCache::size() const noexcept { return _cache.size(); } +inline void BlockCache::pop_oldest() { + _map.erase(_queue.front()); + _queue.pop(); +} } // namespace hictk::hic::internal diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp index 73aa8631..92f78284 100644 --- a/src/hic/block_reader_impl.hpp +++ b/src/hic/block_reader_impl.hpp @@ -34,7 +34,7 @@ inline std::string &BinaryBuffer::reset() noexcept { inline HiCBlockReader::HiCBlockReader(std::shared_ptr hfs, const Index &master_index, std::shared_ptr bins_, - std::shared_ptr block_cache_) + std::shared_ptr block_cache_) : _hfs(std::move(hfs)), _blk_cache(std::move(block_cache_)), _bins(std::move(bins_)), @@ -83,8 +83,9 @@ inline std::shared_ptr HiCBlockReader::read(const Chromo assert(_blk_cache); assert(_bins); - if (auto it = _blk_cache->find(chrom1.id(), chrom2.id(), idx.id()); it != _blk_cache->end()) { - return it->second; + auto blk = _blk_cache->find(chrom1.id(), chrom2.id(), idx.id()); + if (blk) { + return blk; } _hfs->readAndInflate(idx, _bbuffer.reset()); @@ -137,9 +138,8 @@ inline std::shared_ptr HiCBlockReader::read(const Chromo HICTK_UNREACHABLE_CODE; } - auto it = _blk_cache->emplace(chrom1.id(), chrom2.id(), idx.id(), - InteractionBlock{idx.id(), _tmp_buffer}); - return it.first->second; + return _blk_cache->emplace(chrom1.id(), chrom2.id(), idx.id(), + InteractionBlock{idx.id(), _tmp_buffer}); } inline void HiCBlockReader::read_dispatcher_type1_block( diff --git a/src/hic/hic_file_impl.hpp b/src/hic/hic_file_impl.hpp index 4e486f60..abd878bb 100644 --- a/src/hic/hic_file_impl.hpp +++ b/src/hic/hic_file_impl.hpp @@ -22,7 +22,7 @@ inline HiCFile::HiCFile(std::string url_, std::uint32_t resolution_, MatrixType : _fs(std::make_shared(std::move(url_))), _type(type_), _unit(unit_), - _block_cache(std::make_shared(block_cache_capacity)), + _block_cache(std::make_shared(block_cache_capacity)), _bins(std::make_shared(_fs->header().chromosomes, resolution_)) { assert(block_cache_capacity != 0); if (!has_resolution(resolution())) { @@ -158,6 +158,5 @@ inline std::size_t HiCFile::num_cached_footers() const noexcept { return _footer inline void HiCFile::purge_footer_cache() { _footers.clear(); } inline double HiCFile::block_cache_hit_rate() const noexcept { return _block_cache->hit_rate(); } -inline std::size_t HiCFile::block_cache_size() const noexcept { return _block_cache->size(); } -inline void HiCFile::clear_block_cache() noexcept { _block_cache->reset(); } +inline void HiCFile::reset_cache_stats() const noexcept { _block_cache->reset_stats(); } } // namespace hictk::hic diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index eaf030cf..7a427da3 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -30,7 +30,7 @@ class HiCFile { mutable internal::FooterCache _footers{}; MatrixType _type{MatrixType::observed}; MatrixUnit _unit{MatrixUnit::BP}; - mutable std::shared_ptr _block_cache{}; + mutable std::shared_ptr _block_cache{}; std::shared_ptr _bins{}; public: @@ -71,8 +71,7 @@ class HiCFile { void purge_footer_cache(); [[nodiscard]] double block_cache_hit_rate() const noexcept; - [[nodiscard]] std::size_t block_cache_size() const noexcept; - void clear_block_cache() noexcept; + void reset_cache_stats() const noexcept; private: [[nodiscard]] std::shared_ptr get_footer( diff --git a/src/hic/include/hictk/hic/block_cache.hpp b/src/hic/include/hictk/hic/block_cache.hpp index ff71b055..53b81a3b 100644 --- a/src/hic/include/hictk/hic/block_cache.hpp +++ b/src/hic/include/hictk/hic/block_cache.hpp @@ -4,12 +4,12 @@ #pragma once -#include -#include +#include #include #include #include +#include #include #include "hictk/chromosome.hpp" @@ -17,6 +17,22 @@ #include "hictk/hic/footer.hpp" #include "hictk/pixel.hpp" +namespace hictk::hic::internal { +struct BlockID { + std::size_t chrom1_id; // NOLINT + std::size_t chrom2_id; // NOLINT + std::size_t id; // NOLINT + [[nodiscard]] constexpr bool operator==(const BlockID& other) const noexcept; +}; +} // namespace hictk::hic::internal + +template <> +struct std::hash { + inline std::size_t operator()(hictk::hic::internal::BlockID const& bid) const noexcept { + return hictk::internal::hash_combine(0, bid.chrom1_id, bid.chrom2_id, bid.id); + } +}; + namespace hictk::hic::internal { class InteractionBlock { @@ -29,12 +45,16 @@ class InteractionBlock { using Row = std::vector; private: - using BuffT = phmap::btree_map; + using BuffT = phmap::flat_hash_map; std::size_t _id{}; BuffT _interactions{}; const Chromosome* _chrom1{}; const Chromosome* _chrom2{}; std::size_t _size{}; + std::size_t _first_bin1_id{(std::numeric_limits::max)()}; + std::size_t _last_bin1_id{}; + std::size_t _first_bin2_id{(std::numeric_limits::max)()}; + std::size_t _last_bin2_id{}; public: using iterator = BuffT::iterator; @@ -68,6 +88,10 @@ class InteractionBlock { [[nodiscard]] std::size_t id() const noexcept; [[nodiscard]] const Chromosome& chrom1() const noexcept; [[nodiscard]] const Chromosome& chrom2() const noexcept; + [[nodiscard]] std::size_t first_bin1_id() const noexcept; + [[nodiscard]] std::size_t first_bin2_id() const noexcept; + [[nodiscard]] std::size_t last_bin1_id() const noexcept; + [[nodiscard]] std::size_t last_bin2_id() const noexcept; [[nodiscard]] auto find(std::uint64_t row) const noexcept -> const_iterator; @@ -75,67 +99,39 @@ class InteractionBlock { [[nodiscard]] std::size_t size_in_bytes() const noexcept; }; -class BlockLRUCache { - public: - struct Key { - std::size_t chrom1_id; // NOLINT - std::size_t chrom2_id; // NOLINT - std::size_t id; // NOLINT - - [[nodiscard]] constexpr bool operator==(const Key& other) const noexcept; - }; - - private: - struct KeyHasher { - [[nodiscard]] std::size_t operator()(const Key& k) const noexcept; - }; - - using MapT = - tsl::ordered_map, KeyHasher, std::equal_to<>>; - using key_t = MapT::key_type; - using mapped_type = MapT::mapped_type; - using iterator = MapT::iterator; - using const_iterator = MapT::const_iterator; - MapT _cache{}; +class BlockCache { + using Value = std::shared_ptr; + using MapT = phmap::flat_hash_map; + std::queue _queue{}; + phmap::flat_hash_map _map{}; std::size_t _hits{}; std::size_t _misses{}; - std::size_t _current_size_bytes{}; - std::size_t _max_size_bytes{500UL * 1024UL * 1024UL}; // 500MB + std::size_t _capacity{}; public: - BlockLRUCache() = default; - explicit BlockLRUCache(std::size_t max_size_in_bytes); - - [[nodiscard]] std::size_t size() const noexcept; - [[nodiscard]] constexpr std::size_t size_in_bytes() const noexcept; - [[nodiscard]] constexpr std::size_t max_size_in_bytes() const noexcept; - void reset() noexcept; - - [[nodiscard]] auto begin() noexcept -> iterator; - [[nodiscard]] auto begin() const noexcept -> const_iterator; - [[nodiscard]] auto cbegin() const noexcept -> const_iterator; - - [[nodiscard]] auto end() noexcept -> iterator; - [[nodiscard]] auto end() const noexcept -> const_iterator; - [[nodiscard]] auto cend() const noexcept -> const_iterator; + BlockCache() = delete; + explicit BlockCache(std::size_t capacity); [[nodiscard]] auto find(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id) - -> iterator; + -> Value; + auto emplace(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id, Value block) + -> Value; auto emplace(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id, - mapped_type&& block) -> std::pair; - auto emplace(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id, - InteractionBlock&& block) -> std::pair; + InteractionBlock&& block) -> Value; + + [[nodiscard]] constexpr std::size_t capacity() const noexcept; + [[nodiscard]] std::size_t size() const noexcept; [[nodiscard]] constexpr double hit_rate() const noexcept; [[nodiscard]] constexpr std::size_t hits() const noexcept; [[nodiscard]] constexpr std::size_t misses() const noexcept; + constexpr void reset_stats() noexcept; private: - void erase(key_t key); - void erase(iterator it); + void pop_oldest(); }; } // namespace hictk::hic::internal diff --git a/src/hic/include/hictk/hic/block_reader.hpp b/src/hic/include/hictk/hic/block_reader.hpp index 3d5556fe..fa88cf3a 100644 --- a/src/hic/include/hictk/hic/block_reader.hpp +++ b/src/hic/include/hictk/hic/block_reader.hpp @@ -35,8 +35,7 @@ class BinaryBuffer { class HiCBlockReader { std::shared_ptr _hfs{}; - std::shared_ptr _blk_cache{}; // This should be passed in by file. Key should be - // changed from size_t to {chrom1, chrom2, size_t} + std::shared_ptr _blk_cache{}; // We need the entire bin table in order to map pixels to abs bin ids std::shared_ptr _bins{}; Index _index{}; @@ -47,8 +46,7 @@ class HiCBlockReader { public: HiCBlockReader() = default; HiCBlockReader(std::shared_ptr hfs, const Index& master_index, - std::shared_ptr bins_, - std::shared_ptr block_cache_); + std::shared_ptr bins_, std::shared_ptr block_cache_); [[nodiscard]] explicit operator bool() const noexcept; diff --git a/src/hic/include/hictk/hic/common.hpp b/src/hic/include/hictk/hic/common.hpp index 8bc8221d..9f12d1cb 100644 --- a/src/hic/include/hictk/hic/common.hpp +++ b/src/hic/include/hictk/hic/common.hpp @@ -5,7 +5,6 @@ #pragma once #include -#include #include #include diff --git a/src/hic/include/hictk/hic/footer_cache.hpp b/src/hic/include/hictk/hic/footer_cache.hpp index cf30e47b..0212d3e6 100644 --- a/src/hic/include/hictk/hic/footer_cache.hpp +++ b/src/hic/include/hictk/hic/footer_cache.hpp @@ -5,7 +5,6 @@ #pragma once #include -#include #include #include diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index 64cc953a..1102611d 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -33,14 +33,13 @@ class PixelSelector { PixelSelector() = delete; PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, - std::shared_ptr cache_, - std::shared_ptr bins_, PixelCoordinates coords) noexcept; + std::shared_ptr cache_, std::shared_ptr bins_, + PixelCoordinates coords) noexcept; PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, - std::shared_ptr cache_, - std::shared_ptr bins_, PixelCoordinates coord1_, - PixelCoordinates coord2_) noexcept; + std::shared_ptr cache_, std::shared_ptr bins_, + PixelCoordinates coord1_, PixelCoordinates coord2_) noexcept; [[nodiscard]] bool operator==(const PixelSelector &other) const noexcept; [[nodiscard]] bool operator!=(const PixelSelector &other) const noexcept; diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index ac9eee79..6b561065 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -21,7 +21,7 @@ namespace hictk::hic { inline PixelSelector::PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, - std::shared_ptr cache_, + std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coords) noexcept : PixelSelector(std::move(hfs_), std::move(footer_), std::move(cache_), std::move(bins_), @@ -29,7 +29,7 @@ inline PixelSelector::PixelSelector(std::shared_ptr hfs inline PixelSelector::PixelSelector(std::shared_ptr hfs_, std::shared_ptr footer_, - std::shared_ptr cache_, + std::shared_ptr cache_, std::shared_ptr bins_, PixelCoordinates coord1_, PixelCoordinates coord2_) noexcept : _reader(std::move(hfs_), footer_->index(), std::move(bins_), std::move(cache_)), From 5ecf473dcaa1d0df5981878d983268881158595d Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sat, 17 Jun 2023 22:31:16 +0200 Subject: [PATCH 38/48] Implement PixelSelector methods to retrieve balancing weights --- src/hic/file_reader_impl.hpp | 8 +++++++- src/hic/include/hictk/hic/pixel_selector.hpp | 3 +++ src/hic/pixel_selector_impl.hpp | 7 +++++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/hic/file_reader_impl.hpp b/src/hic/file_reader_impl.hpp index cedc2429..bf5c81ae 100644 --- a/src/hic/file_reader_impl.hpp +++ b/src/hic/file_reader_impl.hpp @@ -395,7 +395,13 @@ inline HiCFooter HiCFileReader::read_footer(std::uint32_t chrom1_id, std::uint32 } } if (metadata.fileOffset == -1) { - return HiCFooter{Index{}, std::move(metadata)}; + const auto num_bins1 = (metadata.chrom1.size() + wanted_resolution - 1) / wanted_resolution; + const auto num_bins2 = (metadata.chrom2.size() + wanted_resolution - 1) / wanted_resolution; + + auto f = HiCFooter{Index{}, std::move(metadata)}; + f.c1Norm() = std::vector(num_bins1, std::numeric_limits::quiet_NaN()); + f.c2Norm() = std::vector(num_bins2, std::numeric_limits::quiet_NaN()); + return f; } const auto file_offset = _fs->tellg(); diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index 1102611d..acaccdd8 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -67,6 +67,9 @@ class PixelSelector { [[nodiscard]] const Chromosome &chrom1() const noexcept; [[nodiscard]] const Chromosome &chrom2() const noexcept; + [[nodiscard]] const std::vector &chrom1_norm() const noexcept; + [[nodiscard]] const std::vector &chrom2_norm() const noexcept; + [[nodiscard]] const BinTable &bins() const noexcept; [[nodiscard]] const internal::HiCFooterMetadata &metadata() const noexcept; diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 6b561065..47d986df 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -132,6 +132,13 @@ inline std::uint32_t PixelSelector::resolution() const noexcept { inline const Chromosome &PixelSelector::chrom1() const noexcept { return _coord1.bin1.chrom(); } inline const Chromosome &PixelSelector::chrom2() const noexcept { return _coord2.bin1.chrom(); } +inline const std::vector &PixelSelector::chrom1_norm() const noexcept { + return _footer->c1Norm(); +} +inline const std::vector &PixelSelector::chrom2_norm() const noexcept { + return _footer->c2Norm(); +} + inline const BinTable &PixelSelector::bins() const noexcept { return _reader.bins(); } inline const internal::HiCFooterMetadata &PixelSelector::metadata() const noexcept { From 9278e9ab091f4bc194b036aa0652a914cc72f9d3 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sat, 17 Jun 2023 22:33:46 +0200 Subject: [PATCH 39/48] Make PixelSelectorAll more lazy --- src/hic/include/hictk/hic/pixel_selector.hpp | 8 +- src/hic/pixel_selector_impl.hpp | 78 ++++++++++++++------ 2 files changed, 60 insertions(+), 26 deletions(-) diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index acaccdd8..1c05a3e2 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -161,15 +161,16 @@ class PixelSelectorAll { [[nodiscard]] NormalizationMethod normalization() const noexcept; [[nodiscard]] MatrixUnit unit() const noexcept; [[nodiscard]] std::uint32_t resolution() const noexcept; - [[nodiscard]] const BinTable &bins() const noexcept; template class iterator { static constexpr auto npos = (std::numeric_limits::max)(); + const PixelSelectorAll *_sel{}; using PixelMerger = hictk::internal::PixelMerger>; std::shared_ptr _merger{}; + std::vector::const_iterator _it{}; Pixel _value{}; std::size_t _i{npos}; @@ -183,7 +184,7 @@ class PixelSelectorAll { using iterator_category = std::forward_iterator_tag; iterator() = default; - explicit iterator(const std::vector &selectors_); + explicit iterator(const PixelSelectorAll &sel); [[nodiscard]] bool operator==(const iterator &other) const noexcept; [[nodiscard]] bool operator!=(const iterator &other) const noexcept; @@ -193,6 +194,9 @@ class PixelSelectorAll { auto operator++() -> iterator &; auto operator++(int) -> iterator; + + private: + void setup_next_pixel_merger(); }; }; diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 47d986df..ddd51288 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -332,7 +332,7 @@ inline auto PixelSelectorAll::begin() const -> iterator { } template inline auto PixelSelectorAll::cbegin() const -> iterator { - return iterator(_selectors); + return iterator(*this); } template @@ -367,31 +367,12 @@ inline std::uint32_t PixelSelectorAll::resolution() const noexcept { inline const BinTable &PixelSelectorAll::bins() const noexcept { return _selectors.front().bins(); } template -inline PixelSelectorAll::iterator::iterator(const std::vector &selectors_) { +inline PixelSelectorAll::iterator::iterator(const PixelSelectorAll &sel) : _sel(&sel) { std::vector> heads; std::vector> tails; - for (const auto &sel : selectors_) { - auto first = sel.begin(); - auto last = sel.end(); - if (first != last) { - heads.emplace_back(std::move(first)); - tails.emplace_back(std::move(last)); - } - } - - if (heads.empty()) { - *this = iterator{}; - return; - } - - _merger = std::make_shared(std::move(heads), std::move(tails)); - _value = _merger->next(); - - if (!_value) { - *this = iterator{}; - return; - } + _it = _sel->_selectors.begin(); + setup_next_pixel_merger(); } template @@ -417,8 +398,10 @@ inline auto PixelSelectorAll::iterator::operator->() const -> const_pointer { template inline auto PixelSelectorAll::iterator::operator++() -> iterator & { _value = _merger->next(); + ++_i; if (!_value) { - *this = iterator{}; + setup_next_pixel_merger(); + return ++(*this); } return *this; } @@ -430,4 +413,51 @@ inline auto PixelSelectorAll::iterator::operator++(int) -> iterator { return it; } +template +inline void PixelSelectorAll::iterator::setup_next_pixel_merger() { + if (_it == _sel->_selectors.end()) { + *this = iterator{}; + return; + } + + auto chrom1 = _it->chrom1(); + auto first_sel = _it; + auto last_sel = std::find_if(first_sel, _sel->_selectors.end(), + [&](const PixelSelector &s) { return s.chrom1() != chrom1; }); + + std::vector> heads; + std::vector> tails; + + while (first_sel != last_sel) { + const auto &sel = *first_sel; + if (sel.chrom1() != chrom1) { + if (!heads.empty()) { + break; + } else { + chrom1 = sel.chrom1(); + } + } + auto first = sel.template begin(); + auto last = sel.template end(); + if (first != last) { + heads.emplace_back(std::move(first)); + tails.emplace_back(std::move(last)); + } + ++first_sel; + } + + if (heads.empty()) { + *this = iterator{}; + return; + } + + _merger = std::make_shared(std::move(heads), std::move(tails)); + _value = _merger->next(); + + if (!_value) { + *this = iterator{}; + return; + } +} + } // namespace hictk::hic From 7be55511526a951d21460ee3bcc91f2a13874856 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 18 Jun 2023 10:33:26 +0200 Subject: [PATCH 40/48] Reduce memory usage of PixelSelectorAll --- src/hic/include/hictk/hic/pixel_selector.hpp | 2 -- src/hic/pixel_selector_impl.hpp | 11 ++++------- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index 1c05a3e2..73146b9d 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -165,14 +165,12 @@ class PixelSelectorAll { template class iterator { - static constexpr auto npos = (std::numeric_limits::max)(); const PixelSelectorAll *_sel{}; using PixelMerger = hictk::internal::PixelMerger>; std::shared_ptr _merger{}; std::vector::const_iterator _it{}; Pixel _value{}; - std::size_t _i{npos}; public: using difference_type = std::ptrdiff_t; diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index ddd51288..5f2eb489 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -377,7 +377,7 @@ inline PixelSelectorAll::iterator::iterator(const PixelSelectorAll &sel) : _s template inline bool PixelSelectorAll::iterator::operator==(const iterator &other) const noexcept { - return _i == other._i && _value == other._value; + return _value == other._value; } template @@ -398,10 +398,8 @@ inline auto PixelSelectorAll::iterator::operator->() const -> const_pointer { template inline auto PixelSelectorAll::iterator::operator++() -> iterator & { _value = _merger->next(); - ++_i; if (!_value) { setup_next_pixel_merger(); - return ++(*this); } return *this; } @@ -415,10 +413,7 @@ inline auto PixelSelectorAll::iterator::operator++(int) -> iterator { template inline void PixelSelectorAll::iterator::setup_next_pixel_merger() { - if (_it == _sel->_selectors.end()) { - *this = iterator{}; - return; - } + assert(_it != _sel->_selectors.end()); auto chrom1 = _it->chrom1(); auto first_sel = _it; @@ -446,6 +441,8 @@ inline void PixelSelectorAll::iterator::setup_next_pixel_merger() { ++first_sel; } + _it = last_sel; + if (heads.empty()) { *this = iterator{}; return; From d2b07f1c202ea5f4873c11e0e08bac695bc9e0b4 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 18 Jun 2023 10:51:08 +0200 Subject: [PATCH 41/48] Express BlockCache size in terms of # of pixels --- src/hic/block_cache_impl.hpp | 10 +++++++--- src/hic/include/hictk/hic.hpp | 3 +-- src/hic/include/hictk/hic/block_cache.hpp | 4 +++- 3 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/hic/block_cache_impl.hpp b/src/hic/block_cache_impl.hpp index 0486bdc1..fccf2e0c 100644 --- a/src/hic/block_cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -141,13 +141,14 @@ inline auto BlockCache::find(std::size_t chrom1_id, std::size_t chrom2_id, std:: inline auto BlockCache::emplace(std::size_t chrom1_id, std::size_t chrom2_id, std::size_t block_id, Value block) -> Value { - if (_map.size() == _capacity) { + while (_size + block->size() > capacity() && !_map.empty()) { pop_oldest(); } BlockID key{chrom1_id, chrom2_id, block_id}; _queue.push(key); _map.emplace(std::move(key), block); + _size += block->size(); return block; } @@ -158,7 +159,8 @@ inline auto BlockCache::emplace(std::size_t chrom1_id, std::size_t chrom2_id, st } constexpr std::size_t BlockCache::capacity() const noexcept { return _capacity; } -inline std::size_t BlockCache::size() const noexcept { return _map.size(); } +constexpr std::size_t BlockCache::size() const noexcept { return _size; } +inline std::size_t BlockCache::num_blocks() const noexcept { return _map.size(); } constexpr double BlockCache::hit_rate() const noexcept { if (_hits + _misses == 0) { @@ -176,8 +178,10 @@ constexpr std::size_t BlockCache::hits() const noexcept { return _hits; } constexpr std::size_t BlockCache::misses() const noexcept { return _misses; } inline void BlockCache::pop_oldest() { - _map.erase(_queue.front()); + auto it = _map.find(_queue.front()); _queue.pop(); + _size -= it->second->size(); + _map.erase(it); } } // namespace hictk::hic::internal diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index 7a427da3..d360eab1 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -37,8 +37,7 @@ class HiCFile { using QUERY_TYPE = GenomicInterval::Type; explicit HiCFile(std::string url_, std::uint32_t resolution_, MatrixType type_ = MatrixType::observed, MatrixUnit unit_ = MatrixUnit::BP, - // TODO consider expressing cache size in terms of number of pixels - std::uint64_t block_cache_capacity = 500ULL << 20U); + std::uint64_t block_cache_capacity = 10'000'000); [[nodiscard]] HiCFile open_resolution(std::uint32_t resolution) const; [[nodiscard]] bool has_resolution(std::uint32_t resolution) const; diff --git a/src/hic/include/hictk/hic/block_cache.hpp b/src/hic/include/hictk/hic/block_cache.hpp index 53b81a3b..220a165c 100644 --- a/src/hic/include/hictk/hic/block_cache.hpp +++ b/src/hic/include/hictk/hic/block_cache.hpp @@ -109,6 +109,7 @@ class BlockCache { std::size_t _misses{}; std::size_t _capacity{}; + std::size_t _size{}; public: BlockCache() = delete; @@ -123,7 +124,8 @@ class BlockCache { InteractionBlock&& block) -> Value; [[nodiscard]] constexpr std::size_t capacity() const noexcept; - [[nodiscard]] std::size_t size() const noexcept; + [[nodiscard]] constexpr std::size_t size() const noexcept; + [[nodiscard]] std::size_t num_blocks() const noexcept; [[nodiscard]] constexpr double hit_rate() const noexcept; [[nodiscard]] constexpr std::size_t hits() const noexcept; From a813ea1a514c17cc3eafe1bb69632e9810e0686f Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 18 Jun 2023 10:55:53 +0200 Subject: [PATCH 42/48] Increase default block cache size to be ~500MB --- src/hic/include/hictk/hic.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hic/include/hictk/hic.hpp b/src/hic/include/hictk/hic.hpp index d360eab1..901c36b5 100644 --- a/src/hic/include/hictk/hic.hpp +++ b/src/hic/include/hictk/hic.hpp @@ -37,7 +37,7 @@ class HiCFile { using QUERY_TYPE = GenomicInterval::Type; explicit HiCFile(std::string url_, std::uint32_t resolution_, MatrixType type_ = MatrixType::observed, MatrixUnit unit_ = MatrixUnit::BP, - std::uint64_t block_cache_capacity = 10'000'000); + std::uint64_t block_cache_capacity = 25'000'000); [[nodiscard]] HiCFile open_resolution(std::uint32_t resolution) const; [[nodiscard]] bool has_resolution(std::uint32_t resolution) const; From 865556fa20f7f71cedd4c848e27072c7158e0c67 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 18 Jun 2023 11:02:14 +0200 Subject: [PATCH 43/48] Bugfix --- src/hic/block_cache_impl.hpp | 4 ++-- test/units/chromosome/chromosome_test.cpp | 2 ++ test/units/hic/pixel_selector_test.cpp | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/hic/block_cache_impl.hpp b/src/hic/block_cache_impl.hpp index fccf2e0c..6ae42a5a 100644 --- a/src/hic/block_cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -56,8 +56,8 @@ inline InteractionBlock::InteractionBlock(std::size_t id_, } for (const SerializedPixel &p : pixels) { - const auto b1 = static_cast(p.bin1_id); - const auto b2 = static_cast(p.bin2_id); + const auto b1 = static_cast(p.bin1_id); + const auto b2 = static_cast(p.bin2_id); _first_bin1_id = (std::min)(b1, _first_bin1_id); _first_bin2_id = (std::min)(b2, _first_bin2_id); diff --git a/test/units/chromosome/chromosome_test.cpp b/test/units/chromosome/chromosome_test.cpp index c3ff567e..e3c6d97b 100644 --- a/test/units/chromosome/chromosome_test.cpp +++ b/test/units/chromosome/chromosome_test.cpp @@ -7,6 +7,8 @@ #include #include +#include +#include #include "hictk/fmt/chromosome.hpp" diff --git a/test/units/hic/pixel_selector_test.cpp b/test/units/hic/pixel_selector_test.cpp index 162861bc..a433eade 100644 --- a/test/units/hic/pixel_selector_test.cpp +++ b/test/units/hic/pixel_selector_test.cpp @@ -419,7 +419,7 @@ TEST_CASE("HiC: pixel selector fetch all (observed NONE BP 100000)", "[hic][long } SECTION("v9") { - auto sel = HiCFile(pathV8, 100'000, MatrixType::observed, MatrixUnit::BP).fetch(); + auto sel = HiCFile(pathV9, 100'000, MatrixType::observed, MatrixUnit::BP).fetch(); const auto buffer = sel.read_all(); REQUIRE(buffer.size() == 890384); From 0edc2eb3c71281ea89be4e843aac41142080fa99 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 18 Jun 2023 13:33:26 +0200 Subject: [PATCH 44/48] Minor changes to InteractionBlock and Index --- conanfile.txt | 1 + src/hic/CMakeLists.txt | 2 + src/hic/block_cache_impl.hpp | 28 +++----- src/hic/include/hictk/hic/block_cache.hpp | 17 ++--- src/hic/include/hictk/hic/index.hpp | 11 ++- src/hic/include/hictk/hic/pixel_selector.hpp | 4 +- src/hic/index_impl.hpp | 70 ++++++++++---------- src/hic/pixel_selector_impl.hpp | 28 +++++--- 8 files changed, 75 insertions(+), 86 deletions(-) diff --git a/conanfile.txt b/conanfile.txt index ae1e1e8f..863a4a20 100644 --- a/conanfile.txt +++ b/conanfile.txt @@ -10,6 +10,7 @@ fmt/10.0.0 hdf5/1.14.0 highfive/2.7.1 libdeflate/1.18 +span-lite/0.10.3 zlib/1.2.13 [generators] diff --git a/src/hic/CMakeLists.txt b/src/hic/CMakeLists.txt index 578ce680..e86bada0 100644 --- a/src/hic/CMakeLists.txt +++ b/src/hic/CMakeLists.txt @@ -5,6 +5,7 @@ find_package(fmt QUIET REQUIRED) find_package(Filesystem REQUIRED) find_package(libdeflate QUIET REQUIRED) +find_package(span-lite QUIET REQUIRED) add_library(hic INTERFACE) add_library(hictk::hic ALIAS hic) @@ -37,4 +38,5 @@ target_link_system_libraries( fmt::fmt libdeflate::libdeflate phmap + nonstd::span-lite std::filesystem) diff --git a/src/hic/block_cache_impl.hpp b/src/hic/block_cache_impl.hpp index 6ae42a5a..6e09222d 100644 --- a/src/hic/block_cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -59,11 +59,6 @@ inline InteractionBlock::InteractionBlock(std::size_t id_, const auto b1 = static_cast(p.bin1_id); const auto b2 = static_cast(p.bin2_id); - _first_bin1_id = (std::min)(b1, _first_bin1_id); - _first_bin2_id = (std::min)(b2, _first_bin2_id); - _last_bin1_id = (std::max)(b1, _last_bin1_id); - _last_bin2_id = (std::max)(b2, _last_bin2_id); - auto [node, _] = this->_interactions.try_emplace(b1, Row{}); node->second.push_back({b2, p.count}); } @@ -80,18 +75,11 @@ inline InteractionBlock::InteractionBlock(std::size_t id_, inline auto InteractionBlock::operator()() const noexcept -> const BuffT & { return _interactions; } -inline auto InteractionBlock::begin() noexcept -> iterator { return _interactions.begin(); } - inline auto InteractionBlock::begin() const noexcept -> const_iterator { return _interactions.begin(); } - -inline auto InteractionBlock::cbegin() const noexcept -> const_iterator { return begin(); } - -inline auto InteractionBlock::end() noexcept -> iterator { return _interactions.end(); } - inline auto InteractionBlock::end() const noexcept -> const_iterator { return _interactions.end(); } - +inline auto InteractionBlock::cbegin() const noexcept -> const_iterator { return begin(); } inline auto InteractionBlock::cend() const noexcept -> const_iterator { return end(); } inline std::size_t InteractionBlock::id() const noexcept { return _id; } @@ -104,13 +92,13 @@ inline const Chromosome &InteractionBlock::chrom2() const noexcept { return *_chrom2; } -inline std::size_t InteractionBlock::first_bin1_id() const noexcept { return _first_bin1_id; } -inline std::size_t InteractionBlock::first_bin2_id() const noexcept { return _first_bin2_id; } -inline std::size_t InteractionBlock::last_bin1_id() const noexcept { return _last_bin1_id; } -inline std::size_t InteractionBlock::last_bin2_id() const noexcept { return _last_bin2_id; } - -inline auto InteractionBlock::find(std::uint64_t row) const noexcept -> const_iterator { - return _interactions.find(row); +inline nonstd::span InteractionBlock::at( + std::size_t bin1_id) const noexcept { + auto match = _interactions.find(bin1_id); + if (match != _interactions.end()) { + return match->second; + } + return {}; } inline std::size_t InteractionBlock::size() const noexcept { return _size; } diff --git a/src/hic/include/hictk/hic/block_cache.hpp b/src/hic/include/hictk/hic/block_cache.hpp index 220a165c..8634956e 100644 --- a/src/hic/include/hictk/hic/block_cache.hpp +++ b/src/hic/include/hictk/hic/block_cache.hpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -51,10 +52,6 @@ class InteractionBlock { const Chromosome* _chrom1{}; const Chromosome* _chrom2{}; std::size_t _size{}; - std::size_t _first_bin1_id{(std::numeric_limits::max)()}; - std::size_t _last_bin1_id{}; - std::size_t _first_bin2_id{(std::numeric_limits::max)()}; - std::size_t _last_bin2_id{}; public: using iterator = BuffT::iterator; @@ -77,23 +74,17 @@ class InteractionBlock { [[nodiscard]] auto operator()() const noexcept -> const BuffT&; - [[nodiscard]] auto begin() noexcept -> iterator; [[nodiscard]] auto begin() const noexcept -> const_iterator; - [[nodiscard]] auto cbegin() const noexcept -> const_iterator; - - [[nodiscard]] auto end() noexcept -> iterator; [[nodiscard]] auto end() const noexcept -> const_iterator; + + [[nodiscard]] auto cbegin() const noexcept -> const_iterator; [[nodiscard]] auto cend() const noexcept -> const_iterator; [[nodiscard]] std::size_t id() const noexcept; [[nodiscard]] const Chromosome& chrom1() const noexcept; [[nodiscard]] const Chromosome& chrom2() const noexcept; - [[nodiscard]] std::size_t first_bin1_id() const noexcept; - [[nodiscard]] std::size_t first_bin2_id() const noexcept; - [[nodiscard]] std::size_t last_bin1_id() const noexcept; - [[nodiscard]] std::size_t last_bin2_id() const noexcept; - [[nodiscard]] auto find(std::uint64_t row) const noexcept -> const_iterator; + [[nodiscard]] nonstd::span at(std::size_t bin1_id) const noexcept; [[nodiscard]] std::size_t size() const noexcept; [[nodiscard]] std::size_t size_in_bytes() const noexcept; diff --git a/src/hic/include/hictk/hic/index.hpp b/src/hic/include/hictk/hic/index.hpp index 3674878e..03412f4b 100644 --- a/src/hic/include/hictk/hic/index.hpp +++ b/src/hic/include/hictk/hic/index.hpp @@ -83,7 +83,6 @@ class Index { std::uint32_t _resolution{}; Chromosome _chrom1{}; Chromosome _chrom2{}; - mutable phmap::flat_hash_set _tmp_buffer{32}; public: static constexpr auto npos = (std::numeric_limits::max)(); @@ -112,16 +111,16 @@ class Index { [[nodiscard]] std::vector find_overlaps(const PixelCoordinates& coords1, const PixelCoordinates& coords2) const; + void find_overlaps(const PixelCoordinates& coords1, const PixelCoordinates& coords2, + std::vector& buffer) const; [[nodiscard]] const BlockIndex& at(std::size_t row, std::size_t col) const; private: - void map_2d_query_to_blocks(const PixelCoordinates& coords1, const PixelCoordinates& coords2, - std::vector& buffer) const; - void generate_block_list(std::size_t bin1, std::size_t bin2, std::size_t bin3, - std::size_t bin4) const; + void generate_block_list(std::size_t bin1, std::size_t bin2, std::size_t bin3, std::size_t bin4, + std::vector& buffer) const; void generate_block_list_intra_v9plus(std::size_t bin1, std::size_t bin2, std::size_t bin3, - std::size_t bin4) const; + std::size_t bin4, std::vector& buffer) const; }; } // namespace hictk::hic::internal diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index 73146b9d..cb282b05 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -90,8 +90,10 @@ class PixelSelector { friend PixelSelector; const PixelSelector *_sel{}; using BufferT = std::vector>; + using BlockIdxBufferT = std::vector; std::size_t _bin1_id{}; + mutable std::shared_ptr _block_idx_buffer{}; mutable std::shared_ptr _buffer{}; mutable std::size_t _buffer_i{}; mutable std::size_t _pixels_processed{}; @@ -128,7 +130,7 @@ class PixelSelector { [[nodiscard]] std::size_t size() const noexcept; void read_next_row(); - [[nodiscard]] std::vector find_blocks_overlapping_current_row(); + [[nodiscard]] const std::vector &find_blocks_overlapping_current_row(); }; }; diff --git a/src/hic/index_impl.hpp b/src/hic/index_impl.hpp index 333ae006..4296528c 100644 --- a/src/hic/index_impl.hpp +++ b/src/hic/index_impl.hpp @@ -134,12 +134,34 @@ inline bool Index::empty() const noexcept { return size() == 0; } // NOLINT inline std::vector Index::find_overlaps(const PixelCoordinates &coords1, const PixelCoordinates &coords2) const { + std::vector buffer{}; + find_overlaps(coords1, coords2, buffer); + return buffer; +} + +inline void Index::find_overlaps(const PixelCoordinates &coords1, const PixelCoordinates &coords2, + std::vector &buffer) const { assert(coords1.is_intra()); assert(coords2.is_intra()); - std::vector buffer{}; - map_2d_query_to_blocks(coords1, coords2, buffer); - return buffer; + assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); + assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); + + auto bin1 = coords1.bin1.rel_id(); + auto bin2 = coords1.bin2.rel_id() + 1; + auto bin3 = coords2.bin1.rel_id(); + auto bin4 = coords2.bin2.rel_id() + 1; + + const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); + + if (_version > 8 && is_intra) { + generate_block_list_intra_v9plus(bin1, bin2, bin3, bin4, buffer); + } else { + generate_block_list(bin1, bin2, bin3, bin4, buffer); + } + + assert(std::is_sorted(buffer.begin(), buffer.end())); + assert(std::unique(buffer.begin(), buffer.end()) == buffer.end()); } inline const BlockIndex &Index::at(std::size_t row, std::size_t col) const { @@ -153,26 +175,27 @@ inline const BlockIndex &Index::at(std::size_t row, std::size_t col) const { } inline void Index::generate_block_list(std::size_t bin1, std::size_t bin2, std::size_t bin3, - std::size_t bin4) const { + std::size_t bin4, std::vector &buffer) const { const auto col1 = bin1 / _block_bin_count; const auto col2 = (bin2 + 1) / _block_bin_count; const auto row1 = bin3 / _block_bin_count; const auto row2 = (bin4 + 1) / _block_bin_count; - // check region part that overlaps with lower left triangle but only if intrachromosomal + buffer.clear(); for (auto row = row1; row <= row2; ++row) { for (auto col = col1; col <= col2; ++col) { const auto block_id = (row * block_column_count()) + col; const auto match = _block_map.find(block_id); if (match != _block_map.end()) { - _tmp_buffer.emplace(*match); + buffer.emplace_back(*match); } } } } inline void Index::generate_block_list_intra_v9plus(std::size_t bin1, std::size_t bin2, - std::size_t bin3, std::size_t bin4) const { + std::size_t bin3, std::size_t bin4, + std::vector &buffer) const { const auto translatedLowerPAD = (bin1 + bin3) / 2 / _block_bin_count; const auto translatedHigherPAD = (bin2 + bin4) / 2 / _block_bin_count + 1; const auto translatedNearerDepth = @@ -190,42 +213,17 @@ inline void Index::generate_block_list_intra_v9plus(std::size_t bin1, std::size_ return (std::min)(translatedNearerDepth, translatedFurtherDepth); }(); - // +1; integer divide rounds down + buffer.clear(); const auto furtherDepth = (std::max)(translatedNearerDepth, translatedFurtherDepth) + 1; - for (auto depth = nearerDepth; depth <= furtherDepth; ++depth) { - for (auto pad = translatedLowerPAD; pad <= translatedHigherPAD; ++pad) { + for (auto pad = translatedLowerPAD; pad <= translatedHigherPAD; ++pad) { + for (auto depth = nearerDepth; depth <= furtherDepth; ++depth) { const auto block_id = (depth * block_column_count()) + pad; auto match = _block_map.find(block_id); if (match != _block_map.end()) { - _tmp_buffer.emplace(*match); + buffer.emplace_back(*match); } } } } -inline void Index::map_2d_query_to_blocks(const hictk::PixelCoordinates &coords1, - const hictk::PixelCoordinates &coords2, - std::vector &buffer) const { - assert(coords1.bin1.chrom() == _chrom1 || coords1.bin1.chrom() == _chrom2); - assert(coords2.bin1.chrom() == _chrom1 || coords2.bin1.chrom() == _chrom2); - - auto bin1 = coords1.bin1.rel_id(); - auto bin2 = coords1.bin2.rel_id() + 1; - auto bin3 = coords2.bin1.rel_id(); - auto bin4 = coords2.bin2.rel_id() + 1; - - const auto is_intra = coords1.bin1.chrom() == coords2.bin1.chrom(); - - _tmp_buffer.clear(); - if (_version > 8 && is_intra) { - generate_block_list_intra_v9plus(bin1, bin2, bin3, bin4); - } else { - generate_block_list(bin1, bin2, bin3, bin4); - } - - buffer.resize(_tmp_buffer.size()); - std::move(_tmp_buffer.begin(), _tmp_buffer.end(), buffer.begin()); - std::sort(buffer.begin(), buffer.end()); -} - } // namespace hictk::hic::internal diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 5f2eb489..c1877c7d 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -72,13 +72,14 @@ inline internal::InteractionBlock::ThinPixel PixelSelector::transform_pixel( const auto &c2Norm = _footer->c2Norm(); const auto &expected = _footer->expectedValues(); - assert(is_inter() || bin1 <= pixel.bin2_id); + const auto bin2 = static_cast(pixel.bin2_id); + + assert(is_inter() || bin1 <= bin2); const auto skipNormalization = normalization() == NormalizationMethod::NONE || matrix_type() == MatrixType::expected; if (!skipNormalization) { - const auto bin2 = pixel.bin2_id; assert(bin1 < c1Norm.size()); assert(bin2 < c2Norm.size()); pixel.count /= static_cast(c1Norm[bin1] * c2Norm[bin2]); @@ -93,7 +94,7 @@ inline internal::InteractionBlock::ThinPixel PixelSelector::transform_pixel( return float(_reader.avg()); } - const auto i = (pixel.bin2_id - bin1); + const auto i = (bin2 - bin1); assert(i < expected.size()); return float(expected[i]); }(); @@ -158,7 +159,10 @@ inline double PixelSelector::avg() const noexcept { return _reader.avg(); } template inline PixelSelector::iterator::iterator(const PixelSelector &sel) - : _sel(&sel), _bin1_id(coord1().bin1.rel_id()), _buffer(std::make_shared()) { + : _sel(&sel), + _bin1_id(coord1().bin1.rel_id()), + _block_idx_buffer(std::make_shared()), + _buffer(std::make_shared()) { if (_sel->_reader.index().empty()) { *this = at_end(sel); return; @@ -259,8 +263,8 @@ inline std::size_t PixelSelector::iterator::size() const noexcept { return !_buffer ? 0 : _buffer->size(); } template -inline std::vector -PixelSelector::iterator::find_blocks_overlapping_current_row() { +inline const std::vector + &PixelSelector::iterator::find_blocks_overlapping_current_row() { const auto end_pos = coord1().bin2.start(); const auto pos1 = (std::min)(end_pos, static_cast(_bin1_id) * bins().bin_size()); const auto pos2 = (std::min)(end_pos, pos1 + bins().bin_size()); @@ -268,7 +272,12 @@ PixelSelector::iterator::find_blocks_overlapping_current_row() { const auto coord1_ = PixelCoordinates(bins().at(coord1().bin1.chrom(), pos1), bins().at(coord1().bin1.chrom(), pos2)); - return _sel->_reader.index().find_overlaps(coord1_, coord2()); + if (_block_idx_buffer.use_count() != 1) { + _block_idx_buffer = std::make_shared(); + } + + _sel->_reader.index().find_overlaps(coord1_, coord2(), *_block_idx_buffer); + return *_block_idx_buffer; } template @@ -291,12 +300,11 @@ inline void PixelSelector::iterator::read_next_row() { const auto bin1 = bins().at(chrom1, static_cast(_bin1_id) * bin_size); for (const auto block_idx : blocks) { const auto blk = _sel->_reader.read(chrom1, coord2().bin1.chrom(), block_idx); - const auto match = blk->find(_bin1_id); - if (match == blk->end()) { + const auto pixels = blk->at(_bin1_id); + if (pixels.empty()) { continue; } - const auto &pixels = match->second; auto first = std::lower_bound(pixels.begin(), pixels.end(), coord2().bin1.rel_id(), [](const internal::InteractionBlock::ThinPixel &pixel, std::size_t bin_id) { return pixel.bin2_id < bin_id; }); From e2688d743b016ef86dd304a9dd2500aeaad708a9 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 18 Jun 2023 14:28:20 +0200 Subject: [PATCH 45/48] Reduce the number of reallocations when constructing InteractionBlock(s) --- src/hic/block_cache_impl.hpp | 12 ++++++++++-- src/hic/block_reader_impl.hpp | 2 +- src/hic/include/hictk/hic/block_cache.hpp | 3 ++- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/hic/block_cache_impl.hpp b/src/hic/block_cache_impl.hpp index 6e09222d..f9ea04d6 100644 --- a/src/hic/block_cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -48,20 +48,28 @@ constexpr bool operator!=(std::size_t a_id, const InteractionBlock &b) noexcept return !(a_id == b); } -inline InteractionBlock::InteractionBlock(std::size_t id_, +inline InteractionBlock::InteractionBlock(std::size_t id_, std::size_t block_bin_count, const std::vector &pixels) : _id(id_), _size(pixels.size()) { if (pixels.empty()) { return; } + _interactions.reserve((std::min)(block_bin_count, pixels.size())); for (const SerializedPixel &p : pixels) { const auto b1 = static_cast(p.bin1_id); const auto b2 = static_cast(p.bin2_id); - auto [node, _] = this->_interactions.try_emplace(b1, Row{}); + auto [node, _] = _interactions.try_emplace(b1, Row{}); + // usually if a row has more than a few interactions, it is likely it has many, + // thus we grow the vector faster than what the stl does (2x). + // For certain workloads, this leads to a significant perf improvement (~15%) + if (node->second.size() == node->second.capacity()) { + node->second.reserve(node->second.size() * 10); + } node->second.push_back({b2, p.count}); } + if constexpr (ndebug_not_defined()) { for (auto &[_, buff] : this->_interactions) { if (!std::is_sorted(buff.begin(), buff.end(), [](const ThinPixel &p1, const ThinPixel &p2) { diff --git a/src/hic/block_reader_impl.hpp b/src/hic/block_reader_impl.hpp index 92f78284..8745e95d 100644 --- a/src/hic/block_reader_impl.hpp +++ b/src/hic/block_reader_impl.hpp @@ -139,7 +139,7 @@ inline std::shared_ptr HiCBlockReader::read(const Chromo } return _blk_cache->emplace(chrom1.id(), chrom2.id(), idx.id(), - InteractionBlock{idx.id(), _tmp_buffer}); + InteractionBlock{idx.id(), _index.block_bin_count(), _tmp_buffer}); } inline void HiCBlockReader::read_dispatcher_type1_block( diff --git a/src/hic/include/hictk/hic/block_cache.hpp b/src/hic/include/hictk/hic/block_cache.hpp index 8634956e..16cdabdb 100644 --- a/src/hic/include/hictk/hic/block_cache.hpp +++ b/src/hic/include/hictk/hic/block_cache.hpp @@ -58,7 +58,8 @@ class InteractionBlock { using const_iterator = BuffT::const_iterator; InteractionBlock() = default; - InteractionBlock(std::size_t id_, const std::vector& pixels); + InteractionBlock(std::size_t id_, std::size_t block_bin_count, + const std::vector& pixels); friend constexpr bool operator<(const InteractionBlock& a, const InteractionBlock& b) noexcept; friend constexpr bool operator==(const InteractionBlock& a, const InteractionBlock& b) noexcept; From fedf3378e377a689a69046d4f7f1739878953fa7 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 18 Jun 2023 16:05:43 +0200 Subject: [PATCH 46/48] Add benchrmark for hic dump --- CMakeLists.txt | 2 +- benchmark/CMakeLists.txt | 5 ++ benchmark/hic/CMakeLists.txt | 19 +++++++ benchmark/hic/dump.cpp | 97 ++++++++++++++++++++++++++++++++++++ 4 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 benchmark/CMakeLists.txt create mode 100644 benchmark/hic/CMakeLists.txt create mode 100644 benchmark/hic/dump.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 9597feca..2bd51394 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,5 +125,5 @@ endif() if(HICTK_BUILD_BENCHMARKS) message(STATUS "Building benchmarks.") - # add_subdirectory(benchmarks) + add_subdirectory(benchmark) endif() diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt new file mode 100644 index 00000000..33ccfcfc --- /dev/null +++ b/benchmark/CMakeLists.txt @@ -0,0 +1,5 @@ +# Copyright (C) 2023 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +add_subdirectory(hic) diff --git a/benchmark/hic/CMakeLists.txt b/benchmark/hic/CMakeLists.txt new file mode 100644 index 00000000..911d8e2a --- /dev/null +++ b/benchmark/hic/CMakeLists.txt @@ -0,0 +1,19 @@ +# Copyright (C) 2022 Roberto Rossini +# +# SPDX-License-Identifier: MIT + +find_package(CLI11 REQUIRED QUIET) +find_package(Filesystem REQUIRED) + +add_executable(hictk_hic_dump_bench dump.cpp) + +target_link_libraries( + hictk_hic_dump_bench + PRIVATE hictk_project_warnings hictk_project_options + PUBLIC hictk::hic) + +target_link_system_libraries( + hictk_hic_dump_bench + PUBLIC + CLI11::CLI11 + std::filesystem) diff --git a/benchmark/hic/dump.cpp b/benchmark/hic/dump.cpp new file mode 100644 index 00000000..fec0c661 --- /dev/null +++ b/benchmark/hic/dump.cpp @@ -0,0 +1,97 @@ +// Copyright (C) 2023 Roberto Rossini +// +// SPDX-License-Identifier: MIT + +#include + +#include +#include +#include + +#include "hictk/hic.hpp" + +using namespace hictk; + +struct Config { + std::string path_to_hic{}; + std::vector resolutions{}; + + std::size_t target_num_records{1'000'000}; + bool genome_wide{}; + std::vector block_cache_sizes{25'000'000}; +}; + +static void dump_genome_wide(const std::string& path_to_hic, std::uint32_t resolution, + std::size_t target_num_records, std::size_t block_cache_size) { + hic::HiCFile hf(path_to_hic, resolution, hic::MatrixType::observed, hic::MatrixUnit::BP, + block_cache_size); + auto sel = hf.fetch(); + + const auto t0 = std::chrono::steady_clock::now(); + auto first = sel.begin(); + auto last = sel.end(); + + std::size_t i = 0; + for (; i < target_num_records && first != last; ++i) { + std::ignore = ++first; + } + + const auto t1 = std::chrono::steady_clock::now(); + + const auto delta = + static_cast(std::chrono::duration_cast(t1 - t0).count()) / + 1.0e6; + fmt::print(FMT_STRING("{}\t{}\t{}\t{}\t{}\n"), path_to_hic, resolution, i, block_cache_size, + delta); +} + +static void dump(const std::string& path_to_hic, std::uint32_t resolution, + std::size_t target_num_records, std::size_t block_cache_size) { + hic::HiCFile const hf(path_to_hic, resolution, hic::MatrixType::observed, hic::MatrixUnit::BP, + block_cache_size); + auto sel = hf.fetch("chr1"); + + const auto t0 = std::chrono::steady_clock::now(); + auto first = sel.begin(); + auto last = sel.end(); + + std::size_t i = 0; + for (; i < target_num_records && first != last; ++i) { + std::ignore = ++first; + } + + const auto t1 = std::chrono::steady_clock::now(); + + const auto delta = + static_cast(std::chrono::duration_cast(t1 - t0).count()) / + 1.0e6; + fmt::print(FMT_STRING("{}\t{}\t{}\t{}\t{}\n"), path_to_hic, resolution, i, block_cache_size, + delta); +} + +int main(int argc, char** argv) { + CLI::App cli{}; + + Config c{}; + + cli.add_option("hic", c.path_to_hic, "Path to a .hic file."); + cli.add_option("resolution", c.resolutions, "Resolution in bp."); + cli.add_option("--target-num-records", c.target_num_records, "")->capture_default_str(); + cli.add_option("--block-cache-size", c.block_cache_sizes, "")->capture_default_str(); + cli.add_flag("--genome-wide", c.genome_wide, "")->capture_default_str(); + + try { + cli.parse(argc, argv); + + fmt::print(FMT_STRING("file\tresolution\tnum_records\tblock_cache_size\ttime\n")); + for (const auto& resolution : c.resolutions) { + for (const auto& block_size : c.block_cache_sizes) { + c.genome_wide + ? dump_genome_wide(c.path_to_hic, resolution, c.target_num_records, block_size) + : dump(c.path_to_hic, resolution, c.target_num_records, block_size); + } + } + } catch (const CLI::ParseError& e) { + return cli.exit(e); + } +} From e93d13f3ca66adfdfd94993ed4d772d5a76b08c6 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Sun, 18 Jun 2023 17:53:57 +0200 Subject: [PATCH 47/48] Greatly improve performance of iteration at high resolutions (<1000bp) Our previous implementation performed quite poorly at high resolutions, as we were processing one row at a time while paying the overhead of fetching the block index and data every row. This was done with the intention of minimizing the amount of read-ahead we do, as well as getting pixels in the correct order without doing any explicit sort. However this was too slow. The current solution is less clean but performance is much better (15x at 10bp on some of our datasets). Instead of processing one row at a time, we now process rows in chunks. Chunk sizes are computed as a fraction of chromosome sizes, and thus grow linearly with resolution, making the overhead to process a chunk comparable across resolution. Another benefit of this approach is that indexing of InteractionBlock is no longer needed: sorting pixels is enough. --- src/hic/block_cache_impl.hpp | 59 ++------------ src/hic/include/hictk/hic/block_cache.hpp | 20 +---- src/hic/include/hictk/hic/pixel_selector.hpp | 9 ++- src/hic/index_impl.hpp | 4 +- src/hic/pixel_selector_impl.hpp | 85 ++++++++++++-------- 5 files changed, 70 insertions(+), 107 deletions(-) diff --git a/src/hic/block_cache_impl.hpp b/src/hic/block_cache_impl.hpp index f9ea04d6..8a186d7f 100644 --- a/src/hic/block_cache_impl.hpp +++ b/src/hic/block_cache_impl.hpp @@ -48,37 +48,11 @@ constexpr bool operator!=(std::size_t a_id, const InteractionBlock &b) noexcept return !(a_id == b); } -inline InteractionBlock::InteractionBlock(std::size_t id_, std::size_t block_bin_count, - const std::vector &pixels) - : _id(id_), _size(pixels.size()) { - if (pixels.empty()) { - return; - } - _interactions.reserve((std::min)(block_bin_count, pixels.size())); - - for (const SerializedPixel &p : pixels) { - const auto b1 = static_cast(p.bin1_id); - const auto b2 = static_cast(p.bin2_id); - - auto [node, _] = _interactions.try_emplace(b1, Row{}); - // usually if a row has more than a few interactions, it is likely it has many, - // thus we grow the vector faster than what the stl does (2x). - // For certain workloads, this leads to a significant perf improvement (~15%) - if (node->second.size() == node->second.capacity()) { - node->second.reserve(node->second.size() * 10); - } - node->second.push_back({b2, p.count}); - } - - if constexpr (ndebug_not_defined()) { - for (auto &[_, buff] : this->_interactions) { - if (!std::is_sorted(buff.begin(), buff.end(), [](const ThinPixel &p1, const ThinPixel &p2) { - return p1.bin2_id < p2.bin2_id; - })) { - throw std::runtime_error("InteractionBlock is not sorted!"); - } - } - } +inline InteractionBlock::InteractionBlock(std::size_t id_, + [[maybe_unused]] std::size_t block_bin_count, + std::vector pixels) + : _id(id_), _interactions(std::move(pixels)) { + std::sort(_interactions.begin(), _interactions.end()); } inline auto InteractionBlock::operator()() const noexcept -> const BuffT & { return _interactions; } @@ -91,29 +65,8 @@ inline auto InteractionBlock::cbegin() const noexcept -> const_iterator { return inline auto InteractionBlock::cend() const noexcept -> const_iterator { return end(); } inline std::size_t InteractionBlock::id() const noexcept { return _id; } -inline const Chromosome &InteractionBlock::chrom1() const noexcept { - assert(_chrom1); - return *_chrom1; -} -inline const Chromosome &InteractionBlock::chrom2() const noexcept { - assert(_chrom2); - return *_chrom2; -} -inline nonstd::span InteractionBlock::at( - std::size_t bin1_id) const noexcept { - auto match = _interactions.find(bin1_id); - if (match != _interactions.end()) { - return match->second; - } - return {}; -} - -inline std::size_t InteractionBlock::size() const noexcept { return _size; } - -inline std::size_t InteractionBlock::size_in_bytes() const noexcept { - return sizeof(ThinPixel) * size(); -} +inline std::size_t InteractionBlock::size() const noexcept { return _interactions.size(); } constexpr bool BlockID::operator==(const BlockID &other) const noexcept { return chrom1_id == other.chrom1_id && chrom2_id == other.chrom2_id && id == other.id; diff --git a/src/hic/include/hictk/hic/block_cache.hpp b/src/hic/include/hictk/hic/block_cache.hpp index 16cdabdb..12c5cabd 100644 --- a/src/hic/include/hictk/hic/block_cache.hpp +++ b/src/hic/include/hictk/hic/block_cache.hpp @@ -4,6 +4,7 @@ #pragma once +#include #include #include @@ -38,20 +39,12 @@ namespace hictk::hic::internal { class InteractionBlock { public: - struct ThinPixel { - std::uint64_t bin2_id{}; - float count{}; - }; - - using Row = std::vector; + using Row = std::vector; private: - using BuffT = phmap::flat_hash_map; + using BuffT = std::vector; std::size_t _id{}; BuffT _interactions{}; - const Chromosome* _chrom1{}; - const Chromosome* _chrom2{}; - std::size_t _size{}; public: using iterator = BuffT::iterator; @@ -59,7 +52,7 @@ class InteractionBlock { InteractionBlock() = default; InteractionBlock(std::size_t id_, std::size_t block_bin_count, - const std::vector& pixels); + std::vector pixels); friend constexpr bool operator<(const InteractionBlock& a, const InteractionBlock& b) noexcept; friend constexpr bool operator==(const InteractionBlock& a, const InteractionBlock& b) noexcept; @@ -82,13 +75,8 @@ class InteractionBlock { [[nodiscard]] auto cend() const noexcept -> const_iterator; [[nodiscard]] std::size_t id() const noexcept; - [[nodiscard]] const Chromosome& chrom1() const noexcept; - [[nodiscard]] const Chromosome& chrom2() const noexcept; - - [[nodiscard]] nonstd::span at(std::size_t bin1_id) const noexcept; [[nodiscard]] std::size_t size() const noexcept; - [[nodiscard]] std::size_t size_in_bytes() const noexcept; }; class BlockCache { diff --git a/src/hic/include/hictk/hic/pixel_selector.hpp b/src/hic/include/hictk/hic/pixel_selector.hpp index cb282b05..c35ed0a7 100644 --- a/src/hic/include/hictk/hic/pixel_selector.hpp +++ b/src/hic/include/hictk/hic/pixel_selector.hpp @@ -80,8 +80,7 @@ class PixelSelector { [[nodiscard]] double avg() const noexcept; private: - [[nodiscard]] internal::InteractionBlock::ThinPixel transform_pixel( - std::size_t bin1, internal::InteractionBlock::ThinPixel pixel) const; + [[nodiscard]] SerializedPixel transform_pixel(SerializedPixel pixel) const; public: template @@ -129,8 +128,10 @@ class PixelSelector { [[nodiscard]] const PixelCoordinates &coord2() const noexcept; [[nodiscard]] std::size_t size() const noexcept; - void read_next_row(); - [[nodiscard]] const std::vector &find_blocks_overlapping_current_row(); + void read_next_chunk(); + [[nodiscard]] const std::vector &find_blocks_overlapping_next_chunk( + std::size_t num_bins); + [[nodiscard]] std::size_t compute_chunk_size(double fraction = 0.0005) const noexcept; }; }; diff --git a/src/hic/index_impl.hpp b/src/hic/index_impl.hpp index 4296528c..22a1e32b 100644 --- a/src/hic/index_impl.hpp +++ b/src/hic/index_impl.hpp @@ -160,8 +160,8 @@ inline void Index::find_overlaps(const PixelCoordinates &coords1, const PixelCoo generate_block_list(bin1, bin2, bin3, bin4, buffer); } - assert(std::is_sorted(buffer.begin(), buffer.end())); - assert(std::unique(buffer.begin(), buffer.end()) == buffer.end()); + std::sort(buffer.begin(), buffer.end()); + buffer.erase(std::unique(buffer.begin(), buffer.end()), buffer.end()); } inline const BlockIndex &Index::at(std::size_t row, std::size_t col) const { diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index c1877c7d..5c72d23b 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -66,12 +66,12 @@ inline auto PixelSelector::end() const -> iterator { return this->cend(); } -inline internal::InteractionBlock::ThinPixel PixelSelector::transform_pixel( - std::size_t bin1, internal::InteractionBlock::ThinPixel pixel) const { +inline SerializedPixel PixelSelector::transform_pixel(SerializedPixel pixel) const { const auto &c1Norm = _footer->c1Norm(); const auto &c2Norm = _footer->c2Norm(); const auto &expected = _footer->expectedValues(); + const auto bin1 = static_cast(pixel.bin1_id); const auto bin2 = static_cast(pixel.bin2_id); assert(is_inter() || bin1 <= bin2); @@ -169,7 +169,7 @@ inline PixelSelector::iterator::iterator(const PixelSelector &sel) } while (_buffer->empty()) { - read_next_row(); + read_next_chunk(); } } @@ -224,7 +224,7 @@ inline auto PixelSelector::iterator::operator++() -> iterator & { ++_pixels_processed; ++_buffer_i; while (!is_at_end() && _buffer_i >= size()) { - read_next_row(); + read_next_chunk(); } return *this; @@ -262,12 +262,29 @@ template inline std::size_t PixelSelector::iterator::size() const noexcept { return !_buffer ? 0 : _buffer->size(); } + +template +inline std::size_t PixelSelector::iterator::compute_chunk_size(double fraction) const noexcept { + const auto bin_size = bins().bin_size(); + const auto num_bins = (coord1().bin1.chrom().size() + bin_size - 1) / bin_size; + const auto max_num_bins = + (std::max)(1U, static_cast(fraction * static_cast(num_bins))); + + const auto end_pos = coord1().bin2.start(); + const auto pos1 = (std::min)(end_pos, static_cast(_bin1_id) * bins().bin_size()); + const auto pos2 = (std::min)(end_pos, pos1 + (max_num_bins * bin_size)); + + return (pos2 - pos1 + bin_size - 1) / bin_size; +} + template inline const std::vector - &PixelSelector::iterator::find_blocks_overlapping_current_row() { + &PixelSelector::iterator::find_blocks_overlapping_next_chunk(std::size_t num_bins) { + const auto bin_size = bins().bin_size(); + const auto end_pos = coord1().bin2.start(); const auto pos1 = (std::min)(end_pos, static_cast(_bin1_id) * bins().bin_size()); - const auto pos2 = (std::min)(end_pos, pos1 + bins().bin_size()); + const auto pos2 = (std::min)(end_pos, pos1 + static_cast((num_bins * bin_size))); const auto coord1_ = PixelCoordinates(bins().at(coord1().bin1.chrom(), pos1), bins().at(coord1().bin1.chrom(), pos2)); @@ -281,9 +298,10 @@ inline const std::vector } template -inline void PixelSelector::iterator::read_next_row() { +inline void PixelSelector::iterator::read_next_chunk() { assert(!!_sel); - const auto blocks = find_blocks_overlapping_current_row(); + const auto chunk_size = compute_chunk_size(); + const auto blocks = find_blocks_overlapping_next_chunk(chunk_size); if (blocks.empty() || _bin1_id > coord1().bin2.rel_id()) { *this = at_end(*_sel); return; @@ -296,39 +314,42 @@ inline void PixelSelector::iterator::read_next_row() { _buffer->clear(); _buffer_i = 0; const auto bin_size = bins().bin_size(); - const auto &chrom1 = coord1().bin1.chrom(); - const auto bin1 = bins().at(chrom1, static_cast(_bin1_id) * bin_size); - for (const auto block_idx : blocks) { - const auto blk = _sel->_reader.read(chrom1, coord2().bin1.chrom(), block_idx); - const auto pixels = blk->at(_bin1_id); - if (pixels.empty()) { - continue; - } - - auto first = std::lower_bound(pixels.begin(), pixels.end(), coord2().bin1.rel_id(), - [](const internal::InteractionBlock::ThinPixel &pixel, - std::size_t bin_id) { return pixel.bin2_id < bin_id; }); - while (first != pixels.end()) { - const auto p = _sel->transform_pixel(_bin1_id, *first); - if (p.bin2_id > coord2().bin2.rel_id()) { - break; + const auto bin1_id_last = _bin1_id + chunk_size; + for (const auto &block_idx : blocks) { + const auto block = _sel->_reader.read(coord1().bin1.chrom(), coord2().bin1.chrom(), block_idx); + auto first = std::lower_bound(block->begin(), block->end(), _bin1_id, + [](const SerializedPixel &pixel, std::size_t bin_id) { + return pixel.bin1_id < static_cast(bin_id); + }); + auto last = std::lower_bound(first, block->end(), bin1_id_last + 1, + [](const SerializedPixel &pixel, std::size_t bin_id) { + return pixel.bin1_id < static_cast(bin_id); + }); + + const auto buffer_size = _buffer->size(); + while (first != last) { + const auto p = _sel->transform_pixel(*first++); + if (p.bin2_id < coord2().bin1.rel_id() || p.bin2_id > coord2().bin2.rel_id()) { + continue; } + const auto pos1 = static_cast(p.bin1_id) * bin_size; const auto pos2 = static_cast(p.bin2_id) * bin_size; + auto coords = PixelCoordinates{bins().at(coord1().bin1.chrom(), pos1), + bins().at(coord2().bin1.chrom(), pos2)}; if constexpr (std::is_integral_v) { - _buffer->emplace_back( - Pixel{PixelCoordinates{bin1, bins().at(coord2().bin1.chrom(), pos2)}, - conditional_static_cast(std::round(p.count))}); + _buffer->emplace_back(Pixel{coords, conditional_static_cast(std::round(p.count))}); } else { - _buffer->emplace_back( - Pixel{PixelCoordinates{bin1, bins().at(coord2().bin1.chrom(), pos2)}, - conditional_static_cast(p.count)}); + _buffer->emplace_back(Pixel{coords, conditional_static_cast(p.count)}); } - ++first; } + + auto sorted_first = _buffer->begin(); + auto sorted_last = sorted_first + static_cast(buffer_size); + std::inplace_merge(sorted_first, sorted_last, _buffer->end()); } assert(std::is_sorted(_buffer->begin(), _buffer->end())); - _bin1_id++; + _bin1_id = bin1_id_last + 1; } inline PixelSelectorAll::PixelSelectorAll(std::vector selectors_) noexcept From 404be67476ed56edbbc1caee666dd3fcc1e561e4 Mon Sep 17 00:00:00 2001 From: Roberto Rossini <71787608+robomics@users.noreply.github.com> Date: Mon, 19 Jun 2023 11:42:25 +0200 Subject: [PATCH 48/48] Bugfix --- src/hic/pixel_selector_impl.hpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/hic/pixel_selector_impl.hpp b/src/hic/pixel_selector_impl.hpp index 5c72d23b..4c79ccff 100644 --- a/src/hic/pixel_selector_impl.hpp +++ b/src/hic/pixel_selector_impl.hpp @@ -168,7 +168,7 @@ inline PixelSelector::iterator::iterator(const PixelSelector &sel) return; } - while (_buffer->empty()) { + while (!!_buffer && _buffer->empty()) { read_next_chunk(); } } @@ -300,9 +300,8 @@ inline const std::vector template inline void PixelSelector::iterator::read_next_chunk() { assert(!!_sel); - const auto chunk_size = compute_chunk_size(); - const auto blocks = find_blocks_overlapping_next_chunk(chunk_size); - if (blocks.empty() || _bin1_id > coord1().bin2.rel_id()) { + + if (_bin1_id > coord1().bin2.rel_id()) { *this = at_end(*_sel); return; } @@ -310,11 +309,19 @@ inline void PixelSelector::iterator::read_next_chunk() { if (_buffer.use_count() != 1) { _buffer = std::make_shared(_buffer->capacity()); } - _buffer->clear(); _buffer_i = 0; + + const auto chunk_size = compute_chunk_size(); const auto bin_size = bins().bin_size(); const auto bin1_id_last = _bin1_id + chunk_size; + + const auto blocks = find_blocks_overlapping_next_chunk(chunk_size); + if (blocks.empty()) { + _bin1_id = bin1_id_last + 1; + return; + } + for (const auto &block_idx : blocks) { const auto block = _sel->_reader.read(coord1().bin1.chrom(), coord2().bin1.chrom(), block_idx); auto first = std::lower_bound(block->begin(), block->end(), _bin1_id,