Skip to content

Commit

Permalink
Merge pull request #2 from paulsengroup/impl-hic-lazy-fetch [ci full]
Browse files Browse the repository at this point in the history
Implement iterators over .hic files
  • Loading branch information
robomics committed Jun 20, 2023
2 parents b644236 + 404be67 commit 43f68a1
Show file tree
Hide file tree
Showing 88 changed files with 3,222 additions and 2,297 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/windows-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,9 @@ jobs:
--schedule-random \
--output-on-failure \
--no-tests=error \
--timeout 300 |&
head -n 1000
--timeout 180 \
--exclude-regex '(HiC: pixel selector fetch.*)|(Cooler: dataset large read\/write.*)' |&
tail -n 1000
windows-ci-status-check:
name: Status Check (Windows CI)
Expand Down
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -113,17 +113,17 @@ add_subdirectory(src)

if(HICTK_ENABLE_TESTING)
enable_testing()
message(STATUS "-- Building unit tests.")
message(STATUS "Building unit tests.")
target_compile_definitions(hictk_project_options INTERFACE HICTK_ENABLE_TESTING)
add_subdirectory(test)
endif()

if(HICTK_BUILD_EXAMPLES)
message(STATUS "-- Building examples.")
message(STATUS "Building examples.")
# add_subdirectory(examples)
endif()

if(HICTK_BUILD_BENCHMARKS)
message(STATUS "Building benchmarks.")
# add_subdirectory(benchmarks)
add_subdirectory(benchmark)
endif()
5 changes: 5 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (C) 2023 Roberto Rossini <roberros@uio.no>
#
# SPDX-License-Identifier: MIT

add_subdirectory(hic)
19 changes: 19 additions & 0 deletions benchmark/hic/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Copyright (C) 2022 Roberto Rossini <roberros@uio.no>
#
# SPDX-License-Identifier: MIT

find_package(CLI11 REQUIRED QUIET)
find_package(Filesystem REQUIRED)

add_executable(hictk_hic_dump_bench dump.cpp)

target_link_libraries(
hictk_hic_dump_bench
PRIVATE hictk_project_warnings hictk_project_options
PUBLIC hictk::hic)

target_link_system_libraries(
hictk_hic_dump_bench
PUBLIC
CLI11::CLI11
std::filesystem)
97 changes: 97 additions & 0 deletions benchmark/hic/dump.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
// Copyright (C) 2023 Roberto Rossini <roberros@uio.no>
//
// SPDX-License-Identifier: MIT

#include <fmt/format.h>

#include <CLI/CLI.hpp>
#include <chrono>
#include <cstdint>

#include "hictk/hic.hpp"

using namespace hictk;

struct Config {
std::string path_to_hic{};
std::vector<std::uint32_t> resolutions{};

std::size_t target_num_records{1'000'000};
bool genome_wide{};
std::vector<std::size_t> block_cache_sizes{25'000'000};
};

static void dump_genome_wide(const std::string& path_to_hic, std::uint32_t resolution,
std::size_t target_num_records, std::size_t block_cache_size) {
hic::HiCFile hf(path_to_hic, resolution, hic::MatrixType::observed, hic::MatrixUnit::BP,
block_cache_size);
auto sel = hf.fetch();

const auto t0 = std::chrono::steady_clock::now();
auto first = sel.begin<float>();
auto last = sel.end<float>();

std::size_t i = 0;
for (; i < target_num_records && first != last; ++i) {
std::ignore = ++first;
}

const auto t1 = std::chrono::steady_clock::now();

const auto delta =
static_cast<double>(std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count()) /
1.0e6;
fmt::print(FMT_STRING("{}\t{}\t{}\t{}\t{}\n"), path_to_hic, resolution, i, block_cache_size,
delta);
}

static void dump(const std::string& path_to_hic, std::uint32_t resolution,
std::size_t target_num_records, std::size_t block_cache_size) {
hic::HiCFile const hf(path_to_hic, resolution, hic::MatrixType::observed, hic::MatrixUnit::BP,
block_cache_size);
auto sel = hf.fetch("chr1");

const auto t0 = std::chrono::steady_clock::now();
auto first = sel.begin<float>();
auto last = sel.end<float>();

std::size_t i = 0;
for (; i < target_num_records && first != last; ++i) {
std::ignore = ++first;
}

const auto t1 = std::chrono::steady_clock::now();

const auto delta =
static_cast<double>(std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count()) /
1.0e6;
fmt::print(FMT_STRING("{}\t{}\t{}\t{}\t{}\n"), path_to_hic, resolution, i, block_cache_size,
delta);
}

int main(int argc, char** argv) {
CLI::App cli{};

Config c{};

cli.add_option("hic", c.path_to_hic, "Path to a .hic file.");
cli.add_option("resolution", c.resolutions, "Resolution in bp.");
cli.add_option("--target-num-records", c.target_num_records, "")->capture_default_str();
cli.add_option("--block-cache-size", c.block_cache_sizes, "")->capture_default_str();
cli.add_flag("--genome-wide", c.genome_wide, "")->capture_default_str();

try {
cli.parse(argc, argv);

fmt::print(FMT_STRING("file\tresolution\tnum_records\tblock_cache_size\ttime\n"));
for (const auto& resolution : c.resolutions) {
for (const auto& block_size : c.block_cache_sizes) {
c.genome_wide
? dump_genome_wide(c.path_to_hic, resolution, c.target_num_records, block_size)
: dump(c.path_to_hic, resolution, c.target_num_records, block_size);
}
}
} catch (const CLI::ParseError& e) {
return cli.exit(e);
}
}
2 changes: 1 addition & 1 deletion conanfile.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fmt/10.0.0
hdf5/1.14.0
highfive/2.7.1
libdeflate/1.18
tsl-ordered-map/1.1.0
span-lite/0.10.3
zlib/1.2.13

[generators]
Expand Down
18 changes: 10 additions & 8 deletions src/bin_table/bin_table_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,17 @@
namespace hictk { // NOLINT

inline Bin::Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end_) noexcept
: Bin(Bin::null_id, chrom_, start_, end_) {}
: Bin(Bin::null_id, Bin::rel_null_id, chrom_, start_, end_) {}

inline Bin::Bin(std::uint64_t id_, const Chromosome &chrom_, std::uint32_t start_,
std::uint32_t end_) noexcept
: _id(id_), _interval(chrom_, start_, end_) {}
inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_,
std::uint32_t start_, std::uint32_t end_) noexcept
: _id(id_), _rel_id(rel_id_), _interval(chrom_, start_, end_) {}

inline Bin::Bin(GenomicInterval interval) noexcept : Bin(Bin::null_id, std::move(interval)) {}
inline Bin::Bin(GenomicInterval interval) noexcept
: Bin(Bin::null_id, Bin::rel_null_id, std::move(interval)) {}

inline Bin::Bin(std::uint64_t id_, GenomicInterval interval) noexcept
: _id(id_), _interval(std::move(interval)) {}
inline Bin::Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept
: _id(id_), _rel_id(rel_id_), _interval(std::move(interval)) {}

inline Bin::operator bool() const noexcept { return !!this->chrom(); }

Expand Down Expand Up @@ -72,6 +73,7 @@ inline bool Bin::operator>=(const Bin &other) const noexcept {
}

constexpr std::uint64_t Bin::id() const noexcept { return this->_id; }
constexpr std::uint32_t Bin::rel_id() const noexcept { return this->_rel_id; }
inline const GenomicInterval &Bin::interval() const noexcept { return this->_interval; }
inline const Chromosome &Bin::chrom() const noexcept { return this->interval().chrom(); }
constexpr std::uint32_t Bin::start() const noexcept { return this->_interval.start(); }
Expand Down Expand Up @@ -214,7 +216,7 @@ inline Bin BinTable::at_hint(std::uint64_t bin_id, const Chromosome &chrom) cons
assert(start < chrom.size());
const auto end = (std::min)(start + this->bin_size(), chrom.size());

return {bin_id, chrom, start, end};
return {bin_id, static_cast<std::uint32_t>(relative_bin_id), chrom, start, end};
}

inline std::pair<Bin, Bin> BinTable::at(const GenomicInterval &gi) const {
Expand Down
7 changes: 5 additions & 2 deletions src/bin_table/include/hictk/bin_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,20 @@ namespace hictk {
class Bin {
public:
static constexpr std::uint64_t null_id{(std::numeric_limits<std::uint64_t>::max)()};
static constexpr std::uint32_t rel_null_id{(std::numeric_limits<std::uint32_t>::max)()};

private:
std::uint64_t _id{null_id};
std::uint32_t _rel_id{rel_null_id};
GenomicInterval _interval{};

public:
constexpr Bin() = default;
Bin(const Chromosome &chrom_, std::uint32_t start_, std::uint32_t end) noexcept;
Bin(std::uint64_t id_, const Chromosome &chrom_, std::uint32_t start_,
Bin(std::uint64_t id_, std::uint32_t rel_id_, const Chromosome &chrom_, std::uint32_t start_,
std::uint32_t end_) noexcept;
explicit Bin(GenomicInterval interval) noexcept;
Bin(std::uint64_t id, GenomicInterval interval) noexcept;
Bin(std::uint64_t id_, std::uint32_t rel_id_, GenomicInterval interval) noexcept;

[[nodiscard]] explicit operator bool() const noexcept;

Expand All @@ -44,6 +46,7 @@ class Bin {
[[nodiscard]] bool operator>=(const Bin &other) const noexcept;

[[nodiscard]] constexpr std::uint64_t id() const noexcept;
[[nodiscard]] constexpr std::uint32_t rel_id() const noexcept;
[[nodiscard]] const GenomicInterval &interval() const noexcept;
[[nodiscard]] const Chromosome &chrom() const noexcept;
[[nodiscard]] constexpr std::uint32_t start() const noexcept;
Expand Down
3 changes: 3 additions & 0 deletions src/common/include/hictk/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ namespace hictk {

inline const std::string_view HICTK_VERSION_STRING{hictk::config::version::str()};

namespace cooler {
// Magic values
inline constexpr std::string_view COOL_MAGIC{"HDF5::Cooler"};
inline constexpr std::string_view MCOOL_MAGIC{"HDF5::MCOOL"};
Expand Down Expand Up @@ -58,6 +59,8 @@ inline constexpr std::string_view SENTINEL_ATTR_NAME{"format-version"};
inline constexpr std::uint8_t SENTINEL_ATTR_VALUE{255};
} // namespace internal

} // namespace cooler

[[nodiscard]] constexpr bool ndebug_defined() noexcept {
#ifdef NDEBUG
return true;
Expand Down
18 changes: 8 additions & 10 deletions src/cooler/attribute_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include "hictk/suppress_warnings.hpp"
#include "hictk/type_pretty_printer.hpp"

namespace hictk {
namespace hictk::cooler {

template <typename ParentObj>
inline bool Attribute::exists(ParentObj& h5obj, std::string_view key) {
Expand Down Expand Up @@ -129,6 +129,7 @@ DISABLE_WARNING_UNREACHABLE_CODE
template <typename T1, typename Tout, typename Tin>
// NOLINTNEXTLINE(readability-function-cognitive-complexity)
inline Tout Attribute::numeric_converter(T1& buff) {
using namespace hictk::internal;
static_assert(!std::is_same_v<Tin, std::monostate>);

if constexpr (std::is_same_v<Tin, Tout>) {
Expand All @@ -143,12 +144,12 @@ inline Tout Attribute::numeric_converter(T1& buff) {
if constexpr (std::is_same_v<Tin, std::string> && std::is_arithmetic_v<Tout>) {
// Try to convert a string attribute to the appropriate numeric type
try {
return internal::parse_numeric_or_throw<Tout>(buff);
return parse_numeric_or_throw<Tout>(buff);
} catch (const std::exception& e) {
throw std::runtime_error(
fmt::format(FMT_STRING("Expected type {}, found std::string. An attempt to convert "
"std::string to {} was made, but failed. Reason {}"),
internal::type_name<Tout>(), internal::type_name<Tin>(), e.what()));
type_name<Tout>(), type_name<Tin>(), e.what()));
}
}

Expand All @@ -163,8 +164,7 @@ inline Tout Attribute::numeric_converter(T1& buff) {
throw std::runtime_error(
fmt::format(FMT_STRING("Expected type {}, found {}. Unable to represent value {} as {} "
"without information loss"),
internal::type_name<Tout>(), internal::type_name<Tin>(), buff,
internal::type_name<Tout>()));
type_name<Tout>(), type_name<Tin>(), buff, type_name<Tout>()));
}

if constexpr (std::is_integral_v<Tin> && std::is_integral_v<Tout>) {
Expand All @@ -189,16 +189,14 @@ inline Tout Attribute::numeric_converter(T1& buff) {
throw std::runtime_error(fmt::format(
FMT_STRING(
"Expected type {}, found {}. Unable to represent value {} as {} without overflowing"),
internal::type_name<Tout>(), internal::type_name<Tin>(), buff,
internal::type_name<Tout>()));
type_name<Tout>(), type_name<Tin>(), buff, type_name<Tout>()));
}
// No conversion was possible
throw std::runtime_error(fmt::format(
FMT_STRING(
"Expected type {}, found {}. Unable to safely convert value {} of type {} to type {}"),
internal::type_name<Tout>(), internal::type_name<Tin>(), buff, internal::type_name<Tin>(),
internal::type_name<Tout>()));
type_name<Tout>(), type_name<Tin>(), buff, type_name<Tin>(), type_name<Tout>()));
}
DISABLE_WARNING_POP

} // namespace hictk
} // namespace hictk::cooler
4 changes: 2 additions & 2 deletions src/cooler/balancing_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <string_view>
#include <utility>

namespace hictk {
namespace hictk::cooler {

inline Weights::Weights(std::vector<double> weights, Type type) noexcept
: _weights(std::move(weights)), _type(type) {
Expand Down Expand Up @@ -252,4 +252,4 @@ inline auto Balancer<N, CHUNK_SIZE>::iterator::operator++(int) -> iterator {
return it;
}

} // namespace hictk
} // namespace hictk::cooler
4 changes: 2 additions & 2 deletions src/cooler/dataset_accessors_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#include <highfive/H5DataSet.hpp>
#include <highfive/H5DataType.hpp>

namespace hictk {
namespace hictk::cooler {

inline HighFive::DataSet Dataset::operator()() { return this->_dataset; }

Expand Down Expand Up @@ -95,4 +95,4 @@ inline auto Dataset::cend() const -> iterator<T, CHUNK_SIZE> {
return this->end<T, CHUNK_SIZE>();
}

} // namespace hictk
} // namespace hictk::cooler
4 changes: 2 additions & 2 deletions src/cooler/dataset_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

#include "hictk/common.hpp"

namespace hictk {
namespace hictk::cooler {

namespace internal {

Expand Down Expand Up @@ -180,4 +180,4 @@ inline HighFive::Selection Dataset::select(std::size_t i1, std::size_t i2) const
return this->_dataset.select(std::vector<std::size_t>{i1}, std::vector<std::size_t>{i2});
}

} // namespace hictk
} // namespace hictk::cooler
4 changes: 2 additions & 2 deletions src/cooler/dataset_iterator_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#include "hictk/common.hpp"
#include "hictk/type_pretty_printer.hpp"

namespace hictk {
namespace hictk::cooler {

template <typename T, std::size_t CHUNK_SIZE>
inline Dataset::iterator<T, CHUNK_SIZE>::iterator(const Dataset &dset, std::size_t h5_offset,
Expand Down Expand Up @@ -286,4 +286,4 @@ constexpr auto Dataset::iterator<T, CHUNK_SIZE>::make_end_iterator(const Dataset
return it;
}

} // namespace hictk
} // namespace hictk::cooler
Loading

0 comments on commit 43f68a1

Please sign in to comment.