From bb48cdbabb538509a4f829596d2ac6f07ac7c67f Mon Sep 17 00:00:00 2001 From: Eric Davis Date: Wed, 6 Jul 2022 20:01:28 -0400 Subject: [PATCH 1/3] Add function for reading available normalizations from .hic file. --- .gitignore | 1 + R/src/straw.cpp | 120 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/.gitignore b/.gitignore index 3d48517..335ea11 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,4 @@ dist env hic_straw.egg-info +.Rproj.user diff --git a/R/src/straw.cpp b/R/src/straw.cpp index 500fcd9..6e7c51c 100644 --- a/R/src/straw.cpp +++ b/R/src/straw.cpp @@ -1326,3 +1326,123 @@ Rcpp::NumericVector readHicBpResolutions(std::string fname) hiCFile->close(); return bpResolutions; } + +// Reads all normalizations from the footer +Rcpp::CharacterVector readNormsFromFooter(istream &fin, int64_t master, int32_t version) { + + // Initialize variable to store norm types + Rcpp::CharacterVector normTypes; + + // Read through the footer section + //nBytes + if (version > 8) { + readInt64FromFile(fin); + } else { + readInt32FromFile(fin); + } + + // nEntries + int32_t nEntries = readInt32FromFile(fin); + for (int i = 0; i < nEntries; i++) { + string str; + getline(fin, str, '\0'); + readInt64FromFile(fin); //fpos + readInt32FromFile(fin); //sizeInBytes + } + + // nExpectedValues + int32_t nExpectedValues = readInt32FromFile(fin); + for (int i = 0; i < nExpectedValues; i++) { + string unit0; + getline(fin, unit0, '\0'); //unit + readInt32FromFile(fin); + + int64_t nValues; + if (version > 8) { + nValues = readInt64FromFile(fin); + for (int j = 0; j < nValues; j++) { + readFloatFromFile(fin); + } + } else { + nValues = (int64_t) readInt32FromFile(fin); + for (int j = 0; j < nValues; j++) { + readDoubleFromFile(fin); + } + } + + int32_t nNormalizationFactors = readInt32FromFile(fin); + for (int j = 0; j < nNormalizationFactors; j++) { + readInt32FromFile(fin); //chrIdx + if (version > 8) { + readFloatFromFile(fin); //v + } else { + readDoubleFromFile(fin);//v + } + } + } + + // Needs to be read like this (readInt32FromFile doesn't work) + fin.read((char*)&nExpectedValues, sizeof(int32_t)); + for (int i = 0; i < nExpectedValues; i++) { + //Record available norm types (handling empty strings as NONE) + string type; + getline(fin, type, '\0'); //typeString + if (type == "") { + type = "NONE"; + } + normTypes.push_back(type); + + string unit0; + getline(fin, unit0, '\0'); //unit + readInt32FromFile(fin); + + int64_t nValues; + if (version > 8) { + nValues = readInt64FromFile(fin); + for (int j = 0; j < nValues; j++) { + readFloatFromFile(fin); //v + } + } else { + nValues = (int64_t) readInt32FromFile(fin); + for (int j = 0; j < nValues; j++) { + readDoubleFromFile(fin); //v + } + } + + int32_t nNormalizationFactors = readInt32FromFile(fin); + for (int j = 0; j < nNormalizationFactors; j++) { + readInt32FromFile(fin); //chrIdx + if (version > 8) { + readFloatFromFile(fin); //v + } else { + readDoubleFromFile(fin); //v + } + } + } + + // Include "NONE" + normTypes.push_back("NONE"); + + // Return unique norms + return unique(normTypes); +} + +//' Function for reading available normalizations from .hic file +//' +//' @param fname path to .hic file +//' @return Vector of available normalizations +//' @examples +//' readHicNormTypes(system.file("extdata", "test.hic", package = "strawr")) +//' @export +// [[Rcpp::export]] +Rcpp::CharacterVector readHicNormTypes(std::string fname) +{ + HiCFile *hiCFile = new HiCFile(std::move(fname)); + Rcpp::CharacterVector normTypes; + hiCFile->fin.seekg(hiCFile->master, ios::beg); + normTypes = readNormsFromFooter(hiCFile->fin, + hiCFile->master, + hiCFile->version); + hiCFile->close(); + return normTypes; +} From f3b0bdd6ffd1d3999d45cb507519125ceaea4427 Mon Sep 17 00:00:00 2001 From: Eric Davis Date: Wed, 6 Jul 2022 20:16:54 -0400 Subject: [PATCH 2/3] Update package with new readHicNormTypes function. --- R/DESCRIPTION | 2 +- R/NAMESPACE | 1 + R/R/RcppExports.R | 23 +++++++++++++++++------ R/man/readHicNormTypes.Rd | 20 ++++++++++++++++++++ R/src/RcppExports.cpp | 27 ++++++++++++++++++++++----- 5 files changed, 61 insertions(+), 12 deletions(-) create mode 100644 R/man/readHicNormTypes.Rd diff --git a/R/DESCRIPTION b/R/DESCRIPTION index 955cb63..d84a8d0 100644 --- a/R/DESCRIPTION +++ b/R/DESCRIPTION @@ -14,4 +14,4 @@ Encoding: UTF-8 Imports: Rcpp LinkingTo: Rcpp NeedsCompilation: yes -RoxygenNote: 7.1.1 +RoxygenNote: 7.2.0 diff --git a/R/NAMESPACE b/R/NAMESPACE index 25c2708..8ba6cef 100644 --- a/R/NAMESPACE +++ b/R/NAMESPACE @@ -2,6 +2,7 @@ export(readHicBpResolutions) export(readHicChroms) +export(readHicNormTypes) export(straw) import(Rcpp) useDynLib(strawr) diff --git a/R/R/RcppExports.R b/R/R/RcppExports.R index fd4dbf1..d0b1a6a 100644 --- a/R/R/RcppExports.R +++ b/R/R/RcppExports.R @@ -30,6 +30,17 @@ straw <- function(norm, fname, chr1loc, chr2loc, unit, binsize, matrix = "observ .Call('_strawr_straw', PACKAGE = 'strawr', norm, fname, chr1loc, chr2loc, unit, binsize, matrix) } +#' Function for reading chromosomes from .hic file +#' +#' @param fname path to .hic file +#' @return Data frame of chromosome names and lengths +#' @examples +#' readHicChroms(system.file("extdata", "test.hic", package = "strawr")) +#' @export +readHicChroms <- function(fname) { + .Call('_strawr_readHicChroms', PACKAGE = 'strawr', fname) +} + #' Function for reading basepair resolutions from .hic file #' #' @param fname path to .hic file @@ -41,14 +52,14 @@ readHicBpResolutions <- function(fname) { .Call('_strawr_readHicBpResolutions', PACKAGE = 'strawr', fname) } -#' Function for reading chromosomes from .hic file -#' +#' Function for reading available normalizations from .hic file +#' #' @param fname path to .hic file -#' @return Data frame of chromosome names and lengths +#' @return Vector of available normalizations #' @examples -#' readHicChroms(system.file("extdata", "test.hic", package = "strawr")) +#' readHicNormTypes(system.file("extdata", "test.hic", package = "strawr")) #' @export -readHicChroms <- function(fname) { - .Call('_strawr_readHicChroms', PACKAGE = 'strawr', fname) +readHicNormTypes <- function(fname) { + .Call('_strawr_readHicNormTypes', PACKAGE = 'strawr', fname) } diff --git a/R/man/readHicNormTypes.Rd b/R/man/readHicNormTypes.Rd new file mode 100644 index 0000000..486ef8f --- /dev/null +++ b/R/man/readHicNormTypes.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{readHicNormTypes} +\alias{readHicNormTypes} +\title{Function for reading available normalizations from .hic file} +\usage{ +readHicNormTypes(fname) +} +\arguments{ +\item{fname}{path to .hic file} +} +\value{ +Vector of available normalizations +} +\description{ +Function for reading available normalizations from .hic file +} +\examples{ +readHicNormTypes(system.file("extdata", "test.hic", package = "strawr")) +} diff --git a/R/src/RcppExports.cpp b/R/src/RcppExports.cpp index 50d05ef..b7a0fe0 100644 --- a/R/src/RcppExports.cpp +++ b/R/src/RcppExports.cpp @@ -5,6 +5,11 @@ using namespace Rcpp; +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + // straw Rcpp::DataFrame straw(std::string norm, std::string fname, std::string chr1loc, std::string chr2loc, const std::string& unit, int32_t binsize, std::string matrix); RcppExport SEXP _strawr_straw(SEXP normSEXP, SEXP fnameSEXP, SEXP chr1locSEXP, SEXP chr2locSEXP, SEXP unitSEXP, SEXP binsizeSEXP, SEXP matrixSEXP) { @@ -22,6 +27,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// readHicChroms +Rcpp::DataFrame readHicChroms(std::string fname); +RcppExport SEXP _strawr_readHicChroms(SEXP fnameSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< std::string >::type fname(fnameSEXP); + rcpp_result_gen = Rcpp::wrap(readHicChroms(fname)); + return rcpp_result_gen; +END_RCPP +} // readHicBpResolutions Rcpp::NumericVector readHicBpResolutions(std::string fname); RcppExport SEXP _strawr_readHicBpResolutions(SEXP fnameSEXP) { @@ -33,22 +49,23 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// readHicChroms -Rcpp::DataFrame readHicChroms(std::string fname); -RcppExport SEXP _strawr_readHicChroms(SEXP fnameSEXP) { +// readHicNormTypes +Rcpp::CharacterVector readHicNormTypes(std::string fname); +RcppExport SEXP _strawr_readHicNormTypes(SEXP fnameSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< std::string >::type fname(fnameSEXP); - rcpp_result_gen = Rcpp::wrap(readHicChroms(fname)); + rcpp_result_gen = Rcpp::wrap(readHicNormTypes(fname)); return rcpp_result_gen; END_RCPP } static const R_CallMethodDef CallEntries[] = { {"_strawr_straw", (DL_FUNC) &_strawr_straw, 7}, - {"_strawr_readHicBpResolutions", (DL_FUNC) &_strawr_readHicBpResolutions, 1}, {"_strawr_readHicChroms", (DL_FUNC) &_strawr_readHicChroms, 1}, + {"_strawr_readHicBpResolutions", (DL_FUNC) &_strawr_readHicBpResolutions, 1}, + {"_strawr_readHicNormTypes", (DL_FUNC) &_strawr_readHicNormTypes, 1}, {NULL, NULL, 0} }; From b57a3a6a97b959e959104be955605cbe74eda1cc Mon Sep 17 00:00:00 2001 From: Eric Davis Date: Wed, 31 Aug 2022 13:46:49 -0400 Subject: [PATCH 3/3] Modify readHicChroms to also return the indices for each chromosome. Useful for mapping the correct order of chromosomes upstream of straw. --- R/src/straw.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/R/src/straw.cpp b/R/src/straw.cpp index 6e7c51c..20788aa 100644 --- a/R/src/straw.cpp +++ b/R/src/straw.cpp @@ -1291,7 +1291,7 @@ vector getChromosomes(string fname){ //' Function for reading chromosomes from .hic file //' //' @param fname path to .hic file -//' @return Data frame of chromosome names and lengths +//' @return Data frame of chromosome indices, names and lengths //' @examples //' readHicChroms(system.file("extdata", "test.hic", package = "strawr")) //' @export @@ -1299,13 +1299,15 @@ vector getChromosomes(string fname){ Rcpp::DataFrame readHicChroms(std::string fname) { vector chroms = getChromosomes(std::move(fname)); + Rcpp::NumericVector indices; Rcpp::StringVector names; Rcpp::NumericVector lengths; for (std::vector::iterator it = chroms.begin(); it != chroms.end(); ++it) { + indices.push_back(it->index); names.push_back(it->name); lengths.push_back(it->length); } - return Rcpp::DataFrame::create(Rcpp::Named("name") = names, Rcpp::Named("length") = lengths); + return Rcpp::DataFrame::create(Rcpp::Named("index") = indices, Rcpp::Named("name") = names, Rcpp::Named("length") = lengths); } //' Function for reading basepair resolutions from .hic file