From db792a8ec63cd410864855e47a6f8b1572fd409d Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Fri, 19 Jul 2024 13:01:02 +0200 Subject: [PATCH 01/22] test and input/output files for the select_mz function --- R/feature.align.R | 5 +++-- tests/testdata/aligned/output_select-mz.rds | Bin 0 -> 207 bytes .../input/feature-align_select-mz.parquet | Bin 0 -> 2902 bytes tests/testthat/test-feature-align_select-mz.R | 15 +++++++++++++++ 4 files changed, 18 insertions(+), 2 deletions(-) create mode 100644 tests/testdata/aligned/output_select-mz.rds create mode 100644 tests/testdata/input/feature-align_select-mz.parquet create mode 100644 tests/testthat/test-feature-align_select-mz.R diff --git a/R/feature.align.R b/R/feature.align.R index acc239b..0aa759f 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -155,13 +155,14 @@ create_aligned_feature_table <- function(features_table, sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence]) # retention time alignment + aligned_features <- foreach::foreach( i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE - ) %dopar% { + ) %do% { rows <- create_rows( features_table, i, - sel.labels, + sel.labels, mz_tol_relative, rt_tol_relative, min_occurrence, diff --git a/tests/testdata/aligned/output_select-mz.rds b/tests/testdata/aligned/output_select-mz.rds new file mode 100644 index 0000000000000000000000000000000000000000..ed8d4b2f622a82cddc3a57d6b869a5d04a29a2d3 GIT binary patch literal 207 zcmV;=05Ja_iwFP!000001B>8dU|?WoU}0uvU}gm}8CXL@+;lA%7?^~?5)2G{K+NS3 z-DJ+{BkBsI&CGcNRGfOIw1lfv8z&r+qe*nhe JlYb8Z006X$Us?bF literal 0 HcmV?d00001 diff --git a/tests/testdata/input/feature-align_select-mz.parquet b/tests/testdata/input/feature-align_select-mz.parquet new file mode 100644 index 0000000000000000000000000000000000000000..32e5c5d7760a55671ea1928a5e8d32e539c47236 GIT binary patch literal 2902 zcmcguZ)jUp6u)^%lh^F6d)C~?iv+3CU5j)xjrI>J`0h)$n3!#CP)yfB&61c^(={ep zTWkF?r0V{NOouQ;1ov%%=m$|j83hZ<5FFJH6F)dVyAL8N3gS8U)6=ISwmY}uZ)AdaU(hJk&iskfBVjB zFn#gkR}H}p1wjbRqx8NsY!O-NN4%Gx|LyE2 zCF9gz-=8}DUdaeP7kc-LuS-ZLM0F<}dqtORUk&KEt&k)*Xwi|pgT3s-9tWFhU9h?4 z(XM;gMIYPp0v$I-2juPP{HUIN>*YSxQ2pjl-(2l6zI|wR^i#jL4ISH4mu(+R>bADh z*^--dytTrJgU?2Q{fvcogLHu2p3iFT?l<0_m5fV)Ki<5u;WaMZacAtV9$y|-D71VV0<4%lr8tsFaIy^94ch zf~5WKVH>xwEj98l!P}BkX(&x4(H!Imuwuk-R&x@%a@#W z`=b317eU)pG=<69RV747IHD^&lR5PG{nowE-}c>uW0aem^J)0wc!cw1ZkG6d&PzOA z5;hYY4xkq2Jc$zKwuD(8O*KTAK;bT%j??aVOLB3^>)53`-gwak7xVV-$ zSSw@?Mh|C`d}3+b@-OikZWUHdGb4h;&T!+6kwTe|?XTw(E9Gpg{4~yC9r7Z+nYZKl z3a)UmKS#XS-%K`Do)OzI4>kPbJoyx;$ZX%Fa55cy1QZ^Pxq5gq$qP&1iX2Mu Date: Fri, 19 Jul 2024 13:43:00 +0200 Subject: [PATCH 02/22] create_empty_tibble added to @export --- R/feature.align.R | 20 ++++---------------- R/utils.R | 1 + 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/R/feature.align.R b/R/feature.align.R index 0aa759f..3b95a56 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -1,5 +1,6 @@ #' @import foreach +#' @export create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) { features <- new("list") features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames) @@ -85,23 +86,12 @@ select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, #' @export create_rows <- function(features, - i, - sel.labels, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) { - if (i %% 100 == 0) { - gc() - } # call Garbage Collection for performance improvement? - - sample <- dplyr::filter(features, cluster == sel.labels[i]) - if (nrow(sample) > 1) { - if (validate_contents(sample, min_occurrence)) { - return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names)) - } - } else if (min_occurrence == 1) { - return(create_output(sample_grouped, sample_names)) + if (validate_contents(features, min_occurrence)) { + return(select_mz(features, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names)) } return(NULL) } @@ -160,9 +150,7 @@ create_aligned_feature_table <- function(features_table, i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE ) %do% { rows <- create_rows( - features_table, - i, - sel.labels, + dplyr::filter(features_table, cluster == sel.labels[i]), mz_tol_relative, rt_tol_relative, min_occurrence, diff --git a/R/utils.R b/R/utils.R index 14f4827..781bc71 100644 --- a/R/utils.R +++ b/R/utils.R @@ -45,6 +45,7 @@ register_functions_to_cluster <- function(cluster) { 'compute_uniq_grp', 'predict_smoothed_rt', 'label_val_to_keep', + "create_empty_tibble", "create_rows", "validate_contents", "select_mz", From a4cc11adcaf15b4b1b3beeccde408c2b4cb879ac Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Fri, 19 Jul 2024 16:14:44 +0200 Subject: [PATCH 03/22] documentation to functions added --- R/feature.align.R | 56 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 7 deletions(-) diff --git a/R/feature.align.R b/R/feature.align.R index 3b95a56..d391f1a 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -1,5 +1,11 @@ #' @import foreach +#' Create an empty tibble for the next alignment step. It will contain three tables with aligned metadata, intensities an RTs. +#' @param number_of_samples Number of different sample names. +#' @param metadata_colnames Metadata column names: "id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names +#' @param intensity_colnames "id" and sample names; will hold intensities. +#' @param rt_colnames "id" and sample names; will hold retention times. +#' @return An empty tibble with slots for metadata, intensities and RTs. #' @export create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) { features <- new("list") @@ -9,6 +15,10 @@ create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_ return(features) } +#' Create a list containing 3 tibbles: metadata, intensities and RTs. +#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. +#' @param sample_names A list of sample names. +#' @return A list containing 3 tibbles: metadata, intensities and RTs. #' @export create_output <- function(sample_grouped, sample_names) { number_of_samples <- length(sample_names) @@ -32,9 +42,12 @@ create_output <- function(sample_grouped, sample_names) { return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row)) } +#' Validates if the data is present in more than "min_occurence" of samples. +#' @param samples A subset of the features_table. +#' @param min_occurrence A minimal number of profiles a feature has to be present in. +#' @return boolean value whether it is TRUE or FALSE. #' @export validate_contents <- function(samples, min_occurrence) { - # validate whether data is still from at least 'min_occurrence' number of samples if (!is.null(nrow(samples))) { if (length(unique(samples$sample_id)) >= min_occurrence) { return(TRUE) @@ -44,24 +57,37 @@ validate_contents <- function(samples, min_occurrence) { return(FALSE) } +#' Compute the kernel density estimation and find the peaks and valleys of a smooth curve. +#' @param data A vector of m/z or RTs for a particular cluster. +#' @param bandwidth A bandwidth value for the KDE computation. +#' @return A list of peaks and valleys positions. #' @export find_optima <- function(data, bandwidth) { - # Kernel Density Estimation - den <- density(data, bw = bandwidth) - # select statistically significant points - turns <- find.turn.point(den$y) - return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys])) + den <- density(data, bw = bandwidth) + turns <- find.turn.point(den$y) + return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys])) } +#' Subset data within lower and upper bound from density estimation +#' @param sample A subset of the features_table. +#' @param turns A list of peaks and valleys positions. +#' @param index Whether it subsets on m/z [1] or RT [2] column. +#' @param i Iterates over the peaks in the turns list. +#' @return Dataframe subsetted within lower and upper bound from density estimation. #' @export filter_based_on_density <- function(sample, turns, index, i) { - # select data within lower and upper bound from density estimation lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]]) upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]]) selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound) return(sample[selected, ]) } +#' Groups the features across samples based on RT. +#' @param sample A dataframe subsetted for the particular cluster. +#' @param rt_tol_relative The retention time tolerance level for peak alignment. +#' @param min_occurence A minimal number of profiles a feature has to be present in. +#' @param sample_names A list of sample names. +#' @param return A list containing 3 tibbles: metadata, intensities and RTs. #' @export select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) { turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414) @@ -73,6 +99,13 @@ select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) { } } +#' Groups the features across samples based on m/z. +#' @param sample A dataframe subsetted for the particular cluster. +#' @param mz_tol_relative The m/z tolerance level for peak alignment. +#' @param rt_tol_relative The retention time tolerance level for peak alignment. +#' @param min_occurence A minimal number of profiles a feature has to be present in. +#' @param sample_names A list of sample names. +#' @return A list containing 3 tibbles: metadata, intensities and RTs. #' @export select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) { turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz)) @@ -84,6 +117,13 @@ select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, } } +#' Groups the mz and RT for particular cluster. +#' @param features The features table subsetted for a particular cluster. +#' @param mz_tol_relative The m/z tolerance level for peak alignment. +#' @param rt_tol_relative The retention time tolerance level for peak alignment. +#' @param min_occurrence A minimal number of profiles a feature has to be present in. +#' @param sample_names A list of sample names. +#' @return A list containing 3 tibbles: metadata, intensities and RTs. #' @export create_rows <- function(features, mz_tol_relative, @@ -96,6 +136,8 @@ create_rows <- function(features, return(NULL) } +#' Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble. +#' @return Tibbles combining the output (metadata, intensity and RT respectively) from different clusters. #' @export comb <- function(x, ...) { mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE))) From 127e714f7d6fe32de2720c6f3db5e6b3db1d8577 Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Fri, 19 Jul 2024 16:21:41 +0200 Subject: [PATCH 04/22] roxygen updated documentation --- DESCRIPTION | 2 +- NAMESPACE | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 80196d4..3b1effc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -17,4 +17,4 @@ NeedsCompilation: no Suggests: dataCompareR, testthat (>= 3.0.0), microbenchmark Config/testthat/edition: 3 -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 1061d6a..995a087 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,6 +42,7 @@ export(compute_uniq_grp) export(correct_time) export(count_peaks) export(create_aligned_feature_table) +export(create_empty_tibble) export(create_output) export(create_rows) export(draw_rt_correction_plot) @@ -101,19 +102,37 @@ export(two.step.hybrid) export(unsupervised) export(validate_contents) export(validate_model_method_input) +import("for") +import("metadata,") +import("next") +import(Create) +import(It) import(MASS) +import(RTs.) +import(aligned) +import(alignment) +import(an) import(arrow) +import(contain) import(doParallel) import(dplyr) +import(empty) import(foreach) +import(intensities) import(mzR) import(parallel) import(snow) import(splines) +import(step.) import(stringr) +import(tables) +import(the) +import(three) import(tibble) import(tidyr) import(tools) +import(will) +import(with) importFrom(dplyr,arrange) importFrom(dplyr,between) importFrom(dplyr,bind_rows) From b2ca09dd21f5c7bbea25b861d76b0d2e0b0518de Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Mon, 22 Jul 2024 08:56:31 +0200 Subject: [PATCH 05/22] weird change --- R/feature.align.R | 186 +++++++++++++++++++++++----------------------- 1 file changed, 93 insertions(+), 93 deletions(-) diff --git a/R/feature.align.R b/R/feature.align.R index d391f1a..16a2fc5 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -8,11 +8,11 @@ #' @return An empty tibble with slots for metadata, intensities and RTs. #' @export create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) { - features <- new("list") - features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames) - features$intensity <- tibble::as_tibble(matrix(nrow = 0, ncol = length(intensity_colnames)), .name_repair = ~intensity_colnames) - features$rt <- tibble::as_tibble(matrix(nrow = 0, ncol = length(rt_colnames)), .name_repair = ~rt_colnames) - return(features) + features <- new("list") + features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames) + features$intensity <- tibble::as_tibble(matrix(nrow = 0, ncol = length(intensity_colnames)), .name_repair = ~intensity_colnames) + features$rt <- tibble::as_tibble(matrix(nrow = 0, ncol = length(rt_colnames)), .name_repair = ~rt_colnames) + return(features) } #' Create a list containing 3 tibbles: metadata, intensities and RTs. @@ -21,25 +21,25 @@ create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_ #' @return A list containing 3 tibbles: metadata, intensities and RTs. #' @export create_output <- function(sample_grouped, sample_names) { - number_of_samples <- length(sample_names) - intensity_row <- rep(0, number_of_samples) - rt_row <- rep(0, number_of_samples) - sample_presence <- rep(0, number_of_samples) - - for (i in seq_along(intensity_row)) { - filtered <- filter(sample_grouped, sample_id == sample_names[i]) - if (nrow(filtered) != 0) { - sample_presence[i] <- 1 - intensity_row[i] <- sum(filtered$area) - rt_row[i] <- median(filtered$rt) - } + number_of_samples <- length(sample_names) + intensity_row <- rep(0, number_of_samples) + rt_row <- rep(0, number_of_samples) + sample_presence <- rep(0, number_of_samples) + + for (i in seq_along(intensity_row)) { + filtered <- filter(sample_grouped, sample_id == sample_names[i]) + if (nrow(filtered) != 0) { + sample_presence[i] <- 1 + intensity_row[i] <- sum(filtered$area) + rt_row[i] <- median(filtered$rt) } + } - mz <- sample_grouped$mz - rt <- sample_grouped$rt - metadata_row <- c(mean(mz), min(mz), max(mz), mean(rt), min(rt), max(rt), nrow(sample_grouped), sample_presence) + mz <- sample_grouped$mz + rt <- sample_grouped$rt + metadata_row <- c(mean(mz), min(mz), max(mz), mean(rt), min(rt), max(rt), nrow(sample_grouped), sample_presence) - return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row)) + return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row)) } #' Validates if the data is present in more than "min_occurence" of samples. @@ -48,13 +48,13 @@ create_output <- function(sample_grouped, sample_names) { #' @return boolean value whether it is TRUE or FALSE. #' @export validate_contents <- function(samples, min_occurrence) { - if (!is.null(nrow(samples))) { - if (length(unique(samples$sample_id)) >= min_occurrence) { - return(TRUE) - } - return(FALSE) + if (!is.null(nrow(samples))) { + if (length(unique(samples$sample_id)) >= min_occurrence) { + return(TRUE) } return(FALSE) + } + return(FALSE) } #' Compute the kernel density estimation and find the peaks and valleys of a smooth curve. @@ -76,10 +76,10 @@ find_optima <- function(data, bandwidth) { #' @return Dataframe subsetted within lower and upper bound from density estimation. #' @export filter_based_on_density <- function(sample, turns, index, i) { - lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]]) - upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]]) - selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound) - return(sample[selected, ]) + lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]]) + upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]]) + selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound) + return(sample[selected, ]) } #' Groups the features across samples based on RT. @@ -90,13 +90,13 @@ filter_based_on_density <- function(sample, turns, index, i) { #' @param return A list containing 3 tibbles: metadata, intensities and RTs. #' @export select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) { - turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414) - for (i in seq_along(turns$peaks)) { - sample_grouped <- filter_based_on_density(sample, turns, 2, i) - if (validate_contents(sample_grouped, min_occurrence)) { - return(create_output(sample_grouped, sample_names)) - } + turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414) + for (i in seq_along(turns$peaks)) { + sample_grouped <- filter_based_on_density(sample, turns, 2, i) + if (validate_contents(sample_grouped, min_occurrence)) { + return(create_output(sample_grouped, sample_names)) } + } } #' Groups the features across samples based on m/z. @@ -108,13 +108,13 @@ select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) { #' @return A list containing 3 tibbles: metadata, intensities and RTs. #' @export select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) { - turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz)) - for (i in seq_along(turns$peaks)) { - sample_grouped <- filter_based_on_density(sample, turns, 1, i) - if (validate_contents(sample_grouped, min_occurrence)) { - return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names)) - } + turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz)) + for (i in seq_along(turns$peaks)) { + sample_grouped <- filter_based_on_density(sample, turns, 1, i) + if (validate_contents(sample_grouped, min_occurrence)) { + return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names)) } + } } #' Groups the mz and RT for particular cluster. @@ -130,17 +130,17 @@ create_rows <- function(features, rt_tol_relative, min_occurrence, sample_names) { - if (validate_contents(features, min_occurrence)) { - return(select_mz(features, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names)) - } - return(NULL) + if (validate_contents(features, min_occurrence)) { + return(select_mz(features, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names)) + } + return(NULL) } #' Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble. #' @return Tibbles combining the output (metadata, intensity and RT respectively) from different clusters. #' @export comb <- function(x, ...) { - mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE))) + mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE))) } #' Align peaks from spectra into a feature table. @@ -163,54 +163,54 @@ create_aligned_feature_table <- function(features_table, rt_tol_relative, mz_tol_relative, cluster = 4) { - if (!is(cluster, "cluster")) { - cluster <- parallel::makeCluster(cluster) - on.exit(parallel::stopCluster(cluster)) - - # NOTE: side effect (doParallel has no functionality to clean up) - doParallel::registerDoParallel(cluster) - register_functions_to_cluster(cluster) + if (!is(cluster, "cluster")) { + cluster <- parallel::makeCluster(cluster) + on.exit(parallel::stopCluster(cluster)) + + # NOTE: side effect (doParallel has no functionality to clean up) + doParallel::registerDoParallel(cluster) + register_functions_to_cluster(cluster) + } + + + + number_of_samples <- length(sample_names) + metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names) + intensity_colnames <- c("id", sample_names) + rt_colnames <- c("id", sample_names) + + aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) + + # table with number of values per group + groups_cardinality <- table(features_table$cluster) + # count those with minimal occurrence + sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence]) + + # retention time alignment + + aligned_features <- foreach::foreach( + i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE + ) %do% { + rows <- create_rows( + dplyr::filter(features_table, cluster == sel.labels[i]), + mz_tol_relative, + rt_tol_relative, + min_occurrence, + sample_names + ) + + if (!is.null(rows)) { + rows$metadata_row <- c(i, rows$metadata_row) + rows$intensity_row <- c(i, rows$intensity_row) + rows$rt_row <- c(i, rows$rt_row) } + list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row) + } + colnames(aligned_features$metadata) <- metadata_colnames + colnames(aligned_features$intensity) <- intensity_colnames + colnames(aligned_features$rt) <- rt_colnames - number_of_samples <- length(sample_names) - metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names) - intensity_colnames <- c("id", sample_names) - rt_colnames <- c("id", sample_names) - - aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) - - # table with number of values per group - groups_cardinality <- table(features_table$cluster) - # count those with minimal occurrence - sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence]) - - # retention time alignment - - aligned_features <- foreach::foreach( - i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE - ) %do% { - rows <- create_rows( - dplyr::filter(features_table, cluster == sel.labels[i]), - mz_tol_relative, - rt_tol_relative, - min_occurrence, - sample_names - ) - - if (!is.null(rows)) { - rows$metadata_row <- c(i, rows$metadata_row) - rows$intensity_row <- c(i, rows$intensity_row) - rows$rt_row <- c(i, rows$rt_row) - } - - list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row) - } - - colnames(aligned_features$metadata) <- metadata_colnames - colnames(aligned_features$intensity) <- intensity_colnames - colnames(aligned_features$rt) <- rt_colnames - - return(aligned_features) + return(aligned_features) } From 3bede62beeaa1dbb3047036d75ac3fcbe9c29f87 Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Mon, 22 Jul 2024 11:27:56 +0200 Subject: [PATCH 06/22] state pre refactoring --- R/feature.align.R | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/R/feature.align.R b/R/feature.align.R index 16a2fc5..cf93b83 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -1,7 +1,8 @@ #' @import foreach #' Create an empty tibble for the next alignment step. It will contain three tables with aligned metadata, intensities an RTs. -#' @param number_of_samples Number of different sample names. +#' @param number_of_samples Number +#' of different sample names. #' @param metadata_colnames Metadata column names: "id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names #' @param intensity_colnames "id" and sample names; will hold intensities. #' @param rt_colnames "id" and sample names; will hold retention times. @@ -15,30 +16,48 @@ create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_ return(features) } +create_metadata <- function(sample_grouped, sample_names) { + sample_presence <- sapply(sample_names, + FUN=function(x) { + as.numeric(any(sample_grouped$sample_id == x)) + } + ) + + metadata_row <- dplyr::summarise( + sample_grouped, + mzmean = mean(mz), + mzmin = min(mz), + mzmax = max(mz), + rtmean = mean(rt), + rtmin = min(rt), + rtmax = max(rt), + npeaks = n() + ) %>% rename(mz = "mzmean", rt = "rtmean") + + metadata_row <- dplyr::bind_cols(metadata_row, as.list(sample_presence)) + return(as.vector(unlist(metadata_row[1,]))) +} + #' Create a list containing 3 tibbles: metadata, intensities and RTs. #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. #' @param sample_names A list of sample names. #' @return A list containing 3 tibbles: metadata, intensities and RTs. #' @export create_output <- function(sample_grouped, sample_names) { + metadata_row <- create_metadata(sample_grouped, sample_names) + number_of_samples <- length(sample_names) intensity_row <- rep(0, number_of_samples) rt_row <- rep(0, number_of_samples) - sample_presence <- rep(0, number_of_samples) for (i in seq_along(intensity_row)) { filtered <- filter(sample_grouped, sample_id == sample_names[i]) + if (nrow(filtered) != 0) { - sample_presence[i] <- 1 intensity_row[i] <- sum(filtered$area) rt_row[i] <- median(filtered$rt) } } - - mz <- sample_grouped$mz - rt <- sample_grouped$rt - metadata_row <- c(mean(mz), min(mz), max(mz), mean(rt), min(rt), max(rt), nrow(sample_grouped), sample_presence) - return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row)) } @@ -172,8 +191,6 @@ create_aligned_feature_table <- function(features_table, register_functions_to_cluster(cluster) } - - number_of_samples <- length(sample_names) metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names) intensity_colnames <- c("id", sample_names) From 2036537090ce0dad6637690351faf5ba0533cd5f Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Mon, 22 Jul 2024 12:42:52 +0200 Subject: [PATCH 07/22] updated select_mz to return tibble rows --- R/feature.align.R | 35 ++++++++++++-------- tests/testdata/aligned/output_select-mz.rds | Bin 207 -> 354 bytes 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/R/feature.align.R b/R/feature.align.R index cf93b83..f85b38f 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -35,7 +35,11 @@ create_metadata <- function(sample_grouped, sample_names) { ) %>% rename(mz = "mzmean", rt = "rtmean") metadata_row <- dplyr::bind_cols(metadata_row, as.list(sample_presence)) - return(as.vector(unlist(metadata_row[1,]))) + return(metadata_row) +} + +first_tibble_row_as_vector <- function(x) { + return(as.vector(unlist(x[1,]))) } #' Create a list containing 3 tibbles: metadata, intensities and RTs. @@ -46,19 +50,22 @@ create_metadata <- function(sample_grouped, sample_names) { create_output <- function(sample_grouped, sample_names) { metadata_row <- create_metadata(sample_grouped, sample_names) - number_of_samples <- length(sample_names) - intensity_row <- rep(0, number_of_samples) - rt_row <- rep(0, number_of_samples) - - for (i in seq_along(intensity_row)) { - filtered <- filter(sample_grouped, sample_id == sample_names[i]) - - if (nrow(filtered) != 0) { - intensity_row[i] <- sum(filtered$area) - rt_row[i] <- median(filtered$rt) - } - } - return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row)) + intensity_row <- sample_grouped %>% + group_by(sample_id) %>% + summarise(intensity = sum(area)) %>% + pivot_wider(names_from = "sample_id", values_from = "intensity") + + + rt_row <- sample_grouped %>% + group_by(sample_id) %>% + summarise(rt = median(rt)) %>% + pivot_wider(names_from = "sample_id", values_from = "rt") + + return(list( + metadata_row = (metadata_row), + intensity_row = (intensity_row), + rt_row = (rt_row) + )) } #' Validates if the data is present in more than "min_occurence" of samples. diff --git a/tests/testdata/aligned/output_select-mz.rds b/tests/testdata/aligned/output_select-mz.rds index ed8d4b2f622a82cddc3a57d6b869a5d04a29a2d3..72675305ea18a1197b783bf0395c7e2ac029f6aa 100644 GIT binary patch delta 324 zcmV-K0lWUs0pbFXCx6T!8l;F1h#4KCo6K2#L|u{iW@f@yP9e(|wAF~g`RV%>@>lJ> zi{P*C?z;Q03&Fp5a*aqe$b4Q9A7Up&&i(@f3=rV6FoE^5FmQrAkeriPTnuCgKzs%k zWGhL^iBCy`@|b}DB|Wg#4V7-85sUS;}jmwj)CkCyQhBRgYyH}+a{}|Er#>GDt|j1JIV?1HsgP? zgO=$(_K;*oH$9Rj-gHRc)%(5n01_WpQXFF5#A-HAZfZ#)$Yt@sWDFJK&CCO8E6yyb WL=t2xDgpC=MgaiHc|JW11ONawX^q7I delta 176 zcmV;h08jto0?z@ECw~llK+NS3-DJ+{BkBsI&CG Date: Tue, 23 Jul 2024 08:49:33 +0200 Subject: [PATCH 08/22] test updated for refactored code --- tests/testthat/test-feature-align_select-mz.R | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-feature-align_select-mz.R b/tests/testthat/test-feature-align_select-mz.R index 6148bdb..59261c5 100644 --- a/tests/testthat/test-feature-align_select-mz.R +++ b/tests/testthat/test-feature-align_select-mz.R @@ -1,15 +1,16 @@ -test_that("select_mz function works", { - sample <- read_parquet("../testdata/input/feature-align_select-mz.parquet") +test_that("create_features_from_cluster() function works", { + sample <- read_parquet("../testdata/input/feature-align_create-features.parquet") sample_names <- c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened") min_occurrence <- 2 mz_tol_relative <- 6.85676325338646e-06 rt_tol_relative <- 2.17918873407775 - actual <- select_mz(sample, + actual <- create_features_from_cluster(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) - expected <- readRDS("../testdata/aligned/output_select-mz.rds") + + expected <- readRDS("../testdata/aligned/output_create-features.rds") expect_equal(actual, expected) }) \ No newline at end of file From b5f78ff2b321cfe33a4ed98a333ad53a7eb7cdbc Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 08:56:17 +0200 Subject: [PATCH 09/22] tests updated for refactored code --- tests/testthat/test-feature-align.R | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-feature-align.R b/tests/testthat/test-feature-align.R index c627879..78aac89 100644 --- a/tests/testthat/test-feature-align.R +++ b/tests/testthat/test-feature-align.R @@ -84,12 +84,16 @@ patrick::with_parameters_test_that( get_num_workers() ) - aligned_expected <- list( - metadata = arrow::read_parquet(file.path(testdata, "aligned", "metadata_table.parquet")), - intensity = arrow::read_parquet(file.path(testdata, "aligned", "intensity_table.parquet")), - rt = arrow::read_parquet(file.path(testdata, "aligned", "rt_table.parquet")) + aligned_expected <- load_aligned_features( + file.path(testdata, "aligned", "metadata_table.parquet"), + file.path(testdata, "aligned", "intensity_table.parquet"), + file.path(testdata, "aligned", "rt_table.parquet"), + file.path(testdata, "aligned", "tolerances.parquet") ) + aligned_expected["mz_tol_relative"] <- NULL + aligned_expected["rt_tol_relative"] <- NULL + expect_equal(aligned_actual, aligned_expected) }, patrick::cases( From 40457b04aa2d32b5ed1e5833992b68ec197f3075 Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 09:58:41 +0200 Subject: [PATCH 10/22] function documentation updated --- R/feature.align.R | 166 ++++++++++++++++++++-------------------------- 1 file changed, 73 insertions(+), 93 deletions(-) diff --git a/R/feature.align.R b/R/feature.align.R index f85b38f..ce5a7a8 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -1,21 +1,8 @@ #' @import foreach -#' Create an empty tibble for the next alignment step. It will contain three tables with aligned metadata, intensities an RTs. -#' @param number_of_samples Number -#' of different sample names. -#' @param metadata_colnames Metadata column names: "id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names -#' @param intensity_colnames "id" and sample names; will hold intensities. -#' @param rt_colnames "id" and sample names; will hold retention times. -#' @return An empty tibble with slots for metadata, intensities and RTs. -#' @export -create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) { - features <- new("list") - features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames) - features$intensity <- tibble::as_tibble(matrix(nrow = 0, ncol = length(intensity_colnames)), .name_repair = ~intensity_colnames) - features$rt <- tibble::as_tibble(matrix(nrow = 0, ncol = length(rt_colnames)), .name_repair = ~rt_colnames) - return(features) -} - +#' Create a metadata row tibble with min, max and mean mz and RT values. +#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. +#' @param sample_names A list of sample names. create_metadata <- function(sample_grouped, sample_names) { sample_presence <- sapply(sample_names, FUN=function(x) { @@ -38,33 +25,41 @@ create_metadata <- function(sample_grouped, sample_names) { return(metadata_row) } -first_tibble_row_as_vector <- function(x) { - return(as.vector(unlist(x[1,]))) -} - -#' Create a list containing 3 tibbles: metadata, intensities and RTs. +#' Compute summed area for each sample #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. -#' @param sample_names A list of sample names. -#' @return A list containing 3 tibbles: metadata, intensities and RTs. -#' @export -create_output <- function(sample_grouped, sample_names) { - metadata_row <- create_metadata(sample_grouped, sample_names) - - intensity_row <- sample_grouped %>% +#' @return Summed area for each sample. +create_intensity_row <- function(sample_grouped) { + sample_grouped %>% group_by(sample_id) %>% summarise(intensity = sum(area)) %>% pivot_wider(names_from = "sample_id", values_from = "intensity") +} +#' Compute median RT for each sample +#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. +#' @return Median RT for each sample. - rt_row <- sample_grouped %>% +create_rt_row <- function(sample_grouped) { + sample_grouped %>% group_by(sample_id) %>% summarise(rt = median(rt)) %>% pivot_wider(names_from = "sample_id", values_from = "rt") +} + +#' Create a list containing 3 tibbles: metadata, intensities and RTs. +#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. +#' @param sample_names A list of sample names. +#' @return A list containing 3 tibbles: metadata, intensities and RTs. +#' @export +create_output <- function(sample_grouped, sample_names) { + metadata_row <- create_metadata(sample_grouped, sample_names) + intensity_row <- create_intensity_row(sample_grouped) + rt_row <- create_rt_row(sample_grouped) return(list( - metadata_row = (metadata_row), - intensity_row = (intensity_row), - rt_row = (rt_row) + metadata_row = metadata_row, + intensity_row = intensity_row, + rt_row = rt_row )) } @@ -108,42 +103,7 @@ filter_based_on_density <- function(sample, turns, index, i) { return(sample[selected, ]) } -#' Groups the features across samples based on RT. -#' @param sample A dataframe subsetted for the particular cluster. -#' @param rt_tol_relative The retention time tolerance level for peak alignment. -#' @param min_occurence A minimal number of profiles a feature has to be present in. -#' @param sample_names A list of sample names. -#' @param return A list containing 3 tibbles: metadata, intensities and RTs. -#' @export -select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) { - turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414) - for (i in seq_along(turns$peaks)) { - sample_grouped <- filter_based_on_density(sample, turns, 2, i) - if (validate_contents(sample_grouped, min_occurrence)) { - return(create_output(sample_grouped, sample_names)) - } - } -} - -#' Groups the features across samples based on m/z. -#' @param sample A dataframe subsetted for the particular cluster. -#' @param mz_tol_relative The m/z tolerance level for peak alignment. -#' @param rt_tol_relative The retention time tolerance level for peak alignment. -#' @param min_occurence A minimal number of profiles a feature has to be present in. -#' @param sample_names A list of sample names. -#' @return A list containing 3 tibbles: metadata, intensities and RTs. -#' @export -select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) { - turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz)) - for (i in seq_along(turns$peaks)) { - sample_grouped <- filter_based_on_density(sample, turns, 1, i) - if (validate_contents(sample_grouped, min_occurrence)) { - return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names)) - } - } -} - -#' Groups the mz and RT for particular cluster. +#' Group the mz and RT for particular cluster. #' @param features The features table subsetted for a particular cluster. #' @param mz_tol_relative The m/z tolerance level for peak alignment. #' @param rt_tol_relative The retention time tolerance level for peak alignment. @@ -151,22 +111,57 @@ select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, #' @param sample_names A list of sample names. #' @return A list containing 3 tibbles: metadata, intensities and RTs. #' @export -create_rows <- function(features, +create_features_from_cluster <- function(features, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) { - if (validate_contents(features, min_occurrence)) { - return(select_mz(features, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names)) + if (!validate_contents(features, min_occurrence)) { + return(NULL) } - return(NULL) + + # create empty tibble rows + metadata <- NULL + intensity <- NULL + rt <- NULL + + # split according to mz values + turns_mz <- find_optima(features$mz, bandwidth = mz_tol_relative * median(features$mz)) + for (i in seq_along(turns_mz$peaks)) { + sample_grouped_mz <- filter_based_on_density(features, turns_mz, 1, i) + if (validate_contents(sample_grouped_mz, min_occurrence)) { + + #split according to rt values + turns_rt <- find_optima(sample_grouped_mz$rt, bandwidth = rt_tol_relative / 1.414) + for (ii in seq_along(turns_rt$peaks)) { + sample_grouped_rt <- filter_based_on_density(sample_grouped_mz, turns_rt, 2, ii) + + # create output rows if valid + if (validate_contents(sample_grouped_rt, min_occurrence)) { + metadata <- dplyr::bind_rows(metadata, create_metadata(sample_grouped_rt, sample_names)) + intensity <- dplyr::bind_rows(intensity, create_intensity_row(sample_grouped_rt)) + rt <- dplyr::bind_rows(rt, create_rt_row(sample_grouped_rt)) + } + } + } + } + + return(list(metadata_row = metadata, intensity_row = intensity, rt_row = rt)) } #' Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble. #' @return Tibbles combining the output (metadata, intensity and RT respectively) from different clusters. #' @export comb <- function(x, ...) { - mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE))) + mapply(plyr::rbind.fill, x, ..., SIMPLIFY = FALSE) +} + +#' Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble +#' @param x A dataframe +#' @param sample_names List of sample names. +#' @return Cleaned tibble. +clean_data_matrix <- function(x, sample_names) { + x %>% replace(is.na(.), 0) %>% dplyr::relocate(sample_names) %>% as_tibble } #' Align peaks from spectra into a feature table. @@ -180,7 +175,7 @@ comb <- function(x, ...) { #' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which #' allows the program to search for the tolerance level based on the data. #' @param cluster The number of CPU cores to be used -#' @return A tibble with three tables containing aligned metadata, intensities an RTs. +#' @return A list of 3 tibbles containing aligned metadata, intensities an RTs. #' #' @export create_aligned_feature_table <- function(features_table, @@ -198,43 +193,28 @@ create_aligned_feature_table <- function(features_table, register_functions_to_cluster(cluster) } - number_of_samples <- length(sample_names) - metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names) - intensity_colnames <- c("id", sample_names) - rt_colnames <- c("id", sample_names) - - aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) - # table with number of values per group groups_cardinality <- table(features_table$cluster) # count those with minimal occurrence sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence]) # retention time alignment - aligned_features <- foreach::foreach( i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE ) %do% { - rows <- create_rows( + rows <- create_features_from_cluster( dplyr::filter(features_table, cluster == sel.labels[i]), mz_tol_relative, rt_tol_relative, min_occurrence, sample_names ) - - if (!is.null(rows)) { - rows$metadata_row <- c(i, rows$metadata_row) - rows$intensity_row <- c(i, rows$intensity_row) - rows$rt_row <- c(i, rows$rt_row) - } - list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row) } - colnames(aligned_features$metadata) <- metadata_colnames - colnames(aligned_features$intensity) <- intensity_colnames - colnames(aligned_features$rt) <- rt_colnames + aligned_features$intensity <- clean_data_matrix(aligned_features$intensity, sample_names) + aligned_features$rt <- clean_data_matrix(aligned_features$rt, sample_names) + aligned_features$metadata <- as_tibble(aligned_features$metadata) return(aligned_features) } From ac6e3d3b8d8e11587372a7d46f46b272983bb420 Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 10:23:41 +0200 Subject: [PATCH 11/22] functions updated --- R/utils.R | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/R/utils.R b/R/utils.R index 781bc71..9594fd2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -45,15 +45,17 @@ register_functions_to_cluster <- function(cluster) { 'compute_uniq_grp', 'predict_smoothed_rt', 'label_val_to_keep', - "create_empty_tibble", - "create_rows", + "create_features_from_cluster", "validate_contents", - "select_mz", - "select_rt", "find_optima", "filter_based_on_density", "create_output", + "create_metadata", + "create_rt_row", + "create_intensity_row", "comb", + "clean_data_matrix", + "create_aligned_feature_table", 'bigauss.esti.EM', 'solve_sigma', 'prep_uv', @@ -111,9 +113,9 @@ load_aligned_features <- function(metadata_file, intensities_file, rt_file, tol_ tolerances <- arrow::read_parquet(tol_file) result <- list() - result$metadata <- as_tibble(metadata) - result$intensity <- as_tibble(intensities) - result$rt <- as_tibble(rt) + result$metadata <- as_tibble(metadata) |> select(-id) + result$intensity <- as_tibble(intensities) |> select(-id) + result$rt <- as_tibble(rt) |> select(-id) result$mz_tol_relative <- tolerances$mz_tolerance result$rt_tol_relative <- tolerances$rt_tolerance return(result) From fbff31a1a91f4245396a479c0622cd32226d0e5c Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 10:24:55 +0200 Subject: [PATCH 12/22] plyr added as dependency --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3b1effc..5acef3f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu. It takes the adaptive processing of LC/MS metabolomics data further with focus on high resolution MS for both LC and GC applications. Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach, - snow, dplyr, tidyr, stringr, tibble, tools, arrow + snow, dplyr, tidyr, stringr, tibble, tools, arrow, plyr biocViews: Technology, MassSpectrometry License: GPL-2 LazyLoad: yes From b82097563d21c72ee1992ea9e7ffe6ee3b982395 Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 10:25:47 +0200 Subject: [PATCH 13/22] documentation updated --- NAMESPACE | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 995a087..e408106 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -6,6 +6,7 @@ export(aggregate_by_rt) export(bigauss.esti) export(bigauss.esti.EM) export(bigauss.mix) +export(clean_data_matrix) export(comb) export(compute_boundaries) export(compute_bounds) @@ -42,9 +43,11 @@ export(compute_uniq_grp) export(correct_time) export(count_peaks) export(create_aligned_feature_table) -export(create_empty_tibble) +export(create_features_from_cluster) +export(create_intensity_row) +export(create_metadata) export(create_output) -export(create_rows) +export(create_rt_row) export(draw_rt_correction_plot) export(draw_rt_normal_peaks) export(duplicate.row.remove) @@ -91,8 +94,6 @@ export(remove_noise) export(rev_cum_sum) export(rm.ridge) export(run_filter) -export(select_mz) -export(select_rt) export(semi.sup) export(solve_a) export(solve_sigma) @@ -102,36 +103,30 @@ export(two.step.hybrid) export(unsupervised) export(validate_contents) export(validate_model_method_input) -import("for") -import("metadata,") -import("next") +import("min,") import(Create) -import(It) import(MASS) -import(RTs.) -import(aligned) -import(alignment) -import(an) +import(RT) +import(a) +import(and) import(arrow) -import(contain) import(doParallel) import(dplyr) -import(empty) import(foreach) -import(intensities) +import(max) +import(mean) +import(metadata) +import(mz) import(mzR) import(parallel) +import(row) import(snow) import(splines) -import(step.) import(stringr) -import(tables) -import(the) -import(three) import(tibble) import(tidyr) import(tools) -import(will) +import(values.) import(with) importFrom(dplyr,arrange) importFrom(dplyr,between) From 39189e3a1a2545f59b8c762eb0f5309a309d915f Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 10:27:11 +0200 Subject: [PATCH 14/22] documentation updated --- man/clean_data_matrix.Rd | 19 ++++++++++++++++++ man/comb.Rd | 14 +++++++++++++ man/compute_clusters_simple.Rd | 2 +- man/create_aligned_feature_table.Rd | 2 +- man/create_features_from_cluster.Rd | 31 +++++++++++++++++++++++++++++ man/create_intensity_row.Rd | 17 ++++++++++++++++ man/create_output.Rd | 19 ++++++++++++++++++ man/create_rt_row.Rd | 17 ++++++++++++++++ man/filter_based_on_density.Rd | 23 +++++++++++++++++++++ man/find_optima.Rd | 19 ++++++++++++++++++ man/remove_noise.Rd | 5 ++++- man/validate_contents.Rd | 19 ++++++++++++++++++ 12 files changed, 184 insertions(+), 3 deletions(-) create mode 100644 man/clean_data_matrix.Rd create mode 100644 man/comb.Rd create mode 100644 man/create_features_from_cluster.Rd create mode 100644 man/create_intensity_row.Rd create mode 100644 man/create_output.Rd create mode 100644 man/create_rt_row.Rd create mode 100644 man/filter_based_on_density.Rd create mode 100644 man/find_optima.Rd create mode 100644 man/validate_contents.Rd diff --git a/man/clean_data_matrix.Rd b/man/clean_data_matrix.Rd new file mode 100644 index 0000000..e535404 --- /dev/null +++ b/man/clean_data_matrix.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{clean_data_matrix} +\alias{clean_data_matrix} +\title{Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble} +\usage{ +clean_data_matrix(x, sample_names) +} +\arguments{ +\item{x}{A dataframe} + +\item{sample_names}{List of sample names.} +} +\value{ +Cleaned tibble. +} +\description{ +Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble +} diff --git a/man/comb.Rd b/man/comb.Rd new file mode 100644 index 0000000..2087909 --- /dev/null +++ b/man/comb.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{comb} +\alias{comb} +\title{Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.} +\usage{ +comb(x, ...) +} +\value{ +Tibbles combining the output (metadata, intensity and RT respectively) from different clusters. +} +\description{ +Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble. +} diff --git a/man/compute_clusters_simple.Rd b/man/compute_clusters_simple.Rd index 09ab9f3..3a4855b 100644 --- a/man/compute_clusters_simple.Rd +++ b/man/compute_clusters_simple.Rd @@ -7,7 +7,7 @@ compute_clusters_simple(feature_tables, sample_names, mz_tol_ppm, rt_tol) } \arguments{ -\item{feature_tables}{list of tibbles List of feature tables coming from all samples.} +\item{feature_tables}{list of tibbles feature tables coming from all samples.} \item{sample_names}{list of strings Sample names of the feature tables used to distinguish the samples.} diff --git a/man/create_aligned_feature_table.Rd b/man/create_aligned_feature_table.Rd index 034df37..66a676d 100644 --- a/man/create_aligned_feature_table.Rd +++ b/man/create_aligned_feature_table.Rd @@ -30,7 +30,7 @@ percentage of the m/z value. This value, multiplied by the m/z value, becomes th \item{cluster}{The number of CPU cores to be used} } \value{ -A tibble with three tables containing aligned metadata, intensities an RTs. +A list of 3 tibbles containing aligned metadata, intensities an RTs. } \description{ Align peaks from spectra into a feature table. diff --git a/man/create_features_from_cluster.Rd b/man/create_features_from_cluster.Rd new file mode 100644 index 0000000..84bebba --- /dev/null +++ b/man/create_features_from_cluster.Rd @@ -0,0 +1,31 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{create_features_from_cluster} +\alias{create_features_from_cluster} +\title{Group the mz and RT for particular cluster.} +\usage{ +create_features_from_cluster( + features, + mz_tol_relative, + rt_tol_relative, + min_occurrence, + sample_names +) +} +\arguments{ +\item{features}{The features table subsetted for a particular cluster.} + +\item{mz_tol_relative}{The m/z tolerance level for peak alignment.} + +\item{rt_tol_relative}{The retention time tolerance level for peak alignment.} + +\item{min_occurrence}{A minimal number of profiles a feature has to be present in.} + +\item{sample_names}{A list of sample names.} +} +\value{ +A list containing 3 tibbles: metadata, intensities and RTs. +} +\description{ +Group the mz and RT for particular cluster. +} diff --git a/man/create_intensity_row.Rd b/man/create_intensity_row.Rd new file mode 100644 index 0000000..0976c45 --- /dev/null +++ b/man/create_intensity_row.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{create_intensity_row} +\alias{create_intensity_row} +\title{Compute summed area for each sample} +\usage{ +create_intensity_row(sample_grouped) +} +\arguments{ +\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.} +} +\value{ +Summed area for each sample. +} +\description{ +Compute summed area for each sample +} diff --git a/man/create_output.Rd b/man/create_output.Rd new file mode 100644 index 0000000..9c1223e --- /dev/null +++ b/man/create_output.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{create_output} +\alias{create_output} +\title{Create a list containing 3 tibbles: metadata, intensities and RTs.} +\usage{ +create_output(sample_grouped, sample_names) +} +\arguments{ +\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.} + +\item{sample_names}{A list of sample names.} +} +\value{ +A list containing 3 tibbles: metadata, intensities and RTs. +} +\description{ +Create a list containing 3 tibbles: metadata, intensities and RTs. +} diff --git a/man/create_rt_row.Rd b/man/create_rt_row.Rd new file mode 100644 index 0000000..0059fb9 --- /dev/null +++ b/man/create_rt_row.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{create_rt_row} +\alias{create_rt_row} +\title{Compute median RT for each sample} +\usage{ +create_rt_row(sample_grouped) +} +\arguments{ +\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.} +} +\value{ +Median RT for each sample. +} +\description{ +Compute median RT for each sample +} diff --git a/man/filter_based_on_density.Rd b/man/filter_based_on_density.Rd new file mode 100644 index 0000000..fe85d5c --- /dev/null +++ b/man/filter_based_on_density.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{filter_based_on_density} +\alias{filter_based_on_density} +\title{Subset data within lower and upper bound from density estimation} +\usage{ +filter_based_on_density(sample, turns, index, i) +} +\arguments{ +\item{sample}{A subset of the features_table.} + +\item{turns}{A list of peaks and valleys positions.} + +\item{index}{Whether it subsets on m/z [1] or RT [2] column.} + +\item{i}{Iterates over the peaks in the turns list.} +} +\value{ +Dataframe subsetted within lower and upper bound from density estimation. +} +\description{ +Subset data within lower and upper bound from density estimation +} diff --git a/man/find_optima.Rd b/man/find_optima.Rd new file mode 100644 index 0000000..81e681b --- /dev/null +++ b/man/find_optima.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{find_optima} +\alias{find_optima} +\title{Compute the kernel density estimation and find the peaks and valleys of a smooth curve.} +\usage{ +find_optima(data, bandwidth) +} +\arguments{ +\item{data}{A vector of m/z or RTs for a particular cluster.} + +\item{bandwidth}{A bandwidth value for the KDE computation.} +} +\value{ +A list of peaks and valleys positions. +} +\description{ +Compute the kernel density estimation and find the peaks and valleys of a smooth curve. +} diff --git a/man/remove_noise.Rd b/man/remove_noise.Rd index debe393..e310fcd 100644 --- a/man/remove_noise.Rd +++ b/man/remove_noise.Rd @@ -13,7 +13,8 @@ remove_noise( baseline_correct_noise_percentile, intensity_weighted, do.plot, - cache + cache, + grouping_threshold = Inf ) } \arguments{ @@ -40,6 +41,8 @@ run filter, to be used as the baseline threshold of signal strength.} \item{do.plot}{Indicates whether plot should be drawn.} \item{cache}{Whether to use cache} + +\item{grouping_threshold}{The maximum difference between two scans to be considered the same EIC. Default is Inf.} } \value{ A matrix with four columns: m/z value, retention time, intensity, and group number. diff --git a/man/validate_contents.Rd b/man/validate_contents.Rd new file mode 100644 index 0000000..e8f9889 --- /dev/null +++ b/man/validate_contents.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/feature.align.R +\name{validate_contents} +\alias{validate_contents} +\title{Validates if the data is present in more than "min_occurence" of samples.} +\usage{ +validate_contents(samples, min_occurrence) +} +\arguments{ +\item{samples}{A subset of the features_table.} + +\item{min_occurrence}{A minimal number of profiles a feature has to be present in.} +} +\value{ +boolean value whether it is TRUE or FALSE. +} +\description{ +Validates if the data is present in more than "min_occurence" of samples. +} From b4ee7c2c96935d9b7524eebee84c275030c3c399 Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 10:31:27 +0200 Subject: [PATCH 15/22] plyr added --- conda/environment-dev.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml index c903498..1e98906 100644 --- a/conda/environment-dev.yaml +++ b/conda/environment-dev.yaml @@ -29,3 +29,4 @@ dependencies: - r-httpgd - r-microbenchmark - r-covr + - r-plyr From e9b05103dc2f3bfb02f0962ae09d643b5ec106a3 Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 10:31:59 +0200 Subject: [PATCH 16/22] export added on functions --- R/feature.align.R | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/R/feature.align.R b/R/feature.align.R index ce5a7a8..c2ebc49 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -3,6 +3,7 @@ #' Create a metadata row tibble with min, max and mean mz and RT values. #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. #' @param sample_names A list of sample names. +#' @export create_metadata <- function(sample_grouped, sample_names) { sample_presence <- sapply(sample_names, FUN=function(x) { @@ -28,6 +29,7 @@ create_metadata <- function(sample_grouped, sample_names) { #' Compute summed area for each sample #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. #' @return Summed area for each sample. +#' @export create_intensity_row <- function(sample_grouped) { sample_grouped %>% group_by(sample_id) %>% @@ -38,7 +40,7 @@ create_intensity_row <- function(sample_grouped) { #' Compute median RT for each sample #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. #' @return Median RT for each sample. - +#' @export create_rt_row <- function(sample_grouped) { sample_grouped %>% group_by(sample_id) %>% @@ -160,6 +162,7 @@ comb <- function(x, ...) { #' @param x A dataframe #' @param sample_names List of sample names. #' @return Cleaned tibble. +#' @export clean_data_matrix <- function(x, sample_names) { x %>% replace(is.na(.), 0) %>% dplyr::relocate(sample_names) %>% as_tibble } From 31ca6f0b0446842e83240de1d96281f9c862ecb8 Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 10:36:19 +0200 Subject: [PATCH 17/22] styler linted --- R/feature.align.R | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/R/feature.align.R b/R/feature.align.R index c2ebc49..c3b75e9 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -6,7 +6,7 @@ #' @export create_metadata <- function(sample_grouped, sample_names) { sample_presence <- sapply(sample_names, - FUN=function(x) { + FUN = function(x) { as.numeric(any(sample_grouped$sample_id == x)) } ) @@ -32,9 +32,9 @@ create_metadata <- function(sample_grouped, sample_names) { #' @export create_intensity_row <- function(sample_grouped) { sample_grouped %>% - group_by(sample_id) %>% - summarise(intensity = sum(area)) %>% - pivot_wider(names_from = "sample_id", values_from = "intensity") + group_by(sample_id) %>% + summarise(intensity = sum(area)) %>% + pivot_wider(names_from = "sample_id", values_from = "intensity") } #' Compute median RT for each sample @@ -43,9 +43,9 @@ create_intensity_row <- function(sample_grouped) { #' @export create_rt_row <- function(sample_grouped) { sample_grouped %>% - group_by(sample_id) %>% - summarise(rt = median(rt)) %>% - pivot_wider(names_from = "sample_id", values_from = "rt") + group_by(sample_id) %>% + summarise(rt = median(rt)) %>% + pivot_wider(names_from = "sample_id", values_from = "rt") } #' Create a list containing 3 tibbles: metadata, intensities and RTs. @@ -57,7 +57,7 @@ create_output <- function(sample_grouped, sample_names) { metadata_row <- create_metadata(sample_grouped, sample_names) intensity_row <- create_intensity_row(sample_grouped) rt_row <- create_rt_row(sample_grouped) - + return(list( metadata_row = metadata_row, intensity_row = intensity_row, @@ -114,10 +114,10 @@ filter_based_on_density <- function(sample, turns, index, i) { #' @return A list containing 3 tibbles: metadata, intensities and RTs. #' @export create_features_from_cluster <- function(features, - mz_tol_relative, - rt_tol_relative, - min_occurrence, - sample_names) { + mz_tol_relative, + rt_tol_relative, + min_occurrence, + sample_names) { if (!validate_contents(features, min_occurrence)) { return(NULL) } @@ -132,8 +132,7 @@ create_features_from_cluster <- function(features, for (i in seq_along(turns_mz$peaks)) { sample_grouped_mz <- filter_based_on_density(features, turns_mz, 1, i) if (validate_contents(sample_grouped_mz, min_occurrence)) { - - #split according to rt values + # split according to rt values turns_rt <- find_optima(sample_grouped_mz$rt, bandwidth = rt_tol_relative / 1.414) for (ii in seq_along(turns_rt$peaks)) { sample_grouped_rt <- filter_based_on_density(sample_grouped_mz, turns_rt, 2, ii) @@ -147,7 +146,7 @@ create_features_from_cluster <- function(features, } } } - + return(list(metadata_row = metadata, intensity_row = intensity, rt_row = rt)) } @@ -164,7 +163,10 @@ comb <- function(x, ...) { #' @return Cleaned tibble. #' @export clean_data_matrix <- function(x, sample_names) { - x %>% replace(is.na(.), 0) %>% dplyr::relocate(sample_names) %>% as_tibble + x %>% + replace(is.na(.), 0) %>% + dplyr::relocate(sample_names) %>% + as_tibble() } #' Align peaks from spectra into a feature table. From 95e0bde9be6bd11cc512f83efdafa65710fb687f Mon Sep 17 00:00:00 2001 From: KristinaGomoryova Date: Tue, 23 Jul 2024 10:39:31 +0200 Subject: [PATCH 18/22] test files renamed --- tests/testdata/aligned/output_create-features.rds | Bin 0 -> 392 bytes tests/testdata/aligned/output_select-mz.rds | Bin 354 -> 0 bytes ...quet => feature-align_create-features.parquet} | Bin 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/testdata/aligned/output_create-features.rds delete mode 100644 tests/testdata/aligned/output_select-mz.rds rename tests/testdata/input/{feature-align_select-mz.parquet => feature-align_create-features.parquet} (100%) diff --git a/tests/testdata/aligned/output_create-features.rds b/tests/testdata/aligned/output_create-features.rds new file mode 100644 index 0000000000000000000000000000000000000000..d1840145829a50a2289dc2809bc29f62c414a11f GIT binary patch literal 392 zcmV;30eAi%iwFP!000001B>8dU|?WoU}0uvU}gm}8CXL@+;lA%7?^~?5)8~B8l;F1 zh?yLso6K2#L|uXOtvg{s+mOV~%!IF;0*Uj=30WhnFKDX~1JbwMC+uv8i>L2f$X~Vh zE{sNqukY@<`>zW|Bg8MBTq9D=1*3stydZIi8^IW?&i(@f7{F-K#aTdRGBU6*aDoCO zIVZ8W7|0NSgc4Ygtt2TYJ|zvxV+Qh|OsY;5+|_&rlzO_CR|j4qz1@=t6|G4NKMQxhVlf0 zoFn25%;JkP@{3AR^HNh##LaPtTOf%uF#Ln24|w=H2C_fwp8AasM#IGe*xM$nq%DTg zaB;87-wwx)a>8gxa4`NSJII;-V-I0wbR%iveTVd2z29pOFvDolll2huK321Na#KqZ mK`x62CVQwDZ)P4)TXAMdC6XXpQ3;p_GztK<$^>qR1ONcc!mm#N literal 0 HcmV?d00001 diff --git a/tests/testdata/aligned/output_select-mz.rds b/tests/testdata/aligned/output_select-mz.rds deleted file mode 100644 index 72675305ea18a1197b783bf0395c7e2ac029f6aa..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 354 zcmV-o0iFIIiwFP!000001B>8dU|?WoU}0uvU}gm}8CXL@+;lA%7?^~?5)8~B8l;F1 zh#4KCo6K2#L|u{iW@f@yP9e(|wAF~g`RV%>@>lJ>i{P*C?z;Q03&Fp5a*aqe$b4Q9 zA7Up&&i(@f3=rV6FoE^5FmQrAkeriPTnuCgKzs%kWGhL^iBCy`@|b})sI$_eo{ Date: Tue, 23 Jul 2024 11:04:34 +0200 Subject: [PATCH 19/22] import foreach deleted --- DESCRIPTION | 2 +- NAMESPACE | 12 ------------ R/feature.align.R | 2 -- 3 files changed, 1 insertion(+), 15 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 5acef3f..3b1effc 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu. It takes the adaptive processing of LC/MS metabolomics data further with focus on high resolution MS for both LC and GC applications. Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach, - snow, dplyr, tidyr, stringr, tibble, tools, arrow, plyr + snow, dplyr, tidyr, stringr, tibble, tools, arrow biocViews: Technology, MassSpectrometry License: GPL-2 LazyLoad: yes diff --git a/NAMESPACE b/NAMESPACE index e408106..d59d113 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -103,31 +103,19 @@ export(two.step.hybrid) export(unsupervised) export(validate_contents) export(validate_model_method_input) -import("min,") -import(Create) import(MASS) -import(RT) -import(a) -import(and) import(arrow) import(doParallel) import(dplyr) import(foreach) -import(max) -import(mean) -import(metadata) -import(mz) import(mzR) import(parallel) -import(row) import(snow) import(splines) import(stringr) import(tibble) import(tidyr) import(tools) -import(values.) -import(with) importFrom(dplyr,arrange) importFrom(dplyr,between) importFrom(dplyr,bind_rows) diff --git a/R/feature.align.R b/R/feature.align.R index c3b75e9..f9e83a6 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -1,5 +1,3 @@ -#' @import foreach - #' Create a metadata row tibble with min, max and mean mz and RT values. #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster. #' @param sample_names A list of sample names. From 9fa922578a630e89b26038c8f567c29d956d642c Mon Sep 17 00:00:00 2001 From: hechth Date: Fri, 26 Jul 2024 15:08:42 +0000 Subject: [PATCH 20/22] fixed tests --- DESCRIPTION | 2 +- R/feature.align.R | 31 +++++++++++++++++++---------- R/utils.R | 7 ++++--- conda/environment.yaml | 1 + tests/testthat/test-feature-align.R | 6 ++++++ tests/testthat/test-hybrid.R | 1 + 6 files changed, 34 insertions(+), 14 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 3b1effc..5acef3f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu. It takes the adaptive processing of LC/MS metabolomics data further with focus on high resolution MS for both LC and GC applications. Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach, - snow, dplyr, tidyr, stringr, tibble, tools, arrow + snow, dplyr, tidyr, stringr, tibble, tools, arrow, plyr biocViews: Technology, MassSpectrometry License: GPL-2 LazyLoad: yes diff --git a/R/feature.align.R b/R/feature.align.R index f9e83a6..5f541a0 100644 --- a/R/feature.align.R +++ b/R/feature.align.R @@ -30,9 +30,9 @@ create_metadata <- function(sample_grouped, sample_names) { #' @export create_intensity_row <- function(sample_grouped) { sample_grouped %>% - group_by(sample_id) %>% - summarise(intensity = sum(area)) %>% - pivot_wider(names_from = "sample_id", values_from = "intensity") + dplyr::group_by(sample_id) %>% + dplyr::summarise(intensity = sum(area)) %>% + tidyr::pivot_wider(names_from = "sample_id", values_from = "intensity") } #' Compute median RT for each sample @@ -41,9 +41,9 @@ create_intensity_row <- function(sample_grouped) { #' @export create_rt_row <- function(sample_grouped) { sample_grouped %>% - group_by(sample_id) %>% - summarise(rt = median(rt)) %>% - pivot_wider(names_from = "sample_id", values_from = "rt") + dplyr::group_by(sample_id) %>% + dplyr::summarise(rt = median(rt)) %>% + tidyr::pivot_wider(names_from = "sample_id", values_from = "rt") } #' Create a list containing 3 tibbles: metadata, intensities and RTs. @@ -161,10 +161,21 @@ comb <- function(x, ...) { #' @return Cleaned tibble. #' @export clean_data_matrix <- function(x, sample_names) { - x %>% + x <- x %>% replace(is.na(.), 0) %>% - dplyr::relocate(sample_names) %>% - as_tibble() + dplyr::relocate(sample_names) |> + add_feature_ids() + return(x) +} + +#' Add `id` column to a dataframe +#' @param x A dataframe +#' @return The same dataframe but with an additional `id` column +#' in first place which contains the rownames. +#' @export +add_feature_ids <- function(x) { + x$id <- as.numeric(rownames(x)) + return(tibble::as_tibble(x |> dplyr::relocate(id))) } #' Align peaks from spectra into a feature table. @@ -217,7 +228,7 @@ create_aligned_feature_table <- function(features_table, aligned_features$intensity <- clean_data_matrix(aligned_features$intensity, sample_names) aligned_features$rt <- clean_data_matrix(aligned_features$rt, sample_names) - aligned_features$metadata <- as_tibble(aligned_features$metadata) + aligned_features$metadata <- add_feature_ids(aligned_features$metadata) return(aligned_features) } diff --git a/R/utils.R b/R/utils.R index 44bdd3d..738ad0e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -8,6 +8,7 @@ register_functions_to_cluster <- function(cluster) { 'prof.to.features', 'load.lcms', 'adaptive.bin', + 'add_feature_ids', 'find.turn.point', 'msExtrema', 'find_local_maxima', @@ -113,9 +114,9 @@ load_aligned_features <- function(metadata_file, intensities_file, rt_file, tol_ tolerances <- arrow::read_parquet(tol_file) result <- list() - result$metadata <- as_tibble(metadata) |> select(-id) - result$intensity <- as_tibble(intensities) |> select(-id) - result$rt <- as_tibble(rt) |> select(-id) + result$metadata <- as_tibble(metadata) + result$intensity <- as_tibble(intensities) + result$rt <- as_tibble(rt) result$mz_tol_relative <- tolerances$mz_tolerance result$rt_tol_relative <- tolerances$rt_tolerance return(result) diff --git a/conda/environment.yaml b/conda/environment.yaml index 3ab1060..1b2776b 100644 --- a/conda/environment.yaml +++ b/conda/environment.yaml @@ -18,4 +18,5 @@ dependencies: - r-tidyr - r-stringr - r-tibble + - r-plyr diff --git a/tests/testthat/test-feature-align.R b/tests/testthat/test-feature-align.R index 7adf16b..a267541 100644 --- a/tests/testthat/test-feature-align.R +++ b/tests/testthat/test-feature-align.R @@ -1,3 +1,9 @@ +update_expected <- function(actual) { + arrow::write_parquet(actual$metadata, file.path("..", "testdata", "aligned", "metadata_table.parquet")) + arrow::write_parquet(actual$intensity, file.path("..", "testdata", "aligned", "intensity_table.parquet")) + arrow::write_parquet(actual$rt, file.path("..", "testdata", "aligned", "rt_table.parquet")) +} + patrick::with_parameters_test_that( "feature.align test", { diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R index ed28fd3..8a064ba 100644 --- a/tests/testthat/test-hybrid.R +++ b/tests/testthat/test-hybrid.R @@ -25,6 +25,7 @@ patrick::with_parameters_test_that("basic hybrid test", { actual <- as_tibble(result$recovered_feature_sample_table) keys <- c("mz", "rt", "sample", "sample_rt", "sample_intensity") + # arrow::write_parquet(actual, file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet"))) expected <- arrow::read_parquet( file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet")) ) From 1dfb73f096fc3780047acc3cce755de762856b87 Mon Sep 17 00:00:00 2001 From: hechth Date: Mon, 29 Jul 2024 10:17:41 +0200 Subject: [PATCH 21/22] Started adding documentation --- NAMESPACE | 5 +++++ R/adjust.time.R | 33 +++++++++++++++++++++++++++------ R/unsupervised.R | 14 ++++++++++++++ R/utils.R | 5 ++++- 4 files changed, 50 insertions(+), 7 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d59d113..3657c2c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,14 @@ # Generated by roxygen2: do not edit by hand export(adaptive.bin) +export(add_feature_ids) export(adjust.time) export(aggregate_by_rt) +export(as_feature_sample_table) export(bigauss.esti) export(bigauss.esti.EM) export(bigauss.mix) +export(check_files) export(clean_data_matrix) export(comb) export(compute_boundaries) @@ -41,6 +44,7 @@ export(compute_template) export(compute_template_adjusted_rt) export(compute_uniq_grp) export(correct_time) +export(correct_time_v2) export(count_peaks) export(create_aligned_feature_table) export(create_features_from_cluster) @@ -63,6 +67,7 @@ export(get_features_in_rt_range) export(get_mzrange_bound_indices) export(get_num_workers) export(get_rt_region_indices) +export(get_sample_name) export(get_single_occurrence_mask) export(get_times_to_use) export(hybrid) diff --git a/R/adjust.time.R b/R/adjust.time.R index fe896d1..23173c5 100644 --- a/R/adjust.time.R +++ b/R/adjust.time.R @@ -2,6 +2,10 @@ NULL #> NULL +#' Combine template and sample features +#' @param template_features Tibble Template feature table (mz, rt, cluster, sample_id). +#' @param features Tibble Sample feature table (mz, rt, cluster, sample_id). +#' @return Tibble Combined feature table (rbind). #' @export compute_comb <- function(template_features, features) { combined <- dplyr::bind_rows( @@ -11,6 +15,12 @@ compute_comb <- function(template_features, features) { return(combined) } +#' Select features to use for retention time alignment +#' @description This function selects features present in both the sample +#' feature table and template feature table given they have the same cluster, +#' are adjacent in the combined table. +#' @param combined Tibble Table with (mz, rt, cluster, sample_id). +#' @return List of bool Returns list of bools with TRUE at each index where this condition is met. #' @export compute_sel <- function(combined) { l <- nrow(combined) @@ -19,6 +29,11 @@ compute_sel <- function(combined) { return(sel) } +#' Create two column table with paired sample and template retention times. +#' @param combined Tibble Table with features from sample and template. +#' @param sel list of bools List of bools indiciating which features to pair. +#' See 'compute_sel'. +#' @param j string Template sample_id. #' @export compute_template_adjusted_rt <- function(combined, sel, j) { all_features <- cbind(combined$rt[sel], combined$rt[sel + 1]) @@ -59,20 +74,25 @@ compute_corrected_features_v2 <- function(features, template_rt, delta_rt) { return(features |> dplyr::arrange_at(c("mz", "rt"))) } +#' Correct the rt in feature table based on paired feature rts and differences. +#' @param features Tibble The feature table for which to correct rts. +#' @param template_rt List of floats Template retention times for the paired features. +#' @param delta_rt List of floats Differences between the paired rts. +#' @return Tibble A table with corrected retention times. #' @export -compute_corrected_features <- function(features, delta_rt, avg_time) { +compute_corrected_features <- function(features, template_rt, delta_rt) { features <- features |> dplyr::arrange_at(c("rt", "mz")) corrected <- features$rt original <- features$rt - idx <- dplyr::between(original, min(delta_rt), max(delta_rt)) + idx <- dplyr::between(original, min(template_rt), max(template_rt)) to_correct <- original[idx] this.smooth <- ksmooth( + template_rt, delta_rt, - avg_time, kernel = "normal", - bandwidth = (max(delta_rt) - min(delta_rt)) / 5, + bandwidth = (max(template_rt) - min(template_rt)) / 5, x.points = to_correct ) @@ -80,8 +100,8 @@ compute_corrected_features <- function(features, delta_rt, avg_time) { lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)]) upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)]) - idx_lower <- original < min(delta_rt) - idx_upper <- original > max(delta_rt) + idx_lower <- original < min(template_rt) + idx_upper <- original > max(template_rt) corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment @@ -149,6 +169,7 @@ compute_template <- function(extracted_features) { return(tibble::as_tibble(template_features)) } +#' @export correct_time_v2 <- function(features, template) { if (unique(features$sample_id) == unique(template$sample_id)) return(tibble::as_tibble(features)) diff --git a/R/unsupervised.R b/R/unsupervised.R index 3b7070b..4093b9c 100644 --- a/R/unsupervised.R +++ b/R/unsupervised.R @@ -2,6 +2,13 @@ NULL #> NULL +#' Read the metadata table, retention time data matrix and intensity data matrix +#' and combine them into a single table +#' @param metadata Tibble Feature metadata table with information concerning the peaks. +#' @param rt_crosstab Tibble Data matrix with features on rows and samples on columns holding rt data. +#' @param int_crosstab Tibble Data matrix with features on rows and samples on columns holding intensity data. +#' @return Tibble A merged table containing all information. +#' @export as_feature_sample_table <- function(metadata, rt_crosstab, int_crosstab) { feature_names <- as.character(rt_crosstab$id) sample_names <- colnames(metadata)[-c(1:8)] @@ -27,6 +34,9 @@ as_feature_sample_table <- function(metadata, rt_crosstab, int_crosstab) { return(data) } +#' Check files whether they exist. +#' @param filenames list of filenames Filenames to check whether they exist. +#' @export check_files <- function(filenames) { missing <- !file.exists(filenames) missing_filenames <- paste0('\t', filenames[missing], collapse = '\n') @@ -36,6 +46,10 @@ check_files <- function(filenames) { } } +#' Get the sample name as basename of the file. +#' @param filename string Name of the file. +#' @return string Sample name. +#' @export get_sample_name <- function(filename) { tools::file_path_sans_ext(basename(filename)) } diff --git a/R/utils.R b/R/utils.R index 738ad0e..db06a3b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -85,7 +85,10 @@ register_functions_to_cluster <- function(cluster) { 'get_mzrange_bound_indices', 'compute_mass_density', 'l2normalize', - 'compute_peaks_and_valleys' + 'compute_peaks_and_valleys', + 'as_feature_sample_table', + 'check_files', + 'get_sample_name' )) snow::clusterEvalQ(cluster, library("dplyr")) snow::clusterEvalQ(cluster, library("stringr")) From ef2f003bfa10070ab438465cd0b0c6cdce931fa8 Mon Sep 17 00:00:00 2001 From: hechth Date: Mon, 29 Jul 2024 10:29:08 +0200 Subject: [PATCH 22/22] Finalized documentation for adjust time --- NAMESPACE | 1 + R/adjust.time.R | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 3657c2c..c5c8173 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,7 @@ export(compute_clusters) export(compute_clusters_simple) export(compute_comb) export(compute_corrected_features) +export(compute_corrected_features_v2) export(compute_curr_rec_with_enough_peaks) export(compute_delta_rt) export(compute_densities) diff --git a/R/adjust.time.R b/R/adjust.time.R index 23173c5..1d0f5a8 100644 --- a/R/adjust.time.R +++ b/R/adjust.time.R @@ -49,6 +49,13 @@ compute_template_adjusted_rt <- function(combined, sel, j) { return(all_features) } +#' Correct the rt in feature table based on paired feature rts and differences. +#' @description This is a newer implementation based on dplyr which might be more efficient than the other function. +#' @param features Tibble The feature table for which to correct rts. +#' @param template_rt List of floats Template retention times for the paired features. +#' @param delta_rt List of floats Differences between the paired rts. +#' @return Tibble A table with corrected retention times. +#' @export compute_corrected_features_v2 <- function(features, template_rt, delta_rt) { features <- features |> dplyr::arrange_at(c("rt", "mz")) idx <- dplyr::between(features$rt, min(template_rt), max(template_rt)) @@ -111,6 +118,10 @@ compute_corrected_features <- function(features, template_rt, delta_rt) { return(features) } +#' Fill missing values based on original retention times. +#' @param orig.features Non-corrected feature table. +#' @param this.features Feature table with eventual missing values. +#' @return Tibble Feature table with filles values. #' @export fill_missing_values <- function(orig.feature, this.feature) { missing_values <- which(is.na(this.feature$rt)) @@ -124,6 +135,10 @@ fill_missing_values <- function(orig.feature, this.feature) { return(this.feature) } +#' Function to perform retention time correction +#' @param this.feature Tibble Feature table for which to correct rt. +#' @param template_features Tibble Template feature table to use for correction. +#' @return Tibble this.feature table with corrected rt values. #' @export correct_time <- function(this.feature, template_features) { orig.features <- this.feature @@ -157,6 +172,10 @@ correct_time <- function(this.feature, template_features) { return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster"))) } +#' Select the template feature table. +#' @description The current implementation selects the table with the most features as the template. +#' @param extracted_features List of tables Tables from which to select the template. +#' @return Tibble Template feature table. #' @export compute_template <- function(extracted_features) { num.ftrs <- sapply(extracted_features, nrow) @@ -169,6 +188,13 @@ compute_template <- function(extracted_features) { return(tibble::as_tibble(template_features)) } +#' Rewritten version of 'correct_time' +#' @description This function uses dplyr to do the same as +#' 'correct_time', just with less code. Most functions used in the original +#' function are replaced with simple data transformations. +#' @param features Tibble Table with features to correct. +#' @param template Tibble Template feature table to use for correction. +#' @return Tibble Corrected feature table. #' @export correct_time_v2 <- function(features, template) { if (unique(features$sample_id) == unique(template$sample_id))