From db792a8ec63cd410864855e47a6f8b1572fd409d Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Fri, 19 Jul 2024 13:01:02 +0200
Subject: [PATCH 01/22] test and input/output files for the select_mz function

---
 R/feature.align.R                                |   5 +++--
 tests/testdata/aligned/output_select-mz.rds      | Bin 0 -> 207 bytes
 .../input/feature-align_select-mz.parquet        | Bin 0 -> 2902 bytes
 tests/testthat/test-feature-align_select-mz.R    |  15 +++++++++++++++
 4 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 tests/testdata/aligned/output_select-mz.rds
 create mode 100644 tests/testdata/input/feature-align_select-mz.parquet
 create mode 100644 tests/testthat/test-feature-align_select-mz.R

diff --git a/R/feature.align.R b/R/feature.align.R
index acc239b..0aa759f 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -155,13 +155,14 @@ create_aligned_feature_table <- function(features_table,
     sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])
 
     # retention time alignment
+    
     aligned_features <- foreach::foreach(
         i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE
-    ) %dopar% {
+    ) %do% {
         rows <- create_rows(
             features_table,
             i,
-            sel.labels,
+            sel.labels, 
             mz_tol_relative,
             rt_tol_relative,
             min_occurrence,
diff --git a/tests/testdata/aligned/output_select-mz.rds b/tests/testdata/aligned/output_select-mz.rds
new file mode 100644
index 0000000000000000000000000000000000000000..ed8d4b2f622a82cddc3a57d6b869a5d04a29a2d3
GIT binary patch
literal 207
zcmV;=05Ja_iwFP!000001B>8dU|?WoU}0uvU}gm}8CXL@+;lA%7?^~?5)2G{K+NS3
z-DJ+{BkBsI&CG<aoC5I++G@lc()TUouiAUpA$@&!*WG_z4(S(9t`Vu`a^PS91N#ry
zX{fo(j)CkCyQhBRa|~c_o2-(y*wL%<x5KfcoG^KZ^j*E*YY#A^o6o?&!UU9OWME+c
zsb$Se%uOu@G6W#L0Sofvrj{h8B$gz`7v+~j#dtIGN>cNRGfOIw1lfv8z&r+qe*nhe
JlYb8Z006X$Us?bF

literal 0
HcmV?d00001

diff --git a/tests/testdata/input/feature-align_select-mz.parquet b/tests/testdata/input/feature-align_select-mz.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..32e5c5d7760a55671ea1928a5e8d32e539c47236
GIT binary patch
literal 2902
zcmcguZ)jUp6u)^%lh^F6d)C~?iv+3CU5j)xjrI>J`0h)$n3!#CP)yfB&61c^(={ep
zTWkF?r0V{NOouQ;1ov%%=m$|j83hZ<5FFJH6F)dVyAL8N3gS8U<t@*=gzATQN!~g4
z+;e{C{_c7A@TPcXg!<?c^k|wYl=Kr4O@Fy{>)6=ISwmY}uZ)AdaU(hJk&iskfBVjB
zFn#gkR}<fXoEv`BKuHPJ=xxIgu2MhsM|9ct(B+#@WT=gm5VB)Pf~Vr~dSBX<*aay<
zgW*x)SHrrruu3IO(auQPIf<=%nEYRm!^6OeP>H}p1wjbRqx8NsY!O-NN4%Gx|LyE2
zCF9gz-=8}DUdaeP7kc-LuS-ZLM0F<}dqtORUk&KEt&k)*Xwi|pgT3s-9tWFhU9h?4
z(XM;gMIYPp0v$I-2juPP{HUIN>*YSxQ2pjl-(2l6zI|wR^i#jL4ISH4mu(+R>bADh
z*^--dytTrJgU?2Q{fvcogLHu2p3iFT?l<0_m5fV)Ki<5u;WaMZacAtV9$y<i_KGgs
zF__eCZRPXJj(_lpIrv-;vU5S;;|A#fy*;14U(cWZAu!F~d2;Zz^}rPG{e08-^Vt-4
zzizgtF55ns)NO6$^ZKqPA78Fo%sB|1>|-D71VV0<4%lr8t<a2pe;!P$kUta8&PB)O
zYR49<_2ThjfriL<Dny1R9Hy4*G)9J05$5pGEsF!FmD`p62vr)5p|_S>sFaIy^94ch
zf~5WKVH>xwEj98l!P}BkX(&x4(H!Im<i~+;TR+vD&A?h9hA!J0o}jHatzhXLu$qBK
zA?z#KEKVR_K2fU|t77{fd9=$O_F0I%5`y};aXO&4qYyqbN_2lMf2_EWqkV)1cT3ns
z8%niqDR^~nYB5U_uM43H1T;@~H%a3;2pD_IL{q|%@LE?Jb$9zF>uwuk-R&x@%a@#W
z`=b317eU)pG=<69RV747IHD^&lR5PG{nowE-}c>uW0aem^J)0wc!cw1ZkG6d&PzOA
z5;hYY4xkq2Jc$zKwuD(8O*KTAK<xlR#b;2>;bT%j??aVOLB3^>)53`-gwak7xVV-$
zSSw@?Mh|C`d}3+b@-OikZWUHdGb4h;&T!+6kwTe|?XTw(E9Gpg{4~yC9r7Z+nYZKl
z3a)UmKS#XS-%K`Do)OzI4>kPbJoyx;$ZX%Fa55cy1QZ^Pxq5gq$qP&1iX2Mu<wm^O
z$oCk!I`U28JRh50;VIxK@RK~&az0Q57|rv6l5qZTDhh|1XyC^Y6#SqbGkAWYP7Tr+
zZ08q6a^#0na(*<shyv?v#as2p(u+a#fl99W%!y)sPrg#wldD!2mj{-L)!O{x@qv+h
W_eS?d!)wqP-9Pd|4<S?VMg2D&&529^

literal 0
HcmV?d00001

diff --git a/tests/testthat/test-feature-align_select-mz.R b/tests/testthat/test-feature-align_select-mz.R
new file mode 100644
index 0000000..6148bdb
--- /dev/null
+++ b/tests/testthat/test-feature-align_select-mz.R
@@ -0,0 +1,15 @@
+test_that("select_mz function works", {
+  sample <- read_parquet("../testdata/input/feature-align_select-mz.parquet")
+  sample_names <- c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened")
+  min_occurrence <- 2
+  mz_tol_relative <- 6.85676325338646e-06
+  rt_tol_relative <- 2.17918873407775
+
+  actual <- select_mz(sample,
+                      mz_tol_relative,
+                      rt_tol_relative,
+                      min_occurrence,
+                      sample_names)
+  expected <- readRDS("../testdata/aligned/output_select-mz.rds")
+  expect_equal(actual, expected)
+})
\ No newline at end of file

From a4e957956fbe124bed9c249a449a3b30745c9af5 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Fri, 19 Jul 2024 13:43:00 +0200
Subject: [PATCH 02/22] create_empty_tibble added to @export

---
 R/feature.align.R | 20 ++++----------------
 R/utils.R         |  1 +
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/R/feature.align.R b/R/feature.align.R
index 0aa759f..3b95a56 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -1,5 +1,6 @@
 #' @import foreach
 
+#' @export
 create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) {
     features <- new("list")
     features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames)
@@ -85,23 +86,12 @@ select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence,
 
 #' @export
 create_rows <- function(features,
-                        i,
-                        sel.labels,
                         mz_tol_relative,
                         rt_tol_relative,
                         min_occurrence,
                         sample_names) {
-    if (i %% 100 == 0) {
-        gc()
-    } # call Garbage Collection for performance improvement?
-
-    sample <- dplyr::filter(features, cluster == sel.labels[i])
-    if (nrow(sample) > 1) {
-        if (validate_contents(sample, min_occurrence)) {
-            return(select_mz(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names))
-        }
-    } else if (min_occurrence == 1) {
-        return(create_output(sample_grouped, sample_names))
+    if (validate_contents(features, min_occurrence)) {
+        return(select_mz(features, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names))
     }
     return(NULL)
 }
@@ -160,9 +150,7 @@ create_aligned_feature_table <- function(features_table,
         i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE
     ) %do% {
         rows <- create_rows(
-            features_table,
-            i,
-            sel.labels, 
+            dplyr::filter(features_table, cluster == sel.labels[i]),
             mz_tol_relative,
             rt_tol_relative,
             min_occurrence,
diff --git a/R/utils.R b/R/utils.R
index 14f4827..781bc71 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -45,6 +45,7 @@ register_functions_to_cluster <- function(cluster) {
         'compute_uniq_grp',
         'predict_smoothed_rt',
         'label_val_to_keep',
+        "create_empty_tibble",
         "create_rows",
         "validate_contents",
         "select_mz",

From a4cc11adcaf15b4b1b3beeccde408c2b4cb879ac Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Fri, 19 Jul 2024 16:14:44 +0200
Subject: [PATCH 03/22] documentation to functions added

---
 R/feature.align.R | 56 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 7 deletions(-)

diff --git a/R/feature.align.R b/R/feature.align.R
index 3b95a56..d391f1a 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -1,5 +1,11 @@
 #' @import foreach
 
+#' Create an empty tibble for the next alignment step. It will contain three tables with aligned metadata, intensities an RTs.
+#' @param number_of_samples Number of different sample names.
+#' @param metadata_colnames Metadata column names: "id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names
+#' @param intensity_colnames "id" and sample names; will hold intensities.
+#' @param rt_colnames "id" and sample names; will hold retention times.
+#' @return An empty tibble with slots for metadata, intensities and RTs.
 #' @export
 create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) {
     features <- new("list")
@@ -9,6 +15,10 @@ create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_
     return(features)
 }
 
+#' Create a list containing 3 tibbles: metadata, intensities and RTs.
+#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
+#' @param sample_names A list of sample names.
+#' @return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
 create_output <- function(sample_grouped, sample_names) {
     number_of_samples <- length(sample_names)
@@ -32,9 +42,12 @@ create_output <- function(sample_grouped, sample_names) {
     return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row))
 }
 
+#' Validates if the data is present in more than "min_occurence" of samples.
+#' @param samples A subset of the features_table.
+#' @param min_occurrence A minimal number of profiles a feature has to be present in.
+#' @return boolean value whether it is TRUE or FALSE.
 #' @export
 validate_contents <- function(samples, min_occurrence) {
-    # validate whether data is still from at least 'min_occurrence' number of samples
     if (!is.null(nrow(samples))) {
         if (length(unique(samples$sample_id)) >= min_occurrence) {
             return(TRUE)
@@ -44,24 +57,37 @@ validate_contents <- function(samples, min_occurrence) {
     return(FALSE)
 }
 
+#' Compute the kernel density estimation and find the peaks and valleys of a smooth curve.
+#' @param data A vector of m/z or RTs for a particular cluster.
+#' @param bandwidth A bandwidth value for the KDE computation.
+#' @return A list of peaks and valleys positions.
 #' @export
 find_optima <- function(data, bandwidth) {
-    # Kernel Density Estimation
-    den <- density(data, bw = bandwidth)
-    # select statistically significant points
-    turns <- find.turn.point(den$y)
-    return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys]))
+  den <- density(data, bw = bandwidth)
+  turns <- find.turn.point(den$y)
+  return(list(peaks = den$x[turns$pks], valleys = den$x[turns$vlys]))
 }
 
+#' Subset data within lower and upper bound from density estimation
+#' @param sample A subset of the features_table.
+#' @param turns A list of peaks and valleys positions.
+#' @param index Whether it subsets on m/z [1] or RT [2] column.
+#' @param i Iterates over the peaks in the turns list.
+#' @return Dataframe subsetted within lower and upper bound from density estimation.
 #' @export
 filter_based_on_density <- function(sample, turns, index, i) {
-    # select data within lower and upper bound from density estimation
     lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]])
     upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]])
     selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound)
     return(sample[selected, ])
 }
 
+#' Groups the features across samples based on RT.
+#' @param sample A dataframe subsetted for the particular cluster.
+#' @param rt_tol_relative The retention time tolerance level for peak alignment.
+#' @param min_occurence A minimal number of profiles a feature has to be present in.
+#' @param sample_names A list of sample names.
+#' @param return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
 select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) {
     turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414)
@@ -73,6 +99,13 @@ select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) {
     }
 }
 
+#' Groups the features across samples based on m/z.
+#' @param sample A dataframe subsetted for the particular cluster.
+#' @param mz_tol_relative The m/z tolerance level for peak alignment.
+#' @param rt_tol_relative The retention time tolerance level for peak alignment.
+#' @param min_occurence A minimal number of profiles a feature has to be present in.
+#' @param sample_names A list of sample names.
+#' @return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
 select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) {
     turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz))
@@ -84,6 +117,13 @@ select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence,
     }
 }
 
+#' Groups the mz and RT for particular cluster.
+#' @param features The features table subsetted for a particular cluster.
+#' @param mz_tol_relative The m/z tolerance level for peak alignment.
+#' @param rt_tol_relative The retention time tolerance level for peak alignment.
+#' @param min_occurrence A minimal number of profiles a feature has to be present in.
+#' @param sample_names A list of sample names.
+#' @return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
 create_rows <- function(features,
                         mz_tol_relative,
@@ -96,6 +136,8 @@ create_rows <- function(features,
     return(NULL)
 }
 
+#' Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.
+#' @return Tibbles combining the output (metadata, intensity and RT respectively) from different clusters.
 #' @export
 comb <- function(x, ...) {
     mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE)))

From 127e714f7d6fe32de2720c6f3db5e6b3db1d8577 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Fri, 19 Jul 2024 16:21:41 +0200
Subject: [PATCH 04/22] roxygen updated documentation

---
 DESCRIPTION |  2 +-
 NAMESPACE   | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 80196d4..3b1effc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -17,4 +17,4 @@ NeedsCompilation: no
 Suggests: 
     dataCompareR, testthat (>= 3.0.0), microbenchmark
 Config/testthat/edition: 3
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
index 1061d6a..995a087 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -42,6 +42,7 @@ export(compute_uniq_grp)
 export(correct_time)
 export(count_peaks)
 export(create_aligned_feature_table)
+export(create_empty_tibble)
 export(create_output)
 export(create_rows)
 export(draw_rt_correction_plot)
@@ -101,19 +102,37 @@ export(two.step.hybrid)
 export(unsupervised)
 export(validate_contents)
 export(validate_model_method_input)
+import("for")
+import("metadata,")
+import("next")
+import(Create)
+import(It)
 import(MASS)
+import(RTs.)
+import(aligned)
+import(alignment)
+import(an)
 import(arrow)
+import(contain)
 import(doParallel)
 import(dplyr)
+import(empty)
 import(foreach)
+import(intensities)
 import(mzR)
 import(parallel)
 import(snow)
 import(splines)
+import(step.)
 import(stringr)
+import(tables)
+import(the)
+import(three)
 import(tibble)
 import(tidyr)
 import(tools)
+import(will)
+import(with)
 importFrom(dplyr,arrange)
 importFrom(dplyr,between)
 importFrom(dplyr,bind_rows)

From b2ca09dd21f5c7bbea25b861d76b0d2e0b0518de Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Mon, 22 Jul 2024 08:56:31 +0200
Subject: [PATCH 05/22] weird change

---
 R/feature.align.R | 186 +++++++++++++++++++++++-----------------------
 1 file changed, 93 insertions(+), 93 deletions(-)

diff --git a/R/feature.align.R b/R/feature.align.R
index d391f1a..16a2fc5 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -8,11 +8,11 @@
 #' @return An empty tibble with slots for metadata, intensities and RTs.
 #' @export
 create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) {
-    features <- new("list")
-    features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames)
-    features$intensity <- tibble::as_tibble(matrix(nrow = 0, ncol = length(intensity_colnames)), .name_repair = ~intensity_colnames)
-    features$rt <- tibble::as_tibble(matrix(nrow = 0, ncol = length(rt_colnames)), .name_repair = ~rt_colnames)
-    return(features)
+  features <- new("list")
+  features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames)
+  features$intensity <- tibble::as_tibble(matrix(nrow = 0, ncol = length(intensity_colnames)), .name_repair = ~intensity_colnames)
+  features$rt <- tibble::as_tibble(matrix(nrow = 0, ncol = length(rt_colnames)), .name_repair = ~rt_colnames)
+  return(features)
 }
 
 #' Create a list containing 3 tibbles: metadata, intensities and RTs.
@@ -21,25 +21,25 @@ create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_
 #' @return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
 create_output <- function(sample_grouped, sample_names) {
-    number_of_samples <- length(sample_names)
-    intensity_row <- rep(0, number_of_samples)
-    rt_row <- rep(0, number_of_samples)
-    sample_presence <- rep(0, number_of_samples)
-
-    for (i in seq_along(intensity_row)) {
-        filtered <- filter(sample_grouped, sample_id == sample_names[i])
-        if (nrow(filtered) != 0) {
-            sample_presence[i] <- 1
-            intensity_row[i] <- sum(filtered$area)
-            rt_row[i] <- median(filtered$rt)
-        }
+  number_of_samples <- length(sample_names)
+  intensity_row <- rep(0, number_of_samples)
+  rt_row <- rep(0, number_of_samples)
+  sample_presence <- rep(0, number_of_samples)
+
+  for (i in seq_along(intensity_row)) {
+    filtered <- filter(sample_grouped, sample_id == sample_names[i])
+    if (nrow(filtered) != 0) {
+      sample_presence[i] <- 1
+      intensity_row[i] <- sum(filtered$area)
+      rt_row[i] <- median(filtered$rt)
     }
+  }
 
-    mz <- sample_grouped$mz
-    rt <- sample_grouped$rt
-    metadata_row <- c(mean(mz), min(mz), max(mz), mean(rt), min(rt), max(rt), nrow(sample_grouped), sample_presence)
+  mz <- sample_grouped$mz
+  rt <- sample_grouped$rt
+  metadata_row <- c(mean(mz), min(mz), max(mz), mean(rt), min(rt), max(rt), nrow(sample_grouped), sample_presence)
 
-    return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row))
+  return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row))
 }
 
 #' Validates if the data is present in more than "min_occurence" of samples.
@@ -48,13 +48,13 @@ create_output <- function(sample_grouped, sample_names) {
 #' @return boolean value whether it is TRUE or FALSE.
 #' @export
 validate_contents <- function(samples, min_occurrence) {
-    if (!is.null(nrow(samples))) {
-        if (length(unique(samples$sample_id)) >= min_occurrence) {
-            return(TRUE)
-        }
-        return(FALSE)
+  if (!is.null(nrow(samples))) {
+    if (length(unique(samples$sample_id)) >= min_occurrence) {
+      return(TRUE)
     }
     return(FALSE)
+  }
+  return(FALSE)
 }
 
 #' Compute the kernel density estimation and find the peaks and valleys of a smooth curve.
@@ -76,10 +76,10 @@ find_optima <- function(data, bandwidth) {
 #' @return Dataframe subsetted within lower and upper bound from density estimation.
 #' @export
 filter_based_on_density <- function(sample, turns, index, i) {
-    lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]])
-    upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]])
-    selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound)
-    return(sample[selected, ])
+  lower_bound <- max(turns$valleys[turns$valleys < turns$peaks[i]])
+  upper_bound <- min(turns$valleys[turns$valleys > turns$peaks[i]])
+  selected <- which(sample[, index] > lower_bound & sample[, index] <= upper_bound)
+  return(sample[selected, ])
 }
 
 #' Groups the features across samples based on RT.
@@ -90,13 +90,13 @@ filter_based_on_density <- function(sample, turns, index, i) {
 #' @param return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
 select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) {
-    turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414)
-    for (i in seq_along(turns$peaks)) {
-        sample_grouped <- filter_based_on_density(sample, turns, 2, i)
-        if (validate_contents(sample_grouped, min_occurrence)) {
-            return(create_output(sample_grouped, sample_names))
-        }
+  turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414)
+  for (i in seq_along(turns$peaks)) {
+    sample_grouped <- filter_based_on_density(sample, turns, 2, i)
+    if (validate_contents(sample_grouped, min_occurrence)) {
+      return(create_output(sample_grouped, sample_names))
     }
+  }
 }
 
 #' Groups the features across samples based on m/z.
@@ -108,13 +108,13 @@ select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) {
 #' @return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
 select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) {
-    turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz))
-    for (i in seq_along(turns$peaks)) {
-        sample_grouped <- filter_based_on_density(sample, turns, 1, i)
-        if (validate_contents(sample_grouped, min_occurrence)) {
-            return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names))
-        }
+  turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz))
+  for (i in seq_along(turns$peaks)) {
+    sample_grouped <- filter_based_on_density(sample, turns, 1, i)
+    if (validate_contents(sample_grouped, min_occurrence)) {
+      return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names))
     }
+  }
 }
 
 #' Groups the mz and RT for particular cluster.
@@ -130,17 +130,17 @@ create_rows <- function(features,
                         rt_tol_relative,
                         min_occurrence,
                         sample_names) {
-    if (validate_contents(features, min_occurrence)) {
-        return(select_mz(features, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names))
-    }
-    return(NULL)
+  if (validate_contents(features, min_occurrence)) {
+    return(select_mz(features, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names))
+  }
+  return(NULL)
 }
 
 #' Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.
 #' @return Tibbles combining the output (metadata, intensity and RT respectively) from different clusters.
 #' @export
 comb <- function(x, ...) {
-    mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE)))
+  mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE)))
 }
 
 #' Align peaks from spectra into a feature table.
@@ -163,54 +163,54 @@ create_aligned_feature_table <- function(features_table,
                                          rt_tol_relative,
                                          mz_tol_relative,
                                          cluster = 4) {
-    if (!is(cluster, "cluster")) {
-        cluster <- parallel::makeCluster(cluster)
-        on.exit(parallel::stopCluster(cluster))
-        
-        # NOTE: side effect (doParallel has no functionality to clean up)
-        doParallel::registerDoParallel(cluster)
-        register_functions_to_cluster(cluster)
+  if (!is(cluster, "cluster")) {
+    cluster <- parallel::makeCluster(cluster)
+    on.exit(parallel::stopCluster(cluster))
+
+    # NOTE: side effect (doParallel has no functionality to clean up)
+    doParallel::registerDoParallel(cluster)
+    register_functions_to_cluster(cluster)
+  }
+
+
+
+  number_of_samples <- length(sample_names)
+  metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
+  intensity_colnames <- c("id", sample_names)
+  rt_colnames <- c("id", sample_names)
+
+  aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames)
+
+  # table with number of values per group
+  groups_cardinality <- table(features_table$cluster)
+  # count those with minimal occurrence
+  sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])
+
+  # retention time alignment
+
+  aligned_features <- foreach::foreach(
+    i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE
+  ) %do% {
+    rows <- create_rows(
+      dplyr::filter(features_table, cluster == sel.labels[i]),
+      mz_tol_relative,
+      rt_tol_relative,
+      min_occurrence,
+      sample_names
+    )
+
+    if (!is.null(rows)) {
+      rows$metadata_row <- c(i, rows$metadata_row)
+      rows$intensity_row <- c(i, rows$intensity_row)
+      rows$rt_row <- c(i, rows$rt_row)
     }
 
+    list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row)
+  }
 
+  colnames(aligned_features$metadata) <- metadata_colnames
+  colnames(aligned_features$intensity) <- intensity_colnames
+  colnames(aligned_features$rt) <- rt_colnames
 
-    number_of_samples <- length(sample_names)
-    metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
-    intensity_colnames <- c("id", sample_names)
-    rt_colnames <- c("id", sample_names)
-
-    aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames)
-
-    # table with number of values per group
-    groups_cardinality <- table(features_table$cluster)
-    # count those with minimal occurrence
-    sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])
-
-    # retention time alignment
-    
-    aligned_features <- foreach::foreach(
-        i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE
-    ) %do% {
-        rows <- create_rows(
-            dplyr::filter(features_table, cluster == sel.labels[i]),
-            mz_tol_relative,
-            rt_tol_relative,
-            min_occurrence,
-            sample_names
-        )
-
-        if (!is.null(rows)) {
-            rows$metadata_row <- c(i, rows$metadata_row)
-            rows$intensity_row <- c(i, rows$intensity_row)
-            rows$rt_row <- c(i, rows$rt_row)
-        }
-
-        list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row)
-    }
-
-    colnames(aligned_features$metadata) <- metadata_colnames
-    colnames(aligned_features$intensity) <- intensity_colnames
-    colnames(aligned_features$rt) <- rt_colnames
-
-    return(aligned_features)
+  return(aligned_features)
 }

From 3bede62beeaa1dbb3047036d75ac3fcbe9c29f87 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Mon, 22 Jul 2024 11:27:56 +0200
Subject: [PATCH 06/22] state pre refactoring

---
 R/feature.align.R | 37 +++++++++++++++++++++++++++----------
 1 file changed, 27 insertions(+), 10 deletions(-)

diff --git a/R/feature.align.R b/R/feature.align.R
index 16a2fc5..cf93b83 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -1,7 +1,8 @@
 #' @import foreach
 
 #' Create an empty tibble for the next alignment step. It will contain three tables with aligned metadata, intensities an RTs.
-#' @param number_of_samples Number of different sample names.
+#' @param number_of_samples Number
+#'  of different sample names.
 #' @param metadata_colnames Metadata column names: "id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names
 #' @param intensity_colnames "id" and sample names; will hold intensities.
 #' @param rt_colnames "id" and sample names; will hold retention times.
@@ -15,30 +16,48 @@ create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_
   return(features)
 }
 
+create_metadata <- function(sample_grouped, sample_names) {
+  sample_presence <- sapply(sample_names,
+    FUN=function(x) {
+      as.numeric(any(sample_grouped$sample_id == x))
+    }
+  )
+
+  metadata_row <- dplyr::summarise(
+    sample_grouped,
+    mzmean = mean(mz),
+    mzmin = min(mz),
+    mzmax = max(mz),
+    rtmean = mean(rt),
+    rtmin = min(rt),
+    rtmax = max(rt),
+    npeaks = n()
+  ) %>% rename(mz = "mzmean", rt = "rtmean")
+
+  metadata_row <- dplyr::bind_cols(metadata_row, as.list(sample_presence))
+  return(as.vector(unlist(metadata_row[1,])))
+}
+
 #' Create a list containing 3 tibbles: metadata, intensities and RTs.
 #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
 #' @param sample_names A list of sample names.
 #' @return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
 create_output <- function(sample_grouped, sample_names) {
+  metadata_row <- create_metadata(sample_grouped, sample_names)
+
   number_of_samples <- length(sample_names)
   intensity_row <- rep(0, number_of_samples)
   rt_row <- rep(0, number_of_samples)
-  sample_presence <- rep(0, number_of_samples)
 
   for (i in seq_along(intensity_row)) {
     filtered <- filter(sample_grouped, sample_id == sample_names[i])
+
     if (nrow(filtered) != 0) {
-      sample_presence[i] <- 1
       intensity_row[i] <- sum(filtered$area)
       rt_row[i] <- median(filtered$rt)
     }
   }
-
-  mz <- sample_grouped$mz
-  rt <- sample_grouped$rt
-  metadata_row <- c(mean(mz), min(mz), max(mz), mean(rt), min(rt), max(rt), nrow(sample_grouped), sample_presence)
-
   return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row))
 }
 
@@ -172,8 +191,6 @@ create_aligned_feature_table <- function(features_table,
     register_functions_to_cluster(cluster)
   }
 
-
-
   number_of_samples <- length(sample_names)
   metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
   intensity_colnames <- c("id", sample_names)

From 2036537090ce0dad6637690351faf5ba0533cd5f Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Mon, 22 Jul 2024 12:42:52 +0200
Subject: [PATCH 07/22] updated select_mz to return tibble rows

---
 R/feature.align.R                           |  35 ++++++++++++--------
 tests/testdata/aligned/output_select-mz.rds | Bin 207 -> 354 bytes
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/R/feature.align.R b/R/feature.align.R
index cf93b83..f85b38f 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -35,7 +35,11 @@ create_metadata <- function(sample_grouped, sample_names) {
   ) %>% rename(mz = "mzmean", rt = "rtmean")
 
   metadata_row <- dplyr::bind_cols(metadata_row, as.list(sample_presence))
-  return(as.vector(unlist(metadata_row[1,])))
+  return(metadata_row)
+}
+
+first_tibble_row_as_vector <- function(x) {
+  return(as.vector(unlist(x[1,])))
 }
 
 #' Create a list containing 3 tibbles: metadata, intensities and RTs.
@@ -46,19 +50,22 @@ create_metadata <- function(sample_grouped, sample_names) {
 create_output <- function(sample_grouped, sample_names) {
   metadata_row <- create_metadata(sample_grouped, sample_names)
 
-  number_of_samples <- length(sample_names)
-  intensity_row <- rep(0, number_of_samples)
-  rt_row <- rep(0, number_of_samples)
-
-  for (i in seq_along(intensity_row)) {
-    filtered <- filter(sample_grouped, sample_id == sample_names[i])
-
-    if (nrow(filtered) != 0) {
-      intensity_row[i] <- sum(filtered$area)
-      rt_row[i] <- median(filtered$rt)
-    }
-  }
-  return(list(metadata_row = metadata_row, intensity_row = intensity_row, rt_row = rt_row))
+  intensity_row <- sample_grouped %>%
+   group_by(sample_id) %>%
+   summarise(intensity = sum(area)) %>%
+   pivot_wider(names_from = "sample_id", values_from = "intensity")
+
+
+  rt_row <- sample_grouped %>%
+   group_by(sample_id) %>%
+   summarise(rt = median(rt)) %>%
+   pivot_wider(names_from = "sample_id", values_from = "rt")
+  
+  return(list(
+    metadata_row = (metadata_row),
+    intensity_row = (intensity_row),
+    rt_row = (rt_row)
+  ))
 }
 
 #' Validates if the data is present in more than "min_occurence" of samples.
diff --git a/tests/testdata/aligned/output_select-mz.rds b/tests/testdata/aligned/output_select-mz.rds
index ed8d4b2f622a82cddc3a57d6b869a5d04a29a2d3..72675305ea18a1197b783bf0395c7e2ac029f6aa 100644
GIT binary patch
delta 324
zcmV-K0lWUs0pbFXCx6T!8l;F1h#4KCo6K2#L|u{iW@f@yP9e(|wAF~g`RV%>@>lJ>
zi{P*C?z;Q03&Fp5a*aqe$b4Q9A7Up&&i(@f3=rV6FoE^5FmQrAkeriPTnuCgKzs%k
zWGhL^iBCy`@|b}<D3dECu_RG1ttc@!70n#ZqWp5bJfL_n)PHeI4M5^Q5TIKKQzQTm
zDTp9bZWT;Aw<<R?4~dgl0aH^{0uwGOK~e+cz}2wj6{IF+7ejdhLCz8J24?Za8Tmye
zsd=d>DB|Wg#4V7-85sUS;}jmwj)CkCyQhBRgYyH}+a{}|Er#>GDt|j1JIV?1HsgP?
zgO=$(_K;*oH$9Rj-gHRc)%(5n01_WpQXFF5#A-HAZfZ#)$Yt@sWDFJK&CCO8E6yyb
WL=t2xDgpC=MgaiHc|JW11ONawX^q7I

delta 176
zcmV;h08jto0?z@ECw~llK+NS3-DJ+{BkBsI&CG<aoC5I++G@lc()TUouiAUpA$@&!
z*WG_z4(S(9t`Vu`a^PS91N#ryX{fo(j)CkCyQhBRa|~c_o2-(y*wL%<x5KfcoG^KZ
z^j*E*YY#A^o6o?&!UU9OWME+csb$Se%uOu@G6W#L0SofvrYV*rrX-dm#uw$6L&bPA
e^GZ_liZe?pkp$U_O29k@hJOIY;gf$40RRBm8&^C4


From 563389f505fc591dcec724199ae9f8afe3b1da7d Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 08:49:33 +0200
Subject: [PATCH 08/22] test updated for refactored code

---
 tests/testthat/test-feature-align_select-mz.R | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/testthat/test-feature-align_select-mz.R b/tests/testthat/test-feature-align_select-mz.R
index 6148bdb..59261c5 100644
--- a/tests/testthat/test-feature-align_select-mz.R
+++ b/tests/testthat/test-feature-align_select-mz.R
@@ -1,15 +1,16 @@
-test_that("select_mz function works", {
-  sample <- read_parquet("../testdata/input/feature-align_select-mz.parquet")
+test_that("create_features_from_cluster() function works", {
+  sample <- read_parquet("../testdata/input/feature-align_create-features.parquet")
   sample_names <- c("RCX_06_shortened", "RCX_07_shortened", "RCX_08_shortened")
   min_occurrence <- 2
   mz_tol_relative <- 6.85676325338646e-06
   rt_tol_relative <- 2.17918873407775
 
-  actual <- select_mz(sample,
+  actual <- create_features_from_cluster(sample,
                       mz_tol_relative,
                       rt_tol_relative,
                       min_occurrence,
                       sample_names)
-  expected <- readRDS("../testdata/aligned/output_select-mz.rds")
+  
+  expected <- readRDS("../testdata/aligned/output_create-features.rds")
   expect_equal(actual, expected)
 })
\ No newline at end of file

From b5f78ff2b321cfe33a4ed98a333ad53a7eb7cdbc Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 08:56:17 +0200
Subject: [PATCH 09/22] tests updated for refactored code

---
 tests/testthat/test-feature-align.R | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/testthat/test-feature-align.R b/tests/testthat/test-feature-align.R
index c627879..78aac89 100644
--- a/tests/testthat/test-feature-align.R
+++ b/tests/testthat/test-feature-align.R
@@ -84,12 +84,16 @@ patrick::with_parameters_test_that(
         get_num_workers()
     )
 
-    aligned_expected <- list(
-      metadata = arrow::read_parquet(file.path(testdata, "aligned", "metadata_table.parquet")),
-      intensity = arrow::read_parquet(file.path(testdata, "aligned", "intensity_table.parquet")),
-      rt = arrow::read_parquet(file.path(testdata, "aligned", "rt_table.parquet"))
+    aligned_expected <- load_aligned_features(
+      file.path(testdata, "aligned", "metadata_table.parquet"),
+      file.path(testdata, "aligned", "intensity_table.parquet"),
+      file.path(testdata, "aligned", "rt_table.parquet"),
+      file.path(testdata, "aligned", "tolerances.parquet")
     )
 
+    aligned_expected["mz_tol_relative"] <- NULL
+    aligned_expected["rt_tol_relative"] <- NULL
+
     expect_equal(aligned_actual, aligned_expected)
   },
   patrick::cases(

From 40457b04aa2d32b5ed1e5833992b68ec197f3075 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 09:58:41 +0200
Subject: [PATCH 10/22] function documentation updated

---
 R/feature.align.R | 166 ++++++++++++++++++++--------------------------
 1 file changed, 73 insertions(+), 93 deletions(-)

diff --git a/R/feature.align.R b/R/feature.align.R
index f85b38f..ce5a7a8 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -1,21 +1,8 @@
 #' @import foreach
 
-#' Create an empty tibble for the next alignment step. It will contain three tables with aligned metadata, intensities an RTs.
-#' @param number_of_samples Number
-#'  of different sample names.
-#' @param metadata_colnames Metadata column names: "id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names
-#' @param intensity_colnames "id" and sample names; will hold intensities.
-#' @param rt_colnames "id" and sample names; will hold retention times.
-#' @return An empty tibble with slots for metadata, intensities and RTs.
-#' @export
-create_empty_tibble <- function(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames) {
-  features <- new("list")
-  features$metadata <- tibble::as_tibble(matrix(nrow = 0, ncol = length(metadata_colnames)), .name_repair = ~metadata_colnames)
-  features$intensity <- tibble::as_tibble(matrix(nrow = 0, ncol = length(intensity_colnames)), .name_repair = ~intensity_colnames)
-  features$rt <- tibble::as_tibble(matrix(nrow = 0, ncol = length(rt_colnames)), .name_repair = ~rt_colnames)
-  return(features)
-}
-
+#' Create a metadata row tibble with min, max and mean mz and RT values.
+#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
+#' @param sample_names A list of sample names.
 create_metadata <- function(sample_grouped, sample_names) {
   sample_presence <- sapply(sample_names,
     FUN=function(x) {
@@ -38,33 +25,41 @@ create_metadata <- function(sample_grouped, sample_names) {
   return(metadata_row)
 }
 
-first_tibble_row_as_vector <- function(x) {
-  return(as.vector(unlist(x[1,])))
-}
-
-#' Create a list containing 3 tibbles: metadata, intensities and RTs.
+#' Compute summed area for each sample
 #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
-#' @param sample_names A list of sample names.
-#' @return A list containing 3 tibbles: metadata, intensities and RTs.
-#' @export
-create_output <- function(sample_grouped, sample_names) {
-  metadata_row <- create_metadata(sample_grouped, sample_names)
-
-  intensity_row <- sample_grouped %>%
+#' @return Summed area for each sample.
+create_intensity_row <- function(sample_grouped) {
+  sample_grouped %>%
    group_by(sample_id) %>%
    summarise(intensity = sum(area)) %>%
    pivot_wider(names_from = "sample_id", values_from = "intensity")
+}
 
+#' Compute median RT for each sample
+#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
+#' @return Median RT for each sample.
 
-  rt_row <- sample_grouped %>%
+create_rt_row <- function(sample_grouped) {
+  sample_grouped %>%
    group_by(sample_id) %>%
    summarise(rt = median(rt)) %>%
    pivot_wider(names_from = "sample_id", values_from = "rt")
+}
+
+#' Create a list containing 3 tibbles: metadata, intensities and RTs.
+#' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
+#' @param sample_names A list of sample names.
+#' @return A list containing 3 tibbles: metadata, intensities and RTs.
+#' @export
+create_output <- function(sample_grouped, sample_names) {
+  metadata_row <- create_metadata(sample_grouped, sample_names)
+  intensity_row <- create_intensity_row(sample_grouped)
+  rt_row <- create_rt_row(sample_grouped)
   
   return(list(
-    metadata_row = (metadata_row),
-    intensity_row = (intensity_row),
-    rt_row = (rt_row)
+    metadata_row = metadata_row,
+    intensity_row = intensity_row,
+    rt_row = rt_row
   ))
 }
 
@@ -108,42 +103,7 @@ filter_based_on_density <- function(sample, turns, index, i) {
   return(sample[selected, ])
 }
 
-#' Groups the features across samples based on RT.
-#' @param sample A dataframe subsetted for the particular cluster.
-#' @param rt_tol_relative The retention time tolerance level for peak alignment.
-#' @param min_occurence A minimal number of profiles a feature has to be present in.
-#' @param sample_names A list of sample names.
-#' @param return A list containing 3 tibbles: metadata, intensities and RTs.
-#' @export
-select_rt <- function(sample, rt_tol_relative, min_occurrence, sample_names) {
-  turns <- find_optima(sample$rt, bandwidth = rt_tol_relative / 1.414)
-  for (i in seq_along(turns$peaks)) {
-    sample_grouped <- filter_based_on_density(sample, turns, 2, i)
-    if (validate_contents(sample_grouped, min_occurrence)) {
-      return(create_output(sample_grouped, sample_names))
-    }
-  }
-}
-
-#' Groups the features across samples based on m/z.
-#' @param sample A dataframe subsetted for the particular cluster.
-#' @param mz_tol_relative The m/z tolerance level for peak alignment.
-#' @param rt_tol_relative The retention time tolerance level for peak alignment.
-#' @param min_occurence A minimal number of profiles a feature has to be present in.
-#' @param sample_names A list of sample names.
-#' @return A list containing 3 tibbles: metadata, intensities and RTs.
-#' @export
-select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names) {
-  turns <- find_optima(sample$mz, bandwidth = mz_tol_relative * median(sample$mz))
-  for (i in seq_along(turns$peaks)) {
-    sample_grouped <- filter_based_on_density(sample, turns, 1, i)
-    if (validate_contents(sample_grouped, min_occurrence)) {
-      return(select_rt(sample_grouped, rt_tol_relative, min_occurrence, sample_names))
-    }
-  }
-}
-
-#' Groups the mz and RT for particular cluster.
+#' Group the mz and RT for particular cluster.
 #' @param features The features table subsetted for a particular cluster.
 #' @param mz_tol_relative The m/z tolerance level for peak alignment.
 #' @param rt_tol_relative The retention time tolerance level for peak alignment.
@@ -151,22 +111,57 @@ select_mz <- function(sample, mz_tol_relative, rt_tol_relative, min_occurrence,
 #' @param sample_names A list of sample names.
 #' @return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
-create_rows <- function(features,
+create_features_from_cluster <- function(features,
                         mz_tol_relative,
                         rt_tol_relative,
                         min_occurrence,
                         sample_names) {
-  if (validate_contents(features, min_occurrence)) {
-    return(select_mz(features, mz_tol_relative, rt_tol_relative, min_occurrence, sample_names))
+  if (!validate_contents(features, min_occurrence)) {
+    return(NULL)
   }
-  return(NULL)
+
+  # create empty tibble rows
+  metadata <- NULL
+  intensity <- NULL
+  rt <- NULL
+
+  # split according to mz values
+  turns_mz <- find_optima(features$mz, bandwidth = mz_tol_relative * median(features$mz))
+  for (i in seq_along(turns_mz$peaks)) {
+    sample_grouped_mz <- filter_based_on_density(features, turns_mz, 1, i)
+    if (validate_contents(sample_grouped_mz, min_occurrence)) {
+
+      #split according to rt values
+      turns_rt <- find_optima(sample_grouped_mz$rt, bandwidth = rt_tol_relative / 1.414)
+      for (ii in seq_along(turns_rt$peaks)) {
+        sample_grouped_rt <- filter_based_on_density(sample_grouped_mz, turns_rt, 2, ii)
+
+        # create output rows if valid
+        if (validate_contents(sample_grouped_rt, min_occurrence)) {
+          metadata <- dplyr::bind_rows(metadata, create_metadata(sample_grouped_rt, sample_names))
+          intensity <- dplyr::bind_rows(intensity, create_intensity_row(sample_grouped_rt))
+          rt <- dplyr::bind_rows(rt, create_rt_row(sample_grouped_rt))
+        }
+      }
+    }
+  }
+ 
+  return(list(metadata_row = metadata, intensity_row = intensity, rt_row = rt))
 }
 
 #' Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.
 #' @return Tibbles combining the output (metadata, intensity and RT respectively) from different clusters.
 #' @export
 comb <- function(x, ...) {
-  mapply(tibble::as_tibble, (mapply(rbind, x, ..., SIMPLIFY = FALSE)))
+  mapply(plyr::rbind.fill, x, ..., SIMPLIFY = FALSE)
+}
+
+#' Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble
+#' @param x A dataframe
+#' @param sample_names List of sample names.
+#' @return Cleaned tibble.
+clean_data_matrix <- function(x, sample_names) {
+  x %>% replace(is.na(.), 0) %>% dplyr::relocate(sample_names) %>% as_tibble
 }
 
 #' Align peaks from spectra into a feature table.
@@ -180,7 +175,7 @@ comb <- function(x, ...) {
 #' @param rt_tol_relative The retention time tolerance level for peak alignment. The default is NA, which
 #'  allows the program to search for the tolerance level based on the data.
 #' @param cluster The number of CPU cores to be used
-#' @return A tibble with three tables containing aligned metadata, intensities an RTs.
+#' @return A list of 3 tibbles containing aligned metadata, intensities an RTs.
 #'
 #' @export
 create_aligned_feature_table <- function(features_table,
@@ -198,43 +193,28 @@ create_aligned_feature_table <- function(features_table,
     register_functions_to_cluster(cluster)
   }
 
-  number_of_samples <- length(sample_names)
-  metadata_colnames <- c("id", "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", sample_names)
-  intensity_colnames <- c("id", sample_names)
-  rt_colnames <- c("id", sample_names)
-
-  aligned_features <- create_empty_tibble(number_of_samples, metadata_colnames, intensity_colnames, rt_colnames)
-
   # table with number of values per group
   groups_cardinality <- table(features_table$cluster)
   # count those with minimal occurrence
   sel.labels <- as.numeric(names(groups_cardinality)[groups_cardinality >= min_occurrence])
 
   # retention time alignment
-
   aligned_features <- foreach::foreach(
     i = seq_along(sel.labels), .combine = "comb", .multicombine = TRUE
   ) %do% {
-    rows <- create_rows(
+    rows <- create_features_from_cluster(
       dplyr::filter(features_table, cluster == sel.labels[i]),
       mz_tol_relative,
       rt_tol_relative,
       min_occurrence,
       sample_names
     )
-
-    if (!is.null(rows)) {
-      rows$metadata_row <- c(i, rows$metadata_row)
-      rows$intensity_row <- c(i, rows$intensity_row)
-      rows$rt_row <- c(i, rows$rt_row)
-    }
-
     list(metadata = rows$metadata_row, intensity = rows$intensity_row, rt = rows$rt_row)
   }
 
-  colnames(aligned_features$metadata) <- metadata_colnames
-  colnames(aligned_features$intensity) <- intensity_colnames
-  colnames(aligned_features$rt) <- rt_colnames
+  aligned_features$intensity <- clean_data_matrix(aligned_features$intensity, sample_names)
+  aligned_features$rt <- clean_data_matrix(aligned_features$rt, sample_names)
+  aligned_features$metadata <- as_tibble(aligned_features$metadata)
 
   return(aligned_features)
 }

From ac6e3d3b8d8e11587372a7d46f46b272983bb420 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 10:23:41 +0200
Subject: [PATCH 11/22] functions updated

---
 R/utils.R | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/R/utils.R b/R/utils.R
index 781bc71..9594fd2 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -45,15 +45,17 @@ register_functions_to_cluster <- function(cluster) {
         'compute_uniq_grp',
         'predict_smoothed_rt',
         'label_val_to_keep',
-        "create_empty_tibble",
-        "create_rows",
+        "create_features_from_cluster",
         "validate_contents",
-        "select_mz",
-        "select_rt",
         "find_optima",
         "filter_based_on_density",
         "create_output",
+        "create_metadata",
+        "create_rt_row",
+        "create_intensity_row",
         "comb",
+        "clean_data_matrix",
+        "create_aligned_feature_table",
         'bigauss.esti.EM',
         'solve_sigma',
         'prep_uv',
@@ -111,9 +113,9 @@ load_aligned_features <- function(metadata_file, intensities_file, rt_file, tol_
     tolerances <- arrow::read_parquet(tol_file)
     
     result <- list()
-    result$metadata <- as_tibble(metadata)
-    result$intensity <- as_tibble(intensities)
-    result$rt <- as_tibble(rt)
+    result$metadata <- as_tibble(metadata) |> select(-id)
+    result$intensity <- as_tibble(intensities) |> select(-id)
+    result$rt <- as_tibble(rt) |> select(-id)
     result$mz_tol_relative <- tolerances$mz_tolerance
     result$rt_tol_relative <- tolerances$rt_tolerance
     return(result)

From fbff31a1a91f4245396a479c0622cd32226d0e5c Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 10:24:55 +0200
Subject: [PATCH 12/22] plyr added as dependency

---
 DESCRIPTION | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 3b1effc..5acef3f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu.
         It takes the adaptive processing of LC/MS metabolomics data further
         with focus on high resolution MS for both LC and GC applications.
 Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach,
-        snow, dplyr, tidyr, stringr, tibble, tools, arrow
+        snow, dplyr, tidyr, stringr, tibble, tools, arrow, plyr
 biocViews: Technology, MassSpectrometry
 License: GPL-2
 LazyLoad: yes

From b82097563d21c72ee1992ea9e7ffe6ee3b982395 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 10:25:47 +0200
Subject: [PATCH 13/22] documentation updated

---
 NAMESPACE | 35 +++++++++++++++--------------------
 1 file changed, 15 insertions(+), 20 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index 995a087..e408106 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -6,6 +6,7 @@ export(aggregate_by_rt)
 export(bigauss.esti)
 export(bigauss.esti.EM)
 export(bigauss.mix)
+export(clean_data_matrix)
 export(comb)
 export(compute_boundaries)
 export(compute_bounds)
@@ -42,9 +43,11 @@ export(compute_uniq_grp)
 export(correct_time)
 export(count_peaks)
 export(create_aligned_feature_table)
-export(create_empty_tibble)
+export(create_features_from_cluster)
+export(create_intensity_row)
+export(create_metadata)
 export(create_output)
-export(create_rows)
+export(create_rt_row)
 export(draw_rt_correction_plot)
 export(draw_rt_normal_peaks)
 export(duplicate.row.remove)
@@ -91,8 +94,6 @@ export(remove_noise)
 export(rev_cum_sum)
 export(rm.ridge)
 export(run_filter)
-export(select_mz)
-export(select_rt)
 export(semi.sup)
 export(solve_a)
 export(solve_sigma)
@@ -102,36 +103,30 @@ export(two.step.hybrid)
 export(unsupervised)
 export(validate_contents)
 export(validate_model_method_input)
-import("for")
-import("metadata,")
-import("next")
+import("min,")
 import(Create)
-import(It)
 import(MASS)
-import(RTs.)
-import(aligned)
-import(alignment)
-import(an)
+import(RT)
+import(a)
+import(and)
 import(arrow)
-import(contain)
 import(doParallel)
 import(dplyr)
-import(empty)
 import(foreach)
-import(intensities)
+import(max)
+import(mean)
+import(metadata)
+import(mz)
 import(mzR)
 import(parallel)
+import(row)
 import(snow)
 import(splines)
-import(step.)
 import(stringr)
-import(tables)
-import(the)
-import(three)
 import(tibble)
 import(tidyr)
 import(tools)
-import(will)
+import(values.)
 import(with)
 importFrom(dplyr,arrange)
 importFrom(dplyr,between)

From 39189e3a1a2545f59b8c762eb0f5309a309d915f Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 10:27:11 +0200
Subject: [PATCH 14/22] documentation updated

---
 man/clean_data_matrix.Rd            | 19 ++++++++++++++++++
 man/comb.Rd                         | 14 +++++++++++++
 man/compute_clusters_simple.Rd      |  2 +-
 man/create_aligned_feature_table.Rd |  2 +-
 man/create_features_from_cluster.Rd | 31 +++++++++++++++++++++++++++++
 man/create_intensity_row.Rd         | 17 ++++++++++++++++
 man/create_output.Rd                | 19 ++++++++++++++++++
 man/create_rt_row.Rd                | 17 ++++++++++++++++
 man/filter_based_on_density.Rd      | 23 +++++++++++++++++++++
 man/find_optima.Rd                  | 19 ++++++++++++++++++
 man/remove_noise.Rd                 |  5 ++++-
 man/validate_contents.Rd            | 19 ++++++++++++++++++
 12 files changed, 184 insertions(+), 3 deletions(-)
 create mode 100644 man/clean_data_matrix.Rd
 create mode 100644 man/comb.Rd
 create mode 100644 man/create_features_from_cluster.Rd
 create mode 100644 man/create_intensity_row.Rd
 create mode 100644 man/create_output.Rd
 create mode 100644 man/create_rt_row.Rd
 create mode 100644 man/filter_based_on_density.Rd
 create mode 100644 man/find_optima.Rd
 create mode 100644 man/validate_contents.Rd

diff --git a/man/clean_data_matrix.Rd b/man/clean_data_matrix.Rd
new file mode 100644
index 0000000..e535404
--- /dev/null
+++ b/man/clean_data_matrix.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{clean_data_matrix}
+\alias{clean_data_matrix}
+\title{Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble}
+\usage{
+clean_data_matrix(x, sample_names)
+}
+\arguments{
+\item{x}{A dataframe}
+
+\item{sample_names}{List of sample names.}
+}
+\value{
+Cleaned tibble.
+}
+\description{
+Replace NA values by zero, relocate 'sample_names' column to the very beginning and convert to a tibble
+}
diff --git a/man/comb.Rd b/man/comb.Rd
new file mode 100644
index 0000000..2087909
--- /dev/null
+++ b/man/comb.Rd
@@ -0,0 +1,14 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{comb}
+\alias{comb}
+\title{Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.}
+\usage{
+comb(x, ...)
+}
+\value{
+Tibbles combining the output (metadata, intensity and RT respectively) from different clusters.
+}
+\description{
+Combines the output (i.e. metadata, intensity and RT) from different clusters to one respective tibble.
+}
diff --git a/man/compute_clusters_simple.Rd b/man/compute_clusters_simple.Rd
index 09ab9f3..3a4855b 100644
--- a/man/compute_clusters_simple.Rd
+++ b/man/compute_clusters_simple.Rd
@@ -7,7 +7,7 @@
 compute_clusters_simple(feature_tables, sample_names, mz_tol_ppm, rt_tol)
 }
 \arguments{
-\item{feature_tables}{list of tibbles List of feature tables coming from all samples.}
+\item{feature_tables}{list of tibbles feature tables coming from all samples.}
 
 \item{sample_names}{list of strings Sample names of the feature tables used to distinguish the samples.}
 
diff --git a/man/create_aligned_feature_table.Rd b/man/create_aligned_feature_table.Rd
index 034df37..66a676d 100644
--- a/man/create_aligned_feature_table.Rd
+++ b/man/create_aligned_feature_table.Rd
@@ -30,7 +30,7 @@ percentage of the m/z value. This value, multiplied by the m/z value, becomes th
 \item{cluster}{The number of CPU cores to be used}
 }
 \value{
-A tibble with three tables containing aligned metadata, intensities an RTs.
+A list of 3 tibbles containing aligned metadata, intensities an RTs.
 }
 \description{
 Align peaks from spectra into a feature table.
diff --git a/man/create_features_from_cluster.Rd b/man/create_features_from_cluster.Rd
new file mode 100644
index 0000000..84bebba
--- /dev/null
+++ b/man/create_features_from_cluster.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{create_features_from_cluster}
+\alias{create_features_from_cluster}
+\title{Group the mz and RT for particular cluster.}
+\usage{
+create_features_from_cluster(
+  features,
+  mz_tol_relative,
+  rt_tol_relative,
+  min_occurrence,
+  sample_names
+)
+}
+\arguments{
+\item{features}{The features table subsetted for a particular cluster.}
+
+\item{mz_tol_relative}{The m/z tolerance level for peak alignment.}
+
+\item{rt_tol_relative}{The retention time tolerance level for peak alignment.}
+
+\item{min_occurrence}{A minimal number of profiles a feature has to be present in.}
+
+\item{sample_names}{A list of sample names.}
+}
+\value{
+A list containing 3 tibbles: metadata, intensities and RTs.
+}
+\description{
+Group the mz and RT for particular cluster.
+}
diff --git a/man/create_intensity_row.Rd b/man/create_intensity_row.Rd
new file mode 100644
index 0000000..0976c45
--- /dev/null
+++ b/man/create_intensity_row.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{create_intensity_row}
+\alias{create_intensity_row}
+\title{Compute summed area for each sample}
+\usage{
+create_intensity_row(sample_grouped)
+}
+\arguments{
+\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.}
+}
+\value{
+Summed area for each sample.
+}
+\description{
+Compute summed area for each sample
+}
diff --git a/man/create_output.Rd b/man/create_output.Rd
new file mode 100644
index 0000000..9c1223e
--- /dev/null
+++ b/man/create_output.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{create_output}
+\alias{create_output}
+\title{Create a list containing 3 tibbles: metadata, intensities and RTs.}
+\usage{
+create_output(sample_grouped, sample_names)
+}
+\arguments{
+\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.}
+
+\item{sample_names}{A list of sample names.}
+}
+\value{
+A list containing 3 tibbles: metadata, intensities and RTs.
+}
+\description{
+Create a list containing 3 tibbles: metadata, intensities and RTs.
+}
diff --git a/man/create_rt_row.Rd b/man/create_rt_row.Rd
new file mode 100644
index 0000000..0059fb9
--- /dev/null
+++ b/man/create_rt_row.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{create_rt_row}
+\alias{create_rt_row}
+\title{Compute median RT for each sample}
+\usage{
+create_rt_row(sample_grouped)
+}
+\arguments{
+\item{sample_grouped}{A dataframe with grouped mz and RT values for a particular cluster.}
+}
+\value{
+Median RT for each sample.
+}
+\description{
+Compute median RT for each sample
+}
diff --git a/man/filter_based_on_density.Rd b/man/filter_based_on_density.Rd
new file mode 100644
index 0000000..fe85d5c
--- /dev/null
+++ b/man/filter_based_on_density.Rd
@@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{filter_based_on_density}
+\alias{filter_based_on_density}
+\title{Subset data within lower and upper bound from density estimation}
+\usage{
+filter_based_on_density(sample, turns, index, i)
+}
+\arguments{
+\item{sample}{A subset of the features_table.}
+
+\item{turns}{A list of peaks and valleys positions.}
+
+\item{index}{Whether it subsets on m/z [1] or RT [2] column.}
+
+\item{i}{Iterates over the peaks in the turns list.}
+}
+\value{
+Dataframe subsetted within lower and upper bound from density estimation.
+}
+\description{
+Subset data within lower and upper bound from density estimation
+}
diff --git a/man/find_optima.Rd b/man/find_optima.Rd
new file mode 100644
index 0000000..81e681b
--- /dev/null
+++ b/man/find_optima.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{find_optima}
+\alias{find_optima}
+\title{Compute the kernel density estimation and find the peaks and valleys of a smooth curve.}
+\usage{
+find_optima(data, bandwidth)
+}
+\arguments{
+\item{data}{A vector of m/z or RTs for a particular cluster.}
+
+\item{bandwidth}{A bandwidth value for the KDE computation.}
+}
+\value{
+A list of peaks and valleys positions.
+}
+\description{
+Compute the kernel density estimation and find the peaks and valleys of a smooth curve.
+}
diff --git a/man/remove_noise.Rd b/man/remove_noise.Rd
index debe393..e310fcd 100644
--- a/man/remove_noise.Rd
+++ b/man/remove_noise.Rd
@@ -13,7 +13,8 @@ remove_noise(
   baseline_correct_noise_percentile,
   intensity_weighted,
   do.plot,
-  cache
+  cache,
+  grouping_threshold = Inf
 )
 }
 \arguments{
@@ -40,6 +41,8 @@ run filter, to be used as the baseline threshold of signal strength.}
 \item{do.plot}{Indicates whether plot should be drawn.}
 
 \item{cache}{Whether to use cache}
+
+\item{grouping_threshold}{The maximum difference between two scans to be considered the same EIC. Default is Inf.}
 }
 \value{
 A matrix with four columns: m/z value, retention time, intensity, and group number.
diff --git a/man/validate_contents.Rd b/man/validate_contents.Rd
new file mode 100644
index 0000000..e8f9889
--- /dev/null
+++ b/man/validate_contents.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/feature.align.R
+\name{validate_contents}
+\alias{validate_contents}
+\title{Validates if the data is present in more than "min_occurence" of samples.}
+\usage{
+validate_contents(samples, min_occurrence)
+}
+\arguments{
+\item{samples}{A subset of the features_table.}
+
+\item{min_occurrence}{A minimal number of profiles a feature has to be present in.}
+}
+\value{
+boolean value whether it is TRUE or FALSE.
+}
+\description{
+Validates if the data is present in more than "min_occurence" of samples.
+}

From b4ee7c2c96935d9b7524eebee84c275030c3c399 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 10:31:27 +0200
Subject: [PATCH 15/22] plyr added

---
 conda/environment-dev.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/conda/environment-dev.yaml b/conda/environment-dev.yaml
index c903498..1e98906 100644
--- a/conda/environment-dev.yaml
+++ b/conda/environment-dev.yaml
@@ -29,3 +29,4 @@ dependencies:
   - r-httpgd
   - r-microbenchmark
   - r-covr
+  - r-plyr

From e9b05103dc2f3bfb02f0962ae09d643b5ec106a3 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 10:31:59 +0200
Subject: [PATCH 16/22] export added on functions

---
 R/feature.align.R | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/R/feature.align.R b/R/feature.align.R
index ce5a7a8..c2ebc49 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -3,6 +3,7 @@
 #' Create a metadata row tibble with min, max and mean mz and RT values.
 #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
 #' @param sample_names A list of sample names.
+#' @export
 create_metadata <- function(sample_grouped, sample_names) {
   sample_presence <- sapply(sample_names,
     FUN=function(x) {
@@ -28,6 +29,7 @@ create_metadata <- function(sample_grouped, sample_names) {
 #' Compute summed area for each sample
 #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
 #' @return Summed area for each sample.
+#' @export
 create_intensity_row <- function(sample_grouped) {
   sample_grouped %>%
    group_by(sample_id) %>%
@@ -38,7 +40,7 @@ create_intensity_row <- function(sample_grouped) {
 #' Compute median RT for each sample
 #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
 #' @return Median RT for each sample.
-
+#' @export
 create_rt_row <- function(sample_grouped) {
   sample_grouped %>%
    group_by(sample_id) %>%
@@ -160,6 +162,7 @@ comb <- function(x, ...) {
 #' @param x A dataframe
 #' @param sample_names List of sample names.
 #' @return Cleaned tibble.
+#' @export
 clean_data_matrix <- function(x, sample_names) {
   x %>% replace(is.na(.), 0) %>% dplyr::relocate(sample_names) %>% as_tibble
 }

From 31ca6f0b0446842e83240de1d96281f9c862ecb8 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 10:36:19 +0200
Subject: [PATCH 17/22] styler linted

---
 R/feature.align.R | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/R/feature.align.R b/R/feature.align.R
index c2ebc49..c3b75e9 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -6,7 +6,7 @@
 #' @export
 create_metadata <- function(sample_grouped, sample_names) {
   sample_presence <- sapply(sample_names,
-    FUN=function(x) {
+    FUN = function(x) {
       as.numeric(any(sample_grouped$sample_id == x))
     }
   )
@@ -32,9 +32,9 @@ create_metadata <- function(sample_grouped, sample_names) {
 #' @export
 create_intensity_row <- function(sample_grouped) {
   sample_grouped %>%
-   group_by(sample_id) %>%
-   summarise(intensity = sum(area)) %>%
-   pivot_wider(names_from = "sample_id", values_from = "intensity")
+    group_by(sample_id) %>%
+    summarise(intensity = sum(area)) %>%
+    pivot_wider(names_from = "sample_id", values_from = "intensity")
 }
 
 #' Compute median RT for each sample
@@ -43,9 +43,9 @@ create_intensity_row <- function(sample_grouped) {
 #' @export
 create_rt_row <- function(sample_grouped) {
   sample_grouped %>%
-   group_by(sample_id) %>%
-   summarise(rt = median(rt)) %>%
-   pivot_wider(names_from = "sample_id", values_from = "rt")
+    group_by(sample_id) %>%
+    summarise(rt = median(rt)) %>%
+    pivot_wider(names_from = "sample_id", values_from = "rt")
 }
 
 #' Create a list containing 3 tibbles: metadata, intensities and RTs.
@@ -57,7 +57,7 @@ create_output <- function(sample_grouped, sample_names) {
   metadata_row <- create_metadata(sample_grouped, sample_names)
   intensity_row <- create_intensity_row(sample_grouped)
   rt_row <- create_rt_row(sample_grouped)
-  
+
   return(list(
     metadata_row = metadata_row,
     intensity_row = intensity_row,
@@ -114,10 +114,10 @@ filter_based_on_density <- function(sample, turns, index, i) {
 #' @return A list containing 3 tibbles: metadata, intensities and RTs.
 #' @export
 create_features_from_cluster <- function(features,
-                        mz_tol_relative,
-                        rt_tol_relative,
-                        min_occurrence,
-                        sample_names) {
+                                         mz_tol_relative,
+                                         rt_tol_relative,
+                                         min_occurrence,
+                                         sample_names) {
   if (!validate_contents(features, min_occurrence)) {
     return(NULL)
   }
@@ -132,8 +132,7 @@ create_features_from_cluster <- function(features,
   for (i in seq_along(turns_mz$peaks)) {
     sample_grouped_mz <- filter_based_on_density(features, turns_mz, 1, i)
     if (validate_contents(sample_grouped_mz, min_occurrence)) {
-
-      #split according to rt values
+      # split according to rt values
       turns_rt <- find_optima(sample_grouped_mz$rt, bandwidth = rt_tol_relative / 1.414)
       for (ii in seq_along(turns_rt$peaks)) {
         sample_grouped_rt <- filter_based_on_density(sample_grouped_mz, turns_rt, 2, ii)
@@ -147,7 +146,7 @@ create_features_from_cluster <- function(features,
       }
     }
   }
- 
+
   return(list(metadata_row = metadata, intensity_row = intensity, rt_row = rt))
 }
 
@@ -164,7 +163,10 @@ comb <- function(x, ...) {
 #' @return Cleaned tibble.
 #' @export
 clean_data_matrix <- function(x, sample_names) {
-  x %>% replace(is.na(.), 0) %>% dplyr::relocate(sample_names) %>% as_tibble
+  x %>%
+    replace(is.na(.), 0) %>%
+    dplyr::relocate(sample_names) %>%
+    as_tibble()
 }
 
 #' Align peaks from spectra into a feature table.

From 95e0bde9be6bd11cc512f83efdafa65710fb687f Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 10:39:31 +0200
Subject: [PATCH 18/22] test files renamed

---
 tests/testdata/aligned/output_create-features.rds | Bin 0 -> 392 bytes
 tests/testdata/aligned/output_select-mz.rds       | Bin 354 -> 0 bytes
 ...quet => feature-align_create-features.parquet} | Bin
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 tests/testdata/aligned/output_create-features.rds
 delete mode 100644 tests/testdata/aligned/output_select-mz.rds
 rename tests/testdata/input/{feature-align_select-mz.parquet => feature-align_create-features.parquet} (100%)

diff --git a/tests/testdata/aligned/output_create-features.rds b/tests/testdata/aligned/output_create-features.rds
new file mode 100644
index 0000000000000000000000000000000000000000..d1840145829a50a2289dc2809bc29f62c414a11f
GIT binary patch
literal 392
zcmV;30eAi%iwFP!000001B>8dU|?WoU}0uvU}gm}8CXL@+;lA%7?^~?5)8~B8l;F1
zh?yLso6K2#L|uXOtvg{s+mOV~%!IF;0*Uj=30WhnFKDX~1JbwMC+uv8i>L2f$X~Vh
zE{sNqukY@<`>zW|Bg8MBTq9D=1*3stydZIi8^IW?&i(@f7{F-K#aTdRGBU6*aDoCO
zIVZ8W7|0NSgc4Ygtt2TYJ|zvxV+Qh|Os<r~l0?0<qQu-(G;=tM^2_z|fa1k4hc^I;
z|NsC0L$?s7NB|t_5J9HgDwuR`Rc>Y;5+|_&rlzO_CR|j4qz1@=t6|G4NKMQxhVlf0
zoFn25%;JkP@{3AR^HNh##LaPtTOf%uF#Ln24|w=H2C_fwp8AasM#IGe*xM$nq%DTg
zaB;87-wwx)a>8gxa4`NSJII;-V-I0wbR%iveTVd2z29pOFvDolll2huK321Na#KqZ
mK`x62CVQwDZ)P4)TXAMdC6XXpQ3;p_GztK<$^>qR1ONcc!mm#N

literal 0
HcmV?d00001

diff --git a/tests/testdata/aligned/output_select-mz.rds b/tests/testdata/aligned/output_select-mz.rds
deleted file mode 100644
index 72675305ea18a1197b783bf0395c7e2ac029f6aa..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 354
zcmV-o0iFIIiwFP!000001B>8dU|?WoU}0uvU}gm}8CXL@+;lA%7?^~?5)8~B8l;F1
zh#4KCo6K2#L|u{iW@f@yP9e(|wAF~g`RV%>@>lJ>i{P*C?z;Q03&Fp5a*aqe$b4Q9
zA7Up&&i(@f3=rV6FoE^5FmQrAkeriPTnuCgKzs%kWGhL^iBCy`@|b}<D3dECu_RG1
zttc@!70n#ZqWp5bJfL_n)NxD=K;l0Tpj!x2BmfR6h#*sL6-+v}DmOC^iIZ3XQ&Us|
z6D}%2QUm0`)v)Ciq$Xw;LwN#0&JpnjX7R-t`9&qEd8sKV;^sKSEs(?+82&-y6dulw
zf$R^vr+(vu^8?u1Caa_^hV#8De>)sI$_eo{<A1V)mgztCkYq+Tk|y4CNZ-}_z4ibS
zA6HTwV&24RHcxJ9Ng~K)@xWvZ730m!18OVIEU82iWGgBG^MFPH0LghiJq!c@0RO?2
APyhe`

diff --git a/tests/testdata/input/feature-align_select-mz.parquet b/tests/testdata/input/feature-align_create-features.parquet
similarity index 100%
rename from tests/testdata/input/feature-align_select-mz.parquet
rename to tests/testdata/input/feature-align_create-features.parquet

From ee1d2dc32ea977dbbacc8b33aa2b436acc35f615 Mon Sep 17 00:00:00 2001
From: KristinaGomoryova <gomoryova@sci.muni.cz>
Date: Tue, 23 Jul 2024 11:04:34 +0200
Subject: [PATCH 19/22] import foreach deleted

---
 DESCRIPTION       |  2 +-
 NAMESPACE         | 12 ------------
 R/feature.align.R |  2 --
 3 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 5acef3f..3b1effc 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu.
         It takes the adaptive processing of LC/MS metabolomics data further
         with focus on high resolution MS for both LC and GC applications.
 Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach,
-        snow, dplyr, tidyr, stringr, tibble, tools, arrow, plyr
+        snow, dplyr, tidyr, stringr, tibble, tools, arrow
 biocViews: Technology, MassSpectrometry
 License: GPL-2
 LazyLoad: yes
diff --git a/NAMESPACE b/NAMESPACE
index e408106..d59d113 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -103,31 +103,19 @@ export(two.step.hybrid)
 export(unsupervised)
 export(validate_contents)
 export(validate_model_method_input)
-import("min,")
-import(Create)
 import(MASS)
-import(RT)
-import(a)
-import(and)
 import(arrow)
 import(doParallel)
 import(dplyr)
 import(foreach)
-import(max)
-import(mean)
-import(metadata)
-import(mz)
 import(mzR)
 import(parallel)
-import(row)
 import(snow)
 import(splines)
 import(stringr)
 import(tibble)
 import(tidyr)
 import(tools)
-import(values.)
-import(with)
 importFrom(dplyr,arrange)
 importFrom(dplyr,between)
 importFrom(dplyr,bind_rows)
diff --git a/R/feature.align.R b/R/feature.align.R
index c3b75e9..f9e83a6 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -1,5 +1,3 @@
-#' @import foreach
-
 #' Create a metadata row tibble with min, max and mean mz and RT values.
 #' @param sample_grouped A dataframe with grouped mz and RT values for a particular cluster.
 #' @param sample_names A list of sample names.

From 9fa922578a630e89b26038c8f567c29d956d642c Mon Sep 17 00:00:00 2001
From: hechth <helge.hecht@recetox.muni.cz>
Date: Fri, 26 Jul 2024 15:08:42 +0000
Subject: [PATCH 20/22] fixed tests

---
 DESCRIPTION                         |  2 +-
 R/feature.align.R                   | 31 +++++++++++++++++++----------
 R/utils.R                           |  7 ++++---
 conda/environment.yaml              |  1 +
 tests/testthat/test-feature-align.R |  6 ++++++
 tests/testthat/test-hybrid.R        |  1 +
 6 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 3b1effc..5acef3f 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -9,7 +9,7 @@ Description: This is a customized fork of the original work from Tianwei Yu.
         It takes the adaptive processing of LC/MS metabolomics data further
         with focus on high resolution MS for both LC and GC applications.
 Depends: R (>= 3.50), MASS, mzR, splines, doParallel, foreach,
-        snow, dplyr, tidyr, stringr, tibble, tools, arrow
+        snow, dplyr, tidyr, stringr, tibble, tools, arrow, plyr
 biocViews: Technology, MassSpectrometry
 License: GPL-2
 LazyLoad: yes
diff --git a/R/feature.align.R b/R/feature.align.R
index f9e83a6..5f541a0 100644
--- a/R/feature.align.R
+++ b/R/feature.align.R
@@ -30,9 +30,9 @@ create_metadata <- function(sample_grouped, sample_names) {
 #' @export
 create_intensity_row <- function(sample_grouped) {
   sample_grouped %>%
-    group_by(sample_id) %>%
-    summarise(intensity = sum(area)) %>%
-    pivot_wider(names_from = "sample_id", values_from = "intensity")
+    dplyr::group_by(sample_id) %>%
+    dplyr::summarise(intensity = sum(area)) %>%
+    tidyr::pivot_wider(names_from = "sample_id", values_from = "intensity")
 }
 
 #' Compute median RT for each sample
@@ -41,9 +41,9 @@ create_intensity_row <- function(sample_grouped) {
 #' @export
 create_rt_row <- function(sample_grouped) {
   sample_grouped %>%
-    group_by(sample_id) %>%
-    summarise(rt = median(rt)) %>%
-    pivot_wider(names_from = "sample_id", values_from = "rt")
+    dplyr::group_by(sample_id) %>%
+    dplyr::summarise(rt = median(rt)) %>%
+    tidyr::pivot_wider(names_from = "sample_id", values_from = "rt")
 }
 
 #' Create a list containing 3 tibbles: metadata, intensities and RTs.
@@ -161,10 +161,21 @@ comb <- function(x, ...) {
 #' @return Cleaned tibble.
 #' @export
 clean_data_matrix <- function(x, sample_names) {
-  x %>%
+  x <- x %>%
     replace(is.na(.), 0) %>%
-    dplyr::relocate(sample_names) %>%
-    as_tibble()
+    dplyr::relocate(sample_names) |>
+    add_feature_ids()
+  return(x)
+}
+
+#' Add `id` column to a dataframe
+#' @param x A dataframe
+#' @return The same dataframe but with an additional `id` column 
+#' in first place which contains the rownames.
+#' @export
+add_feature_ids <- function(x) {
+  x$id <- as.numeric(rownames(x))
+  return(tibble::as_tibble(x |> dplyr::relocate(id)))
 }
 
 #' Align peaks from spectra into a feature table.
@@ -217,7 +228,7 @@ create_aligned_feature_table <- function(features_table,
 
   aligned_features$intensity <- clean_data_matrix(aligned_features$intensity, sample_names)
   aligned_features$rt <- clean_data_matrix(aligned_features$rt, sample_names)
-  aligned_features$metadata <- as_tibble(aligned_features$metadata)
+  aligned_features$metadata <- add_feature_ids(aligned_features$metadata)
 
   return(aligned_features)
 }
diff --git a/R/utils.R b/R/utils.R
index 44bdd3d..738ad0e 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -8,6 +8,7 @@ register_functions_to_cluster <- function(cluster) {
         'prof.to.features',
         'load.lcms',
         'adaptive.bin',
+        'add_feature_ids',
         'find.turn.point',
         'msExtrema',
         'find_local_maxima',
@@ -113,9 +114,9 @@ load_aligned_features <- function(metadata_file, intensities_file, rt_file, tol_
     tolerances <- arrow::read_parquet(tol_file)
     
     result <- list()
-    result$metadata <- as_tibble(metadata) |> select(-id)
-    result$intensity <- as_tibble(intensities) |> select(-id)
-    result$rt <- as_tibble(rt) |> select(-id)
+    result$metadata <- as_tibble(metadata)
+    result$intensity <- as_tibble(intensities)
+    result$rt <- as_tibble(rt)
     result$mz_tol_relative <- tolerances$mz_tolerance
     result$rt_tol_relative <- tolerances$rt_tolerance
     return(result)
diff --git a/conda/environment.yaml b/conda/environment.yaml
index 3ab1060..1b2776b 100644
--- a/conda/environment.yaml
+++ b/conda/environment.yaml
@@ -18,4 +18,5 @@ dependencies:
   - r-tidyr
   - r-stringr
   - r-tibble
+  - r-plyr
 
diff --git a/tests/testthat/test-feature-align.R b/tests/testthat/test-feature-align.R
index 7adf16b..a267541 100644
--- a/tests/testthat/test-feature-align.R
+++ b/tests/testthat/test-feature-align.R
@@ -1,3 +1,9 @@
+update_expected <- function(actual) {
+  arrow::write_parquet(actual$metadata, file.path("..", "testdata", "aligned", "metadata_table.parquet"))
+  arrow::write_parquet(actual$intensity, file.path("..", "testdata", "aligned", "intensity_table.parquet"))
+  arrow::write_parquet(actual$rt, file.path("..", "testdata", "aligned", "rt_table.parquet"))
+}
+
 patrick::with_parameters_test_that(
   "feature.align test",
   {
diff --git a/tests/testthat/test-hybrid.R b/tests/testthat/test-hybrid.R
index ed28fd3..8a064ba 100644
--- a/tests/testthat/test-hybrid.R
+++ b/tests/testthat/test-hybrid.R
@@ -25,6 +25,7 @@ patrick::with_parameters_test_that("basic hybrid test", {
   actual <- as_tibble(result$recovered_feature_sample_table)
   keys <- c("mz", "rt", "sample", "sample_rt", "sample_intensity")
 
+  # arrow::write_parquet(actual, file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet")))
   expected <- arrow::read_parquet(
     file.path(testdata, "hybrid", paste0(.test_name, "_recovered_feature_sample_table.parquet"))
   )

From 1dfb73f096fc3780047acc3cce755de762856b87 Mon Sep 17 00:00:00 2001
From: hechth <helge.hecht@recetox.muni.cz>
Date: Mon, 29 Jul 2024 10:17:41 +0200
Subject: [PATCH 21/22] Started adding documentation

---
 NAMESPACE        |  5 +++++
 R/adjust.time.R  | 33 +++++++++++++++++++++++++++------
 R/unsupervised.R | 14 ++++++++++++++
 R/utils.R        |  5 ++++-
 4 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/NAMESPACE b/NAMESPACE
index d59d113..3657c2c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,11 +1,14 @@
 # Generated by roxygen2: do not edit by hand
 
 export(adaptive.bin)
+export(add_feature_ids)
 export(adjust.time)
 export(aggregate_by_rt)
+export(as_feature_sample_table)
 export(bigauss.esti)
 export(bigauss.esti.EM)
 export(bigauss.mix)
+export(check_files)
 export(clean_data_matrix)
 export(comb)
 export(compute_boundaries)
@@ -41,6 +44,7 @@ export(compute_template)
 export(compute_template_adjusted_rt)
 export(compute_uniq_grp)
 export(correct_time)
+export(correct_time_v2)
 export(count_peaks)
 export(create_aligned_feature_table)
 export(create_features_from_cluster)
@@ -63,6 +67,7 @@ export(get_features_in_rt_range)
 export(get_mzrange_bound_indices)
 export(get_num_workers)
 export(get_rt_region_indices)
+export(get_sample_name)
 export(get_single_occurrence_mask)
 export(get_times_to_use)
 export(hybrid)
diff --git a/R/adjust.time.R b/R/adjust.time.R
index fe896d1..23173c5 100644
--- a/R/adjust.time.R
+++ b/R/adjust.time.R
@@ -2,6 +2,10 @@
 NULL
 #> NULL
 
+#' Combine template and sample features
+#' @param template_features Tibble Template feature table (mz, rt, cluster, sample_id).
+#' @param features Tibble Sample feature table (mz, rt, cluster, sample_id).
+#' @return Tibble Combined feature table (rbind).
 #' @export
 compute_comb <- function(template_features, features) {
   combined <- dplyr::bind_rows(
@@ -11,6 +15,12 @@ compute_comb <- function(template_features, features) {
   return(combined)
 }
 
+#' Select features to use for retention time alignment
+#' @description This function selects features present in both the sample
+#' feature table and template feature table given they have the same cluster,
+#' are adjacent in the combined table.
+#' @param combined Tibble Table with (mz, rt, cluster, sample_id).
+#' @return List of bool Returns list of bools with TRUE at each index where this condition is met.
 #' @export
 compute_sel <- function(combined) {
   l <- nrow(combined)
@@ -19,6 +29,11 @@ compute_sel <- function(combined) {
   return(sel)
 }
 
+#' Create two column table with paired sample and template retention times.
+#' @param combined Tibble Table with features from sample and template.
+#' @param sel list of bools List of bools indiciating which features to pair.
+#' See 'compute_sel'.
+#' @param j string Template sample_id.
 #' @export
 compute_template_adjusted_rt <- function(combined, sel, j) {
   all_features <- cbind(combined$rt[sel], combined$rt[sel + 1])
@@ -59,20 +74,25 @@ compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
   return(features |> dplyr::arrange_at(c("mz", "rt")))
 }
 
+#' Correct the rt in feature table based on paired feature rts and differences.
+#' @param features Tibble The feature table for which to correct rts.
+#' @param template_rt List of floats Template retention times for the paired features.
+#' @param delta_rt List of floats Differences between the paired rts.
+#' @return Tibble A table with corrected retention times.
 #' @export
-compute_corrected_features <- function(features, delta_rt, avg_time) {
+compute_corrected_features <- function(features, template_rt, delta_rt) {
   features <- features |> dplyr::arrange_at(c("rt", "mz"))
 
   corrected <- features$rt
   original <- features$rt
 
-  idx <- dplyr::between(original, min(delta_rt), max(delta_rt))
+  idx <- dplyr::between(original, min(template_rt), max(template_rt))
   to_correct <- original[idx]
   this.smooth <- ksmooth(
+    template_rt,
     delta_rt,
-    avg_time,
     kernel = "normal",
-    bandwidth = (max(delta_rt) - min(delta_rt)) / 5,
+    bandwidth = (max(template_rt) - min(template_rt)) / 5,
     x.points = to_correct
   )
 
@@ -80,8 +100,8 @@ compute_corrected_features <- function(features, delta_rt, avg_time) {
   lower_bound_adjustment <- mean(this.smooth$y[this.smooth$x == min(this.smooth$x)])
   upper_bound_adjustment <- mean(this.smooth$y[this.smooth$x == max(this.smooth$x)])
 
-  idx_lower <- original < min(delta_rt)
-  idx_upper <- original > max(delta_rt)
+  idx_lower <- original < min(template_rt)
+  idx_upper <- original > max(template_rt)
 
   corrected[idx_lower] <- corrected[idx_lower] + lower_bound_adjustment
   corrected[idx_upper] <- corrected[idx_upper] + upper_bound_adjustment
@@ -149,6 +169,7 @@ compute_template <- function(extracted_features) {
   return(tibble::as_tibble(template_features))
 }
 
+#' @export
 correct_time_v2 <- function(features, template) {
   if (unique(features$sample_id) == unique(template$sample_id))
     return(tibble::as_tibble(features))
diff --git a/R/unsupervised.R b/R/unsupervised.R
index 3b7070b..4093b9c 100644
--- a/R/unsupervised.R
+++ b/R/unsupervised.R
@@ -2,6 +2,13 @@
 NULL
 #> NULL
 
+#' Read the metadata table, retention time data matrix and intensity data matrix
+#' and combine them into a single table
+#' @param metadata Tibble Feature metadata table with information concerning the peaks.
+#' @param rt_crosstab Tibble Data matrix with features on rows and samples on columns holding rt data.
+#' @param int_crosstab Tibble Data matrix with features on rows and samples on columns holding intensity data.
+#' @return Tibble A merged table containing all information.
+#' @export 
 as_feature_sample_table <- function(metadata, rt_crosstab, int_crosstab) {
   feature_names <- as.character(rt_crosstab$id)
   sample_names <- colnames(metadata)[-c(1:8)]
@@ -27,6 +34,9 @@ as_feature_sample_table <- function(metadata, rt_crosstab, int_crosstab) {
   return(data)
 }
 
+#' Check files whether they exist.
+#' @param filenames list of filenames Filenames to check whether they exist.
+#' @export
 check_files <- function(filenames) {
   missing <- !file.exists(filenames)
   missing_filenames <- paste0('\t', filenames[missing], collapse = '\n')
@@ -36,6 +46,10 @@ check_files <- function(filenames) {
   }
 }
 
+#' Get the sample name as basename of the file.
+#' @param filename string Name of the file.
+#' @return string Sample name.
+#' @export
 get_sample_name <- function(filename) {
   tools::file_path_sans_ext(basename(filename))
 }
diff --git a/R/utils.R b/R/utils.R
index 738ad0e..db06a3b 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -85,7 +85,10 @@ register_functions_to_cluster <- function(cluster) {
         'get_mzrange_bound_indices',
         'compute_mass_density',
         'l2normalize',
-        'compute_peaks_and_valleys'
+        'compute_peaks_and_valleys',
+        'as_feature_sample_table',
+        'check_files',
+        'get_sample_name'
     ))
     snow::clusterEvalQ(cluster, library("dplyr"))
     snow::clusterEvalQ(cluster, library("stringr"))

From ef2f003bfa10070ab438465cd0b0c6cdce931fa8 Mon Sep 17 00:00:00 2001
From: hechth <helge.hecht@recetox.muni.cz>
Date: Mon, 29 Jul 2024 10:29:08 +0200
Subject: [PATCH 22/22] Finalized documentation for adjust time

---
 NAMESPACE       |  1 +
 R/adjust.time.R | 26 ++++++++++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/NAMESPACE b/NAMESPACE
index 3657c2c..c5c8173 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -20,6 +20,7 @@ export(compute_clusters)
 export(compute_clusters_simple)
 export(compute_comb)
 export(compute_corrected_features)
+export(compute_corrected_features_v2)
 export(compute_curr_rec_with_enough_peaks)
 export(compute_delta_rt)
 export(compute_densities)
diff --git a/R/adjust.time.R b/R/adjust.time.R
index 23173c5..1d0f5a8 100644
--- a/R/adjust.time.R
+++ b/R/adjust.time.R
@@ -49,6 +49,13 @@ compute_template_adjusted_rt <- function(combined, sel, j) {
   return(all_features)
 }
 
+#' Correct the rt in feature table based on paired feature rts and differences.
+#' @description This is a newer implementation based on dplyr which might be more efficient than the other function.
+#' @param features Tibble The feature table for which to correct rts.
+#' @param template_rt List of floats Template retention times for the paired features.
+#' @param delta_rt List of floats Differences between the paired rts.
+#' @return Tibble A table with corrected retention times.
+#' @export
 compute_corrected_features_v2 <- function(features, template_rt, delta_rt) {
   features <- features |> dplyr::arrange_at(c("rt", "mz"))
   idx <- dplyr::between(features$rt, min(template_rt), max(template_rt))
@@ -111,6 +118,10 @@ compute_corrected_features <- function(features, template_rt, delta_rt) {
   return(features)
 }
 
+#' Fill missing values based on original retention times.
+#' @param orig.features Non-corrected feature table.
+#' @param this.features Feature table with eventual missing values.
+#' @return Tibble Feature table with filles values.
 #' @export
 fill_missing_values <- function(orig.feature, this.feature) {
   missing_values <- which(is.na(this.feature$rt))
@@ -124,6 +135,10 @@ fill_missing_values <- function(orig.feature, this.feature) {
   return(this.feature)
 }
 
+#' Function to perform retention time correction
+#' @param this.feature Tibble Feature table for which to correct rt.
+#' @param template_features Tibble Template feature table to use for correction.
+#' @return Tibble this.feature table with corrected rt values.
 #' @export
 correct_time <- function(this.feature, template_features) {
     orig.features <- this.feature
@@ -157,6 +172,10 @@ correct_time <- function(this.feature, template_features) {
   return(tibble::as_tibble(this.feature, column_name = c("mz", "rt", "sd1", "sd2", "area", "sample_id", "cluster")))
 }
 
+#' Select the template feature table.
+#' @description The current implementation selects the table with the most features as the template.
+#' @param extracted_features List of tables Tables from which to select the template.
+#' @return Tibble Template feature table.
 #' @export
 compute_template <- function(extracted_features) {
   num.ftrs <- sapply(extracted_features, nrow)
@@ -169,6 +188,13 @@ compute_template <- function(extracted_features) {
   return(tibble::as_tibble(template_features))
 }
 
+#' Rewritten version of 'correct_time'
+#' @description This function uses dplyr to do the same as
+#' 'correct_time', just with less code. Most functions used in the original
+#' function are replaced with simple data transformations.
+#' @param features Tibble Table with features to correct.
+#' @param template Tibble Template feature table to use for correction.
+#' @return Tibble Corrected feature table.
 #' @export
 correct_time_v2 <- function(features, template) {
   if (unique(features$sample_id) == unique(template$sample_id))