From 4b108f3fcea52d74be141471abaa86dc3a8476c4 Mon Sep 17 00:00:00 2001
From: Yichen Wang <wayichen@umich.edu>
Date: Tue, 26 Mar 2024 13:16:36 -0400
Subject: [PATCH] minor fixes

---
 DESCRIPTION           |  2 +-
 NEWS.md               | 18 +++++++++++-------
 R/classConversion.R   |  2 +-
 R/h5Utility.R         |  1 -
 R/import.R            | 16 ++++++++++++----
 R/integration.R       | 44 +++++++++++++++++++++++++++++++++++++------
 man/createLiger.Rd    | 11 ++++++++---
 man/quantileNorm.Rd   |  5 +++--
 man/readLiger.Rd      |  4 +++-
 man/restoreH5Liger.Rd |  4 +++-
 10 files changed, 80 insertions(+), 27 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index daf42221..11e2f0b7 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -43,7 +43,7 @@ Depends:
     methods,
     stats,
     utils,
-    R (>= 3.4)
+    R (>= 3.5)
 Imports:
     circlize,
     cli,
diff --git a/NEWS.md b/NEWS.md
index 03a3f6ad..5193f13e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,11 +1,12 @@
 ## rliger Next
 
-- Standardized H5 writing specification that can be shared with other platforms.
-  - Currently we allow analysis with 10X cellranger output H5 file and H5AD file from anndata>=0.8.0
-  - Writing to H5AD file should follow anndata specification otherwise the file cannot be read back to a Python seesion.
-  - Writing to 10X H5 file should be carefully investigated.
-  - Consider using object backend to store information instead of active H5 binding, which cannot be serialized to RDS.
-  - Investigate whether to use existing backend implementation like HDF5Array, DelayedArray.
+- Standardized H5 IO specification that can be shared with other platforms.
+  - Will move to use HDF5Array (TENxMatrix, H5ADMatrix)/ or BPCells for backed data representation.
+  - Read feature metadata (e.g. id, name, ...) if available; Allow setting "id" as rownames, "name" for visualization.
+  - rawData - coming from the original input, read only (qc filtering should be just stored in the object, no IO)
+  - preprocessing metrics - nUMI, nGene and etc, still go "chunkApply" so the file is read only once
+  - normData - delayed computed data from rawData, no on disk representation
+  - scaleData - new on-disk file and then create object back, because RcppPlanc won't be able to handle delayed computation
 - Ability to reorganize datasets
   - Allow doing something like `reorganize(ligerObj, variable = "somethingNotDataset")` and resulting in a new liger object with different ligerDataset grouping.
 - Ability to do downstream analysis on H5 data
@@ -15,8 +16,11 @@
 ## rliger 2.0.1
 
 - Fixed wrong UINMF aborting criteria
-- Fixed example/test skipping criteria for nonexisting dependencies
+- Fixed example/test skipping criteria for non-existing dependencies
 - Fixed file access issue when checking package on CRAN
+- Updated installed data file `system.file("extdata/ctrl.h5", "extdata/stim.h5")` to be of standard 10X H5 format
+- Updated `quantileNorm()` automatic reference selection according to #297
+- Other minor fixes
 
 ## rliger 2.0.0
 
diff --git a/R/classConversion.R b/R/classConversion.R
index adcffe02..1743273d 100644
--- a/R/classConversion.R
+++ b/R/classConversion.R
@@ -512,7 +512,7 @@ convertOldLiger.mem <- function(object) {
         if (!is.null(dataList$rawData)) features <- rownames(dataList$rawData)
         else features <- rownames(dataList$normData)
         if (is.null(features)) {
-            cli::cli_alert_abort(
+            cli::cli_abort(
                 "Cannot detect feature names for dataset {.val {d}}."
             )
         }
diff --git a/R/h5Utility.R b/R/h5Utility.R
index dc58cfaa..bd0c401b 100644
--- a/R/h5Utility.R
+++ b/R/h5Utility.R
@@ -61,7 +61,6 @@ H5Apply <- function(
                                        total = numChunks, clear = FALSE)
         # pb <- utils::txtProgressBar(0, numChunks, style = 3)
     for (i in seq(numChunks)) {
-        Sys.sleep(0.1)
         start <- (i - 1)*chunkSize + 1
         end <- if (i*chunkSize > ncol(object)) ncol(object) else i*chunkSize
         colptrStart <- start
diff --git a/R/import.R b/R/import.R
index 2b1af7ec..58f69873 100644
--- a/R/import.R
+++ b/R/import.R
@@ -41,8 +41,9 @@
 #' @param verbose Logical. Whether to show information of the progress. Default
 #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set.
 #' @param ... Additional slot values that should be directly placed in object.
-#' @param remove.missing,format.type,data.name,indices.name,indptr.name,genes.name,barcodes.name
+#' @param raw.data,remove.missing,format.type,data.name,indices.name,indptr.name,genes.name,barcodes.name
 #' \bold{Deprecated.} See Usage section for replacement.
+#' @param take.gene.union Defuncted. Will be ignored.
 #' @export
 #' @seealso \code{\link{createLigerDataset}}, \code{\link{createH5LigerDataset}}
 #' @examples
@@ -78,6 +79,8 @@ createLiger <- function(
         verbose = getOption("ligerVerbose", TRUE),
         ...,
         # Deprecated coding style
+        raw.data = rawData,
+        take.gene.union = NULL,
         remove.missing = removeMissing,
         format.type = formatType,
         data.name = dataName,
@@ -86,12 +89,17 @@ createLiger <- function(
         genes.name = genesName,
         barcodes.name = barcodesName
 ) {
-    .deprecateArgs(list(remove.missing = "removeMissing",
+    .deprecateArgs(list(raw.data = "rawData", remove.missing = "removeMissing",
                         format.type = "formatType", data.name = "dataName",
                         indices.name = "indicesName",
                         indptr.name = "indptrName", genes.name = "genesName",
-                        barcodes.name = "barcodesName"))
-    if (!is.list(rawData)) cli::cli_abort("{.var rawData} has to be a named list.")
+                        barcodes.name = "barcodesName"),
+                   defunct = "take.gene.union")
+    if (!is.list(rawData) ||
+        is.null(names(rawData)) ||
+        any(nchar(names(rawData)) == 0)) {
+        cli::cli_abort("{.var rawData} has to be a named list.")
+    }
 
     nData <- length(rawData)
     if (missing(modal) || is.null(modal)) modal <- "default"
diff --git a/R/integration.R b/R/integration.R
index e84f4b06..e61d5197 100644
--- a/R/integration.R
+++ b/R/integration.R
@@ -1275,8 +1275,9 @@ runUINMF.liger <- function(
 #' Default \code{50}.
 #' @param reference Character, numeric or logical selection of one dataset, out
 #' of all available datasets in \code{object}, to use as a "reference" for
-#' normalization. Default \code{NULL} use the dataset with the largest number of
-#' cells.
+#' quantile normalization. Default \code{NULL} tries to find an RNA dataset with
+#' the largest number of cells; if no RNA dataset available, use the globally
+#' largest dataset.
 #' @param minCells Minimum number of cells to consider a cluster shared across
 #' datasets. Default \code{20}.
 #' @param nNeighbors Number of nearest neighbors for within-dataset knn graph.
@@ -1350,12 +1351,23 @@ quantileNorm.liger <- function(
 ) {
     .checkObjVersion(object)
     .checkValidFactorResult(object, checkV = FALSE)
-    reference <- reference %||% names(which.max(sapply(datasets(object), ncol)))
-    reference <- .checkUseDatasets(object, useDatasets = reference)
-    if (length(reference) != 1) {
-        cli::cli_abort("Should specify only one reference dataset.")
+    # Choose largest RNA dataset as default if possible, as per Issue #297
+    if (is.null(reference)) {
+        reference <- .autoFindRef_qn(object)
+    } else {
+        reference <- .checkUseDatasets(object, useDatasets = reference)
+        if (length(reference) != 1) {
+            cli::cli_abort("Should specify only one reference dataset.")
+        }
+        if (inherits(dataset(object, reference), c("ligerATACDataset", "ligerSpatialDataset", "ligerMethDataset"))) {
+            cli::cli_alert_warning(
+                "Dataset of {.val {modalOf(dataset(pbmc, reference))}} modality is not recommended to be set as reference."
+            )
+        }
     }
+
     object <- recordCommand(object, ..., dependencies = "RANN")
+
     out <- .quantileNorm.HList(
         object = getMatrix(object, "H"),
         quantiles = quantiles,
@@ -1521,6 +1533,26 @@ quantileNorm.Seurat <- function(
     return(list('H.norm' = Reduce(rbind, Hs), 'clusters' = clusterAssign))
 }
 
+.autoFindRef_qn <- function(object) {
+    notRecom <- c("ligerATACDataset", "ligerSpatialDataset", "ligerMethDataset")
+    recom <- !sapply(datasets(object), inherits, what = notRecom)
+    if (sum(recom) == 0) {
+        cli::cli_alert_warning(
+            "Auto selecting reference, dataset of recommended type not found."
+        )
+        ref <- names(which.max(sapply(datasets(object), ncol)))
+        cli::cli_alert_info(
+            "Using globally largest dataset as reference: {.val {ref}} with {lengths(object)[ref]} cells"
+        )
+    } else {
+        ref <- names(which.max(sapply(datasets(object)[recom], ncol)))
+        cli::cli_alert_info(
+            "Using largest dataset of recommended type as reference: {.val {ref}} with {lengths(object)[ref]} cells"
+        )
+    }
+    return(ref)
+}
+
 #' [Deprecated] Quantile align (normalize) factor loading
 #' @description
 #' \bold{Please turn to \code{\link{quantileNorm}}.}
diff --git a/man/createLiger.Rd b/man/createLiger.Rd
index f3fbdccc..e6e5e0d5 100644
--- a/man/createLiger.Rd
+++ b/man/createLiger.Rd
@@ -20,6 +20,8 @@ createLiger(
   newH5 = TRUE,
   verbose = getOption("ligerVerbose", TRUE),
   ...,
+  raw.data = rawData,
+  take.gene.union = NULL,
   remove.missing = removeMissing,
   format.type = formatType,
   data.name = dataName,
@@ -78,7 +80,9 @@ can be dangerous for large scale analysis.}
 
 \item{...}{Additional slot values that should be directly placed in object.}
 
-\item{remove.missing, format.type, data.name, indices.name, indptr.name, genes.name, barcodes.name}{\bold{Deprecated.} See Usage section for replacement.}
+\item{raw.data, remove.missing, format.type, data.name, indices.name, indptr.name, genes.name, barcodes.name}{\bold{Deprecated.} See Usage section for replacement.}
+
+\item{take.gene.union}{Defuncted. Will be ignored.}
 }
 \description{
 This function allows creating \linkS4class{liger} object from
@@ -96,8 +100,9 @@ pbmc1 <- createLiger(list(ctrl = ctrl.raw, stim = stim.raw))
 
 # Create from H5 files
 h5Path <- system.file("extdata/ctrl.h5", package = "rliger")
-print(h5Path)
-lig <- createLiger(list(ctrl = h5Path))
+tempPath <- tempfile(fileext = ".h5")
+file.copy(from = h5Path, to = tempPath)
+lig <- createLiger(list(ctrl = tempPath))
 
 # Create from other container object
 ctrl.seu <- SeuratObject::CreateSeuratObject(ctrl.raw)
diff --git a/man/quantileNorm.Rd b/man/quantileNorm.Rd
index 2837dd69..7ebd0792 100644
--- a/man/quantileNorm.Rd
+++ b/man/quantileNorm.Rd
@@ -54,8 +54,9 @@ Default \code{50}.}
 
 \item{reference}{Character, numeric or logical selection of one dataset, out
 of all available datasets in \code{object}, to use as a "reference" for
-normalization. Default \code{NULL} use the dataset with the largest number of
-cells.}
+quantile normalization. Default \code{NULL} tries to find an RNA dataset with
+the largest number of cells; if no RNA dataset available, use the globally
+largest dataset.}
 
 \item{minCells}{Minimum number of cells to consider a cluster shared across
 datasets. Default \code{20}.}
diff --git a/man/readLiger.Rd b/man/readLiger.Rd
index 1c7af7b1..fab16f42 100644
--- a/man/readLiger.Rd
+++ b/man/readLiger.Rd
@@ -49,7 +49,9 @@ pbmc <- readLiger(tempPath)
 
 # Save and read H5-based liger object
 h5Path <- system.file("extdata/ctrl.h5", package = "rliger")
-lig <- createLiger(list(ctrl = h5Path))
+h5tempPath <- tempfile(fileext = ".h5")
+file.copy(from = h5Path, to = h5tempPath)
+lig <- createLiger(list(ctrl = h5tempPath))
 tempPath <- tempfile(fileext = ".rds")
 saveRDS(lig, tempPath)
 lig <- readLiger(tempPath)
diff --git a/man/restoreH5Liger.Rd b/man/restoreH5Liger.Rd
index 62173c90..6b0cfb89 100644
--- a/man/restoreH5Liger.Rd
+++ b/man/restoreH5Liger.Rd
@@ -34,7 +34,9 @@ for data structure.
 }
 \examples{
 h5Path <- system.file("extdata/ctrl.h5", package = "rliger")
-lig <- createLiger(list(ctrl = h5Path))
+tempPath <- tempfile(fileext = ".h5")
+file.copy(from = h5Path, to = tempPath)
+lig <- createLiger(list(ctrl = tempPath))
 # Now it is actually an invalid object! which is equivalent to what users
 # will get with `saveRDS(lig, "object.rds"); lig <- readRDS("object.rds")``
 closeAllH5(lig)