From 4b108f3fcea52d74be141471abaa86dc3a8476c4 Mon Sep 17 00:00:00 2001 From: Yichen Wang Date: Tue, 26 Mar 2024 13:16:36 -0400 Subject: [PATCH] minor fixes --- DESCRIPTION | 2 +- NEWS.md | 18 +++++++++++------- R/classConversion.R | 2 +- R/h5Utility.R | 1 - R/import.R | 16 ++++++++++++---- R/integration.R | 44 +++++++++++++++++++++++++++++++++++++------ man/createLiger.Rd | 11 ++++++++--- man/quantileNorm.Rd | 5 +++-- man/readLiger.Rd | 4 +++- man/restoreH5Liger.Rd | 4 +++- 10 files changed, 80 insertions(+), 27 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index daf42221..11e2f0b7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,7 +43,7 @@ Depends: methods, stats, utils, - R (>= 3.4) + R (>= 3.5) Imports: circlize, cli, diff --git a/NEWS.md b/NEWS.md index 03a3f6ad..5193f13e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,11 +1,12 @@ ## rliger Next -- Standardized H5 writing specification that can be shared with other platforms. - - Currently we allow analysis with 10X cellranger output H5 file and H5AD file from anndata>=0.8.0 - - Writing to H5AD file should follow anndata specification otherwise the file cannot be read back to a Python seesion. - - Writing to 10X H5 file should be carefully investigated. - - Consider using object backend to store information instead of active H5 binding, which cannot be serialized to RDS. - - Investigate whether to use existing backend implementation like HDF5Array, DelayedArray. +- Standardized H5 IO specification that can be shared with other platforms. + - Will move to use HDF5Array (TENxMatrix, H5ADMatrix)/ or BPCells for backed data representation. + - Read feature metadata (e.g. id, name, ...) if available; Allow setting "id" as rownames, "name" for visualization. + - rawData - coming from the original input, read only (qc filtering should be just stored in the object, no IO) + - preprocessing metrics - nUMI, nGene and etc, still go "chunkApply" so the file is read only once + - normData - delayed computed data from rawData, no on disk representation + - scaleData - new on-disk file and then create object back, because RcppPlanc won't be able to handle delayed computation - Ability to reorganize datasets - Allow doing something like `reorganize(ligerObj, variable = "somethingNotDataset")` and resulting in a new liger object with different ligerDataset grouping. - Ability to do downstream analysis on H5 data @@ -15,8 +16,11 @@ ## rliger 2.0.1 - Fixed wrong UINMF aborting criteria -- Fixed example/test skipping criteria for nonexisting dependencies +- Fixed example/test skipping criteria for non-existing dependencies - Fixed file access issue when checking package on CRAN +- Updated installed data file `system.file("extdata/ctrl.h5", "extdata/stim.h5")` to be of standard 10X H5 format +- Updated `quantileNorm()` automatic reference selection according to #297 +- Other minor fixes ## rliger 2.0.0 diff --git a/R/classConversion.R b/R/classConversion.R index adcffe02..1743273d 100644 --- a/R/classConversion.R +++ b/R/classConversion.R @@ -512,7 +512,7 @@ convertOldLiger.mem <- function(object) { if (!is.null(dataList$rawData)) features <- rownames(dataList$rawData) else features <- rownames(dataList$normData) if (is.null(features)) { - cli::cli_alert_abort( + cli::cli_abort( "Cannot detect feature names for dataset {.val {d}}." ) } diff --git a/R/h5Utility.R b/R/h5Utility.R index dc58cfaa..bd0c401b 100644 --- a/R/h5Utility.R +++ b/R/h5Utility.R @@ -61,7 +61,6 @@ H5Apply <- function( total = numChunks, clear = FALSE) # pb <- utils::txtProgressBar(0, numChunks, style = 3) for (i in seq(numChunks)) { - Sys.sleep(0.1) start <- (i - 1)*chunkSize + 1 end <- if (i*chunkSize > ncol(object)) ncol(object) else i*chunkSize colptrStart <- start diff --git a/R/import.R b/R/import.R index 2b1af7ec..58f69873 100644 --- a/R/import.R +++ b/R/import.R @@ -41,8 +41,9 @@ #' @param verbose Logical. Whether to show information of the progress. Default #' \code{getOption("ligerVerbose")} or \code{TRUE} if users have not set. #' @param ... Additional slot values that should be directly placed in object. -#' @param remove.missing,format.type,data.name,indices.name,indptr.name,genes.name,barcodes.name +#' @param raw.data,remove.missing,format.type,data.name,indices.name,indptr.name,genes.name,barcodes.name #' \bold{Deprecated.} See Usage section for replacement. +#' @param take.gene.union Defuncted. Will be ignored. #' @export #' @seealso \code{\link{createLigerDataset}}, \code{\link{createH5LigerDataset}} #' @examples @@ -78,6 +79,8 @@ createLiger <- function( verbose = getOption("ligerVerbose", TRUE), ..., # Deprecated coding style + raw.data = rawData, + take.gene.union = NULL, remove.missing = removeMissing, format.type = formatType, data.name = dataName, @@ -86,12 +89,17 @@ createLiger <- function( genes.name = genesName, barcodes.name = barcodesName ) { - .deprecateArgs(list(remove.missing = "removeMissing", + .deprecateArgs(list(raw.data = "rawData", remove.missing = "removeMissing", format.type = "formatType", data.name = "dataName", indices.name = "indicesName", indptr.name = "indptrName", genes.name = "genesName", - barcodes.name = "barcodesName")) - if (!is.list(rawData)) cli::cli_abort("{.var rawData} has to be a named list.") + barcodes.name = "barcodesName"), + defunct = "take.gene.union") + if (!is.list(rawData) || + is.null(names(rawData)) || + any(nchar(names(rawData)) == 0)) { + cli::cli_abort("{.var rawData} has to be a named list.") + } nData <- length(rawData) if (missing(modal) || is.null(modal)) modal <- "default" diff --git a/R/integration.R b/R/integration.R index e84f4b06..e61d5197 100644 --- a/R/integration.R +++ b/R/integration.R @@ -1275,8 +1275,9 @@ runUINMF.liger <- function( #' Default \code{50}. #' @param reference Character, numeric or logical selection of one dataset, out #' of all available datasets in \code{object}, to use as a "reference" for -#' normalization. Default \code{NULL} use the dataset with the largest number of -#' cells. +#' quantile normalization. Default \code{NULL} tries to find an RNA dataset with +#' the largest number of cells; if no RNA dataset available, use the globally +#' largest dataset. #' @param minCells Minimum number of cells to consider a cluster shared across #' datasets. Default \code{20}. #' @param nNeighbors Number of nearest neighbors for within-dataset knn graph. @@ -1350,12 +1351,23 @@ quantileNorm.liger <- function( ) { .checkObjVersion(object) .checkValidFactorResult(object, checkV = FALSE) - reference <- reference %||% names(which.max(sapply(datasets(object), ncol))) - reference <- .checkUseDatasets(object, useDatasets = reference) - if (length(reference) != 1) { - cli::cli_abort("Should specify only one reference dataset.") + # Choose largest RNA dataset as default if possible, as per Issue #297 + if (is.null(reference)) { + reference <- .autoFindRef_qn(object) + } else { + reference <- .checkUseDatasets(object, useDatasets = reference) + if (length(reference) != 1) { + cli::cli_abort("Should specify only one reference dataset.") + } + if (inherits(dataset(object, reference), c("ligerATACDataset", "ligerSpatialDataset", "ligerMethDataset"))) { + cli::cli_alert_warning( + "Dataset of {.val {modalOf(dataset(pbmc, reference))}} modality is not recommended to be set as reference." + ) + } } + object <- recordCommand(object, ..., dependencies = "RANN") + out <- .quantileNorm.HList( object = getMatrix(object, "H"), quantiles = quantiles, @@ -1521,6 +1533,26 @@ quantileNorm.Seurat <- function( return(list('H.norm' = Reduce(rbind, Hs), 'clusters' = clusterAssign)) } +.autoFindRef_qn <- function(object) { + notRecom <- c("ligerATACDataset", "ligerSpatialDataset", "ligerMethDataset") + recom <- !sapply(datasets(object), inherits, what = notRecom) + if (sum(recom) == 0) { + cli::cli_alert_warning( + "Auto selecting reference, dataset of recommended type not found." + ) + ref <- names(which.max(sapply(datasets(object), ncol))) + cli::cli_alert_info( + "Using globally largest dataset as reference: {.val {ref}} with {lengths(object)[ref]} cells" + ) + } else { + ref <- names(which.max(sapply(datasets(object)[recom], ncol))) + cli::cli_alert_info( + "Using largest dataset of recommended type as reference: {.val {ref}} with {lengths(object)[ref]} cells" + ) + } + return(ref) +} + #' [Deprecated] Quantile align (normalize) factor loading #' @description #' \bold{Please turn to \code{\link{quantileNorm}}.} diff --git a/man/createLiger.Rd b/man/createLiger.Rd index f3fbdccc..e6e5e0d5 100644 --- a/man/createLiger.Rd +++ b/man/createLiger.Rd @@ -20,6 +20,8 @@ createLiger( newH5 = TRUE, verbose = getOption("ligerVerbose", TRUE), ..., + raw.data = rawData, + take.gene.union = NULL, remove.missing = removeMissing, format.type = formatType, data.name = dataName, @@ -78,7 +80,9 @@ can be dangerous for large scale analysis.} \item{...}{Additional slot values that should be directly placed in object.} -\item{remove.missing, format.type, data.name, indices.name, indptr.name, genes.name, barcodes.name}{\bold{Deprecated.} See Usage section for replacement.} +\item{raw.data, remove.missing, format.type, data.name, indices.name, indptr.name, genes.name, barcodes.name}{\bold{Deprecated.} See Usage section for replacement.} + +\item{take.gene.union}{Defuncted. Will be ignored.} } \description{ This function allows creating \linkS4class{liger} object from @@ -96,8 +100,9 @@ pbmc1 <- createLiger(list(ctrl = ctrl.raw, stim = stim.raw)) # Create from H5 files h5Path <- system.file("extdata/ctrl.h5", package = "rliger") -print(h5Path) -lig <- createLiger(list(ctrl = h5Path)) +tempPath <- tempfile(fileext = ".h5") +file.copy(from = h5Path, to = tempPath) +lig <- createLiger(list(ctrl = tempPath)) # Create from other container object ctrl.seu <- SeuratObject::CreateSeuratObject(ctrl.raw) diff --git a/man/quantileNorm.Rd b/man/quantileNorm.Rd index 2837dd69..7ebd0792 100644 --- a/man/quantileNorm.Rd +++ b/man/quantileNorm.Rd @@ -54,8 +54,9 @@ Default \code{50}.} \item{reference}{Character, numeric or logical selection of one dataset, out of all available datasets in \code{object}, to use as a "reference" for -normalization. Default \code{NULL} use the dataset with the largest number of -cells.} +quantile normalization. Default \code{NULL} tries to find an RNA dataset with +the largest number of cells; if no RNA dataset available, use the globally +largest dataset.} \item{minCells}{Minimum number of cells to consider a cluster shared across datasets. Default \code{20}.} diff --git a/man/readLiger.Rd b/man/readLiger.Rd index 1c7af7b1..fab16f42 100644 --- a/man/readLiger.Rd +++ b/man/readLiger.Rd @@ -49,7 +49,9 @@ pbmc <- readLiger(tempPath) # Save and read H5-based liger object h5Path <- system.file("extdata/ctrl.h5", package = "rliger") -lig <- createLiger(list(ctrl = h5Path)) +h5tempPath <- tempfile(fileext = ".h5") +file.copy(from = h5Path, to = h5tempPath) +lig <- createLiger(list(ctrl = h5tempPath)) tempPath <- tempfile(fileext = ".rds") saveRDS(lig, tempPath) lig <- readLiger(tempPath) diff --git a/man/restoreH5Liger.Rd b/man/restoreH5Liger.Rd index 62173c90..6b0cfb89 100644 --- a/man/restoreH5Liger.Rd +++ b/man/restoreH5Liger.Rd @@ -34,7 +34,9 @@ for data structure. } \examples{ h5Path <- system.file("extdata/ctrl.h5", package = "rliger") -lig <- createLiger(list(ctrl = h5Path)) +tempPath <- tempfile(fileext = ".h5") +file.copy(from = h5Path, to = tempPath) +lig <- createLiger(list(ctrl = tempPath)) # Now it is actually an invalid object! which is equivalent to what users # will get with `saveRDS(lig, "object.rds"); lig <- readRDS("object.rds")`` closeAllH5(lig)