diff --git a/DESCRIPTION b/DESCRIPTION
index 09c20d9..b9bdfa3 100755
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: msImpute
 Type: Package
 Title: Peptide imputation in label-free proteomics
-Version: 1.2.0
+Version: 1.3.0
 Authors@R: 
     person(given = "Soroor",
            family = "Hediyeh-zadeh",
diff --git a/NAMESPACE b/NAMESPACE
index 7b10eaa..890a15b 100755
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -5,8 +5,11 @@ export(KNC)
 export(KNN)
 export(betweenness)
 export(computeStructuralMetrics)
+export(findVariableFeatures)
 export(gromov_wasserstein)
 export(msImpute)
 export(scaleData)
 export(selectFeatures)
 export(withinness)
+importFrom(scran,decomposeVar)
+importFrom(scran,trendVar)
diff --git a/R/CPD.R b/R/CPD.R
index a1d9b89..5acc939 100755
--- a/R/CPD.R
+++ b/R/CPD.R
@@ -4,8 +4,8 @@
 #' CPD quantifies preservation of the global structure after imputation.
 #' Requires complete datasets - for developers/use in benchmark studies only.
 #'
-#' @param xorigin numeric matrix. The original data. Can not contain missing values.
-#' @param ximputed numeric matrix. The imputed data. Can not contain missing values.
+#' @param xorigin numeric matrix. The original log-intensity data. Can not contain missing values.
+#' @param ximputed numeric matrix. The imputed log-intensity data. Can not contain missing values.
 #'
 #' @return numeric
 #'
diff --git a/R/KNC.R b/R/KNC.R
index fd2003c..85e9d26 100755
--- a/R/KNC.R
+++ b/R/KNC.R
@@ -4,8 +4,8 @@
 #' quantifies preservation of the mesoscopic structure after imputation.
 #' Requires complete datasets - for developers/use in benchmark studies only.
 #'
-#' @param xorigin numeric matrix. The original data. Can contain missing values.
-#' @param ximputed numeric matrix. The imputed data.
+#' @param xorigin numeric matrix. The original log-intensity data. Can contain missing values.
+#' @param ximputed numeric matrix. The imputed log-intensity data.
 #' @param class factor. A vector of length number of columns (samples) in the data specifying the class/label (i.e. experimental group) of each sample.
 #' @param k  number of nearest class means. default to k=3.
 #'
diff --git a/R/KNN.R b/R/KNN.R
index 64b1e5c..15a59b6 100755
--- a/R/KNN.R
+++ b/R/KNN.R
@@ -4,8 +4,8 @@
 #' KNN quantifies preservation of the local, or microscopic structure.
 #' Requires complete datasets - for developers/use in benchmark studies only.
 #'
-#' @param xorigin numeric matrix. The original data. Can not contain missing values.
-#' @param ximputed numeric matrix. The imputed data. Can not contain missing values.
+#' @param xorigin numeric matrix. The original log-intensity data. Can not contain missing values.
+#' @param ximputed numeric matrix. The imputed log-intensity data. Can not contain missing values.
 #' @param k  number of nearest neighbours. default to k=3.
 #'
 #' @return numeric  The proportion of preserved k-nearest neighbours in imputed data.
diff --git a/R/computeStructuralMetrics.R b/R/computeStructuralMetrics.R
index b27d0c7..2d3d6e1 100644
--- a/R/computeStructuralMetrics.R
+++ b/R/computeStructuralMetrics.R
@@ -1,12 +1,13 @@
 #' Metrics for the assessment of post-imputation structural preservation
 #'
 #' For an imputed dataset, it computes within phenotype/experimental condition similarity (i.e. preservation of local structures),
-#' between phenotype distances (preservation of global structures), and the Gromov-Wasserstein (GW) distance between original and
+#' between phenotype distances (preservation of global structures), and the Gromov-Wasserstein (GW) distance between original (source) and
 #' imputed data.
 #'
-#' @param x numeric matrix. An imputed data matrix.
+#' @param x numeric matrix. An imputed data matrix of log-intensity.
 #' @param group factor. A vector of biological groups, experimental conditions or phenotypes (e.g. control, treatment).
-#' @param xna numeric matrix. Data matrix with missing values (i.e. the original intensity matrix with NAs)
+#' @param y numeric matrix. The source data (i.e. the original log-intensity matrix), preferably subsetted on highly variable peptides (see \code{findVariableFeatures}).
+#' @param k numeric. Number of Principal Components used to compute the GW distance. default to 2.
 #'
 #' @details For each group of experimental conditions (e.g. treatment and control), the group centroid is calculated as the average
 #' of observed peptide intensities. Withinness for each group is computed as sum of the squared distances between samples in that group and
@@ -16,15 +17,21 @@
 #' The GW metric considers preservation of both local and global structures simultaneously. A small GW distance suggests that
 #' imputation has introduced small distortions to global and local structures overall, whereas a large distance implies significant
 #' distortions. When comparing two or more imputation methods, the optimal method is the method with smallest GW distance.
-#' To compute the GW distance, the missing values in each column of \code{xna} are replaced by mean of observed values in that column.
-#' This is equivalent to imputation by KNN, where k is set to the total number of identified peptides (i.e. number of rows in the input matrix).
-#' GW distance estimation requires \code{python}. See example.
-#' All metrics are on log scale.
+#' The GW distance is computed on Principal Components (PCs) of the source and imputed data, instead of peptides. Principal components capture the
+#' geometry of the data, hence GW computed on PCs is a better measure of preservation of local and global structures. The PCs in the source data are
+#' recommended to be computed on peptides with high biological variance. Hence, users are recommended to subset the source data only on highly variable peptides (hvp)
+#' (see \code{findVariableFeatures}). Since the hvp peptides have high biological variance, they are likely to have enough information to discriminate samples
+#' from different experimental groups. Hence, PCs computed on those peptides should be representative of the original source data with missing values.
+#' If the samples cluster by experimental group in the first couple of PCs, then a choice of k=2 is reasonable. If the desired separation/clustering of samples
+#' occurs in later PCs (i.e. the first few PCs are dominated by batches or unwanted variability), then it is recommended to use a larger number of PCs to compute the
+#' GW metric. If you are interested in how well the imputed data represent the original data in all possible dimensions, then set k to the number of samples
+#' in the data (i.e. the number of columns in the intensity matrix).
+#' GW distance estimation requires \code{python}. See example. All metrics are on log scale.
 #'
 #'
 #' @return list of three metrics: withinness (sum of squared distances within a phenotype group),
 #' betweenness (sum of squared distances between the phenotypes), and gromov-wasserstein distance (if \code{xna} is not NULL).
-#' All metrics are on log scale.
+#' if \code{group} is NULL only the GW distance is returned. All metrics are on log scale.
 #'
 #'
 #' @examples
@@ -49,28 +56,35 @@
 #' # you can then run the computeStructuralMetrics() function.
 #' # Note that the reticulate package should be loaded before loading msImpute.
 #' set.seed(101)
-#' n=200
-#' p=100
-#' J=50
+#' n=12000
+#' p=10
+#' J=5
 #' np=n*p
 #' missfrac=0.3
-#' x=matrix(rnorm(n*J),n,J)%*%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+#' x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)%*%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+#'   matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 #' ix=seq(np)
 #' imiss=sample(ix,np*missfrac,replace=FALSE)
 #' xna=x
 #' xna[imiss]=NA
+#' keep <- (rowSums(!is.na(xna)) >= 4)
+#' xna <- xna[keep,]
+#' rownames(xna) <- 1:nrow(xna)
 #' y <- xna
 #' xna <- scaleData(xna)
 #' xcomplete <- msImpute(object=xna)
-#' G <- as.factor(sample(1:5, 100, replace = TRUE))
-#' computeStructuralMetrics(xcomplete, G, y)
+#' G <- as.factor(sample(1:3, p, replace = TRUE))
+#' top.hvp <- findVariableFeatures(y)
+#' computeStructuralMetrics(xcomplete, G, y[rownames(top.hvp)[1:50],], k = 2)
 #' @export
-computeStructuralMetrics <- function(x, group, xna = NULL){
-  out <- list(withinness = log(withinness(x, group)),
-       betweenness = log(betweenness(x,group)))
+computeStructuralMetrics <- function(x, group=NULL, y = NULL, k=2){
+ if(!is.null(group)){
+   out <- list(withinness = log(withinness(x, group)),
+               betweenness = log(betweenness(x,group)))
+ }
 
-  if(!is.null(xna)){
-    GW <- gromov_wasserstein(xna, x)
+  if(!is.null(y)){
+    GW <- gromov_wasserstein(x, y, k=k)
     out[['gw_dist']] <- GW[[2]]$gw_dist
   }
   return(out)
@@ -101,8 +115,33 @@ betweenness <- function(x, class_label){
 
 
 #' @export
-gromov_wasserstein <- function(xna, ximputed){
+gromov_wasserstein <- function(x, y, k, min.mean = 0.1){
+  if (k > ncol(x)) stop("Number of Principal Components cannot be greater than number of columns (samples) in the data.")
+  if (any(!is.finite(x))) stop("Non-finite values (NA, Inf, NaN) encountered in imputed data")
+  if (any(!is.finite(y))) stop("Non-finite values (NA, Inf, NaN) encountered in source data")
+
+  means <- rowMeans(x)
+  vars <- matrixStats::rowSds(x)
+
+  # Filtering out zero-variance and low-abundance peptides
+  is.okay <- !is.na(vars) & vars > 1e-8 & means >= min.mean
+
+  xt <- t(x)
+  yt <- t(y)
+
+  # compute PCA
+  xt_pca <- prcomp(xt[,is.okay], scale. = TRUE, center = TRUE)
+  yt_pca <- prcomp(yt, scale. = TRUE, center = TRUE)
+
+  C1 <- yt_pca$x[,1:k]
+  C2 <- xt_pca$x[,1:k]
+
+
+  cat("Computing GW distance using k=", k, "Principal Components")
   reticulate::source_python(system.file("python", "gw.py", package = "msImpute"))
-  xna <- apply(xna, 2, FUN=function(x) {x[is.na(x)] <- mean(x, na.rm=TRUE); return(x)})
-  return(gw(t(xna), t(ximputed), ncol(xna)))
+  return(gw(C1,C2, ncol(x)))
 }
+
+
+
+
diff --git a/R/findVariableFeatures.R b/R/findVariableFeatures.R
new file mode 100644
index 0000000..22c769d
--- /dev/null
+++ b/R/findVariableFeatures.R
@@ -0,0 +1,25 @@
+#' Find highly variable peptides
+#'
+#' For each peptide, the total variance is decomposed into biological and technical variance using package \code{scran}
+#' @param y numeric matrix giving log-intensity. Can contain NA values.
+#'
+#' @return A data frame where rows are peptides and columns contain estimates of biological and technical variances. Peptides are ordered by biological variance.
+#'
+#' @details A loess trend is fitted to total sample variances and mean intensities. For each peptide, the biological variance is then
+#' computed by subtracting the estimated technical variance from the loess fit from the total sample variance.
+#'
+#' @seealso computeStructuralMetrics
+#'
+#' @export
+#' @importFrom scran trendVar decomposeVar
+findVariableFeatures <- function(y){
+  fit <- trendVar(y)
+  results <- decomposeVar(y, fit)
+  plot(results$mean, results$total)
+  o <- order(results$mean)
+  lines(results$mean[o], results$tech[o], col="red", lwd=2)
+  results <- as.data.frame(results)
+  top.dec <- results[order(results$bio, decreasing=TRUE), ]
+  return(top.dec)
+
+}
diff --git a/R/msImpute.R b/R/msImpute.R
index 50917e6..7cfaeca 100755
--- a/R/msImpute.R
+++ b/R/msImpute.R
@@ -8,7 +8,7 @@
 #' \code{msImpute} operates on the softImpute-ALS algorithm.
 #' For more details on the underlying algorithm, please see \code{\link[softImpute]{softImpute}} package.
 #'
-#' @param object Numeric matrix  where missing values are denoted by NA. Rows are peptides, columns are samples.
+#' @param object Numeric matrix giving log-intensity where missing values are denoted by NA. Rows are peptides, columns are samples.
 #' @param rank.max Numeric. This restricts the rank of the solution. is set to min(dim(\code{object})-1) by default.
 #' @param lambda Numeric. Nuclear-norm regularization parameter. Controls the low-rank property of the solution
 #' to the matrix completion problem. By default, it is determined at the scaling step. If set to zero
@@ -24,16 +24,19 @@
 #'
 #' @examples
 #' set.seed(101)
-#' n=200
-#' p=100
-#' J=50
+#' n=12000
+#' p=10
+#' J=5
 #' np=n*p
 #' missfrac=0.3
-#' x=matrix(rnorm(n*J),n,J)%*%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+#' x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)%*%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+#'   matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 #' ix=seq(np)
 #' imiss=sample(ix,np*missfrac,replace=FALSE)
 #' xna=x
 #' xna[imiss]=NA
+#' keep <- (rowSums(!is.na(xna)) >= 4)
+#' xna <- xna[keep,]
 #' xna <- scaleData(xna)
 #' xcomplete <- msImpute(object=xna)
 #' @seealso selectFeatures, scaleData
@@ -50,12 +53,13 @@ msImpute <- function(object, rank.max = NULL, lambda = NULL, thresh = 1e-05,
   if(is(object, "matrix")) {
     x <- object
     xnas <- x
+    warning("Input is not scaled. Data scaling is recommended for msImpute optimal performance.")
     }
   # MAList object
   # or \code{MAList} object from \link{limma}
   # if(is(object,"MAList")) x <- object$E
 
-
+  if(any(is.nan(x) | is.infinite(x))) stop("Inf or NaN values encountered.")
   if(any(rowSums(!is.na(x)) <= 3)) stop("Peptides with excessive NAs are detected. Please revisit your fitering step. At least 4 non-missing measurements are required for any peptide.")
   if(any(x < 0, na.rm = TRUE)){
     warning("Negative values encountered in imputed data. Please consider revising filtering and/or normalisation steps.")
@@ -63,7 +67,7 @@ msImpute <- function(object, rank.max = NULL, lambda = NULL, thresh = 1e-05,
   if(is.null(rank.max)) rank.max <- min(dim(x) - 1)
   cat("maximum rank is", rank.max, "\n")
   cat("computing lambda0 ... \n")
-  if(is.null(lambda)) lambda <- softImpute::lambda0(x)
+  if(is.null(lambda)) lambda <- softImpute::lambda0(xnas)
   cat("lambda0 is", lambda, "\n")
   cat("fit the low-rank model ... \n")
   fit <- softImpute::softImpute(xnas,rank=rank.max,lambda=lambda, type = "als", thresh = thresh,
diff --git a/R/scaleData.R b/R/scaleData.R
index 1f01a28..1b68c5c 100755
--- a/R/scaleData.R
+++ b/R/scaleData.R
@@ -1,7 +1,7 @@
 #' Standardize a matrix to have optionally row means zero and variances one, and/or column means zero and variances one.
 #'
 #'
-#' @param object numeric matrix where missing values are denoted by NA. Rows are peptides, columns are samples.
+#' @param object numeric matrix giving log-intensity where missing values are denoted by NA. Rows are peptides, columns are samples.
 #' @param maxit numeric. maximum iteration for the algorithm to converge (default to 20). When both row and column centering/scaling is requested, iteration may be necessary.
 #' @param thresh numeric. Convergence threshold (default to 1e-09).
 #' @param row.center logical. if row.center==TRUE (the default), row centering will be performed resulting in a matrix with row means zero. If row.center is a vector, it will be used to center the rows. If row.center=FALSE nothing is done.
@@ -12,6 +12,7 @@
 #'
 #' @details
 #' Standardizes rows and/or columns of a matrix with missing values, according to the \code{biScale} algorithm in Hastie et al. 2015.
+#' Data is assumed to be normalised and log-transformed.
 #'
 #' @return
 #' A list of two components: E and E.scaled. E contains the input matrix, E.scaled contains the scaled data
@@ -19,16 +20,19 @@
 #'
 #' @examples
 #' set.seed(101)
-#' n=200
-#' p=100
-#' J=50
+#' n=12000
+#' p=10
+#' J=5
 #' np=n*p
 #' missfrac=0.3
-#' x=matrix(rnorm(n*J),n,J)%*%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+#' x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)%*%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+#'   matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 #' ix=seq(np)
 #' imiss=sample(ix,np*missfrac,replace=FALSE)
 #' xna=x
 #' xna[imiss]=NA
+#' keep <- (rowSums(!is.na(xna)) >= 4)
+#' xna <- xna[keep,]
 #' xna <- scaleData(xna)
 #' @seealso selectFeatures, msImpute
 #' @export
@@ -39,7 +43,7 @@ scaleData <- function(object, maxit = 20, thresh = 1e-09, row.center = TRUE, row
   }else{
     x <- object
   }
-
+  if(any(is.nan(x) | is.infinite(x))) stop("Inf or NaN values encountered.")
   if(any(rowSums(!is.na(x)) <= 3)) stop("Peptides with excessive NAs are detected. Please revisit your fitering step. At least 4 non-missing measurements are required for any peptide.")
   if(any(x < 0, na.rm = TRUE)){
     warning("Negative values encountered in imputed data. Please consider revisting the filtering and/or normalisation steps, if appropriate.")
diff --git a/R/selectFeatures.R b/R/selectFeatures.R
index 76a03d9..18f09eb 100755
--- a/R/selectFeatures.R
+++ b/R/selectFeatures.R
@@ -4,7 +4,7 @@
 #' used to determine if data is Missing Not At Random (MNAR). Users should note that \code{msImpute} assumes peptides
 #' are Missing At Random (MAR).
 #'
-#' @param object Numeric matrix where missing values are denoted by NA.
+#' @param object Numeric matrix giving log-intensity where missing values are denoted by NA.
 #' Rows are peptides, columns are samples.
 #' @param n_features Numeric, number of features with high dropout rate. 500 by default.
 #' @param suppress_plot Logical show plot of dropouts vs abundances.
@@ -13,16 +13,19 @@
 #'
 #' @examples
 #' set.seed(101)
-#' n=800
-#' p=100
-#' J=50
+#' n=12000
+#' p=10
+#' J=5
 #' np=n*p
 #' missfrac=0.3
-#' x=matrix(rnorm(n*J),n,J)%*%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+#' x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)%*%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+#'   matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 #' ix=seq(np)
 #' imiss=sample(ix,np*missfrac,replace=FALSE)
 #' xna=x
 #' xna[imiss]=NA
+#' keep <- (rowSums(!is.na(xna)) >= 4)
+#' xna <- xna[keep,]
 #' rownames(xna) <- 1:nrow(xna)
 #' hdp <- selectFeatures(xna, n_features=500,  suppress_plot=FALSE)
 #' # construct matrix M to capture missing entries
@@ -59,6 +62,8 @@ selectFeatures <- function(object, n_features=500, suppress_plot = FALSE) {
   }
 
   if(is.null(rownames(x))) stop("No row names in input. Please provide input with named rows.")
+  if(any(is.nan(x) | is.infinite(x))) stop("Inf or NaN values encountered.")
+
   AveExpr <- rowMeans(x, na.rm = TRUE)
   dropout <- rowMeans(is.na(x))
 
diff --git a/README.md b/README.md
index 90fd08e..9f4d4cd 100755
--- a/README.md
+++ b/README.md
@@ -19,10 +19,20 @@ selectFeatures(xna)  # xna is a numeric matrix with NAs (for MAR/MNAR diagnosis
 xna <- scaleData(xna) 
 msImpute(xna, rank.max = 2) # rank 2 approximaiton
 xcomplete <- msImpute(xna)  # optimal rank determined by msImpute
-computeStructuralMetrics(xcomplete, group, xna$E) # "group" denotes experimental condition (e.g. control, treatment etc). Requires python. See Manual for more information.
+
+
+# Requires python. See Manual for more information.
+top.hvp <- findVariableFeatures(xna$E)
+computeStructuralMetrics(xcomplete, 
+                         # "group" denotes experimental condition (e.g. control, treatment etc).
+                         group, 
+                         xna$E[rownames(top.hvp)[1:50],], 
+                         k = 2) 
+
+
 ```
 
-See [user manual](https://github.com/DavisLaboratory/msImpute/blob/master/msImpute_1.2.0.pdf) for help. 
+See [user manual](https://github.com/DavisLaboratory/msImpute/blob/master/msImpute_1.3.0.pdf) for help. 
 
 
 **Reference**
diff --git a/man/CPD.Rd b/man/CPD.Rd
index dce6b8c..0ab15be 100755
--- a/man/CPD.Rd
+++ b/man/CPD.Rd
@@ -7,9 +7,9 @@
 CPD(xorigin, ximputed)
 }
 \arguments{
-\item{xorigin}{numeric matrix. The original data. Can not contain missing values.}
+\item{xorigin}{numeric matrix. The original log-intensity data. Can not contain missing values.}
 
-\item{ximputed}{numeric matrix. The imputed data. Can not contain missing values.}
+\item{ximputed}{numeric matrix. The imputed log-intensity data. Can not contain missing values.}
 }
 \value{
 numeric
diff --git a/man/KNC.Rd b/man/KNC.Rd
index eb26957..8163e2d 100755
--- a/man/KNC.Rd
+++ b/man/KNC.Rd
@@ -7,9 +7,9 @@
 KNC(xorigin, ximputed, class, k = 3)
 }
 \arguments{
-\item{xorigin}{numeric matrix. The original data. Can contain missing values.}
+\item{xorigin}{numeric matrix. The original log-intensity data. Can contain missing values.}
 
-\item{ximputed}{numeric matrix. The imputed data.}
+\item{ximputed}{numeric matrix. The imputed log-intensity data.}
 
 \item{class}{factor. A vector of length number of columns (samples) in the data specifying the class/label (i.e. experimental group) of each sample.}
 
diff --git a/man/KNN.Rd b/man/KNN.Rd
index df36b7a..0477998 100755
--- a/man/KNN.Rd
+++ b/man/KNN.Rd
@@ -7,9 +7,9 @@
 KNN(xorigin, ximputed, k = 3)
 }
 \arguments{
-\item{xorigin}{numeric matrix. The original data. Can not contain missing values.}
+\item{xorigin}{numeric matrix. The original log-intensity data. Can not contain missing values.}
 
-\item{ximputed}{numeric matrix. The imputed data. Can not contain missing values.}
+\item{ximputed}{numeric matrix. The imputed log-intensity data. Can not contain missing values.}
 
 \item{k}{number of nearest neighbours. default to k=3.}
 }
diff --git a/man/computeStructuralMetrics.Rd b/man/computeStructuralMetrics.Rd
index 7f3316b..f1b0690 100644
--- a/man/computeStructuralMetrics.Rd
+++ b/man/computeStructuralMetrics.Rd
@@ -4,23 +4,25 @@
 \alias{computeStructuralMetrics}
 \title{Metrics for the assessment of post-imputation structural preservation}
 \usage{
-computeStructuralMetrics(x, group, xna = NULL)
+computeStructuralMetrics(x, group = NULL, y = NULL, k = 2)
 }
 \arguments{
-\item{x}{numeric matrix. An imputed data matrix.}
+\item{x}{numeric matrix. An imputed data matrix of log-intensity.}
 
 \item{group}{factor. A vector of biological groups, experimental conditions or phenotypes (e.g. control, treatment).}
 
-\item{xna}{numeric matrix. Data matrix with missing values (i.e. the original intensity matrix with NAs)}
+\item{y}{numeric matrix. The source data (i.e. the original log-intensity matrix), preferably subsetted on highly variable peptides (see \code{findVariableFeatures}).}
+
+\item{k}{numeric. Number of Principal Components used to compute the GW distance. default to 2.}
 }
 \value{
 list of three metrics: withinness (sum of squared distances within a phenotype group),
 betweenness (sum of squared distances between the phenotypes), and gromov-wasserstein distance (if \code{xna} is not NULL).
-All metrics are on log scale.
+if \code{group} is NULL only the GW distance is returned. All metrics are on log scale.
 }
 \description{
 For an imputed dataset, it computes within phenotype/experimental condition similarity (i.e. preservation of local structures),
-between phenotype distances (preservation of global structures), and the Gromov-Wasserstein (GW) distance between original and
+between phenotype distances (preservation of global structures), and the Gromov-Wasserstein (GW) distance between original (source) and
 imputed data.
 }
 \details{
@@ -32,10 +34,16 @@ group distances, hence smaller withinness, and maximizes between group distances
 The GW metric considers preservation of both local and global structures simultaneously. A small GW distance suggests that
 imputation has introduced small distortions to global and local structures overall, whereas a large distance implies significant
 distortions. When comparing two or more imputation methods, the optimal method is the method with smallest GW distance.
-To compute the GW distance, the missing values in each column of \code{xna} are replaced by mean of observed values in that column.
-This is equivalent to imputation by KNN, where k is set to the total number of identified peptides (i.e. number of rows in the input matrix).
-GW distance estimation requires \code{python}. See example.
-All metrics are on log scale.
+The GW distance is computed on Principal Components (PCs) of the source and imputed data, instead of peptides. Principal components capture the
+geometry of the data, hence GW computed on PCs is a better measure of preservation of local and global structures. The PCs in the source data are
+recommended to be computed on peptides with high biological variance. Hence, users are recommended to subset the source data only on highly variable peptides (hvp)
+(see \code{findVariableFeatures}). Since the hvp peptides have high biological variance, they are likely to have enough information to discriminate samples
+from different experimental groups. Hence, PCs computed on those peptides should be representative of the original source data with missing values.
+If the samples cluster by experimental group in the first couple of PCs, then a choice of k=2 is reasonable. If the desired separation/clustering of samples
+occurs in later PCs (i.e. the first few PCs are dominated by batches or unwanted variability), then it is recommended to use a larger number of PCs to compute the
+GW metric. If you are interested in how well the imputed data represent the original data in all possible dimensions, then set k to the number of samples
+in the data (i.e. the number of columns in the intensity matrix).
+GW distance estimation requires \code{python}. See example. All metrics are on log scale.
 }
 \examples{
 # To compute the GW distance you need to have python installed
@@ -59,19 +67,24 @@ use_virtualenv("msImpute-reticulate")
 # you can then run the computeStructuralMetrics() function.
 # Note that the reticulate package should be loaded before loading msImpute.
 set.seed(101)
-n=200
-p=100
-J=50
+n=12000
+p=10
+J=5
 np=n*p
 missfrac=0.3
-x=matrix(rnorm(n*J),n,J)\%*\%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)\%*\%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+  matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 ix=seq(np)
 imiss=sample(ix,np*missfrac,replace=FALSE)
 xna=x
 xna[imiss]=NA
+keep <- (rowSums(!is.na(xna)) >= 4)
+xna <- xna[keep,]
+rownames(xna) <- 1:nrow(xna)
 y <- xna
 xna <- scaleData(xna)
 xcomplete <- msImpute(object=xna)
-G <- as.factor(sample(1:5, 100, replace = TRUE))
-computeStructuralMetrics(xcomplete, G, y)
+G <- as.factor(sample(1:3, p, replace = TRUE))
+top.hvp <- findVariableFeatures(y)
+computeStructuralMetrics(xcomplete, G, y[rownames(top.hvp)[1:50],], k = 2)
 }
diff --git a/man/findVariableFeatures.Rd b/man/findVariableFeatures.Rd
new file mode 100644
index 0000000..90a7bd6
--- /dev/null
+++ b/man/findVariableFeatures.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/findVariableFeatures.R
+\name{findVariableFeatures}
+\alias{findVariableFeatures}
+\title{Find highly variable peptides}
+\usage{
+findVariableFeatures(y)
+}
+\arguments{
+\item{y}{numeric matrix giving log-intensity. Can contain NA values.}
+}
+\value{
+A data frame where rows are peptides and columns contain estimates of biological and technical variances. Peptides are ordered by biological variance.
+}
+\description{
+For each peptide, the total variance is decomposed into biological and technical variance using package \code{scran}
+}
+\details{
+A loess trend is fitted to total sample variances and mean intensities. For each peptide, the biological variance is then
+computed by subtracting the estimated technical variance from the loess fit from the total sample variance.
+}
+\seealso{
+computeStructuralMetrics
+}
diff --git a/man/msImpute.Rd b/man/msImpute.Rd
index f10287d..cdbf928 100755
--- a/man/msImpute.Rd
+++ b/man/msImpute.Rd
@@ -9,7 +9,7 @@ msImpute(object, rank.max = NULL, lambda = NULL, thresh = 1e-05,
   final.svd = TRUE)
 }
 \arguments{
-\item{object}{Numeric matrix  where missing values are denoted by NA. Rows are peptides, columns are samples.}
+\item{object}{Numeric matrix giving log-intensity where missing values are denoted by NA. Rows are peptides, columns are samples.}
 
 \item{rank.max}{Numeric. This restricts the rank of the solution. is set to min(dim(\code{object})-1) by default.}
 
@@ -44,16 +44,19 @@ For more details on the underlying algorithm, please see \code{\link[softImpute]
 }
 \examples{
 set.seed(101)
-n=200
-p=100
-J=50
+n=12000
+p=10
+J=5
 np=n*p
 missfrac=0.3
-x=matrix(rnorm(n*J),n,J)\%*\%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)\%*\%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+  matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 ix=seq(np)
 imiss=sample(ix,np*missfrac,replace=FALSE)
 xna=x
 xna[imiss]=NA
+keep <- (rowSums(!is.na(xna)) >= 4)
+xna <- xna[keep,]
 xna <- scaleData(xna)
 xcomplete <- msImpute(object=xna)
 }
diff --git a/man/scaleData.Rd b/man/scaleData.Rd
index f3132bd..c9c30ff 100755
--- a/man/scaleData.Rd
+++ b/man/scaleData.Rd
@@ -9,7 +9,7 @@ scaleData(object, maxit = 20, thresh = 1e-09, row.center = TRUE,
   trace = FALSE)
 }
 \arguments{
-\item{object}{numeric matrix where missing values are denoted by NA. Rows are peptides, columns are samples.}
+\item{object}{numeric matrix giving log-intensity where missing values are denoted by NA. Rows are peptides, columns are samples.}
 
 \item{maxit}{numeric. maximum iteration for the algorithm to converge (default to 20). When both row and column centering/scaling is requested, iteration may be necessary.}
 
@@ -33,19 +33,23 @@ Standardize a matrix to have optionally row means zero and variances one, and/or
 }
 \details{
 Standardizes rows and/or columns of a matrix with missing values, according to the \code{biScale} algorithm in Hastie et al. 2015.
+Data is assumed to be normalised and log-transformed.
 }
 \examples{
 set.seed(101)
-n=200
-p=100
-J=50
+n=12000
+p=10
+J=5
 np=n*p
 missfrac=0.3
-x=matrix(rnorm(n*J),n,J)\%*\%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)\%*\%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+  matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 ix=seq(np)
 imiss=sample(ix,np*missfrac,replace=FALSE)
 xna=x
 xna[imiss]=NA
+keep <- (rowSums(!is.na(xna)) >= 4)
+xna <- xna[keep,]
 xna <- scaleData(xna)
 }
 \seealso{
diff --git a/man/selectFeatures.Rd b/man/selectFeatures.Rd
index 91d83e7..9250d72 100755
--- a/man/selectFeatures.Rd
+++ b/man/selectFeatures.Rd
@@ -7,7 +7,7 @@
 selectFeatures(object, n_features = 500, suppress_plot = FALSE)
 }
 \arguments{
-\item{object}{Numeric matrix where missing values are denoted by NA.
+\item{object}{Numeric matrix giving log-intensity where missing values are denoted by NA.
 Rows are peptides, columns are samples.}
 
 \item{n_features}{Numeric, number of features with high dropout rate. 500 by default.}
@@ -24,16 +24,19 @@ are Missing At Random (MAR).
 }
 \examples{
 set.seed(101)
-n=800
-p=100
-J=50
+n=12000
+p=10
+J=5
 np=n*p
 missfrac=0.3
-x=matrix(rnorm(n*J),n,J)\%*\%matrix(rnorm(J*p),J,p)+matrix(rnorm(np),n,p)/5
+x=matrix(rnorm(n*J,mean = 5,sd = 0.2),n,J)\%*\%matrix(rnorm(J*p, mean = 5,sd = 0.2),J,p)+
+  matrix(rnorm(np,mean = 5,sd = 0.2),n,p)/5
 ix=seq(np)
 imiss=sample(ix,np*missfrac,replace=FALSE)
 xna=x
 xna[imiss]=NA
+keep <- (rowSums(!is.na(xna)) >= 4)
+xna <- xna[keep,]
 rownames(xna) <- 1:nrow(xna)
 hdp <- selectFeatures(xna, n_features=500,  suppress_plot=FALSE)
 # construct matrix M to capture missing entries
diff --git a/msImpute_1.2.0.pdf b/msImpute_1.2.0.pdf
deleted file mode 100644
index e7e35fb..0000000
Binary files a/msImpute_1.2.0.pdf and /dev/null differ
diff --git a/msImpute_1.3.0.pdf b/msImpute_1.3.0.pdf
new file mode 100644
index 0000000..6df58a9
Binary files /dev/null and b/msImpute_1.3.0.pdf differ