Added new argument convert to augment() and survplot() for returning

data.frame in place of data.table. survplot() gains also return.all argument to get all datasets out with just 1 agument. augment() checks always the argument more_status for missing data even if check_na = FALSE.
contefranz · Jun 14, 2016 · a5aa158 · a5aa158
1 parent 1a9aafc
commit a5aa158
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 39 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -2,17 +2,38 @@
 
 ### Breaking changes
 
-* `augment()` gains the brand new argument `check_NA` which allows the user to decide if the function
+* `msmtools` can now run with R 3.0.0 and above for retro compatibility reasons.
+
+* `augment()` gains the new argument `check_NA` which allows the user to decide if the function
 should run some checks to find missing data in the following arguments: `data_key`, `n_events`, 
-`pattern`, `t_start` and `t_end`. Default is `FALSE`.
+`pattern`, `t_start` and `t_end`. Default is `FALSE`. Missing data checks are always carried out on
+`more_status`.
+
+* `augment()` gains the new argument `convert` which if set to `TRUE` efficiently converts the output 
+to the old school `data.frame` class. 
+
+* `survplot()` gains the new argument `return.all` which saves you some typing time when requesting both 
+the data of the Kaplan-Meier and the fitter survival. 
+
+* `survplot()` gains the new argument `convert` which if set to `TRUE` efficiently converts
+any object returned to the old school `data.frame` class.
 
 ### Changes in functions
 
+* `augment()` gets a whole new implementation which comes into play when `pattern` has only
+2 values ('alive' and 'dead'). Now the procedures runs with computational times similar to when
+`pattern` has 3 values. This is due thanks to the fast joins adopted.
+
 * `augment()` now is much faster when defining the target size for the reshaping. This was a 
-bottlneck which caused memory issues and wasted time. 
+bottleneck which caused memory issues and wasted time. 
+
+* General memory optimization in the function `augment()`.
 
 ### Bug fixes
 
+* In `augment()`, the sequential status is now correctly computed. There was a wrong call which
+blocked the object defined by `n_events`.
+
 * In `augment()`, when `pattern` was detected with two unique values, inconsistent results were
 produced during the status flag assignment. This was due to a wrong rounding of the amount of 
 augmenting factor for each unit.

diff --git a/R/augment.R b/R/augment.R
@@ -43,6 +43,9 @@
 #' some it stops with error. Default is \code{FALSE} because \code{augment} is not intended for
 #' running consistency checks, beside what is mandatory, and because the procedure is
 #' computationally onerous and could cause memory overhead for very highly dimensional datasets.
+#' @param convert If \code{TRUE}, then the returned object is automatically converted to the
+#' class \code{data.frame}. This is done in place and comes at very low cost both from running time
+#' and memory consumption (See \code{\link[data.table]{setDF}}).
 #' @param verbose If \code{FALSE}, all information produced by \code{print}, \code{cat} and
 #' \code{message} are suppressed. All is done internally so that no global
 #' options are changed. \code{verbose} can be set to \code{FALSE} on all common OS
@@ -54,9 +57,10 @@
 #' it returns the subjects gived by \code{data_key} where issues occurred before giving an
 #' error and stopping. If \code{n_events} is not passed, then the ordering procedure remains the
 #' same, but the progression number is created internally with the name \code{n_events}.
-#' @return A restructured long format dataset of class \code{"data.table"} where each row
-#' represents a specific transition for a given subject. Moreover, \code{augment} adds some
-#' important variables:\cr
+#' @return An augmented format dataset of class \code{data.table}, or \code{data.frame} when
+#' \code{convert} is \code{TRUE}, where each row represents a specific transition for a given subject.
+#' Moreover, \code{augment} adds some important variables:\cr
+#'
 #' -----\cr
 #' \emph{augmented}: the new timing variable for the process when looking at transitions. If
 #' \code{t_augmented} is missing, then \code{augment} creates \emph{augmented} by default.
@@ -102,7 +106,7 @@
 #' @export
 augment = function( data, data_key, n_events, pattern, state = list ( 'IN', 'OUT', 'DEAD' ),
                     t_start, t_end, t_cens, t_death, t_augmented = 'augmented',
-                    more_status, check_NA = FALSE, verbose = TRUE ) {
+                    more_status, check_NA = FALSE, convert = FALSE, verbose = TRUE ) {
 
   tic = proc.time()
   status         = NULL
@@ -452,7 +456,7 @@ augment = function( data, data_key, n_events, pattern, state = list ( 'IN', 'OUT
     dead_out_long[ , status := status_flag_out ]
     l = list( alive_long, dead_in_long, dead_out_long )
     final = rbindlist( l )
-    setkeyv( final, cols[[ 1 ]] )
+    setkeyv( final, cols )
     rm( alive, alive_long, dead_in, dead_in_long, dead_out, dead_out_long )
     cat( '---\n' )
   } else if ( length( values ) == 3 ) {
@@ -614,6 +618,10 @@ augment = function( data, data_key, n_events, pattern, state = list ( 'IN', 'OUT
     sink()
   }
   options( warn = oldw )
+  if ( convert == TRUE ) {
+    setDF( final )
+    return( final )
+  }
   return( final )
 }
 
diff --git a/R/survplot.R b/R/survplot.R
@@ -1,7 +1,7 @@
 #' Plot and get survival data from a multi-state model.
 #'
 #' Plot a Kaplan-Meier curve and compare it with the fitted survival probability computed from a
-#' \code{\link[msm]{msm}} model. Fast build and return the associated datasets.
+#' \code{\link[msm]{msm}} model. Fast builds and returns the associated datasets.
 #'
 #' @param x A \code{msm} object.
 #' @param from State from which to compute the estimated survival. Default to state 1.
@@ -22,12 +22,22 @@
 #' @param grid An integer which tells at how many points to compute the fitted survival.
 #' If \code{times} is passed, \code{grid} is ignored. It has a default of 100 points.
 #' @param km If \code{TRUE}, then the Kaplan-Meier curve is shown. Default is \code{FALSE}.
-#' @param return.km If \code{TRUE}, then a \code{data.table} is returned. Default is \code{FALSE}.
+#' @param return.all If \code{TRUE}, then all the datasets used to draw the plot will be return to
+#' the environment. This argument saves you some typing time since you do not have to pass neither
+#' \code{return.km} nor \code{return.p}. Default is \code{FALSE}.
+#' @param return.km If \code{TRUE}, then the dataset used for building the Kaplan-Meier is returned
+#' as an object of class \code{data.table} unless \code{convert} is set to \code{TRUE}
+#' (See \code{convert}). Default is \code{FALSE}.
 #' \code{survplot} must be assigned to an object in order to get the data in the environment
 #' (see 'Value').
-#' @param return.p If \code{TRUE}, then a \code{data.table} is returned. Default is \code{FALSE}.
+#' @param return.p If \code{TRUE}, then the dataset used for building the fitted survival curve
+#' is returned as an object of class \code{data.table} unless \code{convert} is set to \code{TRUE}
+#' (See \code{convert}). Default is \code{FALSE}.
 #' \code{survplot} must be assigned to an object in order to get the data in the environment
 #' (see 'Value').
+#' @param convert If \code{TRUE}, then any returned object is automatically converted to the
+#' class \code{data.frame}. This is done in place and comes at very low cost both from running time
+#' and memory consumption (See \code{\link[data.table]{setDF}}).
 #' @param add If \code{TRUE}, then a new layer is added to the current plot. Default is \code{FALSE}.
 #' @param ci If \code{"none"} (the default), then no confidence intervals are plotted.
 #' If \code{"normal"} or \code{"bootstrap"}, confidence intervals are plotted based on the
@@ -122,8 +132,7 @@
 #' survplot( msm_model, km = TRUE, ci = 'none', verbose = FALSE, devnew = FALSE )
 #'
 #' # returning fitted and empirical data
-#' all_data = survplot( msm_model, ci = 'none',
-#'                      return.km = TRUE, return.p = TRUE,
+#' all_data = survplot( msm_model, ci = 'none', return.all = TRUE,
 #'                      verbose = FALSE, do.plot = FALSE )
 #' }
 #'
@@ -147,7 +156,8 @@
 #' @export
 survplot = function( x, from = 1, to = NULL, range = NULL, covariates = "mean",
                      exacttimes = TRUE, times, grid = 100L,
-                     km = FALSE, return.km = FALSE, return.p = FALSE, add = FALSE,
+                     km = FALSE, return.all = FALSE, return.km = NULL, return.p = NULL,
+                     convert = FALSE, add = FALSE,
                      ci = c( "none", "normal", "bootstrap" ), interp = c( "start", "midpoint" ),
                      B = 100L, legend.pos = 'topright',
                      xlab = "Time", ylab = "Survival Probability",
@@ -241,7 +251,7 @@ survplot = function( x, from = 1, to = NULL, range = NULL, covariates = "mean",
       lines( times, 1 - upper, lwd = lwd.ci.fit, lty = lty.ci.fit, col = col.ci.fit )
     }
   }
-  if ( km == TRUE || return.km == TRUE ) {
+  if ( return.all == TRUE || km == TRUE || return.km == TRUE ) {
     dat = as.data.table( x$data$mf[ , c( "(subject)", "(time)", "(state)" ) ] )
     setnames( dat, c( 'subject', 'time', 'state' ) )
     absind = which( dat$state == to )
@@ -289,14 +299,42 @@ survplot = function( x, from = 1, to = NULL, range = NULL, covariates = "mean",
     sink()
   }
   options( warn = oldw )
-  if ( return.km == TRUE && return.p == TRUE ) {
+
+  if ( return.all == TRUE ) {
+    if ( !is.null( return.km ) && return.km == FALSE ) {
+        stop( 'return.all is TRUE, but return.km is FALSE. Please, leave it set to NULL' )
+    }
+    if ( !is.null( return.p ) && return.p == FALSE ) {
+      stop( 'return.all is TRUE, but return.p is FALSE. Please, leave it set to NULL' )
+    }
+    if ( !is.null( return.km ) && return.km == TRUE ) {
+      warning( 'return.all is already set to TRUE. return.km will be ignored' )
+    }
+    if ( !is.null( return.p ) && return.p == TRUE ) {
+      warning( 'return.all is already set to TRUE. return.p will be ignored' )
+    }
+    if ( convert == TRUE ) {
+      setDF( wide )
+      return( invisible( list( km = wide,
+                               fitted = data.frame( time = times,
+                                                    probs = round( 1 - pr, 4 ) ) ) ) )
+    }
     return( invisible( list( km = wide,
-                  fitted = data.table( time = times,
-                                             probs = round( 1 - pr, 4 ) ) ) ) )
-  } else if ( return.km == TRUE && return.p == FALSE ) {
-    return( invisible( wide ) )
-  } else if ( return.km == FALSE && return.p == TRUE ) {
-    return( invisible( data.table( time = times, probs = round( 1 - pr, 4 ) ) ) )
+                             fitted = data.table( time = times,
+                                                  probs = round( 1 - pr, 4 ) ) ) ) )
+  } else {
+    if ( return.km == TRUE && return.p == FALSE ) {
+      if ( convert == TRUE ) {
+        setDF( wide )
+        return( invisible( wide ) )
+      }
+      return( invisible( wide ) )
+    } else if ( return.km == FALSE && return.p == TRUE ) {
+      if ( convert == TRUE ) {
+        return( invisible( data.frame( time = times, probs = round( 1 - pr, 4 ) ) ) )
+      }
+      return( invisible( data.table( time = times, probs = round( 1 - pr, 4 ) ) ) )
+    }
   }
 }
 

diff --git a/man/augment.Rd b/man/augment.Rd
diff --git a/man/survplot.Rd b/man/survplot.Rd