Merge branch 'alphadev'

contefranz · May 27, 2017 · 544de9f · 544de9f
2 parents 9967280 + 8433b24
commit 544de9f
Show file tree

Hide file tree

Showing 9 changed files with 242 additions and 14 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(augment)
 export(prevplot)
+export(quake)
 export(survplot)
 import(data.table)
 importFrom(grDevices,dev.cur)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,15 @@
+------
+
 # msmtools 1.3
 
+### Major changes
+
+* The new function `quake()` is introduced. This adds support in the preprocessing 
+part of the analysis. `quake()` addresses the specific problem of different 
+transitions occurring at the same exact time. This is a case for which a 
+multi-state model fails to estimate the probability associated with the
+two transitions.
+
 ### Minor changes
 
 * Global variables are now correctly declared on top of functions using 
@@ -23,6 +33,13 @@ francesco.grossetti@unibocconi.it.
 first call. This means that you can finally print on console the augmented dataset
 right away.
 
+* `pandoc` versions prior 1.17 does not fully support spaces in file names and 
+caused a warning when compiling `msmtools` under Fedora using both `clang` 
+ang `gcc`. Now all file names are without spaces. `msmtools` 1.3 has been built
+using `pandoc` 1.19.2 and `pandoc-citeproc` 0.10.4.1
+
+------
+
 # msmtools 1.2
 
 ### Major changes
@@ -75,7 +92,7 @@ blocked the object defined by `n_events`.
 produced during the status flag assignment. This was due to a wrong rounding of the amount of 
 augmenting factor for each unit.
 
----
+------
 
 # msmtools 1.1
 

diff --git a/R/augment.R b/R/augment.R
@@ -286,7 +286,7 @@ augment = function( data, data_key, n_events, pattern, state = list ( 'IN', 'OUT
   }
   if ( check_NA == TRUE ) {
     if ( verbose == TRUE ) {
-      message( 'checking for any missing data in function arguments' )
+      message( 'checking for any missing values in function arguments' )
     }
     checks = c( cols, pattern, t_start, t_end )
     test = apply( data[ , checks, with = FALSE ], 2, function( x ) any( sum( is.na( x ) ) > 0 ) )
@@ -298,7 +298,7 @@ augment = function( data, data_key, n_events, pattern, state = list ( 'IN', 'OUT
       invisible( sapply( names( test[ test == TRUE ] ), function( x ) cat( x, '\n' ) ) )
       stop( 'Please, fix the issues and relaunch augment()' )
     } else {
-      cat( 'Ok, no missing data detected\n')
+      cat( 'Ok, no missing values detected\n')
       cat( '---\n' )
     }
   }

diff --git a/R/quake.R b/R/quake.R
@@ -0,0 +1,156 @@
+#' Delete different events occurring at the same time
+#'
+#' Fast algorithm to get rid of transitions to different states occurring at the same exact time
+#' when dealing with augmented data as computed by \code{augment}.
+#'
+#' @param data An augmented \code{data.table} or \code{data.frame} object where each row
+#' represents a transition. If \code{data} is a \code{data.frame}, then \code{quake} internally
+#' casts it to a \code{data.table}.
+#' @param data_key A keying variable which \code{quake} uses to define a key for \code{data}.
+#' This represents the subject ID (See \code{\link[data.table]{setkey}}).
+#' @param pattern ID status at the end of the study as passed to \code{augment} (See
+#' \code{\link[msmtools]{augment}}).
+#' @param target The target variable to check duplicates. By default it is set to 'augmented_int'.
+#' @param check_NA If \code{TRUE}, then arguments \code{data_key}, \code{pattern},
+#' and \code{target} are looked up for any missing data and if the function finds
+#' any, it stops with error. Default is \code{FALSE}.
+#' @param verbose If \code{FALSE}, all information produced by \code{print}, \code{cat} and
+#' \code{message} are suppressed. All is done internally so that no global
+#' options are changed. \code{verbose} can be set to \code{FALSE} on all common OS
+#' (see also \code{\link[base]{sink}} and \code{\link[base]{options}}). Default is \code{TRUE}.
+#'
+#' @details blablabla some details to write down
+#' @seealso \code{\link[msmtools]{augment}}
+#'
+#' @examples
+#' data( hosp )
+#' hosp_aug = augment( data = hosp, data_key = subj, n_events = adm_number, pattern = label_3,
+#'                     t_start = dateIN, t_end = dateOUT, t_cens = dateCENS )
+#' hosp_aug_clean = quake( data = hosp_aug, data_key = subj, pattern = label_3 )
+#'
+#' @author Francesco Grossetti \email{francesco.grossetti@@polimi.it}.
+#' @import data.table
+#' @export
+
+quake = function( data, data_key, pattern, target, check_NA = FALSE, verbose = TRUE ) {
+
+  tic = proc.time()
+  index = NULL
+
+  if ( missing( data ) ) {
+    stop( 'a dataset of class data.table or data.frame must be provided' )
+  }
+  if ( !inherits( data, "data.table" ) && !inherits( data, "data.frame" ) ) {
+    stop( "a dataset of class data.table or data.frame must be provided" )
+  }
+  if ( missing( data_key ) ) {
+    stop( 'a variable of keying must be provided' )
+  }
+  if ( missing( pattern ) ) {
+    stop( "a pattern must be provided" )
+  }
+  if ( inherits( data, 'data.frame' ) ) {
+    setDT( data )
+  }
+  if ( verbose == TRUE ) {
+    cat( '-------------------------------------\n' )
+    cat( '# # # # setting everything up # # # #\n' )
+    cat( '-------------------------------------\n' )
+  }
+
+  setkey( data, NULL )
+  cols = as.character( substitute( list( data_key ) )[ -1L ] )
+  if ( !length( cols ) ) {
+    cols = colnames( data )
+  }
+  setkeyv( data, cols )
+  pattern = as.character( substitute( list( pattern ) )[ -1L ] )
+  if ( missing( target ) ) {
+    target = 'augmented_int'
+  } else {
+    target = as.character( substitute( list( target ) )[ -1L ] )
+  }
+
+  if ( check_NA == TRUE ) {
+    if ( verbose == TRUE ) {
+      message( 'checking for any missing values in function arguments' )
+    }
+    checks = c( cols, pattern, target )
+    test = apply( data[ , checks, with = FALSE ], 2, function( x ) any( sum( is.na( x ) ) > 0 ) )
+    if ( any ( test ) ) {
+      cat( '---\n' )
+      if ( verbose == TRUE ) {
+        message( 'detected missing values in the following variables:' )
+      }
+      invisible( sapply( names( test[ test == TRUE ] ), function( x ) cat( x, '\n' ) ) )
+      stop( 'Please, fix the issues and relaunch shiver()' )
+    } else {
+      cat( 'Ok, no missing values detected\n')
+      cat( '---\n' )
+    }
+  }
+
+  data[ , index := sequence( .N ) ]
+  n_patients = uniqueN( eval( substitute( data$cols ) ) )
+  values = sort( eval( substitute( unique( data$pattern ) ) ) )
+  if ( length( values ) < 2 ) {
+    stop( 'unit identification label must be an integer, a factor or a character
+          with at least 2 elements' )
+  }
+
+  alive = data[ get( pattern ) == values[ 1 ] ]
+  alive.last = alive[ alive[ , .I[ .N ], by = eval( cols ) ]$V1 ]
+  setkey( alive.last, index )
+  setkey( alive, index )
+  alive.no.last = alive[ !alive.last ]
+
+  if ( verbose == TRUE ) {
+    message( 'checking ', substitute( pattern ), ' and defining patterns' )
+  }
+  if ( length( values ) == 2 ) {
+    cat( 'detected only 2 values\n' )
+    cat( '---\n' )
+    dead = data[ get( pattern ) == values[ 2 ] ]
+  } else if ( length( values ) == 3 ) {
+    cat( 'Ok, detected 3 values\n' )
+    dead = data[ get( pattern ) != values[ 1 ] ]
+    cat( '---\n' )
+  }
+
+  l = list( alive.no.last, dead )
+  data.no.last.event = rbindlist( l )
+  row.duplicated = duplicated( data.no.last.event, by = c( eval( cols ), eval( target ) ) )
+  duplicated = data.no.last.event[ row.duplicated == TRUE ]
+  n_duplicated = uniqueN( eval( substitute( duplicated$cols ) ) )
+  setkeyv( duplicated, cols )
+
+  if ( n_duplicated == 0 ) {
+    message( 'Hurray! No duplicated occurrences have been found in ', substitute( data ),
+             ' according to variable ', substitute( target ) )
+  } else {
+    message( 'Spotted ', n_duplicated,
+             ' patients with at least a duplicated occurrence according to variable ',
+             substitute( target ) )
+    data.clean = data[ !duplicated ]
+    n_patients.to.keep = uniqueN( eval( substitute( data.clean$cols ) ) )
+    cat( n_patients.to.keep, ' patients have been reained corresponding to ',
+         round( 100 * ( n_patients.to.keep / n_patients ), 2 ), '%\n', sep = '' )
+    cat( 'Duplicated patients have been sucessfully removed\n' )
+  }
+
+  data[ , index := NULL ]
+  if ( n_duplicated > 0 ) {
+    data.clean[ , index := NULL ]
+  }
+  toc = proc.time()
+  time = toc - tic
+  cat( '---------------------------\n' )
+  cat( 'quake() took:', time[ 3 ], 'sec. \n', sep = ' ' )
+  cat( '---------------------------\n' )
+
+  if ( n_duplicated == 0 ) {
+    return( invisible( data ) )
+  } else {
+    return( invisible( data.clean ) )
+  }
+}
diff --git a/R/survplot.R b/R/survplot.R
@@ -73,6 +73,7 @@ if ( getRversion() >= "2.15.1" ) {
 #' legend is not shown.
 #' @param xlab \emph{x} axis label.
 #' @param ylab \emph{y} axis label.
+#' @param main The main title of the plot(s) as character. Default is \code{NULL}.
 #' @param lty.fit Line type for the fitted curve. See \code{\link[graphics]{par}}.
 #' @param lwd.fit Line width for the fitted curve. See \code{\link[graphics]{par}}.
 #' @param col.fit Line color for the fitted curve. See \code{\link[graphics]{par}}.
@@ -210,7 +211,7 @@ survplot = function( x, from = 1, to = NULL, range = NULL, covariates = "mean",
                      convert = FALSE, add = FALSE,
                      ci = c( "none", "normal", "bootstrap" ), interp = c( "start", "midpoint" ),
                      B = 100L, legend.pos = 'topright',
-                     xlab = "Time", ylab = "Survival Probability",
+                     xlab = "Time", ylab = "Survival Probability", main = NULL,
                      lty.fit = 1, lwd.fit = 1, col.fit = "red", lty.ci.fit = 3, lwd.ci.fit = 1,
                      col.ci.fit = col.fit, mark.time = FALSE,
                      lty.km = 5, lwd.km = 1, col.km = "darkblue",
@@ -288,7 +289,7 @@ survplot = function( x, from = 1, to = NULL, range = NULL, covariates = "mean",
         dev.set( dev.cur() )
       }
       plot( times, 1 - pr, type = "l", xlab = xlab, ylab = ylab, ylim = c( 0, 1 ),
-            lwd = lwd.fit, lty = lty.fit, col = col.fit )
+            lwd = lwd.fit, lty = lty.fit, col = col.fit, main = main )
     } else {
       lines( times, 1 - pr, lwd = lwd.fit, lty = lty.fit, col = col.fit )
     }

diff --git a/man/quake.Rd b/man/quake.Rd
diff --git a/man/survplot.Rd b/man/survplot.Rd
diff --git a/vignettes/msmtools.R b/vignettes/msmtools.R
@@ -1,12 +1,12 @@
 ## ----ninja, echo = FALSE-------------------------------------------------
 library( msmtools )
 
-## ----long example, collapse = TRUE---------------------------------------
+## ----long_example, collapse = TRUE---------------------------------------
 data( hosp )
 hosp[ 1:17, .( subj, adm_number, gender, age, label_2, 
                dateIN, dateOUT, dateCENS ) ]
 
-## ----running augment, collapse = TRUE------------------------------------
+## ----running_augment, collapse = TRUE------------------------------------
 hosp_augmented = augment( data = hosp, data_key = subj, 
                           n_events = adm_number, pattern = label_2,
                           t_start = dateIN, t_end = dateOUT, 
@@ -19,7 +19,7 @@ hosp_augmented[ 1:35, .( subj, adm_number, gender, age, label_2,
 hosp[ 18:28, .( subj, adm_number, rehab, it, rehab_it,
                 dateIN, dateOUT, dateCENS ) ]
 
-## ----complex status, collapse = TRUE-------------------------------------
+## ----complex_status, collapse = TRUE-------------------------------------
 hosp_augmented_more = augment( data = hosp, data_key = subj,
                                n_events = adm_number, pattern = label_2,
                                t_start = dateIN, t_end = dateOUT,
@@ -29,7 +29,7 @@ hosp_augmented_more = augment( data = hosp, data_key = subj,
 hosp_augmented_more[ 36:60, .( subj, adm_number, rehab_it,
                                augmented, status, status_exp, n_status_exp ) ]
 
-## ----multistate model, collapse = TRUE-----------------------------------
+## ----multistate_model, collapse = TRUE-----------------------------------
 # let's define the initial transition matrix for our model
 Qmat = matrix( data = 0, nrow = 3, ncol = 3, byrow = TRUE )
 Qmat[ 1, 1:3 ] = 1
@@ -49,15 +49,15 @@ msm_model = msm( status_num ~ augmented_int,
                  control = list( fnscale = 6e+05, trace = 0,
                                  REPORT = 1, maxit = 10000 ) )
 
-## ----survplot 1, fig.align = 'center', fig.width = 5, fig.height = 4-----
+## ----survplot_1, fig.align = 'center', fig.width = 5, fig.height = 4-----
 survplot( msm_model, km = TRUE, ci = 'none',
           verbose = FALSE, devnew = FALSE )
 
-## ----survplot 2, fig.align = 'center', fig.width = 5, fig.height = 4-----
+## ----survplot_2, fig.align = 'center', fig.width = 5, fig.height = 4-----
 survplot( msm_model, km = TRUE, from = 2, ci = 'none',
           verbose = FALSE, devnew = FALSE )
 
-## ----custom time seq, fig.align = 'center', fig.width = 5, fig.height = 4----
+## ----custom_time_seq, fig.align = 'center', fig.width = 5, fig.height = 4----
 time_seq = seq( 300, 800, by = 30 )
 survplot( msm_model, times = time_seq, ci = 'none',
           verbose = FALSE, devnew = FALSE )
@@ -99,7 +99,7 @@ all_data = survplot( msm_model, ci = 'none', grid = 10,
 # let's see the datasets
 all_data
 
-## ----splitting data, collapse = TRUE-------------------------------------
+## ----splitting_data, collapse = TRUE-------------------------------------
 # do not extract data using just one [].
 # This keeps the class, so it returns a list
 km_data_wrong = all_data[ 1 ]

diff --git a/vignettes/msmtools.pdf b/vignettes/msmtools.pdf