diff --git a/NAMESPACE b/NAMESPACE index 62ab098..356b383 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(augment) export(prevplot) +export(quake) export(survplot) import(data.table) importFrom(grDevices,dev.cur) diff --git a/NEWS.md b/NEWS.md index c7331ad..d554e5b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,15 @@ +------ + # msmtools 1.3 +### Major changes + +* The new function `quake()` is introduced. This adds support in the preprocessing +part of the analysis. `quake()` addresses the specific problem of different +transitions occurring at the same exact time. This is a case for which a +multi-state model fails to estimate the probability associated with the +two transitions. + ### Minor changes * Global variables are now correctly declared on top of functions using @@ -23,6 +33,13 @@ francesco.grossetti@unibocconi.it. first call. This means that you can finally print on console the augmented dataset right away. +* `pandoc` versions prior 1.17 does not fully support spaces in file names and +caused a warning when compiling `msmtools` under Fedora using both `clang` +ang `gcc`. Now all file names are without spaces. `msmtools` 1.3 has been built +using `pandoc` 1.19.2 and `pandoc-citeproc` 0.10.4.1 + +------ + # msmtools 1.2 ### Major changes @@ -75,7 +92,7 @@ blocked the object defined by `n_events`. produced during the status flag assignment. This was due to a wrong rounding of the amount of augmenting factor for each unit. ---- +------ # msmtools 1.1 diff --git a/R/augment.R b/R/augment.R index 326390e..ef7844e 100644 --- a/R/augment.R +++ b/R/augment.R @@ -286,7 +286,7 @@ augment = function( data, data_key, n_events, pattern, state = list ( 'IN', 'OUT } if ( check_NA == TRUE ) { if ( verbose == TRUE ) { - message( 'checking for any missing data in function arguments' ) + message( 'checking for any missing values in function arguments' ) } checks = c( cols, pattern, t_start, t_end ) test = apply( data[ , checks, with = FALSE ], 2, function( x ) any( sum( is.na( x ) ) > 0 ) ) @@ -298,7 +298,7 @@ augment = function( data, data_key, n_events, pattern, state = list ( 'IN', 'OUT invisible( sapply( names( test[ test == TRUE ] ), function( x ) cat( x, '\n' ) ) ) stop( 'Please, fix the issues and relaunch augment()' ) } else { - cat( 'Ok, no missing data detected\n') + cat( 'Ok, no missing values detected\n') cat( '---\n' ) } } diff --git a/R/quake.R b/R/quake.R new file mode 100644 index 0000000..e36ef62 --- /dev/null +++ b/R/quake.R @@ -0,0 +1,156 @@ +#' Delete different events occurring at the same time +#' +#' Fast algorithm to get rid of transitions to different states occurring at the same exact time +#' when dealing with augmented data as computed by \code{augment}. +#' +#' @param data An augmented \code{data.table} or \code{data.frame} object where each row +#' represents a transition. If \code{data} is a \code{data.frame}, then \code{quake} internally +#' casts it to a \code{data.table}. +#' @param data_key A keying variable which \code{quake} uses to define a key for \code{data}. +#' This represents the subject ID (See \code{\link[data.table]{setkey}}). +#' @param pattern ID status at the end of the study as passed to \code{augment} (See +#' \code{\link[msmtools]{augment}}). +#' @param target The target variable to check duplicates. By default it is set to 'augmented_int'. +#' @param check_NA If \code{TRUE}, then arguments \code{data_key}, \code{pattern}, +#' and \code{target} are looked up for any missing data and if the function finds +#' any, it stops with error. Default is \code{FALSE}. +#' @param verbose If \code{FALSE}, all information produced by \code{print}, \code{cat} and +#' \code{message} are suppressed. All is done internally so that no global +#' options are changed. \code{verbose} can be set to \code{FALSE} on all common OS +#' (see also \code{\link[base]{sink}} and \code{\link[base]{options}}). Default is \code{TRUE}. +#' +#' @details blablabla some details to write down +#' @seealso \code{\link[msmtools]{augment}} +#' +#' @examples +#' data( hosp ) +#' hosp_aug = augment( data = hosp, data_key = subj, n_events = adm_number, pattern = label_3, +#' t_start = dateIN, t_end = dateOUT, t_cens = dateCENS ) +#' hosp_aug_clean = quake( data = hosp_aug, data_key = subj, pattern = label_3 ) +#' +#' @author Francesco Grossetti \email{francesco.grossetti@@polimi.it}. +#' @import data.table +#' @export + +quake = function( data, data_key, pattern, target, check_NA = FALSE, verbose = TRUE ) { + + tic = proc.time() + index = NULL + + if ( missing( data ) ) { + stop( 'a dataset of class data.table or data.frame must be provided' ) + } + if ( !inherits( data, "data.table" ) && !inherits( data, "data.frame" ) ) { + stop( "a dataset of class data.table or data.frame must be provided" ) + } + if ( missing( data_key ) ) { + stop( 'a variable of keying must be provided' ) + } + if ( missing( pattern ) ) { + stop( "a pattern must be provided" ) + } + if ( inherits( data, 'data.frame' ) ) { + setDT( data ) + } + if ( verbose == TRUE ) { + cat( '-------------------------------------\n' ) + cat( '# # # # setting everything up # # # #\n' ) + cat( '-------------------------------------\n' ) + } + + setkey( data, NULL ) + cols = as.character( substitute( list( data_key ) )[ -1L ] ) + if ( !length( cols ) ) { + cols = colnames( data ) + } + setkeyv( data, cols ) + pattern = as.character( substitute( list( pattern ) )[ -1L ] ) + if ( missing( target ) ) { + target = 'augmented_int' + } else { + target = as.character( substitute( list( target ) )[ -1L ] ) + } + + if ( check_NA == TRUE ) { + if ( verbose == TRUE ) { + message( 'checking for any missing values in function arguments' ) + } + checks = c( cols, pattern, target ) + test = apply( data[ , checks, with = FALSE ], 2, function( x ) any( sum( is.na( x ) ) > 0 ) ) + if ( any ( test ) ) { + cat( '---\n' ) + if ( verbose == TRUE ) { + message( 'detected missing values in the following variables:' ) + } + invisible( sapply( names( test[ test == TRUE ] ), function( x ) cat( x, '\n' ) ) ) + stop( 'Please, fix the issues and relaunch shiver()' ) + } else { + cat( 'Ok, no missing values detected\n') + cat( '---\n' ) + } + } + + data[ , index := sequence( .N ) ] + n_patients = uniqueN( eval( substitute( data$cols ) ) ) + values = sort( eval( substitute( unique( data$pattern ) ) ) ) + if ( length( values ) < 2 ) { + stop( 'unit identification label must be an integer, a factor or a character + with at least 2 elements' ) + } + + alive = data[ get( pattern ) == values[ 1 ] ] + alive.last = alive[ alive[ , .I[ .N ], by = eval( cols ) ]$V1 ] + setkey( alive.last, index ) + setkey( alive, index ) + alive.no.last = alive[ !alive.last ] + + if ( verbose == TRUE ) { + message( 'checking ', substitute( pattern ), ' and defining patterns' ) + } + if ( length( values ) == 2 ) { + cat( 'detected only 2 values\n' ) + cat( '---\n' ) + dead = data[ get( pattern ) == values[ 2 ] ] + } else if ( length( values ) == 3 ) { + cat( 'Ok, detected 3 values\n' ) + dead = data[ get( pattern ) != values[ 1 ] ] + cat( '---\n' ) + } + + l = list( alive.no.last, dead ) + data.no.last.event = rbindlist( l ) + row.duplicated = duplicated( data.no.last.event, by = c( eval( cols ), eval( target ) ) ) + duplicated = data.no.last.event[ row.duplicated == TRUE ] + n_duplicated = uniqueN( eval( substitute( duplicated$cols ) ) ) + setkeyv( duplicated, cols ) + + if ( n_duplicated == 0 ) { + message( 'Hurray! No duplicated occurrences have been found in ', substitute( data ), + ' according to variable ', substitute( target ) ) + } else { + message( 'Spotted ', n_duplicated, + ' patients with at least a duplicated occurrence according to variable ', + substitute( target ) ) + data.clean = data[ !duplicated ] + n_patients.to.keep = uniqueN( eval( substitute( data.clean$cols ) ) ) + cat( n_patients.to.keep, ' patients have been reained corresponding to ', + round( 100 * ( n_patients.to.keep / n_patients ), 2 ), '%\n', sep = '' ) + cat( 'Duplicated patients have been sucessfully removed\n' ) + } + + data[ , index := NULL ] + if ( n_duplicated > 0 ) { + data.clean[ , index := NULL ] + } + toc = proc.time() + time = toc - tic + cat( '---------------------------\n' ) + cat( 'quake() took:', time[ 3 ], 'sec. \n', sep = ' ' ) + cat( '---------------------------\n' ) + + if ( n_duplicated == 0 ) { + return( invisible( data ) ) + } else { + return( invisible( data.clean ) ) + } +} diff --git a/R/survplot.R b/R/survplot.R index 6524e09..8227871 100644 --- a/R/survplot.R +++ b/R/survplot.R @@ -73,6 +73,7 @@ if ( getRversion() >= "2.15.1" ) { #' legend is not shown. #' @param xlab \emph{x} axis label. #' @param ylab \emph{y} axis label. +#' @param main The main title of the plot(s) as character. Default is \code{NULL}. #' @param lty.fit Line type for the fitted curve. See \code{\link[graphics]{par}}. #' @param lwd.fit Line width for the fitted curve. See \code{\link[graphics]{par}}. #' @param col.fit Line color for the fitted curve. See \code{\link[graphics]{par}}. @@ -210,7 +211,7 @@ survplot = function( x, from = 1, to = NULL, range = NULL, covariates = "mean", convert = FALSE, add = FALSE, ci = c( "none", "normal", "bootstrap" ), interp = c( "start", "midpoint" ), B = 100L, legend.pos = 'topright', - xlab = "Time", ylab = "Survival Probability", + xlab = "Time", ylab = "Survival Probability", main = NULL, lty.fit = 1, lwd.fit = 1, col.fit = "red", lty.ci.fit = 3, lwd.ci.fit = 1, col.ci.fit = col.fit, mark.time = FALSE, lty.km = 5, lwd.km = 1, col.km = "darkblue", @@ -288,7 +289,7 @@ survplot = function( x, from = 1, to = NULL, range = NULL, covariates = "mean", dev.set( dev.cur() ) } plot( times, 1 - pr, type = "l", xlab = xlab, ylab = ylab, ylim = c( 0, 1 ), - lwd = lwd.fit, lty = lty.fit, col = col.fit ) + lwd = lwd.fit, lty = lty.fit, col = col.fit, main = main ) } else { lines( times, 1 - pr, lwd = lwd.fit, lty = lty.fit, col = col.fit ) } diff --git a/man/quake.Rd b/man/quake.Rd new file mode 100644 index 0000000..80676cf --- /dev/null +++ b/man/quake.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/quake.R +\name{quake} +\alias{quake} +\title{Delete different events occurring at the same time} +\usage{ +quake(data, data_key, pattern, target, check_NA = FALSE, verbose = TRUE) +} +\arguments{ +\item{data}{An augmented \code{data.table} or \code{data.frame} object where each row +represents a transition. If \code{data} is a \code{data.frame}, then \code{quake} internally +casts it to a \code{data.table}.} + +\item{data_key}{A keying variable which \code{quake} uses to define a key for \code{data}. +This represents the subject ID (See \code{\link[data.table]{setkey}}).} + +\item{pattern}{ID status at the end of the study as passed to \code{augment} (See +\code{\link[msmtools]{augment}}).} + +\item{target}{The target variable to check duplicates. By default it is set to 'augmented_int'.} + +\item{check_NA}{If \code{TRUE}, then arguments \code{data_key}, \code{pattern}, +and \code{target} are looked up for any missing data and if the function finds +any, it stops with error. Default is \code{FALSE}.} + +\item{verbose}{If \code{FALSE}, all information produced by \code{print}, \code{cat} and +\code{message} are suppressed. All is done internally so that no global +options are changed. \code{verbose} can be set to \code{FALSE} on all common OS +(see also \code{\link[base]{sink}} and \code{\link[base]{options}}). Default is \code{TRUE}.} +} +\description{ +Fast algorithm to get rid of transitions to different states occurring at the same exact time +when dealing with augmented data as computed by \code{augment}. +} +\details{ +blablabla some details to write down +} +\examples{ +data( hosp ) +hosp_aug = augment( data = hosp, data_key = subj, n_events = adm_number, pattern = label_3, + t_start = dateIN, t_end = dateOUT, t_cens = dateCENS ) +hosp_aug_clean = quake( data = hosp_aug, data_key = subj, pattern = label_3 ) + +} +\author{ +Francesco Grossetti \email{francesco.grossetti@polimi.it}. +} +\seealso{ +\code{\link[msmtools]{augment}} +} + diff --git a/man/survplot.Rd b/man/survplot.Rd index 79b11d7..420d6df 100644 --- a/man/survplot.Rd +++ b/man/survplot.Rd @@ -9,7 +9,7 @@ survplot(x, from = 1, to = NULL, range = NULL, covariates = "mean", return.km = NULL, return.p = NULL, convert = FALSE, add = FALSE, ci = c("none", "normal", "bootstrap"), interp = c("start", "midpoint"), B = 100L, legend.pos = "topright", xlab = "Time", - ylab = "Survival Probability", lty.fit = 1, lwd.fit = 1, + ylab = "Survival Probability", main = NULL, lty.fit = 1, lwd.fit = 1, col.fit = "red", lty.ci.fit = 3, lwd.ci.fit = 1, col.ci.fit = col.fit, mark.time = FALSE, lty.km = 5, lwd.km = 1, col.km = "darkblue", do.plot = TRUE, plot.width = 7, plot.height = 7, devnew = TRUE, @@ -102,6 +102,8 @@ legend is not shown.} \item{ylab}{\emph{y} axis label.} +\item{main}{The main title of the plot(s) as character. Default is \code{NULL}.} + \item{lty.fit}{Line type for the fitted curve. See \code{\link[graphics]{par}}.} \item{lwd.fit}{Line width for the fitted curve. See \code{\link[graphics]{par}}.} diff --git a/vignettes/msmtools.R b/vignettes/msmtools.R index 65716b6..7f2f5f5 100644 --- a/vignettes/msmtools.R +++ b/vignettes/msmtools.R @@ -1,12 +1,12 @@ ## ----ninja, echo = FALSE------------------------------------------------- library( msmtools ) -## ----long example, collapse = TRUE--------------------------------------- +## ----long_example, collapse = TRUE--------------------------------------- data( hosp ) hosp[ 1:17, .( subj, adm_number, gender, age, label_2, dateIN, dateOUT, dateCENS ) ] -## ----running augment, collapse = TRUE------------------------------------ +## ----running_augment, collapse = TRUE------------------------------------ hosp_augmented = augment( data = hosp, data_key = subj, n_events = adm_number, pattern = label_2, t_start = dateIN, t_end = dateOUT, @@ -19,7 +19,7 @@ hosp_augmented[ 1:35, .( subj, adm_number, gender, age, label_2, hosp[ 18:28, .( subj, adm_number, rehab, it, rehab_it, dateIN, dateOUT, dateCENS ) ] -## ----complex status, collapse = TRUE------------------------------------- +## ----complex_status, collapse = TRUE------------------------------------- hosp_augmented_more = augment( data = hosp, data_key = subj, n_events = adm_number, pattern = label_2, t_start = dateIN, t_end = dateOUT, @@ -29,7 +29,7 @@ hosp_augmented_more = augment( data = hosp, data_key = subj, hosp_augmented_more[ 36:60, .( subj, adm_number, rehab_it, augmented, status, status_exp, n_status_exp ) ] -## ----multistate model, collapse = TRUE----------------------------------- +## ----multistate_model, collapse = TRUE----------------------------------- # let's define the initial transition matrix for our model Qmat = matrix( data = 0, nrow = 3, ncol = 3, byrow = TRUE ) Qmat[ 1, 1:3 ] = 1 @@ -49,15 +49,15 @@ msm_model = msm( status_num ~ augmented_int, control = list( fnscale = 6e+05, trace = 0, REPORT = 1, maxit = 10000 ) ) -## ----survplot 1, fig.align = 'center', fig.width = 5, fig.height = 4----- +## ----survplot_1, fig.align = 'center', fig.width = 5, fig.height = 4----- survplot( msm_model, km = TRUE, ci = 'none', verbose = FALSE, devnew = FALSE ) -## ----survplot 2, fig.align = 'center', fig.width = 5, fig.height = 4----- +## ----survplot_2, fig.align = 'center', fig.width = 5, fig.height = 4----- survplot( msm_model, km = TRUE, from = 2, ci = 'none', verbose = FALSE, devnew = FALSE ) -## ----custom time seq, fig.align = 'center', fig.width = 5, fig.height = 4---- +## ----custom_time_seq, fig.align = 'center', fig.width = 5, fig.height = 4---- time_seq = seq( 300, 800, by = 30 ) survplot( msm_model, times = time_seq, ci = 'none', verbose = FALSE, devnew = FALSE ) @@ -99,7 +99,7 @@ all_data = survplot( msm_model, ci = 'none', grid = 10, # let's see the datasets all_data -## ----splitting data, collapse = TRUE------------------------------------- +## ----splitting_data, collapse = TRUE------------------------------------- # do not extract data using just one []. # This keeps the class, so it returns a list km_data_wrong = all_data[ 1 ] diff --git a/vignettes/msmtools.pdf b/vignettes/msmtools.pdf index 1e11256..8f4f3e6 100644 Binary files a/vignettes/msmtools.pdf and b/vignettes/msmtools.pdf differ