From a7567b6be3e0b6b6f75a92bed3277ed5b6d6d825 Mon Sep 17 00:00:00 2001 From: AnestisTouloumis Date: Mon, 22 Jul 2019 18:20:03 +0100 Subject: [PATCH] improved R code readability --- DESCRIPTION | 2 +- R/SimCorMultRes_internals.R | 274 ++++--- R/rbin.R | 51 +- R/rmult.acl.R | 17 +- R/rmult.bcl.R | 38 +- R/rmult.clm.R | 36 +- R/rmult.crm.R | 48 +- R/rnorta.R | 14 +- R/rsmvnorm.R | 17 +- README.md | 2 +- inst/NEWS | 4 + man/rbin.Rd | 22 +- man/rmult.acl.Rd | 4 +- man/rmult.bcl.Rd | 4 +- man/rmult.clm.Rd | 8 +- man/rmult.crm.Rd | 8 +- tests/testthat/test_core_functions.R | 78 +- tests/testthat/test_internal_functions.R | 56 ++ tests/testthat/test_utility_functions.R | 26 +- vignettes/SimCorMultRes.R | 227 ------ vignettes/SimCorMultRes.Rmd | 89 ++- vignettes/SimCorMultRes.html | 934 ----------------------- 22 files changed, 455 insertions(+), 1504 deletions(-) create mode 100644 tests/testthat/test_internal_functions.R delete mode 100644 vignettes/SimCorMultRes.R delete mode 100644 vignettes/SimCorMultRes.html diff --git a/DESCRIPTION b/DESCRIPTION index bdd8804..81232ff 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,7 +2,7 @@ Package: SimCorMultRes Type: Package Title: Simulates Correlated Multinomial Responses Description: Simulates correlated multinomial responses conditional on a marginal model specification. -Version: 1.6.6 +Version: 1.6.7 Depends: R(>= 2.15.0) Imports: evd, diff --git a/R/SimCorMultRes_internals.R b/R/SimCorMultRes_internals.R index a4199d6..ae74a0c 100644 --- a/R/SimCorMultRes_internals.R +++ b/R/SimCorMultRes_internals.R @@ -1,58 +1,65 @@ -check_cluster_size <- function(clsize) { - if (all.equal(clsize, as.integer(clsize)) != TRUE | clsize < 2) +check_cluster_size <- function(cluster_size) { + if (all.equal(cluster_size, as.integer(cluster_size)) != TRUE | + cluster_size < 2) stop("'clsize' must be a positive integer greater than or equal to two") } -check_ncategories <- function(ncategories) { - if (!is.numeric(ncategories) | ncategories < 3) +check_ncategories <- function(categories_no) { + if (!is.numeric(categories_no) | categories_no < 3) stop("'ncategories' must be numeric greater or equal to three") - if (all.equal(ncategories, as.integer(ncategories)) != TRUE) + if (all.equal(categories_no, as.integer(categories_no)) != TRUE) stop("'ncategories' must be a positive integer") - ncategories + categories_no } -check_correlation_matrix <- function(cor_matrix, clsize, rfctn, - ncategories = NULL) { - if (!is.numeric(cor_matrix)) +check_correlation_matrix <- function(correlation_matrix, cluster_size, rfctn, + categories_no = NULL) { + if (!is.numeric(correlation_matrix)) stop("'cor_matrix' must be numeric") - if (!is.matrix(cor_matrix)) + if (!is.matrix(correlation_matrix)) stop("'cor_matrix' must be a matrix") if (rfctn == "rbin" | rfctn == "rmult.clm") { - if (ncol(cor_matrix) != clsize | nrow(cor_matrix) != clsize) - stop("'cor_matrix' must be a ", clsize, "x", clsize, " matrix") + if (ncol(correlation_matrix) != cluster_size | + nrow(correlation_matrix) != cluster_size) + stop("'cor.matrix' must be a ", cluster_size, "x", + cluster_size, " matrix") } else { - ncateg2 <- ifelse(rfctn == "rmult.bcl", ncategories, ncategories - 1) - dimcor <- clsize * ncateg2 - if (ncol(cor_matrix) != dimcor | nrow(cor_matrix) != dimcor) + ncateg2 <- ifelse(rfctn == "rmult.bcl", categories_no, + categories_no - 1) + dimcor <- cluster_size * ncateg2 + if (ncol(correlation_matrix) != dimcor | + nrow(correlation_matrix) != dimcor) stop("'cor_matrix' must be a ", dimcor, "x", dimcor, " matrix") - for (i in 1:clsize) { + for (i in 1:cluster_size) { diag_index <- 1:ncateg2 + (i - 1) * ncateg2 - cor_matrix[diag_index, diag_index] <- diag(1, ncateg2) + correlation_matrix[diag_index, diag_index] <- diag(1, ncateg2) } } - if (!isSymmetric(cor_matrix)) + if (!isSymmetric(correlation_matrix)) stop("'cor_matrix' must be a symmetric matrix") - if (any(diag(cor_matrix) != 1)) + if (any(diag(correlation_matrix) != 1)) stop("the diagonal elements of 'cor_matrix' must be one") - if (any(cor_matrix > 1) | any(cor_matrix < -1)) + if (any(correlation_matrix > 1) | any(correlation_matrix < -1)) stop("all the elements of 'cor_matrix' must be on [-1,1]") - if (any(eigen(cor_matrix, symmetric = TRUE, only.values = TRUE)$values <= - 0)) + correlation_matrix_eigen <- eigen(correlation_matrix, symmetric = TRUE, + only.values = TRUE) + if (any(correlation_matrix_eigen$values <= 0)) stop("'cor_matrix' must be positive definite") - cor_matrix + correlation_matrix } check_xformula <- function(xformula) { - lpformula <- as.formula(xformula) - if (length(paste0(attr(terms(lpformula), "variables"))) == 1) + linear_predictor_formula <- as.formula(xformula) + if (length(paste0(attr(terms(linear_predictor_formula), "variables"))) == 1) stop("No covariates were found in 'formula' ") - if (attr(terms(lpformula), "intercept") == 0) { - lpformula <- update(lpformula, ~. + 1) + if (attr(terms(linear_predictor_formula), "intercept") == 0) { + linear_predictor_formula <- update(linear_predictor_formula, ~. + 1) } - lpformula + linear_predictor_formula } -check_intercepts <- function(intercepts, clsize, rfctn, sample_size = NULL) { +check_intercepts <- function(intercepts, cluster_size, rfctn, + sample_size = NULL) { if (!is.numeric(intercepts)) stop("'intercepts' must be numeric") if (any(is.infinite(intercepts))) @@ -61,9 +68,10 @@ check_intercepts <- function(intercepts, clsize, rfctn, sample_size = NULL) { if (!(is.vector(intercepts) & !is.list(intercepts))) stop("'intercepts' must be a vector") if (length(intercepts) == 1) - intercepts <- rep(intercepts, clsize) - if (length(intercepts) != clsize) - stop("'intercepts' must have either one or ", clsize, " elements") + intercepts <- rep(intercepts, cluster_size) + if (length(intercepts) != cluster_size) + stop("'intercepts' must have either one or ", cluster_size, + " elements") intercepts <- cbind(-Inf, intercepts, Inf) } else { if (!(is.vector(intercepts) & !is.list(intercepts)) & @@ -74,76 +82,79 @@ check_intercepts <- function(intercepts, clsize, rfctn, sample_size = NULL) { stop("'intercepts' must have at least 2 elements") if (rfctn == "rmult.clm" & any(diff(intercepts) <= 0)) stop("'intercepts' must be increasing") - ncategories <- length(intercepts) + 1 + categories_no <- length(intercepts) + 1 if (rfctn == "rmult.clm") { - intercepts <- matrix(intercepts, clsize, ncategories - + intercepts <- matrix(intercepts, cluster_size, categories_no - 1, TRUE) intercepts <- cbind(-Inf, intercepts, Inf) } else if (rfctn == "rmult.crm") { intercepts <- matrix(intercepts, sample_size, - clsize * (ncategories - 1), byrow = TRUE) + cluster_size * (categories_no - 1), TRUE) } else { - intercepts <- matrix(intercepts, clsize, ncategories - - 1, byrow = TRUE) + intercepts <- matrix(intercepts, cluster_size, + categories_no - 1, byrow = TRUE) } } else { - ncategories <- ncol(intercepts) + 1 + categories_no <- ncol(intercepts) + 1 if (rfctn == "rmult.clm") { intercepts <- cbind(-Inf, intercepts, Inf) - for (i in 1:clsize) { + for (i in 1:cluster_size) { if (any(diff(intercepts[i, ]) <= 0)) stop("'intercepts' must be increasing at each row") } } else if (rfctn == "rmult.crm") { - ncategories <- ncol(intercepts) + 1 + categories_no <- ncol(intercepts) + 1 intercepts <- matrix(t(intercepts), sample_size, - clsize * (ncategories - 1), byrow = TRUE) + cluster_size * (categories_no - 1), TRUE) } else { - ncategories <- ncol(intercepts) + 1 - intercepts <- matrix(intercepts, clsize, ncategories - - 1, byrow = TRUE) + categories_no <- ncol(intercepts) + 1 + intercepts <- matrix(intercepts, cluster_size, + categories_no - 1, TRUE) } } } intercepts } -check_betas <- function(betas, clsize) { +check_betas <- function(betas, cluster_size) { if (!(is.vector(betas) & !is.list(betas)) & !is.matrix(betas)) stop("'betas' must be a vector or a matrix") if (!is.numeric(betas)) stop("'betas' must be numeric") if (is.vector(betas) & !is.list(betas)) { - betas <- rep(betas, clsize) + betas <- rep(betas, cluster_size) } else { - if (nrow(betas) != clsize) + if (nrow(betas) != cluster_size) stop("The number of rows in 'betas' should be equal to 'clsize'") betas <- c(t(betas)) } betas } -create_linear_predictor <- function(betas, clsize, lpformula, xdata, rfctn, - ncategories = NULL) { - xmat <- model.matrix(lpformula, data = xdata) +create_linear_predictor <- function(betas, cluster_size, + linear_predictor_formula, xdata, rfctn, + categories_no = NULL) { + xmat <- model.matrix(linear_predictor_formula, data = xdata) if (rfctn == "rmult.bcl") { - xmat <- apply(xmat, 2, function(x) rep(x, each = ncategories)) - if (length(betas) != (clsize * ncategories * ncol(xmat))) + xmat <- apply(xmat, 2, function(x) rep(x, each = categories_no)) + if (length(betas) != (cluster_size * categories_no * ncol(xmat))) stop("The length of 'betas' does not match with the provided covariates") # nolint } else { xmat <- matrix(xmat[, -1], ncol = ncol(xmat) - 1) - if (length(betas) != (clsize) * ncol(xmat)) + if (length(betas) != (cluster_size) * ncol(xmat)) stop("The length of 'betas' does not match with the number of covariates") # nolint } - lin_pred <- matrix(betas, nrow = nrow(xmat), ncol = ncol(xmat), + linear_predictor <- matrix(betas, nrow = nrow(xmat), ncol = ncol(xmat), byrow = TRUE) * xmat if (rfctn == "rmult.bcl") { - lin_pred <- matrix(rowSums(lin_pred), ncol = ncategories * clsize, - byrow = TRUE) + linear_predictor <- matrix(rowSums(linear_predictor), + ncol = categories_no * cluster_size, + byrow = TRUE) } else { - lin_pred <- matrix(rowSums(lin_pred), ncol = clsize, byrow = TRUE) + linear_predictor <- matrix(rowSums(linear_predictor), + ncol = cluster_size, byrow = TRUE) } - as.matrix(lin_pred) + as.matrix(linear_predictor) } create_distribution <- function(link) { @@ -157,104 +168,121 @@ create_distribution <- function(link) { distr } -create_rlatent <- function(rlatent, sample_size, link, clsize, cor_matrix, - rfctn, ncategories = NULL) { - if (is.null(rlatent)) { +create_rlatent <- function(simulated_latent_variables, sample_size, link, + cluster_size, correlation_matrix, + rfctn, categories_no = NULL) { + if (is.null(simulated_latent_variables)) { distr <- create_distribution(link) - cor_matrix <- check_correlation_matrix(cor_matrix, clsize, rfctn, - ncategories) - rlatent <- rnorta(sample_size, cor_matrix, rep(distr, nrow(cor_matrix))) + correlation_matrix <- check_correlation_matrix(correlation_matrix, + cluster_size, rfctn, + categories_no) + simulated_latent_variables <- rnorta(sample_size, correlation_matrix, + rep(distr, + nrow(correlation_matrix))) if (distr == "qgumbel" & rfctn != "rmult.bcl") - rlatent <- -rlatent + simulated_latent_variables <- -simulated_latent_variables } else { - if (!is.matrix(rlatent)) + if (!is.matrix(simulated_latent_variables)) stop("'rlatent' must be a matrix") - if (!is.numeric(rlatent)) + if (!is.numeric(simulated_latent_variables)) stop("'rlatent' must be numeric") if (rfctn == "rbin" | rfctn == "rmult.clm") { - ncol_rlatent <- clsize + ncol_rlatent <- cluster_size } else if (rfctn == "rmult.bcl") { - ncol_rlatent <- clsize * ncategories + ncol_rlatent <- cluster_size * categories_no } else { - ncol_rlatent <- clsize * (ncategories - 1) + ncol_rlatent <- cluster_size * (categories_no - 1) } - if (nrow(rlatent) != sample_size | ncol(rlatent) != ncol_rlatent) + if (nrow(simulated_latent_variables) != sample_size | + ncol(simulated_latent_variables) != ncol_rlatent) stop("'rlatent' must be a ", sample_size, "x", ncol_rlatent, " matrix") # nolint - cor_matrix <- NULL - rlatent <- rlatent + correlation_matrix <- NULL + simulated_latent_variables <- simulated_latent_variables } - rlatent + simulated_latent_variables } -create_output <- function(y_sim, sample_size, clsize, rlatent, lpformula, xdata, - rfctn, ncategories = NULL) { - y <- c(t(y_sim)) - id <- rep(1:sample_size, each = clsize) - time <- rep(1:clsize, sample_size) - rownames(y_sim) <- rownames(rlatent) <- paste("i", 1:sample_size, sep = "=") - colnames(y_sim) <- paste("t", 1:clsize, sep = "=") - colnames(rlatent) <- if (rfctn == "rbin" | rfctn == "rmult.clm") { - paste("t", 1:clsize, sep = "=") - } else if (rfctn == "rmult.bcl") { - paste("t=", rep(1:clsize, each = ncategories), " & j=", - rep(1:ncategories, clsize), sep = "") - } else { - paste("t=", rep(1:clsize, each = ncategories - 1), " & j=", - rep(1:(ncategories - 1), clsize), sep = "") +create_output <- function(simulated_responses, sample_size, cluster_size, + simulated_latent_variables, linear_predictor_formula, + xdata, rfctn, categories_no = NULL) { + y <- c(t(simulated_responses)) + id <- rep(1:sample_size, each = cluster_size) + time <- rep(1:cluster_size, sample_size) + rownames(simulated_responses) <- rownames(simulated_latent_variables) <- + paste("i", 1:sample_size, sep = "=") + colnames(simulated_responses) <- paste("t", 1:cluster_size, sep = "=") + colnames(simulated_latent_variables) <- + if (rfctn == "rbin" | rfctn == "rmult.clm") { + paste("t", 1:cluster_size, sep = "=") + } else if (rfctn == "rmult.bcl") { + paste("t=", rep(1:cluster_size, each = categories_no), " & j=", + rep(1:categories_no, cluster_size), sep = "") + } else { + paste("t=", rep(1:cluster_size, each = categories_no - 1), + " & j=", rep(1:(categories_no - 1), cluster_size), + sep = "") } - sim_model_frame <- model.frame(formula = lpformula, data = xdata) + sim_model_frame <- model.frame(formula = linear_predictor_formula, + data = xdata) simdata <- data.frame(y, sim_model_frame, id, time) - list(Ysim = y_sim, simdata = simdata, rlatent = rlatent) + list(Ysim = simulated_responses, simdata = simdata, + rlatent = simulated_latent_variables) } -apply_threshold <- function(lin_pred, rlatent, clsize, rfctn, intercepts = NULL, - ncategories = NULL) { - sample_size <- nrow(lin_pred) +apply_threshold <- function(linear_predictor, simulated_latent_variables, + cluster_size, rfctn, intercepts = NULL, + categories_no = NULL) { + sample_size <- nrow(linear_predictor) if (rfctn == "rmult.clm" | rfctn == "rmult.crm") { - u_sim <- rlatent - lin_pred + u_sim <- simulated_latent_variables - linear_predictor } else { - u_sim <- rlatent + lin_pred + u_sim <- simulated_latent_variables + linear_predictor } if (rfctn == "rbin") { - y_sim <- matrix(0, sample_size, clsize) - for (i in 1:clsize) y_sim[, i] <- cut(u_sim[, i] - 2 * lin_pred[, i], - intercepts[i, ], labels = FALSE) - y_sim <- 2 - y_sim + simulated_responses <- matrix(0, sample_size, cluster_size) + for (i in 1:cluster_size) simulated_responses[, i] <- + cut(u_sim[, i] - 2 * linear_predictor[, i], intercepts[i, ], + labels = FALSE) + simulated_responses <- 2 - simulated_responses } else if (rfctn == "rmult.bcl") { - u_sim <- matrix(as.vector(t(u_sim)), nrow = clsize * sample_size, - ncol = ncategories, byrow = TRUE) - y_sim <- apply(u_sim, 1, which.max) - y_sim <- matrix(y_sim, ncol = clsize, byrow = TRUE) + u_sim <- matrix(as.vector(t(u_sim)), nrow = cluster_size * sample_size, + ncol = categories_no, byrow = TRUE) + simulated_responses <- apply(u_sim, 1, which.max) + simulated_responses <- matrix(simulated_responses, ncol = cluster_size, + byrow = TRUE) } else if (rfctn == "rmult.clm") { - y_sim <- matrix(0, sample_size, clsize) - for (i in 1:clsize) y_sim[, i] <- cut(u_sim[, i], intercepts[i, ], - labels = FALSE) + simulated_responses <- matrix(0, sample_size, cluster_size) + for (i in 1:cluster_size) simulated_responses[, i] <- + cut(u_sim[, i], intercepts[i, ], labels = FALSE) } else { - y_sim <- matrix(as.numeric(t(u_sim <= intercepts)), - sample_size * clsize, ncategories - 1, TRUE) - for (i in 1:(ncategories - 1)) y_sim[, i] <- ifelse(y_sim[, i] == - 1, i, ncategories) - y_sim <- apply(y_sim, 1, min) - y_sim <- matrix(y_sim, sample_size, clsize, byrow = TRUE) + simulated_responses <- matrix(as.numeric(t(u_sim <= intercepts)), + sample_size * cluster_size, categories_no - 1, TRUE) + for (i in 1:(categories_no - 1)) + simulated_responses[, i] <- ifelse(simulated_responses[, i] == 1, + i, categories_no) + simulated_responses <- apply(simulated_responses, 1, min) + simulated_responses <- matrix(simulated_responses, sample_size, + cluster_size, TRUE) } - y_sim + simulated_responses } create_betas_acl2bcl <- function(intercepts = intercepts, - ncategories = ncategories, betas = betas) { + categories_no = categories_no, betas = betas) { intercepts_bcl <- t(apply(intercepts, 1, function(z) rev(cumsum(rev(z))))) - clsize <- nrow(intercepts) - dim_betas <- length(betas) / clsize - betas_matrix <- matrix(betas, clsize, dim_betas, TRUE) - betas_matrix_bcl <- t(apply(betas_matrix, 1, function(z) rep(ncategories - - seq(ncategories - 1), each = length(z)) * z)) - betas_bcl <- matrix(0, clsize, ncategories - 1 + ncol(betas_matrix_bcl)) - for (i in seq(ncategories - 1)) { + cluster_size <- nrow(intercepts) + dim_betas <- length(betas) / cluster_size + betas_matrix <- matrix(betas, cluster_size, dim_betas, TRUE) + betas_matrix_bcl <- t(apply(betas_matrix, 1, function(z) + rep(categories_no - seq(categories_no - 1), each = length(z)) * z)) + betas_bcl <- matrix(0, cluster_size, + categories_no - 1 + ncol(betas_matrix_bcl)) + for (i in seq(categories_no - 1)) { betas_bcl[, ((i - 1) * (dim_betas + 1) + 1):(i * (dim_betas + 1))] <- cbind(intercepts_bcl[, i], betas_matrix_bcl[, ((i - 1) * (dim_betas) + 1):(i * (dim_betas))]) } - betas_bcl <- cbind(betas_bcl, matrix(0, clsize, dim_betas + 1)) + betas_bcl <- cbind(betas_bcl, matrix(0, cluster_size, dim_betas + 1)) betas_bcl } diff --git a/R/rbin.R b/R/rbin.R index ae270a8..b24c7ee 100644 --- a/R/rbin.R +++ b/R/rbin.R @@ -94,7 +94,7 @@ #' under Marginal Model Specification: The SimCorMultRes Package. \emph{The R #' Journal} \bold{8}, 79--91. #' @examples -#' ## See Example 3.4 in the Vignette. +#' ## See Example 3.5 in the Vignette. #' set.seed(123) #' sample_size <- 5000 #' cluster_size <- 4 @@ -102,27 +102,27 @@ #' beta_coefficients <- 0.2 #' latent_correlation_matrix <- toeplitz(c(1, 0.9, 0.9, 0.9)) #' x <- rep(rnorm(sample_size), each = cluster_size) -#' simulated_binary_responses <- rbin(clsize = cluster_size, +#' simulated_binary_dataset <- rbin(clsize = cluster_size, #' intercepts = beta_intercepts, betas = beta_coefficients, #' xformula = ~ x, cor.matrix = latent_correlation_matrix, link = 'probit') #' library(gee) #' binary_gee_model <- gee(y ~ x, family = binomial('probit'), id = id, -#' data = simulated_binary_responses$simdata) +#' data = simulated_binary_dataset$simdata) #' summary(binary_gee_model)$coefficients #' -#' ## See Example 3.5 in the Vignette. +#' ## See Example 3.6 in the Vignette. #' set.seed(8) #' library(evd) -#' rlatent1 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), model = 'log', -#' d = cluster_size) -#' rlatent2 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), model = 'log', -#' d = cluster_size) -#' simulated_latent_variables <- rlatent1 - rlatent2 -#' simulated_binary_responses <- rbin(clsize = cluster_size, +#' simulated_latent_variables1 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), +#' model = 'log', d = cluster_size) +#' simulated_latent_variables2 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), +#' model = 'log', d = cluster_size) +#' simulated_latent_variables <- simulated_latent_variables1 - simulated_latent_variables2 # nolintr +#' simulated_binary_dataset <- rbin(clsize = cluster_size, #' intercepts = beta_intercepts, betas = beta_coefficients, #' xformula = ~ x, rlatent = simulated_latent_variables) #' binary_gee_model <- gee(y ~ x, family = binomial('logit'), id = id, -#' data = simulated_binary_responses$simdata) +#' data = simulated_binary_dataset$simdata) #' summary(binary_gee_model)$coefficients #' #' @export @@ -130,16 +130,21 @@ rbin <- function(clsize = clsize, intercepts = intercepts, betas = betas, xformula = formula(xdata), xdata = parent.frame(), link = "logit", cor.matrix = cor.matrix, rlatent = NULL){ # nolint check_cluster_size(clsize) - intercepts <- check_intercepts(intercepts, clsize, "rbin") - betas <- check_betas(betas, clsize) - lpformula <- check_xformula(xformula) - if (!is.environment(xdata)) - xdata <- data.frame(na.omit(xdata)) - lin_pred <- create_linear_predictor(betas, clsize, lpformula, xdata, - "rbin") - sample_size <- nrow(lin_pred) # nolint - rlatent <- create_rlatent(rlatent, sample_size, link, clsize, cor.matrix, - "rbin") - y_sim <- apply_threshold(lin_pred, rlatent, clsize, "rbin", intercepts) - create_output(y_sim, sample_size, clsize, rlatent, lpformula, xdata, "rbin") + beta_intercepts <- check_intercepts(intercepts, clsize, "rbin") + beta_coefficients <- check_betas(betas, clsize) + linear_predictor_formula <- check_xformula(xformula) + if (!is.environment(xdata)) xdata <- data.frame(na.omit(xdata)) + linear_predictor <- create_linear_predictor(beta_coefficients, clsize, + linear_predictor_formula, xdata, + "rbin") + sample_size <- nrow(linear_predictor) + simulated_latent_variables <- create_rlatent(rlatent, sample_size, link, + clsize, cor.matrix, "rbin") + simulated_binary_responses <- apply_threshold(linear_predictor, + simulated_latent_variables, + clsize, "rbin", + beta_intercepts) + create_output(simulated_binary_responses, sample_size, clsize, + simulated_latent_variables, linear_predictor_formula, xdata, + "rbin") } diff --git a/R/rmult.acl.R b/R/rmult.acl.R index 33dffba..0053bf7 100644 --- a/R/rmult.acl.R +++ b/R/rmult.acl.R @@ -107,13 +107,13 @@ #' equicorrelation_matrix <- toeplitz(c(1, rep(0.95, cluster_size - 1))) #' latent_correlation_matrix <- kronecker(equicorrelation_matrix, #' identity_matrix) -#' simulated_ordinal_reponses <- rmult.acl(clsize = cluster_size, +#' simulated_ordinal_dataset <- rmult.acl(clsize = cluster_size, #' intercepts = beta_intercepts, betas = beta_coefficients, #' xformula = ~ x1 + x2, xdata = xdata, #' cor.matrix = latent_correlation_matrix) #' suppressPackageStartupMessages(library('multgee')) #' ordinal_gee_model <- ordLORgee(y ~ x1 + x2, -#' data = simulated_ordinal_reponses$simdata, id = id, repeated = time, +#' data = simulated_ordinal_dataset$simdata, id = id, repeated = time, #' LORstr = 'time.exch', link='acl') #' round(coef(ordinal_gee_model), 2) #' @@ -122,10 +122,11 @@ rmult.acl <- function(clsize = clsize, intercepts = intercepts, betas = betas, # xformula = formula(xdata), xdata = parent.frame(), cor.matrix = cor.matrix, # nolint rlatent = NULL) { check_cluster_size(clsize) - intercepts <- check_intercepts(intercepts, clsize, "rmult.acl") - ncategories <- ncol(intercepts) + 1 - betas <- check_betas(betas, clsize) - betas_bcl <- create_betas_acl2bcl(intercepts, ncategories, betas) - rmult.bcl(clsize, ncategories, betas_bcl, xformula, xdata, cor.matrix, - rlatent) + beta_intercepts <- check_intercepts(intercepts, clsize, "rmult.acl") + categories_no <- ncol(beta_intercepts) + 1 + beta_coefficients <- check_betas(betas, clsize) + betas_bcl <- create_betas_acl2bcl(beta_intercepts, categories_no, + beta_coefficients) + rmult.bcl(clsize, categories_no, betas_bcl, xformula, xdata, cor.matrix, + rlatent) } diff --git a/R/rmult.bcl.R b/R/rmult.bcl.R index 71078a0..7b92bc4 100644 --- a/R/rmult.bcl.R +++ b/R/rmult.bcl.R @@ -104,12 +104,12 @@ #' equicorrelation_matrix <- toeplitz(c(1, rep(0.95, cluster_size - 1))) #' identity_matrix <- diag(categories_no) #' latent_correlation_matrix <- kronecker(equicorrelation_matrix, identity_matrix) # nolint -#' simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, +#' simulated_nominal_dataset <- rmult.bcl(clsize = cluster_size, #' ncategories = categories_no, betas = betas, xformula = ~ x1 + x2, #' xdata = xdata, cor.matrix = latent_correlation_matrix) #' suppressPackageStartupMessages(library('multgee')) #' nominal_gee_model <- nomLORgee(y ~ x1 + x2, -#' data = simulated_nominal_responses$simdata, id = id, repeated = time, +#' data = simulated_nominal_dataset$simdata, id = id, repeated = time, #' LORstr = 'time.exch') #' round(coef(nominal_gee_model), 2) #' @@ -118,20 +118,24 @@ rmult.bcl <- function(clsize = clsize, ncategories = ncategories, betas = betas, xformula = formula(xdata), xdata = parent.frame(), cor.matrix = cor.matrix, # nolint rlatent = NULL) { check_cluster_size(clsize) - ncategories <- check_ncategories(ncategories) + categories_no <- check_ncategories(ncategories) betas <- check_betas(betas, clsize) - lpformula <- check_xformula(xformula) - if (!is.environment(xdata)) - xdata <- data.frame(na.omit(xdata)) - lin_pred <- create_linear_predictor(betas, clsize, lpformula, xdata, - "rmult.bcl", ncategories = ncategories) - sample_size <- nrow(lin_pred) - rlatent <- create_rlatent(rlatent, sample_size, "cloglog", clsize, - cor.matrix, "rmult.bcl", - ncategories = ncategories) - y_sim <- apply_threshold(lin_pred, rlatent, clsize, "rmult.bcl", - ncategories = ncategories) - lpformula <- update(lpformula, ~. - 1) - create_output(y_sim, sample_size, clsize, rlatent, lpformula, xdata, - "rmult.bcl", ncategories = ncategories) + linear_predictor_formula <- check_xformula(xformula) + if (!is.environment(xdata)) xdata <- data.frame(na.omit(xdata)) + linear_predictor <- create_linear_predictor(betas, clsize, + linear_predictor_formula, xdata, + "rmult.bcl", categories_no) + sample_size <- nrow(linear_predictor) + simulated_latent_responses <- create_rlatent(rlatent, sample_size, + "cloglog", clsize, cor.matrix, + "rmult.bcl", categories_no) + simulated_nominal_responses <- apply_threshold(linear_predictor, + simulated_latent_responses, + clsize, "rmult.bcl", + categories_no = + categories_no) + linear_predictor_formula <- update(linear_predictor_formula, ~. - 1) + create_output(simulated_nominal_responses, sample_size, clsize, + simulated_latent_responses, linear_predictor_formula, xdata, + "rmult.bcl", categories_no = categories_no) } diff --git a/R/rmult.clm.R b/R/rmult.clm.R index f994538..b5476c4 100644 --- a/R/rmult.clm.R +++ b/R/rmult.clm.R @@ -104,21 +104,21 @@ #' beta_coefficients <- matrix(c(1, 2, 3, 4), 4, 1) #' x <- rep(rnorm(sample_size), each = cluster_size) #' latent_correlation_matrix <- toeplitz(c(1, 0.85, 0.5, 0.15)) -#' simulated_ordinal_responses <- rmult.clm(clsize = cluster_size, +#' simulated_ordinal_dataset <- rmult.clm(clsize = cluster_size, #' intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, #' cor.matrix = latent_correlation_matrix, link = 'probit') -#' head(simulated_ordinal_responses$simdata, n = 8) +#' head(simulated_ordinal_dataset$simdata, n = 8) #' #' ## Same sampling scheme except that the parameter vector is time-stationary. #' set.seed(12345) -#' simulated_ordinal_responses <- rmult.clm(clsize = cluster_size, betas = 1, +#' simulated_ordinal_dataset <- rmult.clm(clsize = cluster_size, betas = 1, #' xformula = ~ x, cor.matrix = latent_correlation_matrix, #' intercepts = beta_intercepts, link = 'probit') #' ## Fit a GEE model (Touloumis et al., 2013) to estimate the regression #' ## coefficients. #' library(multgee) #' ordinal_gee_model <- ordLORgee(y ~ x, id = id, repeated = time, -#' link = 'probit', data = simulated_ordinal_responses$simdata) +#' link = 'probit', data = simulated_ordinal_dataset$simdata) #' coef(ordinal_gee_model) #' #' @export @@ -126,17 +126,21 @@ rmult.clm <- function(clsize = clsize, intercepts = intercepts, betas = betas, # xformula = formula(xdata), xdata = parent.frame(), link = "logit", cor.matrix = cor.matrix, rlatent = NULL) { # nolint check_cluster_size(clsize) - intercepts <- check_intercepts(intercepts, clsize, "rmult.clm") - betas <- check_betas(betas, clsize) - lpformula <- check_xformula(xformula) - if (!is.environment(xdata)) - xdata <- data.frame(na.omit(xdata)) - lin_pred <- create_linear_predictor(betas, clsize, lpformula, xdata, - "rmult.clm") - sample_size <- nrow(lin_pred) - rlatent <- create_rlatent(rlatent, sample_size, link, clsize, cor.matrix, - "rmult.clm") - y_sim <- apply_threshold(lin_pred, rlatent, clsize, "rmult.clm", intercepts) - create_output(y_sim, sample_size, clsize, rlatent, lpformula, xdata, + beta_intercepts <- check_intercepts(intercepts, clsize, "rmult.clm") + beta_coefficients <- check_betas(betas, clsize) + linear_predictor_formula <- check_xformula(xformula) + if (!is.environment(xdata)) xdata <- data.frame(na.omit(xdata)) + linear_predictor <- create_linear_predictor(beta_coefficients, clsize, + linear_predictor_formula, xdata, + "rmult.clm") + sample_size <- nrow(linear_predictor) + simulated_latent_responses <- create_rlatent(rlatent, sample_size, link, + clsize, cor.matrix, + "rmult.clm") + simulated_ordinal_responses <- + apply_threshold(linear_predictor, simulated_latent_responses, clsize, + "rmult.clm", beta_intercepts) + create_output(simulated_ordinal_responses, sample_size, clsize, + simulated_latent_responses, linear_predictor_formula, xdata, "rmult.clm") } diff --git a/R/rmult.crm.R b/R/rmult.crm.R index cfe2e66..fa274ad 100644 --- a/R/rmult.crm.R +++ b/R/rmult.crm.R @@ -110,31 +110,37 @@ #' ones_matrix <- matrix(1, cluster_size, cluster_size) #' latent_correlation_matrix <- identity_matrix + #' kronecker(equicorrelation_matrix, ones_matrix) -#' CorOrdRes <- rmult.crm(clsize = cluster_size, intercepts = beta_intercepts, -#' betas = beta_coefficients, xformula = ~ x, -#' cor.matrix = latent_correlation_matrix, link = 'probit') -#' head(CorOrdRes$Ysim) +#' simulated_ordinal_dataset <- rmult.crm(clsize = cluster_size, +#' intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, +#' cor.matrix = latent_correlation_matrix, link = 'probit') +#' head(simulated_ordinal_dataset$Ysim) #' #'@export rmult.crm <- function(clsize = clsize, intercepts = intercepts, betas = betas, # nolint xformula = formula(xdata), xdata = parent.frame(), link = "logit", cor.matrix = cor.matrix, rlatent = NULL) { # nolint check_cluster_size(clsize) - betas <- check_betas(betas, clsize) - lpformula <- check_xformula(xformula) - if (!is.environment(xdata)) - xdata <- data.frame(na.omit(xdata)) - lin_pred <- create_linear_predictor(betas, clsize, lpformula, xdata, - "rmult.clm") - sample_size <- nrow(lin_pred) - intercepts <- check_intercepts(intercepts, clsize, "rmult.crm", sample_size) - ncategories <- ncol(intercepts) / clsize + 1 - lin_pred_extended <- t(apply(lin_pred, 1, - function(x) rep(x, each = ncategories - 1))) - rlatent <- create_rlatent(rlatent, sample_size, link, clsize, cor.matrix, - "rmult.crm", ncategories) - y_sim <- apply_threshold(lin_pred_extended, rlatent, clsize, "rmult.crm", - intercepts, ncategories) - create_output(y_sim, sample_size, clsize, rlatent, lpformula, xdata, - "rmult.crm", ncategories) + beta_coefficients <- check_betas(betas, clsize) + linear_predictor_formula <- check_xformula(xformula) + if (!is.environment(xdata)) xdata <- data.frame(na.omit(xdata)) + linear_predictor <- create_linear_predictor(beta_coefficients, clsize, + linear_predictor_formula, xdata, + "rmult.clm") + sample_size <- nrow(linear_predictor) + beta_intercepts <- check_intercepts(intercepts, clsize, "rmult.crm", + sample_size) + categories_no <- ncol(beta_intercepts) / clsize + 1 + linear_predictor_extended <- t(apply(linear_predictor, 1, function(x) + rep(x, each = categories_no - 1))) + simulated_latent_responses <- create_rlatent(rlatent, sample_size, link, + clsize, cor.matrix, + "rmult.crm", categories_no) + simulated_ordinal_responses <- apply_threshold(linear_predictor_extended, + simulated_latent_responses, + clsize, "rmult.crm", + beta_intercepts, + categories_no) + create_output(simulated_ordinal_responses, sample_size, clsize, + simulated_latent_responses, linear_predictor_formula, xdata, + "rmult.crm", categories_no) } diff --git a/R/rnorta.R b/R/rnorta.R index fdf0eac..c2b2e9c 100644 --- a/R/rnorta.R +++ b/R/rnorta.R @@ -86,9 +86,9 @@ rnorta <- function(R = R, cor.matrix = cor.matrix, distr = distr, # nolint qparameters = NULL) { if (all.equal(R, as.integer(R)) != TRUE | R < 1) stop("'R' must be a positive integer") - quantilefunctions <- as.character(distr) + quantile_functions <- as.character(distr) ans <- rsmvnorm(R = R, cor.matrix = cor.matrix) - if (length(quantilefunctions) != ncol(cor.matrix)) + if (length(quantile_functions) != ncol(cor.matrix)) stop("'distr' must be a ", ncol(cor.matrix), "-variate vector of strings naming a valid quantile function") if (!is.null(qparameters)) { @@ -99,14 +99,14 @@ rnorta <- function(R = R, cor.matrix = cor.matrix, distr = distr, # nolint } ans <- pnorm(ans) for (i in seq_len(ncol(cor.matrix))) { - quantilefunction <- get(quantilefunctions[i], mode = "function") - if (!is.function(quantilefunction)) + quantile_function <- get(quantile_functions[i], mode = "function") + if (!is.function(quantile_function)) stop("Character string ", i, " in `distr' does not correspond to a valid function") if (!is.null(qparameters)) - formals(quantilefunction)[pmatch(names(qparameters[[i]]), - formalArgs(quantilefunction))] <- qparameters[[i]] - ans[, i] <- quantilefunction(ans[, i]) + formals(quantile_function)[pmatch(names(qparameters[[i]]), + formalArgs(quantile_function))] <- qparameters[[i]] + ans[, i] <- quantile_function(ans[, i]) } ans } diff --git a/R/rsmvnorm.R b/R/rsmvnorm.R index f3c111e..bdb50cf 100644 --- a/R/rsmvnorm.R +++ b/R/rsmvnorm.R @@ -30,17 +30,18 @@ rsmvnorm <- function(R = R, cor.matrix = cor.matrix) { # nolint stop("'R' must be a positive integer") if (!is.numeric(cor.matrix)) stop("'cor.matrix' must be numeric") - cor.matrix <- as.matrix(cor.matrix) # nolint - if (!isSymmetric(cor.matrix)) + correlation_matrix <- as.matrix(cor.matrix) # nolint + if (!isSymmetric(correlation_matrix)) stop("'cor.matrix' must be a symmetric matrix") - if (any(diag(cor.matrix) != 1)) + if (any(diag(correlation_matrix) != 1)) stop("the diagonal elements of 'cor.matrix' must be equal to one") - if (any(cor.matrix > 1) | any(cor.matrix < -1)) + if (any(correlation_matrix > 1) | any(correlation_matrix < -1)) stop("all the elements of 'cor.matrix' must be on [-1,1]") - if (any(eigen(cor.matrix, symmetric = TRUE, only.values = TRUE)$values <= - 0)) + correlation_matrix_eigen <- eigen(correlation_matrix, symmetric = TRUE, + only.values = TRUE) + if (any(correlation_matrix_eigen$values <= 0)) stop("'cor.matrix' must be a positive definite matrix") - p <- ncol(cor.matrix) - ans <- matrix(rnorm(R * p), R, p) %*% chol(cor.matrix) + p <- ncol(correlation_matrix) + ans <- matrix(rnorm(R * p), R, p) %*% chol(correlation_matrix) ans } diff --git a/README.md b/README.md index 058853e..4f013ad 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ # SimCorMultRes: Simulates Correlated Multinomial Responses [![Github -version](https://img.shields.io/badge/GitHub%20-1.6.6-orange.svg)](%22commits/master%22) +version](https://img.shields.io/badge/GitHub%20-1.6.7-orange.svg)](%22commits/master%22) [![Travis-CI Build Status](https://travis-ci.org/AnestisTouloumis/SimCorMultRes.svg?branch=master)](https://travis-ci.org/AnestisTouloumis/SimCorMultRes) [![Project Status: Active The project has reached a stable, usable state diff --git a/inst/NEWS b/inst/NEWS index cd92e42..524399c 100644 --- a/inst/NEWS +++ b/inst/NEWS @@ -1,3 +1,7 @@ +Version 1.6.7 [2019-07-22] + * Improved R code readability. + * Added unit tests for internal functions. + Version 1.6.6 [2019-07-09] * Minor fixed to comply with lintr. diff --git a/man/rbin.Rd b/man/rbin.Rd index 0a0d84e..42efac4 100644 --- a/man/rbin.Rd +++ b/man/rbin.Rd @@ -91,7 +91,7 @@ element (\eqn{i,t}) of \code{rlatent} represents the realization of \eqn{e^{B}_{it}}. } \examples{ -## See Example 3.4 in the Vignette. +## See Example 3.5 in the Vignette. set.seed(123) sample_size <- 5000 cluster_size <- 4 @@ -99,27 +99,27 @@ beta_intercepts <- 0 beta_coefficients <- 0.2 latent_correlation_matrix <- toeplitz(c(1, 0.9, 0.9, 0.9)) x <- rep(rnorm(sample_size), each = cluster_size) -simulated_binary_responses <- rbin(clsize = cluster_size, +simulated_binary_dataset <- rbin(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, cor.matrix = latent_correlation_matrix, link = 'probit') library(gee) binary_gee_model <- gee(y ~ x, family = binomial('probit'), id = id, -data = simulated_binary_responses$simdata) +data = simulated_binary_dataset$simdata) summary(binary_gee_model)$coefficients -## See Example 3.5 in the Vignette. +## See Example 3.6 in the Vignette. set.seed(8) library(evd) -rlatent1 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), model = 'log', -d = cluster_size) -rlatent2 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), model = 'log', -d = cluster_size) -simulated_latent_variables <- rlatent1 - rlatent2 -simulated_binary_responses <- rbin(clsize = cluster_size, +simulated_latent_variables1 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), + model = 'log', d = cluster_size) +simulated_latent_variables2 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), + model = 'log', d = cluster_size) +simulated_latent_variables <- simulated_latent_variables1 - simulated_latent_variables2 # nolintr +simulated_binary_dataset <- rbin(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, rlatent = simulated_latent_variables) binary_gee_model <- gee(y ~ x, family = binomial('logit'), id = id, - data = simulated_binary_responses$simdata) + data = simulated_binary_dataset$simdata) summary(binary_gee_model)$coefficients } diff --git a/man/rmult.acl.Rd b/man/rmult.acl.Rd index abe26bd..b9ebf27 100644 --- a/man/rmult.acl.Rd +++ b/man/rmult.acl.Rd @@ -104,13 +104,13 @@ identity_matrix <- diag(4) equicorrelation_matrix <- toeplitz(c(1, rep(0.95, cluster_size - 1))) latent_correlation_matrix <- kronecker(equicorrelation_matrix, identity_matrix) -simulated_ordinal_reponses <- rmult.acl(clsize = cluster_size, +simulated_ordinal_dataset <- rmult.acl(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x1 + x2, xdata = xdata, cor.matrix = latent_correlation_matrix) suppressPackageStartupMessages(library('multgee')) ordinal_gee_model <- ordLORgee(y ~ x1 + x2, - data = simulated_ordinal_reponses$simdata, id = id, repeated = time, + data = simulated_ordinal_dataset$simdata, id = id, repeated = time, LORstr = 'time.exch', link='acl') round(coef(ordinal_gee_model), 2) diff --git a/man/rmult.bcl.Rd b/man/rmult.bcl.Rd index ed58da3..883c731 100644 --- a/man/rmult.bcl.Rd +++ b/man/rmult.bcl.Rd @@ -101,12 +101,12 @@ xdata <- data.frame(x1, x2) equicorrelation_matrix <- toeplitz(c(1, rep(0.95, cluster_size - 1))) identity_matrix <- diag(categories_no) latent_correlation_matrix <- kronecker(equicorrelation_matrix, identity_matrix) # nolint -simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, +simulated_nominal_dataset <- rmult.bcl(clsize = cluster_size, ncategories = categories_no, betas = betas, xformula = ~ x1 + x2, xdata = xdata, cor.matrix = latent_correlation_matrix) suppressPackageStartupMessages(library('multgee')) nominal_gee_model <- nomLORgee(y ~ x1 + x2, - data = simulated_nominal_responses$simdata, id = id, repeated = time, + data = simulated_nominal_dataset$simdata, id = id, repeated = time, LORstr = 'time.exch') round(coef(nominal_gee_model), 2) diff --git a/man/rmult.clm.Rd b/man/rmult.clm.Rd index fe877a2..cd1111b 100644 --- a/man/rmult.clm.Rd +++ b/man/rmult.clm.Rd @@ -101,21 +101,21 @@ beta_intercepts <- c(-1.5, -0.5, 0.5, 1.5) beta_coefficients <- matrix(c(1, 2, 3, 4), 4, 1) x <- rep(rnorm(sample_size), each = cluster_size) latent_correlation_matrix <- toeplitz(c(1, 0.85, 0.5, 0.15)) -simulated_ordinal_responses <- rmult.clm(clsize = cluster_size, +simulated_ordinal_dataset <- rmult.clm(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, cor.matrix = latent_correlation_matrix, link = 'probit') -head(simulated_ordinal_responses$simdata, n = 8) +head(simulated_ordinal_dataset$simdata, n = 8) ## Same sampling scheme except that the parameter vector is time-stationary. set.seed(12345) -simulated_ordinal_responses <- rmult.clm(clsize = cluster_size, betas = 1, +simulated_ordinal_dataset <- rmult.clm(clsize = cluster_size, betas = 1, xformula = ~ x, cor.matrix = latent_correlation_matrix, intercepts = beta_intercepts, link = 'probit') ## Fit a GEE model (Touloumis et al., 2013) to estimate the regression ## coefficients. library(multgee) ordinal_gee_model <- ordLORgee(y ~ x, id = id, repeated = time, - link = 'probit', data = simulated_ordinal_responses$simdata) + link = 'probit', data = simulated_ordinal_dataset$simdata) coef(ordinal_gee_model) } diff --git a/man/rmult.crm.Rd b/man/rmult.crm.Rd index 66ea8eb..ac30efa 100644 --- a/man/rmult.crm.Rd +++ b/man/rmult.crm.Rd @@ -111,10 +111,10 @@ equicorrelation_matrix <- toeplitz(c(0, rep(0.24, categories_no - 2))) ones_matrix <- matrix(1, cluster_size, cluster_size) latent_correlation_matrix <- identity_matrix + kronecker(equicorrelation_matrix, ones_matrix) -CorOrdRes <- rmult.crm(clsize = cluster_size, intercepts = beta_intercepts, - betas = beta_coefficients, xformula = ~ x, -cor.matrix = latent_correlation_matrix, link = 'probit') -head(CorOrdRes$Ysim) +simulated_ordinal_dataset <- rmult.crm(clsize = cluster_size, + intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, + cor.matrix = latent_correlation_matrix, link = 'probit') +head(simulated_ordinal_dataset$Ysim) } \references{ diff --git a/tests/testthat/test_core_functions.R b/tests/testthat/test_core_functions.R index 71547d8..d07a78a 100644 --- a/tests/testthat/test_core_functions.R +++ b/tests/testthat/test_core_functions.R @@ -6,14 +6,14 @@ test_that("rbin constant beta_intercepts", { beta_coefficients <- 0.2 latent_correlation_matrix <- toeplitz(c(1, 0.9, 0.9, 0.9)) x <- rep(rnorm(sample_size), each = cluster_size) - simulated_binary_responses <- + simulated_binary_dataset <- rbin(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, cor.matrix = latent_correlation_matrix, link = "probit") - y_sim <- - as.numeric(c(t(simulated_binary_responses$rlatent)) <= + simulated_responses <- + as.numeric(c(t(simulated_binary_dataset$rlatent)) <= beta_intercepts + beta_coefficients * x) - expect_equal(c(t(simulated_binary_responses$Ysim)), y_sim) + expect_equal(c(t(simulated_binary_dataset$Ysim)), simulated_responses) }) test_that("rbin varying beta_intercepts", { @@ -24,15 +24,15 @@ test_that("rbin varying beta_intercepts", { beta_coefficients <- 0.2 latent_correlation_matrix <- toeplitz(c(1, 0.9, 0.9, 0.9)) x <- rep(rnorm(sample_size), each = cluster_size) - simulated_binary_responses <- + simulated_binary_dataset <- rbin(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, cor.matrix = latent_correlation_matrix, link = "probit") beta_intercepts <- rep(beta_intercepts, sample_size) - y_sim <- - as.numeric(c(t(simulated_binary_responses$rlatent)) <= + simulated_responses <- + as.numeric(c(t(simulated_binary_dataset$rlatent)) <= beta_intercepts + beta_coefficients * x) - expect_equal(c(t(simulated_binary_responses$Ysim)), y_sim) + expect_equal(c(t(simulated_binary_dataset$Ysim)), simulated_responses) }) @@ -48,7 +48,7 @@ test_that("rmult.bcl constant beta_coefficients", { latent_correlation_matrix <- kronecker( toeplitz(c(1, rep(0.95, cluster_size - 1))), diag(categories_no) ) - simulated_nominal_responses <- + simulated_nominal_dataset <- rmult.bcl(clsize = cluster_size, ncategories = categories_no, betas = beta_coefficients, xformula = ~ x1 + x2, xdata = xdata, cor.matrix = latent_correlation_matrix) @@ -56,10 +56,10 @@ test_that("rmult.bcl constant beta_coefficients", { xmat <- apply(xmat, 2, function(x) rep(x, each = categories_no)) lin_pred <- matrix(beta_coefficients, nrow = nrow(xmat), ncol = ncol(xmat), byrow = TRUE) * xmat - lin_pred <- rowSums(lin_pred) + c(t(simulated_nominal_responses$rlatent)) + lin_pred <- rowSums(lin_pred) + c(t(simulated_nominal_dataset$rlatent)) lin_pred <- matrix(lin_pred, sample_size * cluster_size, categories_no, TRUE) - y_sim <- apply(lin_pred, 1, which.max) - expect_equal(c(t(simulated_nominal_responses$Ysim)), y_sim) + simulated_responses <- apply(lin_pred, 1, which.max) + expect_equal(c(t(simulated_nominal_dataset$Ysim)), simulated_responses) }) @@ -78,7 +78,7 @@ test_that("rmult.bcl varying beta_coefficients", { xdata <- data.frame(x1, x2) latent_correlation_matrix <- kronecker( toeplitz(c(1, rep(0.95, cluster_size - 1))), diag(categories_no)) - simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, + simulated_nominal_dataset <- rmult.bcl(clsize = cluster_size, ncategories = categories_no, betas = beta_coefficients, xformula = ~ x1 + x2, xdata = xdata, @@ -88,10 +88,10 @@ test_that("rmult.bcl varying beta_coefficients", { xmat <- apply(xmat, 2, function(x) rep(x, each = categories_no)) lin_pred <- matrix(c(t(beta_coefficients)), nrow = nrow(xmat), ncol = ncol(xmat), byrow = TRUE) * xmat - lin_pred <- rowSums(lin_pred) + c(t(simulated_nominal_responses$rlatent)) + lin_pred <- rowSums(lin_pred) + c(t(simulated_nominal_dataset$rlatent)) lin_pred <- matrix(lin_pred, sample_size * cluster_size, categories_no, TRUE) - y_sim <- apply(lin_pred, 1, which.max) - expect_equal(c(t(simulated_nominal_responses$Ysim)), y_sim) + simulated_responses <- apply(lin_pred, 1, which.max) + expect_equal(c(t(simulated_nominal_dataset$Ysim)), simulated_responses) }) @@ -104,14 +104,16 @@ test_that("rmult.clm varying beta_coefficients", { beta_coefficients <- matrix(c(1, 2, 3, 4), 4, 1) x <- rep(rnorm(sample_size), each = cluster_size) latent_correlation_matrix <- toeplitz(c(1, 0.85, 0.5, 0.15)) - simulated_ordinal_responses <- + simulated_ordinal_dataset <- rmult.clm(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, cor.matrix = latent_correlation_matrix, link = "probit") - u_sim <- c(t(simulated_ordinal_responses$rlatent)) - c(beta_coefficients) * x + simulated_latent_responses <- + c(t(simulated_ordinal_dataset$rlatent)) - c(beta_coefficients) * x beta_intercepts <- c(-Inf, beta_intercepts, Inf) - y_sim <- cut(u_sim, beta_intercepts, labels = FALSE) - expect_equal(c(t(simulated_ordinal_responses$Ysim)), y_sim) + simulated_responses <- cut(simulated_latent_responses, beta_intercepts, + labels = FALSE) + expect_equal(c(t(simulated_ordinal_dataset$Ysim)), simulated_responses) }) @@ -124,14 +126,16 @@ test_that("rmult.clm constant beta_coefficients", { beta_coefficients <- 1 x <- rep(rnorm(sample_size), each = cluster_size) latent_correlation_matrix <- toeplitz(c(1, 0.85, 0.5, 0.15)) - simulated_ordinal_responses <- + simulated_ordinal_dataset <- rmult.clm(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, cor.matrix = latent_correlation_matrix, link = "probit") - u_sim <- c(t(simulated_ordinal_responses$rlatent)) - c(beta_coefficients) * x + simulated_latent_responses <- + c(t(simulated_ordinal_dataset$rlatent)) - c(beta_coefficients) * x beta_intercepts <- c(-Inf, beta_intercepts, Inf) - y_sim <- cut(u_sim, beta_intercepts, labels = FALSE) - expect_equal(c(t(simulated_ordinal_responses$Ysim)), y_sim) + simulated_responses <- cut(simulated_latent_responses, beta_intercepts, + labels = FALSE) + expect_equal(c(t(simulated_ordinal_dataset$Ysim)), simulated_responses) }) @@ -147,7 +151,7 @@ test_that("rmult.acl constant beta_coefficients", { set.seed(1) latent_correlation_matrix <- kronecker(toeplitz(c(1, rep(0.95, cluster_size - 1))), diag(4)) - simulated_ordinal_responses <- + simulated_ordinal_dataset <- rmult.acl(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x1 + x2, xdata = xdata, cor.matrix = latent_correlation_matrix) @@ -158,12 +162,12 @@ test_that("rmult.acl constant beta_coefficients", { beta_intercepts[3], 1, 1, beta_intercepts[4], 0, 0) set.seed(1) - simulated_nominal_responses <- + simulated_nominal_dataset <- rmult.bcl(clsize = cluster_size, ncategories = 4, betas = beta_coefficients_bcl, xformula = ~ x1 + x2, xdata = xdata, cor.matrix = latent_correlation_matrix) - expect_equal(c(t(simulated_ordinal_responses$Ysim)), - c(t(simulated_nominal_responses$Ysim))) + expect_equal(c(t(simulated_ordinal_dataset$Ysim)), + c(t(simulated_nominal_dataset$Ysim))) }) @@ -178,17 +182,19 @@ test_that("rmult.crm constant beta_coefficients", { diag(1, (categories_no - 1) * cluster_size) + kronecker(toeplitz(c(0, rep(0.24, categories_no - 2))), matrix(1, cluster_size, cluster_size)) - simulated_ordinal_responses <- + simulated_ordinal_dataset <- rmult.crm(clsize = cluster_size, intercepts = beta_intercepts, betas = beta_coefficients, xformula = ~ x, cor.matrix = latent_correlation_matrix, link = "probit") - u_sim <- c(t(simulated_ordinal_responses$rlatent)) - + simulated_latent_responses <- + c(t(simulated_ordinal_dataset$rlatent)) - rep(x, each = categories_no - 1) - y_sim <- - matrix(as.numeric(t(u_sim <= beta_intercepts)), sample_size * cluster_size, - categories_no - 1, TRUE) + simulated_responses <- + matrix(as.numeric(t(simulated_latent_responses <= beta_intercepts)), + sample_size * cluster_size, categories_no - 1, TRUE) for (i in 1:(categories_no - 1)) - y_sim[, i] <- ifelse(y_sim[, i] == 1, i, categories_no) - y_sim <- apply(y_sim, 1, min) - expect_equal(c(t(simulated_ordinal_responses$Ysim)), y_sim) + simulated_responses[, i] <- + ifelse(simulated_responses[, i] == 1, i, categories_no) + simulated_responses <- apply(simulated_responses, 1, min) + expect_equal(c(t(simulated_ordinal_dataset$Ysim)), simulated_responses) }) diff --git a/tests/testthat/test_internal_functions.R b/tests/testthat/test_internal_functions.R new file mode 100644 index 0000000..4199af9 --- /dev/null +++ b/tests/testthat/test_internal_functions.R @@ -0,0 +1,56 @@ +check_cluster_size + +test_that("checking cluster size", { + expect_silent(check_cluster_size(2)) + expect_error(check_cluster_size(1)) + expect_error(check_cluster_size(2.5)) +}) + +test_that("checking categories", { + expect_silent(check_ncategories(3)) + expect_error(check_ncategories(2)) + expect_error(check_ncategories(2.5)) +}) + + +test_that("creating marginal distributions", { + expect_equal(create_distribution("probit"), "qnorm") + expect_equal(create_distribution("logit"), "qlogis" ) + expect_equal(create_distribution("cloglog"), "qgumbel") + expect_equal(create_distribution("cauchit"), "qcauchy") +}) + + + +test_that("checking correlation matrix", { + cluster_size <- 5 + categories_no <- 3 + correlation_matrix_1 <- matrix(1, cluster_size, cluster_size) + correlation_matrix_2 <- matrix(1, cluster_size * categories_no, + cluster_size * categories_no) + correlation_matrix_3 <- matrix(1, cluster_size * (categories_no - 1), + cluster_size * (categories_no - 1)) + expect_error(check_correlation_matrix(correlation_matrix_1, cluster_size, + rfctn = "rbin")) + expect_error(check_correlation_matrix(correlation_matrix_1, cluster_size, + rfctn = "rmult.clm")) + expect_error(check_correlation_matrix(correlation_matrix_2, cluster_size, + rfctn = "rmult.bcl", categories_no)) + expect_error(check_correlation_matrix(correlation_matrix_3, cluster_size, + rfctn = "rmult.crm", categories_no)) + correlation_matrix_4 <- diag(1, cluster_size) + correlation_matrix_5 <- diag(1, cluster_size * categories_no) + correlation_matrix_6 <- diag(1, cluster_size * (categories_no - 1)) + expect_equal(check_correlation_matrix(correlation_matrix_4, cluster_size, + rfctn = "rbin"), + correlation_matrix_4) + expect_equal(check_correlation_matrix(correlation_matrix_4, cluster_size, + rfctn = "rmult.clm"), + correlation_matrix_4) + expect_equal(check_correlation_matrix(correlation_matrix_5, cluster_size, + rfctn = "rmult.bcl", categories_no), + correlation_matrix_5) + expect_equal(check_correlation_matrix(correlation_matrix_6, cluster_size, + rfctn = "rmult.crm", categories_no), + correlation_matrix_6) +}) diff --git a/tests/testthat/test_utility_functions.R b/tests/testthat/test_utility_functions.R index c749157..e3e02c3 100644 --- a/tests/testthat/test_utility_functions.R +++ b/tests/testthat/test_utility_functions.R @@ -16,10 +16,10 @@ test_that("rnorta", { test_that("rsmvnorm", { set.seed(1) - sample_size <- 100 # nolint - correlation_matrix <- toeplitz(c(1, 0.4)) # nolint + sample_size <- 100 + correlation_matrix <- toeplitz(c(1, 0.4)) sim_bivariate_normal <- rsmvnorm(R = sample_size, - cor.matrix = correlation_matrix) + cor.matrix = correlation_matrix) set.seed(1) p <- ncol(correlation_matrix) raw_code <- matrix(rnorm(sample_size * p), sample_size, p) %*% @@ -29,15 +29,13 @@ test_that("rsmvnorm", { test_that("rnorta sample size", { - R <- 0 - expect_error(if (all.equal(R, as.integer(R)) != TRUE | R < 1) - stop("'R' must be a positive integer")) - R <- 3.4 - expect_error( - if (all.equal(R, as.integer(R)) != TRUE | R < 1) - stop("'R' must be a positive integer")) - R <- -3 - expect_error( - if (all.equal(R, as.integer(R)) != TRUE | R < 1) - stop("'R' must be a positive integer")) + sample_size <- 0 + expect_true(all.equal(sample_size, as.integer(sample_size)) != TRUE | + sample_size < 1) + sample_size <- 3.4 + expect_true(all.equal(sample_size, as.integer(sample_size)) != TRUE | + sample_size < 1) + sample_size <- -3 + expect_true(all.equal(sample_size, as.integer(sample_size)) != TRUE | + sample_size < 1) }) diff --git a/vignettes/SimCorMultRes.R b/vignettes/SimCorMultRes.R deleted file mode 100644 index df3b42c..0000000 --- a/vignettes/SimCorMultRes.R +++ /dev/null @@ -1,227 +0,0 @@ -## ---- echo = FALSE------------------------------------------------------- -knitr::opts_chunk$set( - tidy = TRUE, - collapse = TRUE, - comment = "#>" - ) - -## ---- tidy=TRUE---------------------------------------------------------- -# parameter vector -betas <- c(1, 3, 2, 1.25, 3.25, 1.75, 0.75, 2.75, 2.25, 0, 0, 0) -# sample size -sample_size <- 500 -# number of nominal response categories -categories_no <- 4 -# cluster size -cluster_size <- 3 -set.seed(1) -# time-stationary covariate x_{i1} -x1 <- rep(rnorm(sample_size), each = cluster_size) -# time-varying covariate x_{it2} -x2 <- rnorm(sample_size * cluster_size) -# create covariates dataframe -xdata <- data.frame(x1, x2) -set.seed(321) -library(SimCorMultRes) -# latent correlation matrix for the NORTA method -equicorrelation_matrix <- toeplitz(c(1, rep(0.95,cluster_size - 1))) -identity_matrix <- diag(categories_no) -latent_correlation_matrix <- kronecker(equicorrelation_matrix, identity_matrix) -# simulation of clustered nominal responses -simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, - ncategories = categories_no, - betas = betas, xformula = ~ x1 + x2, - xdata = xdata, - cor.matrix = latent_correlation_matrix) -suppressPackageStartupMessages(library("multgee")) -# fitting a GEE model -nominal_gee_model <- nomLORgee(y ~ x1 + x2, - data = simulated_nominal_responses$simdata, id = id, - repeated = time, LORstr="time.exch") -# checking regression coefficients -round(coef(nominal_gee_model), 2) - -## ------------------------------------------------------------------------ -set.seed(12345) -# sample size -sample_size <- 500 -# cluster size -cluster_size <- 4 -# category-specific intercepts -beta_intercepts <- c(-1.5, -0.5, 0.5, 1.5) -# time-varying regression parameters associated with covariates -beta_coefficients <- matrix(c(1, 2, 3, 4), 4, 1) -# time-stationary covariate -x <- rep(rnorm(sample_size), each = cluster_size) -# latent correlation matrix for the NORTA method -latent_correlation_matrix <- toeplitz(c(1, 0.85, 0.5, 0.15)) -# simulation of ordinal responses -simulated_ordinal_responses <- rmult.clm(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, - xformula = ~ x, - cor.matrix = latent_correlation_matrix, - link = "probit") -# first eight rows of the simulated dataframe -head(simulated_ordinal_responses$simdata, n = 8) - -## ------------------------------------------------------------------------ -set.seed(1) -# sample size -sample_size <- 500 -# cluster size -cluster_size <- 4 -# category-specific intercepts -beta_intercepts <- c(-1.5, -0.5, 0.5, 1.5) -# regression parameters associated with covariates -beta_coefficients <- 1 -# time-varying covariate -x <- rnorm(sample_size * cluster_size) -# number of ordinal response categories -categories_no <- 5 -# correlation matrix for the NORTA method -latent_correlation_matrix <- diag(1, (categories_no - 1) * cluster_size) + - kronecker(toeplitz(c(0, rep(0.24, categories_no - 2))), matrix(1, cluster_size, cluster_size)) -# simulation of ordinal responses -simulated_ordinal_responses <- rmult.crm(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, - xformula = ~ x, - cor.matrix = latent_correlation_matrix, - link = "probit") -# first six clusters with ordinal responses -head(simulated_ordinal_responses$Ysim) - -## ---- tidy=TRUE---------------------------------------------------------- -# intercepts -beta_intercepts <- c(3, 2, 1) -# parameter vector -beta_coefficients <- c(1, 1) -# sample size -sample_size <- 500 -# cluster size -cluster_size <- 3 -set.seed(321) -# time-stationary covariate x_{i1} -x1 <- rep(rnorm(sample_size), each = cluster_size) -# time-varying covariate x_{it2} -x2 <- rnorm(sample_size * cluster_size) -# create covariates dataframe -xdata <- data.frame(x1, x2) -# correlation matrix for the NORTA method -equicorrelation_matrix <- toeplitz(c(1, rep(0.95, cluster_size - 1))) -identity_matrix <- diag(4) -latent_correlation_matrix <- kronecker(equicorrelation_matrix, identity_matrix) -# simulation of clustered ordinal responses -simulated_ordinal_responses <- rmult.acl(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, - xformula = ~ x1 + x2, xdata = xdata, - cor.matrix = latent_correlation_matrix) -suppressPackageStartupMessages(library("multgee")) -# fitting a GEE model -ordinal_gee_model <- ordLORgee(y ~ x1 + x2, - data = simulated_ordinal_responses$simdata, - id = id, repeated = time, LORstr = "time.exch", - link = "acl") -# checking regression coefficients -round(coef(ordinal_gee_model), 2) - -## ------------------------------------------------------------------------ -set.seed(123) -# sample size -sample_size <- 100 -# cluster size -cluster_size <- 4 -# intercept -beta_intercepts <- 0 -# regression parameter associated with the covariate -beta_coefficients <- 0.2 -# correlation matrix for the NORTA method -latent_correlation_matrix <- toeplitz(c(1, 0.9, 0.9, 0.9)) -# time-stationary covariate -x <- rep(rnorm(sample_size), each = cluster_size) -# simulation of clustered binary responses -simulated_binary_responses <- rbin(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, xformula = ~ x, - cor.matrix = latent_correlation_matrix, - link = "probit") -library(gee) -# fitting a GEE model -binary_gee_model <- gee(y ~ x, family = binomial("probit"), id = id, - data = simulated_binary_responses$simdata) -# checking the estimated coefficients -summary(binary_gee_model)$coefficients - -## ------------------------------------------------------------------------ -set.seed(8) -# simulation of epsilon variables -library(evd) -simulated_latent_variables1 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), - model = "log", d = cluster_size) -simulated_latent_variables2 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), - model = "log", d = cluster_size) -simulated_latent_variables <- simulated_latent_variables1 - - simulated_latent_variables2 -# simulation of clustered binary responses -simulated_binary_responses <- rbin(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, xformula = ~ x, - rlatent = simulated_latent_variables) -# fitting a GEE model -binary_gee_model <- gee(y ~ x, family = binomial("logit"), id = id, - data = simulated_binary_responses$simdata) -# checking the estimated coefficients -summary(binary_gee_model)$coefficients - -## ------------------------------------------------------------------------ -set.seed(123) -# sample size -sample_size <- 5000 -# cluster size -cluster_size <- 4 -# intercept -beta_intercepts <- qnorm(0.8) -# pseudo-covariate -x <- rep(0, each = cluster_size * sample_size) -# regression parameter associated with the covariate -beta_coefficients <- 0 -# correlation matrix for the NORTA method -latent_correlation_matrix <- diag(cluster_size) -# simulation of clustered binary responses -simulated_binary_responses <- rbin(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, - xformula = ~x, - cor.matrix = latent_correlation_matrix, - link = "probit") -library(gee) -# simulated marginal probabilities -colMeans(simulated_binary_responses$Ysim) - -## ---- tidy=TRUE---------------------------------------------------------- -# sample size -sample_size <- 5000 -# cluster size -cluster_size <- 3 -# pseudo-covariate -x <- rep(0, each = cluster_size * sample_size) -# parameter vector -betas <- c(log(0.1/0.4), 0, log(0.2/0.4), 0, log(0.3/0.4), 0, 0, 0) -# number of nominal response categories -categories_no <- 4 -set.seed(1) -# correlation matrix for the NORTA method -latent_correlation_matrix <- kronecker(diag(cluster_size), diag(categories_no)) -# simulation of clustered nominal responses -simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, - ncategories = categories_no, - betas = betas, xformula = ~ x, - cor.matrix = latent_correlation_matrix) -# simulated marginal probabilities -apply(simulated_nominal_responses$Ysim, 2, table) / sample_size - -## ---- comment=""--------------------------------------------------------- -citation("SimCorMultRes") - diff --git a/vignettes/SimCorMultRes.Rmd b/vignettes/SimCorMultRes.Rmd index ccb6463..f0d4867 100644 --- a/vignettes/SimCorMultRes.Rmd +++ b/vignettes/SimCorMultRes.Rmd @@ -91,7 +91,7 @@ equicorrelation_matrix <- toeplitz(c(1, rep(0.95,cluster_size - 1))) identity_matrix <- diag(categories_no) latent_correlation_matrix <- kronecker(equicorrelation_matrix, identity_matrix) # simulation of clustered nominal responses -simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, +simulated_nominal_dataset <- rmult.bcl(clsize = cluster_size, ncategories = categories_no, betas = betas, xformula = ~ x1 + x2, xdata = xdata, @@ -99,7 +99,7 @@ simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, suppressPackageStartupMessages(library("multgee")) # fitting a GEE model nominal_gee_model <- nomLORgee(y ~ x1 + x2, - data = simulated_nominal_responses$simdata, id = id, + data = simulated_nominal_dataset$simdata, id = id, repeated = time, LORstr="time.exch") # checking regression coefficients round(coef(nominal_gee_model), 2) @@ -155,14 +155,14 @@ x <- rep(rnorm(sample_size), each = cluster_size) # latent correlation matrix for the NORTA method latent_correlation_matrix <- toeplitz(c(1, 0.85, 0.5, 0.15)) # simulation of ordinal responses -simulated_ordinal_responses <- rmult.clm(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, - xformula = ~ x, - cor.matrix = latent_correlation_matrix, - link = "probit") +simulated_ordinal_dataset <- rmult.clm(clsize = cluster_size, + intercepts = beta_intercepts, + betas = beta_coefficients, + xformula = ~ x, + cor.matrix = latent_correlation_matrix, + link = "probit") # first eight rows of the simulated dataframe -head(simulated_ordinal_responses$simdata, n = 8) +head(simulated_ordinal_dataset$simdata, n = 8) ``` ### Marginal continuation-ratio model @@ -215,14 +215,14 @@ categories_no <- 5 latent_correlation_matrix <- diag(1, (categories_no - 1) * cluster_size) + kronecker(toeplitz(c(0, rep(0.24, categories_no - 2))), matrix(1, cluster_size, cluster_size)) # simulation of ordinal responses -simulated_ordinal_responses <- rmult.crm(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, - xformula = ~ x, - cor.matrix = latent_correlation_matrix, - link = "probit") +simulated_ordinal_dataset <- rmult.crm(clsize = cluster_size, + intercepts = beta_intercepts, + betas = beta_coefficients, + xformula = ~ x, + cor.matrix = latent_correlation_matrix, + link = "probit") # first six clusters with ordinal responses -head(simulated_ordinal_responses$Ysim) +head(simulated_ordinal_dataset$Ysim) ``` ### Marginal adjacent-category logit model @@ -282,15 +282,15 @@ equicorrelation_matrix <- toeplitz(c(1, rep(0.95, cluster_size - 1))) identity_matrix <- diag(4) latent_correlation_matrix <- kronecker(equicorrelation_matrix, identity_matrix) # simulation of clustered ordinal responses -simulated_ordinal_responses <- rmult.acl(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, - xformula = ~ x1 + x2, xdata = xdata, - cor.matrix = latent_correlation_matrix) +simulated_ordinal_dataset <- rmult.acl(clsize = cluster_size, + intercepts = beta_intercepts, + betas = beta_coefficients, + xformula = ~ x1 + x2, xdata = xdata, + cor.matrix = latent_correlation_matrix) suppressPackageStartupMessages(library("multgee")) # fitting a GEE model ordinal_gee_model <- ordLORgee(y ~ x1 + x2, - data = simulated_ordinal_responses$simdata, + data = simulated_ordinal_dataset$simdata, id = id, repeated = time, LORstr = "time.exch", link = "acl") # checking regression coefficients @@ -348,15 +348,15 @@ latent_correlation_matrix <- toeplitz(c(1, 0.9, 0.9, 0.9)) # time-stationary covariate x <- rep(rnorm(sample_size), each = cluster_size) # simulation of clustered binary responses -simulated_binary_responses <- rbin(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, xformula = ~ x, - cor.matrix = latent_correlation_matrix, - link = "probit") +simulated_binary_dataset <- rbin(clsize = cluster_size, + intercepts = beta_intercepts, + betas = beta_coefficients, xformula = ~ x, + cor.matrix = latent_correlation_matrix, + link = "probit") library(gee) # fitting a GEE model binary_gee_model <- gee(y ~ x, family = binomial("probit"), id = id, - data = simulated_binary_responses$simdata) + data = simulated_binary_dataset$simdata) # checking the estimated coefficients summary(binary_gee_model)$coefficients ``` @@ -384,13 +384,13 @@ simulated_latent_variables2 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), simulated_latent_variables <- simulated_latent_variables1 - simulated_latent_variables2 # simulation of clustered binary responses -simulated_binary_responses <- rbin(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, xformula = ~ x, - rlatent = simulated_latent_variables) +simulated_binary_dataset <- rbin(clsize = cluster_size, + intercepts = beta_intercepts, + betas = beta_coefficients, xformula = ~ x, + rlatent = simulated_latent_variables) # fitting a GEE model binary_gee_model <- gee(y ~ x, family = binomial("logit"), id = id, - data = simulated_binary_responses$simdata) + data = simulated_binary_dataset$simdata) # checking the estimated coefficients summary(binary_gee_model)$coefficients ``` @@ -429,15 +429,14 @@ beta_coefficients <- 0 # correlation matrix for the NORTA method latent_correlation_matrix <- diag(cluster_size) # simulation of clustered binary responses -simulated_binary_responses <- rbin(clsize = cluster_size, - intercepts = beta_intercepts, - betas = beta_coefficients, - xformula = ~x, - cor.matrix = latent_correlation_matrix, - link = "probit") +simulated_binary_dataset <- rbin(clsize = cluster_size, + intercepts = beta_intercepts, + betas = beta_coefficients, xformula = ~x, + cor.matrix = latent_correlation_matrix, + link = "probit") library(gee) # simulated marginal probabilities -colMeans(simulated_binary_responses$Ysim) +colMeans(simulated_binary_dataset$Ysim) ``` \textbf{ } @@ -461,12 +460,12 @@ set.seed(1) # correlation matrix for the NORTA method latent_correlation_matrix <- kronecker(diag(cluster_size), diag(categories_no)) # simulation of clustered nominal responses -simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, - ncategories = categories_no, - betas = betas, xformula = ~ x, - cor.matrix = latent_correlation_matrix) +simulated_nominal_dataset <- rmult.bcl(clsize = cluster_size, + ncategories = categories_no, + betas = betas, xformula = ~ x, + cor.matrix = latent_correlation_matrix) # simulated marginal probabilities -apply(simulated_nominal_responses$Ysim, 2, table) / sample_size +apply(simulated_nominal_dataset$Ysim, 2, table) / sample_size ``` # How to Cite ```{r, comment=""} diff --git a/vignettes/SimCorMultRes.html b/vignettes/SimCorMultRes.html deleted file mode 100644 index 94d0c5c..0000000 --- a/vignettes/SimCorMultRes.html +++ /dev/null @@ -1,934 +0,0 @@ - - - - - - - - - - - - - - - - -Simulating Correlated Binary and Multinomial Responses with SimCorMultRes - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - -
-

1 Introduction

-

The R package SimCorMultRes is suitable for simulation of correlated binary responses (exactly two response categories) and of correlated nominal or ordinal multinomial responses (three or more response categories) conditional on a regression model specification for the marginal probabilities of the response categories. This vignette briefly describes the simulation methods proposed by Touloumis (2016) and illustrates the use of the core functions of SimCorMultRes. A more detailed description of SimCorMultRes can be found in Touloumis (2016).

-
-
-

2 Areas of Applications

-

This package was created to facilitate the task of carrying out simulation studies and evaluating the performance of statistical methods for estimating the regression parameters in a marginal model with clustered binary and multinomial responses. Examples of such statistical methods include maximum likelihood methods, copula approaches, quasi-least squares approaches, generalized quasi-likelihood methods and generalized estimating equations (GEE) approaches among others (see references in Touloumis 2016).

-

In addition, SimCorMultRes can generate correlated binary and multinomial random variables conditional on a desired dependence structure and known marginal probabilities even if these are not determined by a regression model (see third example in Touloumis 2016) or to explore approximations of association measures for discrete variables that arise as realizations of an underlying continuum (see second example in Touloumis 2016).

-
-
-

3 Simulation Methods

-

Let \(Y_{it}\) be the binary or multinomial response for subject \(i\) (\(i=1,\ldots,N\)) at measurement occasion \(t\) (\(t=1,\ldots,T\)), and let \(\mathbf {x}_{it}\) be the associated covariates vector. We assume that \(Y_{it} \in \{0,1\}\) for binary responses and \(Y_{it} \in \{1,2,\ldots,J\geq 3\}\) for multinomial responses.

-
-

3.1 Correlated nominal responses

-

The function rmult.bcl simulates nominal responses under the marginal baseline-category logit model -\[\begin{equation} -\log \left[\frac{\Pr(Y_{it}=j |\mathbf {x}_{it})}{\Pr(Y_{it}=J |\mathbf {x}_{it})}\right]=(\beta_{tj0}-\beta_{tJ0})+(\boldsymbol {\beta}_{tj}-\boldsymbol{\beta}_{tJ})^{\prime} \mathbf {x}_{it}=\beta^{\ast}_{tj0}+\boldsymbol{\beta}^{\ast\prime}_{tj}\mathbf {x}_{it}, -\tag{3.1} -\end{equation}\] -where \(\beta_{tj0}\) is the \(j\)-th category-specific intercept at measurement occasion \(t\) and \(\boldsymbol{\beta}_{tj}\) is the \(j\)-th category-specific parameter vector associated with the covariates at measurement occasion \(t\). The popular identifiability constraints \(\beta_{tJ0}=0\) and \(\boldsymbol{\beta}_{tJ}=\mathbf {0}\) for all \(t\), imply that \(\beta^{\ast}_{tj0}=\beta_{tj0}\) and \(\boldsymbol {\beta}^{\ast}_{tj}=\boldsymbol{\beta}_{tj}\) for all \(t=1,\ldots,T\) and \(j=1,\ldots,J-1\). The threshold -\[Y_{it}=j \Leftrightarrow U^{NO}_{itj}=\max \{U^{NO}_{it1},\ldots,U^{NO}_{itJ}\}\] -generates clustered nominal responses that satisfy the marginal baseline-category logit model (3.1), where -\[U^{NO}_{itj}=\beta_{tj0}+\boldsymbol{\beta}_{tj}^{\prime} \mathbf {x}_{it}+e^{NO}_{itj},\] -and where the random variables \(\{e^{NO}_{itj}:i=1,\ldots,N \text{, } t=1,\ldots,T \text{ and } j=1,\ldots,J\}\) satisfy the following conditions:

-
    -
  1. \(e^{NO}_{itj}\) follows the standard extreme value distribution for all \(i\), \(t\) and \(j\) (mean \(=\gamma \approx 0.5772\), where \(\gamma\) is Euler’s constant, and variance \(=\pi^2/6\)).
  2. -
  3. \(e^{NO}_{i_1t_1j_1}\) and \(e^{NO}_{i_2t_2j_2}\) are independent random variables provided that \(i_1 \neq i_2\).
  4. -
  5. \(e^{NO}_{itj_1}\) and \(e^{NO}_{itj_2}\) are independent random variables provided that \(j_1\neq j_2\).
  6. -
-

For each subject \(i\), the association structure among the clustered nominal responses \(\{Y_{it}:t=1,\ldots,T\}\) depends on the joint distribution and correlation matrix of \(\{e^{NO}_{itj}:t=1,\ldots,T \text{ and } j=1,\ldots,J\}\). If the random variables \(\{e^{NO}_{itj}:t=1,\ldots,T \text{ and } j=1,\ldots,J\}\) are independent then so are \(\{Y_{it}:t=1,\ldots,T\}\). -

- -
-Example 3.1 (Simulation of clustered nominal responses using the NORTA method) Suppose the aim is to simulate nominal responses from the marginal baseline-category logit model -\[\begin{equation*} -\log \left[\frac{\Pr(Y_{it}=j |\mathbf {x}_{it})}{\Pr(Y_{it}=4 |\mathbf {x}_{it})}\right]=\beta_{j0}+ \beta_{j1} {x}_{i1}+ \beta_{j2} {x}_{it2} \end{equation*}\] -where \(N=500\), \(T=3\), \((\beta_{10},\beta_{11},\beta_{12},\beta_{20},\beta_{21},\beta_{22},\beta_{30},\beta_{31},\beta_{32})=(1, 3, 2, 1.25, 3.25, 1.75, 0.75, 2.75, 2.25)\) and \(\mathbf {x}_{it}=(x_{i1},x_{it2})^{\prime}\) for all \(i\) and \(t\), with \(x_{i1}\overset{iid}{\sim} N(0,1)\) and \(x_{it2}\overset{iid}{\sim} N(0,1)\). For the dependence structure, suppose that the correlation matrix \(\mathbf{R}\) in the NORTA method has elements -\[ -\mathbf{R}_{t_1j_1,t_2j_2}=\begin{cases} -1 & \text{if } t_1=t_2 \text{ and } j_1=j_2\\ -0.95 & \text{if } t_1 \neq t_2 \text{ and } j_1=j_2\\ -0 & \text{otherwise }\\ -\end{cases} -\] -for all \(i=1,\ldots,500\). -
- -
# parameter vector
-betas <- c(1, 3, 2, 1.25, 3.25, 1.75, 0.75, 2.75, 2.25, 0, 0, 0)
-# sample size
-sample_size <- 500
-# number of nominal response categories
-categories_no <- 4
-# cluster size
-cluster_size <- 3
-set.seed(1)
-# time-stationary covariate x_{i1}
-x1 <- rep(rnorm(sample_size), each = cluster_size)
-# time-varying covariate x_{it2}
-x2 <- rnorm(sample_size * cluster_size)
-# create covariates dataframe
-xdata <- data.frame(x1, x2)
-set.seed(321)
-library(SimCorMultRes)
-# latent correlation matrix for the NORTA method
-equicorrelation_matrix <- toeplitz(c(1, rep(0.95, cluster_size - 1)))
-identity_matrix <- diag(categories_no)
-latent_correlation_matrix <- kronecker(equicorrelation_matrix, identity_matrix)
-# simulation of clustered nominal responses
-simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, ncategories = categories_no, 
-    betas = betas, xformula = ~x1 + x2, xdata = xdata, cor.matrix = latent_correlation_matrix)
-suppressPackageStartupMessages(library("multgee"))
-# fitting a GEE model
-nominal_gee_model <- nomLORgee(y ~ x1 + x2, data = simulated_nominal_responses$simdata, 
-    id = id, repeated = time, LORstr = "time.exch")
-# checking regression coefficients
-round(coef(nominal_gee_model), 2)
-#> beta10   x1:1   x2:1 beta20   x1:2   x2:2 beta30   x1:3   x2:3 
-#>   1.07   3.18   1.99   1.35   3.40   1.70   0.89   3.06   2.22
-
-
-

3.2 Correlated ordinal responses

-

Simulation of clustered ordinal responses is feasible under either a marginal cumulative link model or a marginal continuation-ratio model.

- -
-

3.2.2 Marginal continuation-ratio model

-

The function rmult.crm simulates clustered ordinal responses under the marginal continuation-ratio model -\[\begin{equation} -\Pr(Y_{it}=j |Y_{it} \ge j,\mathbf {x}_{it})=F(\beta_{tj0} +\boldsymbol {\beta}^{'}_t \mathbf {x}_{it}) -\tag{3.3} -\end{equation}\] -where \(\beta_{tj0}\) is the \(j\)-th category-specific intercept at measurement occasion \(t\), \(\boldsymbol \beta_t\) is the regression parameter vector associated with the covariates at measurement occasion \(t\) and \(F\) is a cdf. This is accomplished by utilizing the threshold -\[Y_{it}=j, \text{ given } Y_{it} \geq j \Leftrightarrow U^{O2}_{itj} \leq \beta_{tj0}\] -where -\[U^{O2}_{itj}=-\boldsymbol {\beta}^{\prime}_t \mathbf {x}_{it}+e^{O2}_{itj},\] -and where \(\{e^{O2}_{itj}:i=1,\ldots,N \text{ , } t=1,\ldots,T \text{ and } j=1,\ldots,J-1\}\) satisfy the following three conditions:

-
    -
  1. \(e^{O2}_{itj} \sim F\) for all \(i\), \(t\) and \(j\).
  2. -
  3. \(e^{O2}_{i_1t_1j_1}\) and \(e^{O2}_{i_2t_2j_2}\) are independent random variables provided that \(i_1 \neq i_2\).
  4. -
  5. \(e^{O2}_{itj_1}\) and \(e^{O2}_{itj_2}\) are independent random variables provided that \(j_1\neq j_2\).
  6. -
-For each subject \(i\), the association structure among the clustered ordinal responses \(\{Y_{it}:t=1,\ldots,T\}\) depends on the joint distribution and correlation matrix of \(\{e^{O2}_{itj}:j=1,\ldots,J \text{ and } t=1,\ldots,T\}\). If the random variables \(\{e^{O2}_{itj}:j=1,\ldots,J \text{ and } t=1,\ldots,T\}\) are independent then so are \(\{Y_{it}:t=1,\ldots,T\}\). - - -
-Example 3.3 (Simulation of clustered ordinal responses conditional on a marginal continuation-ratio probit model) Suppose simulation of clustered ordinal responses under the marginal continuation-ratio probit model -\[\begin{equation*} -\Pr(Y_{it}=j |Y_{it} \ge j,\mathbf{x}_{it})=\Phi(\beta_{j0} + \beta {x}_{it}) -\end{equation*}\] -with \(N=500\), \(T=4\), \((\beta_{10},\beta_{20},\beta_{30},\beta_{40},\beta)=(-1.5,-0.5,0.5,1.5,1)\) and \(\mathbf{x}_{it}=x_{it}\overset{iid}{\sim} N(0,1)\) for all \(i\) and \(t\) is desired. For the dependence structure, assume that \(\left\{\mathbf{e}_i^{O2}=\left(e^{O2}_{i11},\ldots,e^{O1}_{i44}\right)^{\prime}:i=1,\ldots,N\right\}\) are iid random vectors from a multivariate normal distribution with mean vector the zero vector and covariance matrix the \(16 \times 16\) correlation matrix with elements\[ - \text{corr}(e^{O2}_{it_1j_1},e^{O2}_{it_2j_2}) = \begin{cases} - 1 & \text{for } j_1 = j_2 \text{ and } t_1 = t_2\\ - 0.24 & \text{for } t_1 \neq t_2\\ - 0 & \text{otherwise.}\\ - \end{cases} - \] -
- -
set.seed(1)
-# sample size
-sample_size <- 500
-# cluster size
-cluster_size <- 4
-# category-specific intercepts
-beta_intercepts <- c(-1.5, -0.5, 0.5, 1.5)
-# regression parameters associated with covariates
-beta_coefficients <- 1
-# time-varying covariate
-x <- rnorm(sample_size * cluster_size)
-# number of ordinal response categories
-categories_no <- 5
-# correlation matrix for the NORTA method
-latent_correlation_matrix <- diag(1, (categories_no - 1) * cluster_size) + kronecker(toeplitz(c(0, 
-    rep(0.24, categories_no - 2))), matrix(1, cluster_size, cluster_size))
-# simulation of ordinal responses
-simulated_ordinal_responses <- rmult.crm(clsize = cluster_size, intercepts = beta_intercepts, 
-    betas = beta_coefficients, xformula = ~x, cor.matrix = latent_correlation_matrix, 
-    link = "probit")
-# first six clusters with ordinal responses
-head(simulated_ordinal_responses$Ysim)
-#>     t=1 t=2 t=3 t=4
-#> i=1   2   1   3   1
-#> i=2   1   4   1   1
-#> i=3   2   2   1   3
-#> i=4   3   5   2   2
-#> i=5   2   1   1   1
-#> i=6   3   3   4   5
-
-
-

3.2.3 Marginal adjacent-category logit model

-

The function rmult.acl simulates clustered ordinal responses under the marginal adjacent-category logit model -\[\begin{equation} -\log\left[\frac{\Pr(Y_{it}=j |\mathbf {x}_{it})}{\Pr(Y_{it}=j+1 |\mathbf {x}_{it})}\right]=\beta_{tj0} +\boldsymbol {\beta}^{'}_t \mathbf {x}_{it} -\tag{3.4} -\end{equation}\] -where \(\beta_{tj0}\) is the \(j\)-th category-specific intercept at measurement occasion \(t\), \(\boldsymbol \beta_t\) is the regression parameter vector associated with the covariates at measurement occasion \(t\).

-

Generation of clustered ordinal responses relies upon utilizing the connection between baseline-category logit models and adjacent-category logit models. In particular, the threshold -\[Y_{it}=j \Leftrightarrow U^{O3}_{itj}=\max \{U^{O3}_{it1},\ldots,U^{O3}_{itJ}\}\] -generates clustered nominal responses that satisfy the marginal adjacent-category logit model (3.4), where -\[U^{O3}_{itj}=\sum_{k=j}^J\beta_{tk0}+(J-j)\boldsymbol{\beta}_{t}^{\prime} \mathbf {x}_{it}+e^{O3}_{itj},\] -and where the random variables \(\{e^{O3}_{itj}:i=1,\ldots,N \text{, } t=1,\ldots,T \text{ and } j=1,\ldots,J\}\) satisfy the following conditions:

-
    -
  1. \(e^{O3}_{itj}\) follows the standard extreme value distribution for all \(i\), \(t\) and \(j\).
  2. -
  3. \(e^{O3}_{i_1t_1j_1}\) and \(e^{O3}_{i_2t_2j_2}\) are independent random variables provided that \(i_1 \neq i_2\).
  4. -
  5. \(e^{O3}_{itj_1}\) and \(e^{O3}_{itj_2}\) are independent random variables provided that \(j_1\neq j_2\).
  6. -
-

For each subject \(i\), the association structure among the clustered ordinal responses \(\{Y_{it}:t=1,\ldots,T\}\) depends on the joint distribution and correlation matrix of \(\{e^{O3}_{itj}:t=1,\ldots,T \text{ and } j=1,\ldots,J\}\). If the random variables \(\{e^{O3}_{itj}:t=1,\ldots,T \text{ and } j=1,\ldots,J\}\) are independent then so are \(\{Y_{it}:t=1,\ldots,T\}\). -

- -
-Example 3.4 (Simulation of clustered ordinal responses conditional on a marginal adjacent-category logit model using the NORTA method) Suppose the aim is to simulate ordinal responses from the marginal adjacent-category logit model -\[\begin{equation*} -\log \left[\frac{\Pr(Y_{it}=j |\mathbf {x}_{it})}{\Pr(Y_{it}=j+1 |\mathbf {x}_{it})}\right]=\beta_{j0}+ \beta_{1} {x}_{i1}+ \beta_{2} {x}_{it2} \end{equation*}\] -where \(N=500\), \(T=3\), \((\beta_{10},\beta_{20},\beta_{30})=(3, 2, 1)\), \((\beta_{1},\beta_{2})=(1, 1)\) and \(\mathbf {x}_{it}=(x_{i1},x_{it2})^{\prime}\) for all \(i\) and \(t\), with \(x_{i1}\overset{iid}{\sim} N(0,1)\) and \(x_{it2}\overset{iid}{\sim} N(0,1)\). For the dependence structure, suppose that the correlation matrix \(\mathbf{R}\) in the NORTA method has elements -\[ -\mathbf{R}_{t_1j_1,t_2j_2}=\begin{cases} -1 & \text{if } t_1=t_2 \text{ and } j_1=j_2\\ -0.95 & \text{if } t_1 \neq t_2 \text{ and } j_1=j_2\\ -0 & \text{otherwise }\\ -\end{cases} -\] -for all \(i=1,\ldots,500\). -
- -
# intercepts
-beta_intercepts <- c(3, 2, 1)
-# parameter vector
-beta_coefficients <- c(1, 1)
-# sample size
-sample_size <- 500
-# cluster size
-cluster_size <- 3
-set.seed(321)
-# time-stationary covariate x_{i1}
-x1 <- rep(rnorm(sample_size), each = cluster_size)
-# time-varying covariate x_{it2}
-x2 <- rnorm(sample_size * cluster_size)
-# create covariates dataframe
-xdata <- data.frame(x1, x2)
-# correlation matrix for the NORTA method
-equicorrelation_matrix <- toeplitz(c(1, rep(0.95, cluster_size - 1)))
-identity_matrix <- diag(4)
-latent_correlation_matrix <- kronecker(equicorrelation_matrix, identity_matrix)
-# simulation of clustered ordinal responses
-simulated_ordinal_responses <- rmult.acl(clsize = cluster_size, intercepts = beta_intercepts, 
-    betas = beta_coefficients, xformula = ~x1 + x2, xdata = xdata, cor.matrix = latent_correlation_matrix)
-suppressPackageStartupMessages(library("multgee"))
-# fitting a GEE model
-ordinal_gee_model <- ordLORgee(y ~ x1 + x2, data = simulated_ordinal_responses$simdata, 
-    id = id, repeated = time, LORstr = "time.exch", link = "acl")
-# checking regression coefficients
-round(coef(ordinal_gee_model), 2)
-#> beta10 beta20 beta30     x1     x2 
-#>   2.95   1.97   1.67   1.14   1.00
-
-
-
-

3.3 Correlated binary responses

-

The function rbin simulates binary responses under the marginal model specification -\[\begin{equation} -\Pr(Y_{it}=1 |\mathbf {x}_{it})=F(\beta_{t0} +\boldsymbol {\beta}^{\prime}_{t} \mathbf {x}_{it}) -\tag{3.5} -\end{equation}\] -where \(\beta_{t0}\) is the intercept at measurement occasion \(t\), \(\boldsymbol \beta_t\) is the regression parameter vector associated with the covariates at measurement occasion \(t\) and \(F\) is a cdf. The threshold -\[Y_{it}=1 \Leftrightarrow U^{B}_{it} \leq \beta_{t0} + 2 \boldsymbol {\beta}^{\prime}_t \mathbf {x}_{it},\] -generates clustered binary responses that satisfy the marginal model (3.5), where -\[\begin{equation} -U^{B}_{it}=\boldsymbol {\beta}^{\prime}_t \mathbf {x}_{it}+e^{B}_{it}, -\tag{3.6} -\end{equation}\] -and where \(\{e^{B}_{it}:i=1,\ldots,N \text{ and } t=1,\ldots,T\}\) are random variables such that:

-
    -
  1. \(e^{B}_{it} \sim F\) for all \(i\) and \(t\).
  2. -
  3. \(e^{B}_{i_1t_1}\) and \(e^{B}_{i_2t_2}\) are independent random variables provided that \(i_1 \neq i_2\).
  4. -
-For each subject \(i\), the association structure among the clustered binary responses \(\{Y_{it}:t=1,\ldots,T\}\) depends on the pairwise bivariate distributions and correlation matrix of \(\{e^{B}_{it}:t=1,\ldots,T\}\). If the random variables \(\{e^{B}_{it}:t=1,\ldots,T\}\) are independent then so are \(\{Y_{it}:t=1,\ldots,T\}\). - - -
-Example 3.5 (Simulation of clustered binary responses conditional on a marginal probit model using NORTA method) Suppose the goal is to simulate clustered binary responses from the marginal probit model -\[\Pr(Y_{it}=1 |\mathbf{x}_{it})=\Phi(0.2x_i)\] -where \(N=100\), \(T=4\) and \(\mathbf{x}_{it}=x_i\overset{iid}{\sim} N(0,1)\) for all \(i\) and \(t\). For the association structure, assume that the random variables \(\mathbf{e}_i^{B}=(e^{B}_{i1},e^{B}_{i2},e^{B}_{i3},e^{B}_{i4})^{\prime}\) in (3.6) are iid random vectors from the tetra-variate normal distribution with mean vector the zero vector and covariance matrix the correlation matrix \(\mathbf{R}\) given by -\[\begin{equation} -\mathbf{R}=\left( {\begin{array}{*{20}c} - 1.00 & 0.90 & 0.90 & 0.90 \\ - 0.90 & 1.00 & 0.90 & 0.90 \\ - 0.90 & 0.90 & 1.00 & 0.90 \\ - 0.90 & 0.90 & 0.90 & 1.00 - \end{array} } \right). - \tag{3.7} - \end{equation}\] -This association configuration defines an exchangeable correlation matrix for the clustered binary responses, i.e. \(\text{corr}(Y_{it_1},Y_{it_2})=\rho_i\) for all \(i\) and \(t\). The strength of the correlation (\(\rho_i\)) is decreasing as the absolute value of the time-stationary covariate \(x_i\) increases. For example, \(\rho_i=0.7128\) when \(x_{i}=0\) and \(\rho_i=0.7\) when \(x_i=3\) or \(x_i=-3\). Therefore, a strong exchangeable correlation pattern for each subject that does not differ much across subjects is implied with this configuration. -
- -
set.seed(123)
-# sample size
-sample_size <- 100
-# cluster size
-cluster_size <- 4
-# intercept
-beta_intercepts <- 0
-# regression parameter associated with the covariate
-beta_coefficients <- 0.2
-# correlation matrix for the NORTA method
-latent_correlation_matrix <- toeplitz(c(1, 0.9, 0.9, 0.9))
-# time-stationary covariate
-x <- rep(rnorm(sample_size), each = cluster_size)
-# simulation of clustered binary responses
-simulated_binary_responses <- rbin(clsize = cluster_size, intercepts = beta_intercepts, 
-    betas = beta_coefficients, xformula = ~x, cor.matrix = latent_correlation_matrix, 
-    link = "probit")
-library(gee)
-# fitting a GEE model
-binary_gee_model <- gee(y ~ x, family = binomial("probit"), id = id, data = simulated_binary_responses$simdata)
-#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
-#> running glm to get initial regression estimate
-#> (Intercept)           x 
-#>   0.1315121   0.2826005
-# checking the estimated coefficients
-summary(binary_gee_model)$coefficients
-#>              Estimate Naive S.E.  Naive z Robust S.E. Robust z
-#> (Intercept) 0.1315121 0.06399465 2.055048   0.1106696 1.188331
-#> x           0.2826006 0.07191931 3.929412   0.1270285 2.224703
- - -
-Example 3.6 (Simulation of clustered binary responses under a conditional marginal logit model without utilizing the NORTA method) Consider now simulation of correlated binary responses from the marginal logit model -\[\begin{equation*} -\Pr(Y_{it}=1 |\mathbf{x}_{it})=F(0.2x_i) -\end{equation*}\] -where \(F\) is the cdf of the standard logistic distribution (mean \(=0\) and variance \(=\pi^2/3\)), \(N=100\), \(T=4\) and \(\mathbf{x}_{it}=x_i\overset{iid}{\sim} N(0,1)\) for all \(i\) and \(t\). This is similar to the marginal model configuration in Example 3.5 except from the link function. For the dependence structure, assume that the correlation matrix of \(\mathbf{e}_i^{B}=(e^{B}_{i1},e^{B}_{i2},e^{B}_{i3},e^{B}_{i4})^{\prime}\) in (3.6) is equal to the correlation matrix \(\mathbf{R}\) defined in (3.7). To simulate \(\mathbf{e}_i^{B}\) without utilizing the NORTA method, one can employ the tetra-variate extreme value distribution (Gumbel 1958). In particular, this is accomplished by setting \(\mathbf{e}_i^{B}=\mathbf{U}_i-\mathbf{V}_i\) for all \(i\), where \(\mathbf{U}_i\) and \(\mathbf{V}_i\) are independent random vectors from the tetra-variate extreme value distribution with dependence parameter equal to \(0.9\), that is -\[\Pr\left(U_{i1}\leq u_{i1},U_{i2}\leq u_{i2},U_{i3}\leq u_{i3},U_{i4}\leq u_{i4}\right)=\exp\left\{-\left[\sum_{t=1}^4 \exp{\left(-\frac{u_{it}}{0.9}\right)}\right]^{0.9}\right\}\] -and -\[\Pr\left(V_{i1}\leq v_{i1},V_{i2}\leq v_{i2},V_{i3}\leq v_{i3},V_{i4}\leq v_{i4}\right)=\exp\left\{-\left[\sum_{t=1}^4 \exp{\left(-\frac{v_{it}}{0.9}\right)}\right]^{0.9}\right\}.\] -It follows that \(e_{it}^{B}\sim F\) for all \(i\) and \(t\) and \(\textrm{corr}(\mathbf{e}_i^{B})=\mathbf{R}\) for all \(i\). -
- -
set.seed(8)
-# simulation of epsilon variables
-library(evd)
-#> 
-#> Attaching package: 'evd'
-#> The following objects are masked from 'package:VGAM':
-#> 
-#>     dfrechet, dgev, dgpd, dgumbel, pfrechet, pgev, pgpd, pgumbel,
-#>     qfrechet, qgev, qgpd, qgumbel, rfrechet, rgev, rgpd, rgumbel,
-#>     venice
-simulated_latent_variables1 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), model = "log", 
-    d = cluster_size)
-simulated_latent_variables2 <- rmvevd(sample_size, dep = sqrt(1 - 0.9), model = "log", 
-    d = cluster_size)
-simulated_latent_variables <- simulated_latent_variables1 - simulated_latent_variables2
-# simulation of clustered binary responses
-simulated_binary_responses <- rbin(clsize = cluster_size, intercepts = beta_intercepts, 
-    betas = beta_coefficients, xformula = ~x, rlatent = simulated_latent_variables)
-# fitting a GEE model
-binary_gee_model <- gee(y ~ x, family = binomial("logit"), id = id, data = simulated_binary_responses$simdata)
-#> Beginning Cgee S-function, @(#) geeformula.q 4.13 98/01/27
-#> running glm to get initial regression estimate
-#> (Intercept)           x 
-#>  0.04146261  0.09562709
-# checking the estimated coefficients
-summary(binary_gee_model)$coefficients
-#>               Estimate Naive S.E.   Naive z Robust S.E.  Robust z
-#> (Intercept) 0.04146261  0.1008516 0.4111249   0.1790511 0.2315686
-#> x           0.09562709  0.1107159 0.8637160   0.1949327 0.4905647
-
-
-

3.4 No marginal model specification

-

To achieve simulation of clustered binary, ordinal and nominal responses under no marginal model specification, perform the following intercepts:

-
    -
  1. Based on the marginal probabilities calculate the intercept of a marginal probit model for binary responses (see Example 3.7) or the category-specific intercepts of a cumulative probit model (see third example in Touloumis 2016) or of a baseline-category logit model for multinomial responses (see Example 3.8).

  2. -
  3. Create a pseudo-covariate say x of length equal to the number of cluster size (clsize) times the desired number of clusters of simulated responses (say R), that is x = clsize * R. This step is required in order to identify the desired number of clustered responses.

  4. -
  5. Set betas = 0 in the core functions rbin (see Example 3.7) or rmult.clm, or set 0 all values of the beta argument that correspond to the category-specific parameters in the core function rmult.bcl (see Example 3.8).

  6. -
  7. set xformula = ~ x.

  8. -
  9. Run the core function to obtain realizations of the simulated clustered responses.

  10. -
-

- -
-Example 3.7 (Simulation of clustered binary responses without covariates) Suppose the goal is to simulate \(5000\) clustered binary responses with \(\Pr(Y_{t}=1)=0.8\) for all \(t=1,\ldots,4\). For simplicity, assume that the clustered binary responses are independent. -
- -
set.seed(123)
-# sample size
-sample_size <- 5000
-# cluster size
-cluster_size <- 4
-# intercept
-beta_intercepts <- qnorm(0.8)
-# pseudo-covariate
-x <- rep(0, each = cluster_size * sample_size)
-# regression parameter associated with the covariate
-beta_coefficients <- 0
-# correlation matrix for the NORTA method
-latent_correlation_matrix <- diag(cluster_size)
-# simulation of clustered binary responses
-simulated_binary_responses <- rbin(clsize = cluster_size, intercepts = beta_intercepts, 
-    betas = beta_coefficients, xformula = ~x, cor.matrix = latent_correlation_matrix, 
-    link = "probit")
-library(gee)
-# simulated marginal probabilities
-colMeans(simulated_binary_responses$Ysim)
-#>    t=1    t=2    t=3    t=4 
-#> 0.8024 0.7972 0.7948 0.8088
- - -
-Example 3.8 (Simulation of clustered nominal responses without covariates) Suppose the aim is to simulate \(N=5000\) clustered nominal responses with -\(\Pr(Y_{t}=1)=0.1\), \(\Pr(Y_{t}=2)=0.2\), \(\Pr(Y_{t}=3)=0.3\) and \(\Pr(Y_{t}=4)=0.4\), for all \(i\) and \(t=1,\ldots,3\). For the sake of simplicity, we assume that the clustered responses are independent. -
- -
# sample size
-sample_size <- 5000
-# cluster size
-cluster_size <- 3
-# pseudo-covariate
-x <- rep(0, each = cluster_size * sample_size)
-# parameter vector
-betas <- c(log(0.1/0.4), 0, log(0.2/0.4), 0, log(0.3/0.4), 0, 0, 0)
-# number of nominal response categories
-categories_no <- 4
-set.seed(1)
-# correlation matrix for the NORTA method
-latent_correlation_matrix <- kronecker(diag(cluster_size), diag(categories_no))
-# simulation of clustered nominal responses
-simulated_nominal_responses <- rmult.bcl(clsize = cluster_size, ncategories = categories_no, 
-    betas = betas, xformula = ~x, cor.matrix = latent_correlation_matrix)
-# simulated marginal probabilities
-apply(simulated_nominal_responses$Ysim, 2, table)/sample_size
-#>      t=1    t=2    t=3
-#> 1 0.1000 0.0996 0.1036
-#> 2 0.2034 0.2000 0.2000
-#> 3 0.2874 0.3130 0.2894
-#> 4 0.4092 0.3874 0.4070
-
-
-
-

4 How to Cite

-
citation("SimCorMultRes")
-
-To cite R package SimCorMultRes in publications, please use:
-
-  Touloumis, A. (2016). Simulating Correlated Binary and
-  Multinomial Responses under Marginal Model Specification: The
-  SimCorMultRes Package. The R Journal 8:2, 79-91.
-
-A BibTeX entry for LaTeX users is
-
-  @Article{,
-    title = {Simulating Correlated Binary and Multinomial Responses under 
-         Marginal Model Specification: The SimCorMultRes Package},
-    author = {Anestis Touloumis},
-    year = {2016},
-    journal = {The R Journal},
-    volume = {8},
-    number = {2},
-    pages = {79-91},
-    url = {https://journal.r-project.org/archive/2016/RJ-2016-034/index.html},
-  }
-
-
-

References

-
-
-

Gumbel, E. J. 1958. Statistics of Extremes. Columbia University Press, New York.

-
-
-

Touloumis, A. 2016. “Simulating Correlated Binary and Multinomial Responses under Marginal Model Specification: The SimCorMultRes Package.” The R Journal 8 (2): 79–91. https://journal.r-project.org/archive/2016/RJ-2016-034/index.html.

-
-
-
- - - - -
- - - - - - - - - - - - - - -