From 01d60b32695acc4d86782ef980f0d4c0ffbe97af Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 19 Oct 2024 01:46:50 +0200 Subject: [PATCH 1/6] `data_read()` preserves class for rds files --- R/data_read.R | 57 +++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/R/data_read.R b/R/data_read.R index 1306a3f32..2925e9b2b 100644 --- a/R/data_read.R +++ b/R/data_read.R @@ -171,35 +171,42 @@ data_read <- function(path, value_labels <- attr(i, "labels", exact = TRUE) variable_labels <- attr(i, "label", exact = TRUE) - # filter, so only matching value labels remain - value_labels <- value_labels[value_labels %in% unique(i)] - - # guess variable type - if (is.character(i)) { - # we need this to drop haven-specific class attributes - i <- as.character(i) - } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) { - # if all values are labelled, we assume factor. Use labels as levels - if (is.numeric(i)) { - i <- factor(i, labels = names(value_labels)) + # Only process if we have value labels - if no value labels present + # the following code falls back to coercing to numeric. Since this + # function is also called for "unknown" file types, all imported data + # is converted to numeric for non-labelled data, which is not intended, + # for instance for .rds files + if (!is.null(value_labels) && length(value_labels)) { + # filter, so only matching value labels remain + value_labels <- value_labels[value_labels %in% unique(i)] + + # guess variable type + if (is.character(i)) { + # we need this to drop haven-specific class attributes + i <- as.character(i) + } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) { + # if all values are labelled, we assume factor. Use labels as levels + if (is.numeric(i)) { + i <- factor(i, labels = names(value_labels)) + } else { + i <- factor(as.character(i), labels = names(value_labels)) + } + value_labels <- NULL + attr(i, "converted_to_factor") <- TRUE } else { - i <- factor(as.character(i), labels = names(value_labels)) + # else, fall back to numeric + i <- as.numeric(i) } - value_labels <- NULL - attr(i, "converted_to_factor") <- TRUE - } else { - # else, fall back to numeric - i <- as.numeric(i) - } - # drop unused value labels - value_labels <- value_labels[value_labels %in% unique(i)] - if (length(value_labels) > 0L) { - attr(i, "labels") <- value_labels - } + # drop unused value labels + value_labels <- value_labels[value_labels %in% unique(i)] + if (length(value_labels) > 0L) { + attr(i, "labels") <- value_labels + } - # add back variable label - attr(i, "label") <- variable_labels + # add back variable label + attr(i, "label") <- variable_labels + } } i }) From 947e2681f914e53d0678bed00ed071776136ac1b Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 19 Oct 2024 01:48:12 +0200 Subject: [PATCH 2/6] desc. news --- DESCRIPTION | 2 +- NEWS.md | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index cc9810def..f68a1e2eb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: datawizard Title: Easy Data Wrangling and Statistical Transformations -Version: 0.13.0.8 +Version: 0.13.0.9 Authors@R: c( person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut", comment = c(ORCID = "0000-0003-1995-6531")), diff --git a/NEWS.md b/NEWS.md index b5cdf84c0..d0c42fdb6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,6 +24,9 @@ BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute CIs. Instead, it warns the user and returns `NA` (#550). +* `data_read()` reserves variable types when importing files from `rds` or + `rdata` format. + # datawizard 0.13.0 BREAKING CHANGES From 919057288b6dea9ad76c1092b549eba0a58d674f Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 19 Oct 2024 10:06:57 +0200 Subject: [PATCH 3/6] fix --- R/data_read.R | 96 +++++++++++++++++++++++------------------------- man/data_read.Rd | 32 ++++++++-------- 2 files changed, 62 insertions(+), 66 deletions(-) diff --git a/R/data_read.R b/R/data_read.R index 2925e9b2b..b24a5bdc2 100644 --- a/R/data_read.R +++ b/R/data_read.R @@ -15,15 +15,16 @@ #' for SAS data files. #' @param encoding The character encoding used for the file. Usually not needed. #' @param convert_factors If `TRUE` (default), numeric variables, where all -#' values have a value label, are assumed to be categorical and converted -#' into factors. If `FALSE`, no variable types are guessed and no conversion -#' of numeric variables into factors will be performed. See also section -#' 'Differences to other packages'. For `data_write()`, this argument only -#' applies to the text (e.g. `.txt` or `.csv`) or spreadsheet file formats (like -#' `.xlsx`). Converting to factors might be useful for these formats because -#' labelled numeric variables are then converted into factors and exported as -#' character columns - else, value labels would be lost and only numeric values -#' are written to the file. +#' values have a value label, are assumed to be categorical and converted into +#' factors. If `FALSE`, no variable types are guessed and no conversion of +#' numeric variables into factors will be performed. For `data_read()`, this +#' argument only applies to file types with *labelled data*, e.g. files from +#' SPSS, SAS or Stata. See also section 'Differences to other packages'. For +#' `data_write()`, this argument only applies to the text (e.g. `.txt` or +#' `.csv`) or spreadsheet file formats (like `.xlsx`). Converting to factors +#' might be useful for these formats because labelled numeric variables are then +#' converted into factors and exported as character columns - else, value labels +#' would be lost and only numeric values are written to the file. #' @param verbose Toggle warnings and messages. #' @param ... Arguments passed to the related `read_*()` or `write_*()` functions. #' @@ -65,12 +66,13 @@ #' @section Differences to other packages that read foreign data formats: #' `data_read()` is most comparable to `rio::import()`. For data files from #' SPSS, SAS or Stata, which support labelled data, variables are converted into -#' their most appropriate type. The major difference to `rio::import()` is that -#' `data_read()` automatically converts fully labelled numeric variables into -#' factors, where imported value labels will be set as factor levels. If a -#' numeric variable has _no_ value labels or less value labels than values, it -#' is not converted to factor. In this case, value labels are preserved as -#' `"labels"` attribute. Character vectors are preserved. Use +#' their most appropriate type. The major difference to `rio::import()` is for +#' data files from SPSS, SAS, or Stata, i.e. file types that support +#' *labelled data*. `data_read()` automatically converts fully labelled numeric +#' variables into factors, where imported value labels will be set as factor +#' levels. If a numeric variable has _no_ value labels or less value labels than +#' values, it is not converted to factor. In this case, value labels are +#' preserved as `"labels"` attribute. Character vectors are preserved. Use #' `convert_factors = FALSE` to remove the automatic conversion of numeric #' variables to factors. #' @@ -105,7 +107,7 @@ data_read <- function(path, por = .read_spss(path, encoding, convert_factors, verbose, ...), dta = .read_stata(path, encoding, convert_factors, verbose, ...), sas7bdat = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...), - .read_unknown(path, file_type, convert_factors, verbose, ...) + .read_unknown(path, file_type, verbose, ...) ) # tell user about empty columns @@ -171,42 +173,35 @@ data_read <- function(path, value_labels <- attr(i, "labels", exact = TRUE) variable_labels <- attr(i, "label", exact = TRUE) - # Only process if we have value labels - if no value labels present - # the following code falls back to coercing to numeric. Since this - # function is also called for "unknown" file types, all imported data - # is converted to numeric for non-labelled data, which is not intended, - # for instance for .rds files - if (!is.null(value_labels) && length(value_labels)) { - # filter, so only matching value labels remain - value_labels <- value_labels[value_labels %in% unique(i)] - - # guess variable type - if (is.character(i)) { - # we need this to drop haven-specific class attributes - i <- as.character(i) - } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) { - # if all values are labelled, we assume factor. Use labels as levels - if (is.numeric(i)) { - i <- factor(i, labels = names(value_labels)) - } else { - i <- factor(as.character(i), labels = names(value_labels)) - } - value_labels <- NULL - attr(i, "converted_to_factor") <- TRUE + # filter, so only matching value labels remain + value_labels <- value_labels[value_labels %in% unique(i)] + + # guess variable type + if (is.character(i)) { + # we need this to drop haven-specific class attributes + i <- as.character(i) + } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) { + # if all values are labelled, we assume factor. Use labels as levels + if (is.numeric(i)) { + i <- factor(i, labels = names(value_labels)) } else { - # else, fall back to numeric - i <- as.numeric(i) - } - - # drop unused value labels - value_labels <- value_labels[value_labels %in% unique(i)] - if (length(value_labels) > 0L) { - attr(i, "labels") <- value_labels + i <- factor(as.character(i), labels = names(value_labels)) } + value_labels <- NULL + attr(i, "converted_to_factor") <- TRUE + } else { + # else, fall back to numeric or factor + i <- as.numeric(i) + } - # add back variable label - attr(i, "label") <- variable_labels + # drop unused value labels + value_labels <- value_labels[value_labels %in% unique(i)] + if (length(value_labels) > 0L) { + attr(i, "labels") <- value_labels } + + # add back variable label + attr(i, "label") <- variable_labels } i }) @@ -295,7 +290,7 @@ data_read <- function(path, } -.read_unknown <- function(path, file_type, convert_factors, verbose, ...) { +.read_unknown <- function(path, file_type, verbose, ...) { insight::check_if_installed("rio", reason = paste0("to read files of type '", file_type, "'")) if (verbose) { insight::format_alert("Reading data...") @@ -324,6 +319,5 @@ data_read <- function(path, } out <- tmp } - - .post_process_imported_data(out, convert_factors, verbose) + out } diff --git a/man/data_read.Rd b/man/data_read.Rd index 1ae3cea8a..d7d26255b 100644 --- a/man/data_read.Rd +++ b/man/data_read.Rd @@ -33,15 +33,16 @@ for SAS data files.} \item{encoding}{The character encoding used for the file. Usually not needed.} \item{convert_factors}{If \code{TRUE} (default), numeric variables, where all -values have a value label, are assumed to be categorical and converted -into factors. If \code{FALSE}, no variable types are guessed and no conversion -of numeric variables into factors will be performed. See also section -'Differences to other packages'. For \code{data_write()}, this argument only -applies to the text (e.g. \code{.txt} or \code{.csv}) or spreadsheet file formats (like -\code{.xlsx}). Converting to factors might be useful for these formats because -labelled numeric variables are then converted into factors and exported as -character columns - else, value labels would be lost and only numeric values -are written to the file.} +values have a value label, are assumed to be categorical and converted into +factors. If \code{FALSE}, no variable types are guessed and no conversion of +numeric variables into factors will be performed. For \code{data_read()}, this +argument only applies to file types with \emph{labelled data}, e.g. files from +SPSS, SAS or Stata. See also section 'Differences to other packages'. For +\code{data_write()}, this argument only applies to the text (e.g. \code{.txt} or +\code{.csv}) or spreadsheet file formats (like \code{.xlsx}). Converting to factors +might be useful for these formats because labelled numeric variables are then +converted into factors and exported as character columns - else, value labels +would be lost and only numeric values are written to the file.} \item{verbose}{Toggle warnings and messages.} @@ -118,12 +119,13 @@ versions, use \code{compress = "none"}, for example \code{data_read()} is most comparable to \code{rio::import()}. For data files from SPSS, SAS or Stata, which support labelled data, variables are converted into -their most appropriate type. The major difference to \code{rio::import()} is that -\code{data_read()} automatically converts fully labelled numeric variables into -factors, where imported value labels will be set as factor levels. If a -numeric variable has \emph{no} value labels or less value labels than values, it -is not converted to factor. In this case, value labels are preserved as -\code{"labels"} attribute. Character vectors are preserved. Use +their most appropriate type. The major difference to \code{rio::import()} is for +data files from SPSS, SAS, or Stata, i.e. file types that support +\emph{labelled data}. \code{data_read()} automatically converts fully labelled numeric +variables into factors, where imported value labels will be set as factor +levels. If a numeric variable has \emph{no} value labels or less value labels than +values, it is not converted to factor. In this case, value labels are +preserved as \code{"labels"} attribute. Character vectors are preserved. Use \code{convert_factors = FALSE} to remove the automatic conversion of numeric variables to factors. } From 25d76dfaeead137d26cd2a8ae1e03e9a51c7f9fa Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 19 Oct 2024 10:11:42 +0200 Subject: [PATCH 4/6] add test --- tests/testthat/test-data_read.R | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-data_read.R b/tests/testthat/test-data_read.R index ac316c706..15f1161d3 100644 --- a/tests/testthat/test-data_read.R +++ b/tests/testthat/test-data_read.R @@ -141,12 +141,12 @@ test_that("data_read - RDS file, matrix, coercible", { httr::stop_for_status(request) writeBin(httr::content(request, type = "raw"), temp_file) - expect_message(expect_message(expect_message({ + expect_message({ d <- data_read( temp_file, verbose = TRUE ) - })), regex = "0 out of 5") + }) expect_s3_class(d, "data.frame") expect_identical(dim(d), c(2L, 5L)) @@ -155,6 +155,29 @@ test_that("data_read - RDS file, matrix, coercible", { +# RDS file, preserve class /types ----------------------------------- + +test_that("data_read - RDS file, preserve class", { + withr::with_tempfile("temp_file", fileext = ".rds", code = { + request <- httr::GET("https://raw.github.com/easystats/circus/main/data/hiv.rds") + httr::stop_for_status(request) + writeBin(httr::content(request, type = "raw"), temp_file) + + d <- data_read(temp_file) + expect_s3_class(d, "data.frame") + expect_identical( + sapply(d, class), + c( + village = "integer", outcome = "integer", distance = "numeric", + amount = "numeric", incentive = "integer", age = "integer", + hiv2004 = "integer", agecat = "factor" + ) + ) + }) +}) + + + # RData ----------------------------------- test_that("data_read - no warning for RData", { From 540c0b3edf1ba40005a7ca3812fda0da441e8651 Mon Sep 17 00:00:00 2001 From: Daniel Date: Sat, 19 Oct 2024 10:19:21 +0200 Subject: [PATCH 5/6] typo --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index d0c42fdb6..f66f6ab5b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -24,7 +24,7 @@ BUG FIXES * `describe_distribution()` no longer errors if the sample was too sparse to compute CIs. Instead, it warns the user and returns `NA` (#550). -* `data_read()` reserves variable types when importing files from `rds` or +* `data_read()` preserves variable types when importing files from `rds` or `rdata` format. # datawizard 0.13.0 From 3d5febc50308b0aa88f3d800a64dda36dcd1a682 Mon Sep 17 00:00:00 2001 From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com> Date: Sat, 19 Oct 2024 16:27:08 +0200 Subject: [PATCH 6/6] Update NEWS.md --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index f66f6ab5b..8fc8a29ca 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,7 +25,7 @@ BUG FIXES CIs. Instead, it warns the user and returns `NA` (#550). * `data_read()` preserves variable types when importing files from `rds` or - `rdata` format. + `rdata` format (#558). # datawizard 0.13.0