From 01d60b32695acc4d86782ef980f0d4c0ffbe97af Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sat, 19 Oct 2024 01:46:50 +0200
Subject: [PATCH 1/6] `data_read()` preserves class for rds files

---
 R/data_read.R | 57 +++++++++++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 25 deletions(-)

diff --git a/R/data_read.R b/R/data_read.R
index 1306a3f32..2925e9b2b 100644
--- a/R/data_read.R
+++ b/R/data_read.R
@@ -171,35 +171,42 @@ data_read <- function(path,
         value_labels <- attr(i, "labels", exact = TRUE)
         variable_labels <- attr(i, "label", exact = TRUE)
 
-        # filter, so only matching value labels remain
-        value_labels <- value_labels[value_labels %in% unique(i)]
-
-        # guess variable type
-        if (is.character(i)) {
-          # we need this to drop haven-specific class attributes
-          i <- as.character(i)
-        } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
-          # if all values are labelled, we assume factor. Use labels as levels
-          if (is.numeric(i)) {
-            i <- factor(i, labels = names(value_labels))
+        # Only process if we have value labels - if no value labels present
+        # the following code falls back to coercing to numeric. Since this
+        # function is also called for "unknown" file types, all imported data
+        # is converted to numeric for non-labelled data, which is not intended,
+        # for instance for .rds files
+        if (!is.null(value_labels) && length(value_labels)) {
+          # filter, so only matching value labels remain
+          value_labels <- value_labels[value_labels %in% unique(i)]
+
+          # guess variable type
+          if (is.character(i)) {
+            # we need this to drop haven-specific class attributes
+            i <- as.character(i)
+          } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
+            # if all values are labelled, we assume factor. Use labels as levels
+            if (is.numeric(i)) {
+              i <- factor(i, labels = names(value_labels))
+            } else {
+              i <- factor(as.character(i), labels = names(value_labels))
+            }
+            value_labels <- NULL
+            attr(i, "converted_to_factor") <- TRUE
           } else {
-            i <- factor(as.character(i), labels = names(value_labels))
+            # else, fall back to numeric
+            i <- as.numeric(i)
           }
-          value_labels <- NULL
-          attr(i, "converted_to_factor") <- TRUE
-        } else {
-          # else, fall back to numeric
-          i <- as.numeric(i)
-        }
 
-        # drop unused value labels
-        value_labels <- value_labels[value_labels %in% unique(i)]
-        if (length(value_labels) > 0L) {
-          attr(i, "labels") <- value_labels
-        }
+          # drop unused value labels
+          value_labels <- value_labels[value_labels %in% unique(i)]
+          if (length(value_labels) > 0L) {
+            attr(i, "labels") <- value_labels
+          }
 
-        # add back variable label
-        attr(i, "label") <- variable_labels
+          # add back variable label
+          attr(i, "label") <- variable_labels
+        }
       }
       i
     })

From 947e2681f914e53d0678bed00ed071776136ac1b Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sat, 19 Oct 2024 01:48:12 +0200
Subject: [PATCH 2/6] desc. news

---
 DESCRIPTION | 2 +-
 NEWS.md     | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index cc9810def..f68a1e2eb 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: datawizard
 Title: Easy Data Wrangling and Statistical Transformations
-Version: 0.13.0.8
+Version: 0.13.0.9
 Authors@R: c(
     person("Indrajeet", "Patil", , "patilindrajeet.science@gmail.com", role = "aut",
            comment = c(ORCID = "0000-0003-1995-6531")),
diff --git a/NEWS.md b/NEWS.md
index b5cdf84c0..d0c42fdb6 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -24,6 +24,9 @@ BUG FIXES
 * `describe_distribution()` no longer errors if the sample was too sparse to compute
   CIs. Instead, it warns the user and returns `NA` (#550).
 
+* `data_read()` reserves variable types when importing files from `rds` or
+  `rdata` format.
+
 # datawizard 0.13.0
 
 BREAKING CHANGES

From 919057288b6dea9ad76c1092b549eba0a58d674f Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sat, 19 Oct 2024 10:06:57 +0200
Subject: [PATCH 3/6] fix

---
 R/data_read.R    | 96 +++++++++++++++++++++++-------------------------
 man/data_read.Rd | 32 ++++++++--------
 2 files changed, 62 insertions(+), 66 deletions(-)

diff --git a/R/data_read.R b/R/data_read.R
index 2925e9b2b..b24a5bdc2 100644
--- a/R/data_read.R
+++ b/R/data_read.R
@@ -15,15 +15,16 @@
 #' for SAS data files.
 #' @param encoding The character encoding used for the file. Usually not needed.
 #' @param convert_factors If `TRUE` (default), numeric variables, where all
-#' values have a value label, are assumed to be categorical and converted
-#' into factors. If `FALSE`, no variable types are guessed and no conversion
-#' of numeric variables into factors will be performed. See also section
-#' 'Differences to other packages'. For `data_write()`, this argument only
-#' applies to the text (e.g. `.txt` or `.csv`) or spreadsheet file formats (like
-#' `.xlsx`). Converting to factors might be useful for these formats because
-#' labelled numeric variables are then converted into factors and exported as
-#' character columns - else, value labels would be lost and only numeric values
-#' are written to the file.
+#' values have a value label, are assumed to be categorical and converted into
+#' factors. If `FALSE`, no variable types are guessed and no conversion of
+#' numeric variables into factors will be performed. For `data_read()`, this
+#' argument only applies to file types with *labelled data*, e.g. files from
+#' SPSS, SAS or Stata. See also section 'Differences to other packages'. For
+#' `data_write()`, this argument only applies to the text (e.g. `.txt` or
+#' `.csv`) or spreadsheet file formats (like `.xlsx`). Converting to factors
+#' might be useful for these formats because labelled numeric variables are then
+#' converted into factors and exported as character columns - else, value labels
+#' would be lost and only numeric values are written to the file.
 #' @param verbose Toggle warnings and messages.
 #' @param ... Arguments passed to the related `read_*()` or `write_*()` functions.
 #'
@@ -65,12 +66,13 @@
 #' @section Differences to other packages that read foreign data formats:
 #' `data_read()` is most comparable to `rio::import()`. For data files from
 #' SPSS, SAS or Stata, which support labelled data, variables are converted into
-#' their most appropriate type. The major difference to `rio::import()` is that
-#' `data_read()` automatically converts fully labelled numeric variables into
-#' factors, where imported value labels will be set as factor levels. If a
-#' numeric variable has _no_ value labels or less value labels than values, it
-#' is not converted to factor. In this case, value labels are preserved as
-#' `"labels"` attribute. Character vectors are preserved. Use
+#' their most appropriate type. The major difference to `rio::import()` is for
+#' data files from SPSS, SAS, or Stata, i.e. file types that support
+#' *labelled data*. `data_read()` automatically converts fully labelled numeric
+#' variables into factors, where imported value labels will be set as factor
+#' levels. If a numeric variable has _no_ value labels or less value labels than
+#' values, it is not converted to factor. In this case, value labels are
+#' preserved as `"labels"` attribute. Character vectors are preserved. Use
 #' `convert_factors = FALSE` to remove the automatic conversion of numeric
 #' variables to factors.
 #'
@@ -105,7 +107,7 @@ data_read <- function(path,
     por = .read_spss(path, encoding, convert_factors, verbose, ...),
     dta = .read_stata(path, encoding, convert_factors, verbose, ...),
     sas7bdat = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...),
-    .read_unknown(path, file_type, convert_factors, verbose, ...)
+    .read_unknown(path, file_type, verbose, ...)
   )
 
   # tell user about empty columns
@@ -171,42 +173,35 @@ data_read <- function(path,
         value_labels <- attr(i, "labels", exact = TRUE)
         variable_labels <- attr(i, "label", exact = TRUE)
 
-        # Only process if we have value labels - if no value labels present
-        # the following code falls back to coercing to numeric. Since this
-        # function is also called for "unknown" file types, all imported data
-        # is converted to numeric for non-labelled data, which is not intended,
-        # for instance for .rds files
-        if (!is.null(value_labels) && length(value_labels)) {
-          # filter, so only matching value labels remain
-          value_labels <- value_labels[value_labels %in% unique(i)]
-
-          # guess variable type
-          if (is.character(i)) {
-            # we need this to drop haven-specific class attributes
-            i <- as.character(i)
-          } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
-            # if all values are labelled, we assume factor. Use labels as levels
-            if (is.numeric(i)) {
-              i <- factor(i, labels = names(value_labels))
-            } else {
-              i <- factor(as.character(i), labels = names(value_labels))
-            }
-            value_labels <- NULL
-            attr(i, "converted_to_factor") <- TRUE
+        # filter, so only matching value labels remain
+        value_labels <- value_labels[value_labels %in% unique(i)]
+
+        # guess variable type
+        if (is.character(i)) {
+          # we need this to drop haven-specific class attributes
+          i <- as.character(i)
+        } else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
+          # if all values are labelled, we assume factor. Use labels as levels
+          if (is.numeric(i)) {
+            i <- factor(i, labels = names(value_labels))
           } else {
-            # else, fall back to numeric
-            i <- as.numeric(i)
-          }
-
-          # drop unused value labels
-          value_labels <- value_labels[value_labels %in% unique(i)]
-          if (length(value_labels) > 0L) {
-            attr(i, "labels") <- value_labels
+            i <- factor(as.character(i), labels = names(value_labels))
           }
+          value_labels <- NULL
+          attr(i, "converted_to_factor") <- TRUE
+        } else {
+          # else, fall back to numeric or factor
+          i <- as.numeric(i)
+        }
 
-          # add back variable label
-          attr(i, "label") <- variable_labels
+        # drop unused value labels
+        value_labels <- value_labels[value_labels %in% unique(i)]
+        if (length(value_labels) > 0L) {
+          attr(i, "labels") <- value_labels
         }
+
+        # add back variable label
+        attr(i, "label") <- variable_labels
       }
       i
     })
@@ -295,7 +290,7 @@ data_read <- function(path,
 }
 
 
-.read_unknown <- function(path, file_type, convert_factors, verbose, ...) {
+.read_unknown <- function(path, file_type, verbose, ...) {
   insight::check_if_installed("rio", reason = paste0("to read files of type '", file_type, "'"))
   if (verbose) {
     insight::format_alert("Reading data...")
@@ -324,6 +319,5 @@ data_read <- function(path,
     }
     out <- tmp
   }
-
-  .post_process_imported_data(out, convert_factors, verbose)
+  out
 }
diff --git a/man/data_read.Rd b/man/data_read.Rd
index 1ae3cea8a..d7d26255b 100644
--- a/man/data_read.Rd
+++ b/man/data_read.Rd
@@ -33,15 +33,16 @@ for SAS data files.}
 \item{encoding}{The character encoding used for the file. Usually not needed.}
 
 \item{convert_factors}{If \code{TRUE} (default), numeric variables, where all
-values have a value label, are assumed to be categorical and converted
-into factors. If \code{FALSE}, no variable types are guessed and no conversion
-of numeric variables into factors will be performed. See also section
-'Differences to other packages'. For \code{data_write()}, this argument only
-applies to the text (e.g. \code{.txt} or \code{.csv}) or spreadsheet file formats (like
-\code{.xlsx}). Converting to factors might be useful for these formats because
-labelled numeric variables are then converted into factors and exported as
-character columns - else, value labels would be lost and only numeric values
-are written to the file.}
+values have a value label, are assumed to be categorical and converted into
+factors. If \code{FALSE}, no variable types are guessed and no conversion of
+numeric variables into factors will be performed. For \code{data_read()}, this
+argument only applies to file types with \emph{labelled data}, e.g. files from
+SPSS, SAS or Stata. See also section 'Differences to other packages'. For
+\code{data_write()}, this argument only applies to the text (e.g. \code{.txt} or
+\code{.csv}) or spreadsheet file formats (like \code{.xlsx}). Converting to factors
+might be useful for these formats because labelled numeric variables are then
+converted into factors and exported as character columns - else, value labels
+would be lost and only numeric values are written to the file.}
 
 \item{verbose}{Toggle warnings and messages.}
 
@@ -118,12 +119,13 @@ versions, use \code{compress = "none"}, for example
 
 \code{data_read()} is most comparable to \code{rio::import()}. For data files from
 SPSS, SAS or Stata, which support labelled data, variables are converted into
-their most appropriate type. The major difference to \code{rio::import()} is that
-\code{data_read()} automatically converts fully labelled numeric variables into
-factors, where imported value labels will be set as factor levels. If a
-numeric variable has \emph{no} value labels or less value labels than values, it
-is not converted to factor. In this case, value labels are preserved as
-\code{"labels"} attribute. Character vectors are preserved. Use
+their most appropriate type. The major difference to \code{rio::import()} is for
+data files from SPSS, SAS, or Stata, i.e. file types that support
+\emph{labelled data}. \code{data_read()} automatically converts fully labelled numeric
+variables into factors, where imported value labels will be set as factor
+levels. If a numeric variable has \emph{no} value labels or less value labels than
+values, it is not converted to factor. In this case, value labels are
+preserved as \code{"labels"} attribute. Character vectors are preserved. Use
 \code{convert_factors = FALSE} to remove the automatic conversion of numeric
 variables to factors.
 }

From 25d76dfaeead137d26cd2a8ae1e03e9a51c7f9fa Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sat, 19 Oct 2024 10:11:42 +0200
Subject: [PATCH 4/6] add test

---
 tests/testthat/test-data_read.R | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/tests/testthat/test-data_read.R b/tests/testthat/test-data_read.R
index ac316c706..15f1161d3 100644
--- a/tests/testthat/test-data_read.R
+++ b/tests/testthat/test-data_read.R
@@ -141,12 +141,12 @@ test_that("data_read - RDS file, matrix, coercible", {
     httr::stop_for_status(request)
     writeBin(httr::content(request, type = "raw"), temp_file)
 
-    expect_message(expect_message(expect_message({
+    expect_message({
       d <- data_read(
         temp_file,
         verbose = TRUE
       )
-    })), regex = "0 out of 5")
+    })
 
     expect_s3_class(d, "data.frame")
     expect_identical(dim(d), c(2L, 5L))
@@ -155,6 +155,29 @@ test_that("data_read - RDS file, matrix, coercible", {
 
 
 
+# RDS file, preserve class /types -----------------------------------
+
+test_that("data_read - RDS file, preserve class", {
+  withr::with_tempfile("temp_file", fileext = ".rds", code = {
+    request <- httr::GET("https://raw.github.com/easystats/circus/main/data/hiv.rds")
+    httr::stop_for_status(request)
+    writeBin(httr::content(request, type = "raw"), temp_file)
+
+    d <- data_read(temp_file)
+    expect_s3_class(d, "data.frame")
+    expect_identical(
+      sapply(d, class),
+      c(
+        village = "integer", outcome = "integer", distance = "numeric",
+        amount = "numeric", incentive = "integer", age = "integer",
+        hiv2004 = "integer", agecat = "factor"
+      )
+    )
+  })
+})
+
+
+
 # RData -----------------------------------
 
 test_that("data_read - no warning for RData", {

From 540c0b3edf1ba40005a7ca3812fda0da441e8651 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Sat, 19 Oct 2024 10:19:21 +0200
Subject: [PATCH 5/6] typo

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index d0c42fdb6..f66f6ab5b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -24,7 +24,7 @@ BUG FIXES
 * `describe_distribution()` no longer errors if the sample was too sparse to compute
   CIs. Instead, it warns the user and returns `NA` (#550).
 
-* `data_read()` reserves variable types when importing files from `rds` or
+* `data_read()` preserves variable types when importing files from `rds` or
   `rdata` format.
 
 # datawizard 0.13.0

From 3d5febc50308b0aa88f3d800a64dda36dcd1a682 Mon Sep 17 00:00:00 2001
From: Etienne Bacher <52219252+etiennebacher@users.noreply.github.com>
Date: Sat, 19 Oct 2024 16:27:08 +0200
Subject: [PATCH 6/6] Update NEWS.md

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index f66f6ab5b..8fc8a29ca 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -25,7 +25,7 @@ BUG FIXES
   CIs. Instead, it warns the user and returns `NA` (#550).
 
 * `data_read()` preserves variable types when importing files from `rds` or
-  `rdata` format.
+  `rdata` format (#558).
 
 # datawizard 0.13.0