diff --git a/R/to-utf8.R b/R/to-utf8.R new file mode 100644 index 0000000..c7b3c53 --- /dev/null +++ b/R/to-utf8.R @@ -0,0 +1,50 @@ +#' Deep conversion to UTF-8 +#' +#' Converts all characters directly or indirectly contained in an object to +#' UTF-8. +to_utf8 <- function(x, ...) UseMethod("to_utf8", x) + +to_utf8.utf8 <- function(x, ...) { + attrib_to_utf8(x) +} + +to_utf8.list <- function(x, ...) { + x[] <- lapply(x, to_utf8) + names(x) <- to_utf8(names(x)) + attrib_to_utf8(x) +} + +to_utf8.data.frame <- to_utf8.list + +to_utf8.character <- function(x, ..., use_class = TRUE) { + if (use_class) + x <- as.utf8(x) + else + x <- enc2utf8(x) + attrib_to_utf8(x) +} + +to_utf8.default <- function(x, ...) { + attrib_to_utf8(x) +} + +to_utf8.NULL <- function(x, ...) { + NULL +} + +attrib_to_utf8 <- function(x) { + mostattributes(x) <- named_to_utf8_except_class(attributes(x)) + x +} + +named_to_utf8_except_class <- function(attrib) { + is_class <- which(names(attrib) == "class") + if (length(is_class) > 0) { + attrib[-is_class] <- to_utf8(unname(attrib)[-is_class]) + attrib[[is_class]] <- to_utf8(unname(attrib)[[is_class]], use_class = FALSE) + } else { + attrib <- to_utf8(unname(attrib)) + } + + attrib +} diff --git a/man/to_utf8.Rd b/man/to_utf8.Rd new file mode 100644 index 0000000..cf084a9 --- /dev/null +++ b/man/to_utf8.Rd @@ -0,0 +1,13 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/to-utf8.R +\name{to_utf8} +\alias{to_utf8} +\title{Deep conversion to UTF-8} +\usage{ +to_utf8(x, ...) +} +\description{ +Converts all characters directly or indirectly contained in an object to +UTF-8. +} + diff --git a/src/encoding.c b/src/encoding.c index 19fe146..1d77519 100644 --- a/src/encoding.c +++ b/src/encoding.c @@ -1,5 +1,4 @@ #include -#include #define USE_RINTERNALS #include diff --git a/tests/testthat/test-to-utf8.R b/tests/testthat/test-to-utf8.R new file mode 100644 index 0000000..5e499e2 --- /dev/null +++ b/tests/testthat/test-to-utf8.R @@ -0,0 +1,31 @@ +context("to-utf8") + +test_that("character vectors", { + expect_is(to_utf8(letters), "utf8") + expect_false(inherits(class(to_utf8(letters)), "utf8")) +}) + +test_that("iris", { + iris_utf8 <- to_utf8(iris) + expect_is(colnames(iris_utf8), "utf8") + expect_true(all_utf8(names(attributes(iris_utf8)))) + expect_is(levels(iris_utf8$Species), "utf8") +}) + +test_that("mtcars", { + mtcars_utf8 <- to_utf8(mtcars) + expect_is(colnames(mtcars_utf8), "utf8") + expect_true(all_utf8(names(attributes(mtcars_utf8)))) + expect_true(all_utf8(rownames(mtcars_utf8))) +}) + +test_that("umlauts", { + data <- data.frame(a = I(c("o", "u"))) + colnames(data) <- enc2native("\u00e4") + data[[1]] <- enc2native(c("\u00f6", "\u00fc")) + + data_utf8 <- to_utf8(data) + expect_is(colnames(data_utf8), "utf8") + expect_true(all_utf8(names(attributes(data_utf8)))) + expect_true(all_utf8(rownames(data_utf8))) +})