From 8d879e9eff753ed8b5764b9fd3f4d8621a45ba62 Mon Sep 17 00:00:00 2001 From: Xianying Tan Date: Fri, 2 Dec 2016 23:21:35 +0800 Subject: [PATCH] correctly parse non-ASCII characters of R files in Windows (#532) * add supports for nonASCII chars in roxygenize * ensure write UTF-8 rd files --- NEWS.md | 4 ++++ R/parse.R | 10 +++++++--- R/source.R | 7 ++++++- R/utils.R | 4 +++- tests/testthat/test-nonASCII.R | 16 ++++++++++++++++ tests/testthat/testNonASCII/DESCRIPTION | 8 ++++++++ tests/testthat/testNonASCII/R/a.r | 9 +++++++++ 7 files changed, 53 insertions(+), 5 deletions(-) create mode 100644 tests/testthat/test-nonASCII.R create mode 100644 tests/testthat/testNonASCII/DESCRIPTION create mode 100644 tests/testthat/testNonASCII/R/a.r diff --git a/NEWS.md b/NEWS.md index fd341c575..7c8ce24e1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -93,6 +93,10 @@ * The new `_PACKAGE` sentinel now also works from `roxygenise()`; before it only worked from `devtools::document()` (#439, @krlmlr). +* `roxygen2::roxygenise()` now parse nonASCII documentation correctly + (as long as UTF-8 encoded or specified Encoding in DESCRIPTION) + (#532, @shrektan). + ## Extension * Deprecated `register.preref.parser()` and `register.preref.parsers()` diff --git a/R/parse.R b/R/parse.R index 32209da0b..6cd89436f 100644 --- a/R/parse.R +++ b/R/parse.R @@ -1,9 +1,10 @@ parse_package <- function(base_path, load_code, registry, global_options = list()) { env <- load_code(base_path) + desc <- read_pkg_description(base_path) files <- package_files(base_path) parsed <- lapply(files, parse_blocks, env = env, registry = registry, - global_options = global_options) + global_options = global_options, fileEncoding = desc$Encoding %||% "UTF-8") blocks <- unlist(parsed, recursive = FALSE) list(env = env, blocks = blocks) @@ -24,8 +25,11 @@ parse_text <- function(text, registry = default_tags(), global_options = list()) list(env = env, blocks = blocks) } -parse_blocks <- function(file, env, registry, global_options = list()) { - parsed <- parse(file = file, keep.source = TRUE) +parse_blocks <- function(file, env, registry, global_options = list(), fileEncoding = "UTF-8") { + + con <- file(file, encoding = fileEncoding) + on.exit(close(con), add = TRUE) + parsed <- parse(con, keep.source = TRUE, srcfile = srcfile(file, encoding = fileEncoding)) if (length(parsed) == 0) return() refs <- utils::getSrcref(parsed) diff --git a/R/source.R b/R/source.R index 8fb49d61e..4452e0409 100644 --- a/R/source.R +++ b/R/source.R @@ -21,12 +21,17 @@ source_package <- function(path) { load_pkg_dependencies(path) + desc <- read_pkg_description(path) paths <- package_files(path) - lapply(paths, sys.source, envir = env, keep.source = FALSE) + lapply(paths, sys_source, envir = env, fileEncoding = desc$Encoding %||% "UTF-8") env } +sys_source <- function(file, envir = baseenv(), fileEncoding = "UTF-8") { + source(file, encoding = fileEncoding, keep.source = FALSE, local = envir) +} + # Assume that the package has already been loaded by other means # (e.g. build and reload) loaded_package <- function(path) { diff --git a/R/utils.R b/R/utils.R index a4f624ce2..88e13c151 100644 --- a/R/utils.R +++ b/R/utils.R @@ -80,7 +80,9 @@ write_if_different <- function(path, contents, check = TRUE) { FALSE } else { cat(sprintf('Writing %s\n', name)) - writeLines(contents, path) + con <- file(path, encoding = "UTF-8") + on.exit(close(con), add = TRUE) + writeLines(contents, con) TRUE } } diff --git a/tests/testthat/test-nonASCII.R b/tests/testthat/test-nonASCII.R new file mode 100644 index 000000000..eeb24167d --- /dev/null +++ b/tests/testthat/test-nonASCII.R @@ -0,0 +1,16 @@ +context("nonASCII") + +test_that("can generate nonASCII document", { + test_pkg <- temp_copy_pkg('testNonASCII') + on.exit(unlink(test_pkg, recursive = TRUE)) + + expect_output(roxygenize(test_pkg), "printChineseMsg[.]Rd") + expect_true(file.exists(file.path(test_pkg, "man", "printChineseMsg.Rd"))) + + cnChar <- readLines(file.path(test_pkg, "man", "printChineseMsg.Rd"), encoding = "UTF-8") + + # Because the parse in testthat::test don't specify encoding to UTF-8 as well, + # so we have to use unicode escapes. + expect_true(any(grepl("\u6211\u7231\u4e2d\u6587", cnChar))) + expect_true(any(grepl("\u4e2d\u6587\u6ce8\u91ca", cnChar))) +}) diff --git a/tests/testthat/testNonASCII/DESCRIPTION b/tests/testthat/testNonASCII/DESCRIPTION new file mode 100644 index 000000000..9e5ca820e --- /dev/null +++ b/tests/testthat/testNonASCII/DESCRIPTION @@ -0,0 +1,8 @@ +Package: testNonASCII +Title: Test no change to Collate when there are no @includes +License: GPL-2 +Description: +Author: Shrektan +Maintainer: Shrektan +Encoding: GB2312 +Version: 0.1 diff --git a/tests/testthat/testNonASCII/R/a.r b/tests/testthat/testNonASCII/R/a.r new file mode 100644 index 000000000..770d735a9 --- /dev/null +++ b/tests/testthat/testNonASCII/R/a.r @@ -0,0 +1,9 @@ +# This script is intended to be saved in GB2312 to test if non UTF-8 encoding is +# supported. + +#' 中文注释 +#' +#' @note 我爱中文。 +printChineseMsg <- function() { + message("我是GB2312的中文字符。") +}