From 8d879e9eff753ed8b5764b9fd3f4d8621a45ba62 Mon Sep 17 00:00:00 2001
From: Xianying Tan <shrektan@126.com>
Date: Fri, 2 Dec 2016 23:21:35 +0800
Subject: [PATCH] correctly parse non-ASCII characters of R files in Windows
 (#532)

* add supports for nonASCII chars in roxygenize

* ensure write UTF-8 rd files
---
 NEWS.md                                 |  4 ++++
 R/parse.R                               | 10 +++++++---
 R/source.R                              |  7 ++++++-
 R/utils.R                               |  4 +++-
 tests/testthat/test-nonASCII.R          | 16 ++++++++++++++++
 tests/testthat/testNonASCII/DESCRIPTION |  8 ++++++++
 tests/testthat/testNonASCII/R/a.r       |  9 +++++++++
 7 files changed, 53 insertions(+), 5 deletions(-)
 create mode 100644 tests/testthat/test-nonASCII.R
 create mode 100644 tests/testthat/testNonASCII/DESCRIPTION
 create mode 100644 tests/testthat/testNonASCII/R/a.r

diff --git a/NEWS.md b/NEWS.md
index fd341c575..7c8ce24e1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -93,6 +93,10 @@
 * The new `_PACKAGE` sentinel now also works from `roxygenise()`; before
   it only worked from `devtools::document()` (#439, @krlmlr).
 
+* `roxygen2::roxygenise()` now parse nonASCII documentation correctly 
+  (as long as UTF-8 encoded or specified Encoding in DESCRIPTION)
+  (#532, @shrektan).
+
 ## Extension
 
 * Deprecated `register.preref.parser()` and `register.preref.parsers()`
diff --git a/R/parse.R b/R/parse.R
index 32209da0b..6cd89436f 100644
--- a/R/parse.R
+++ b/R/parse.R
@@ -1,9 +1,10 @@
 parse_package <- function(base_path, load_code, registry, global_options = list()) {
   env <- load_code(base_path)
+  desc <- read_pkg_description(base_path)
 
   files <- package_files(base_path)
   parsed <- lapply(files, parse_blocks, env = env, registry = registry,
-                   global_options = global_options)
+                   global_options = global_options, fileEncoding = desc$Encoding %||% "UTF-8")
   blocks <- unlist(parsed, recursive = FALSE)
 
   list(env = env, blocks = blocks)
@@ -24,8 +25,11 @@ parse_text <- function(text, registry = default_tags(), global_options = list())
   list(env = env, blocks = blocks)
 }
 
-parse_blocks <- function(file, env, registry, global_options = list()) {
-  parsed <- parse(file = file, keep.source = TRUE)
+parse_blocks <- function(file, env, registry, global_options = list(), fileEncoding = "UTF-8") {
+
+  con <- file(file, encoding = fileEncoding)
+  on.exit(close(con), add = TRUE)
+  parsed <- parse(con, keep.source = TRUE, srcfile = srcfile(file, encoding = fileEncoding))
   if (length(parsed) == 0) return()
 
   refs <- utils::getSrcref(parsed)
diff --git a/R/source.R b/R/source.R
index 8fb49d61e..4452e0409 100644
--- a/R/source.R
+++ b/R/source.R
@@ -21,12 +21,17 @@ source_package <- function(path) {
 
   load_pkg_dependencies(path)
 
+  desc <- read_pkg_description(path)
   paths <- package_files(path)
-  lapply(paths, sys.source, envir = env, keep.source = FALSE)
+  lapply(paths, sys_source, envir = env, fileEncoding = desc$Encoding %||% "UTF-8")
 
   env
 }
 
+sys_source <- function(file, envir = baseenv(), fileEncoding = "UTF-8") {
+  source(file, encoding = fileEncoding, keep.source = FALSE, local = envir)
+}
+
 # Assume that the package has already been loaded by other means
 # (e.g. build and reload)
 loaded_package <- function(path) {
diff --git a/R/utils.R b/R/utils.R
index a4f624ce2..88e13c151 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -80,7 +80,9 @@ write_if_different <- function(path, contents, check = TRUE) {
     FALSE
   } else {
     cat(sprintf('Writing %s\n', name))
-    writeLines(contents, path)
+    con <- file(path, encoding = "UTF-8")
+    on.exit(close(con), add = TRUE)
+    writeLines(contents, con)
     TRUE
   }
 }
diff --git a/tests/testthat/test-nonASCII.R b/tests/testthat/test-nonASCII.R
new file mode 100644
index 000000000..eeb24167d
--- /dev/null
+++ b/tests/testthat/test-nonASCII.R
@@ -0,0 +1,16 @@
+context("nonASCII")
+
+test_that("can generate nonASCII document", {
+  test_pkg <- temp_copy_pkg('testNonASCII')
+  on.exit(unlink(test_pkg, recursive = TRUE))
+
+  expect_output(roxygenize(test_pkg), "printChineseMsg[.]Rd")
+  expect_true(file.exists(file.path(test_pkg, "man", "printChineseMsg.Rd")))
+
+  cnChar <- readLines(file.path(test_pkg, "man", "printChineseMsg.Rd"), encoding = "UTF-8")
+
+  # Because the parse in testthat::test don't specify encoding to UTF-8 as well,
+  # so we have to use unicode escapes.
+  expect_true(any(grepl("\u6211\u7231\u4e2d\u6587", cnChar)))
+  expect_true(any(grepl("\u4e2d\u6587\u6ce8\u91ca", cnChar)))
+})
diff --git a/tests/testthat/testNonASCII/DESCRIPTION b/tests/testthat/testNonASCII/DESCRIPTION
new file mode 100644
index 000000000..9e5ca820e
--- /dev/null
+++ b/tests/testthat/testNonASCII/DESCRIPTION
@@ -0,0 +1,8 @@
+Package: testNonASCII
+Title: Test no change to Collate when there are no @includes
+License: GPL-2
+Description:
+Author: Shrektan <shrektan@126.com>
+Maintainer: Shrektan <shrektan@126.com>
+Encoding: GB2312
+Version: 0.1
diff --git a/tests/testthat/testNonASCII/R/a.r b/tests/testthat/testNonASCII/R/a.r
new file mode 100644
index 000000000..770d735a9
--- /dev/null
+++ b/tests/testthat/testNonASCII/R/a.r
@@ -0,0 +1,9 @@
+# This script is intended to be saved in GB2312 to test if non UTF-8 encoding is
+# supported.
+
+#' 中文注释
+#'
+#' @note 我爱中文。
+printChineseMsg <- function() {
+  message("我是GB2312的中文字符。")
+}