diff --git a/NEWS.md b/NEWS.md index 340137971..311e2471a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,7 @@ * The ability to position `autostart` anywhere inside one of multiple tables in a single file is removed with warning. It used to search upwards from that line to find the start of the table based on a consistent number of columns. People appear to be using `skip="string"` or `skip=nrow` to find the header row exactly, which is retained and simpler. It was too difficult to retain search-upwards-autostart together with skipping blank lines, filling incomplete rows and parallelization. Varying format and height messy header info above the column names is still auto detected and auto skipped. * `dec=','` is now implemented directly so there is no dependency on locale. The options `datatable.fread.dec.experiment` and `datatable.fread.dec.locale` have been removed. * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik and more to add for testing before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846) + * Detect BOM mark in GB-18030 and UTF-16 encodings, in verbose mode print a message about BOM detection. #### BUG FIXES diff --git a/inst/tests/gb18030.txt b/inst/tests/gb18030.txt new file mode 100644 index 000000000..8b05ce081 --- /dev/null +++ b/inst/tests/gb18030.txt @@ -0,0 +1,2 @@ +x,y,z +,, diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index cafb3a256..551f9e6a5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -7906,6 +7906,10 @@ if ("package:bit64" %in% search()) { # fix for #1087 and #1465 test(1627, charToRaw(names(fread("issue_1087_utf8_bom.csv"))[1L]), as.raw(97L)) +test(1627.1, names(fread("issue_1087_utf8_bom.csv", verbose=T))[1L], "a", output="UTF-8 byte order mark EF BB BF found") +test(1627.2, names(fread("gb18030.txt", verbose=T))[1L], "x", output="GB-18030 byte order mark 84 31 95 33 found", warning="GB-18030 encoding detected") +test(1627.3, fread("utf16le.txt"), error="File is encoded in UTF-16") +test(1627.4, fread("utf16be.txt"), error="File is encoded in UTF-16") # uniqueN gains na.rm argument, #1455 set.seed(1L) diff --git a/inst/tests/utf16be.txt b/inst/tests/utf16be.txt new file mode 100644 index 000000000..a85d75180 Binary files /dev/null and b/inst/tests/utf16be.txt differ diff --git a/inst/tests/utf16le.txt b/inst/tests/utf16le.txt new file mode 100644 index 000000000..36b633aa3 Binary files /dev/null and b/inst/tests/utf16le.txt differ diff --git a/src/fread.c b/src/fread.c index a269ccc3a..4d190ddfa 100644 --- a/src/fread.c +++ b/src/fread.c @@ -602,11 +602,30 @@ int freadMain(freadMainArgs args) { } double tMap = wallclock(); + // ******************************************************************************************** + // Check whether the file contains BOM (Byte Order Mark), and if yes strip it, modifying + // `mmp`. Also, presence of BOM allows us to reliably detect the file's encoding. + // See: https://en.wikipedia.org/wiki/Byte_order_mark + // See: issues #1087 and #1465 + // ******************************************************************************************** + if (fileSize >= 3 && memcmp(sof, "\xEF\xBB\xBF", 3) == 0) { + sof += 3; + // ienc = CE_UTF8; + if (args.verbose) DTPRINT("UTF-8 byte order mark EF BB BF found at the start of the file and skipped.\n"); + } + else if (fileSize >= 4 && memcmp(sof, "\x84\x31\x95\x33", 4) == 0) { + sof += 4; + // ienc = CE_GB18030; + if (args.verbose) DTPRINT("GB-18030 byte order mark 84 31 95 33 found at the start of the file and skipped.\n"); + DTWARN("GB-18030 encoding detected, however fread() is unable to decode it. Some character fields may be garbled.\n"); + } + else if (fileSize >= 2 && sof[0] + sof[1] == '\xFE' + '\xFF') { // either 0xFE 0xFF or 0xFF 0xFE + STOP("File is encoded in UTF-16, this encoding is not supported by fread(). Please recode the file to UTF-8."); + } + // ******************************************************************************************** // Auto detect eol, first eol where there are two (i.e. CRLF) // ******************************************************************************************** - // take care of UTF8 BOM, #1087 and #1465 - if (!memcmp(sof, "\xef\xbb\xbf", 3)) sof += 3; ch = sof; while (ch