fwrite single column DT now sets na='NA' to avoid blank lines, #2106

Rdatatable · Oct 30, 2017 · 1fc5cba · 1fc5cba
1 parent 7357a3a
commit 1fc5cba
Show file tree

Hide file tree

Showing 7 changed files with 35 additions and 16 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -29,6 +29,7 @@
 2. `fwrite()`:
     * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215).
     * `logicalAsInt` has been renamed `logical01` and the default changed from `FALSE` to `TRUE`, both changes for consistency with `fread` (see item above). The old name `logicalAsInt` continues to work but is now deprecated. The previous default can easily be restored without any code changes by setting `options("datatable.logical01" = FALSE)`.
+    * When `DT` is a single column, `na=` is now set to `"NA"` to avoid blank lines in the output, [#2106](https://github.com/Rdatatable/data.table/issues/2106). Thanks to @skanskan, Michael Chirico and @franknarf1 for the testing and ideas.
 
 3. Added helpful message when subsetting by a logical column without wrapping it in parentheses, [#1844](https://github.com/Rdatatable/data.table/issues/1844). Thanks @dracodoc for the suggestion and @MichaelChirico for the PR.
 

diff --git a/R/fread.R b/R/fread.R
@@ -1,8 +1,13 @@
 
 fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=NA,skip=0,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=interactive(),data.table=getOption("datatable.fread.datatable"),nThread=getDTthreads(),logical01=TRUE)
 {
-  stopifnot( is.character(sep), length(sep)==1, sep=="auto" || nchar(sep)==1 )
-  if (sep == "auto") sep=""
+  if (is.null(sep)) sep="\n"         # C level knows that \n means \r\n on Windows, for example
+  else {
+    stopifnot( is.character(sep), length(sep)==1 )
+    if (sep=="") sep="\n"            # meaning readLines behaviour. The 3 values (NULL, "" or "\n") are equivalent.
+    else if (sep=="auto") sep=""     # sep=="" at C level means auto sep
+    else stopifnot( nchar(sep)==1 )  # otherwise an actual character to use as sep
+  }
   stopifnot( is.character(dec), length(dec)==1, nchar(dec)==1 )
   # handle encoding, #563
   if (length(encoding) != 1L || !encoding %in% c("unknown", "UTF-8", "Latin-1")) {

diff --git a/R/fwrite.R b/R/fwrite.R
@@ -1,6 +1,6 @@
 fwrite <- function(x, file="", append=FALSE, quote="auto",
            sep=",", sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n",
-           na="", dec=".", row.names=FALSE, col.names=TRUE,
+           na=if (length(x)>1L) "" else "NA", dec=".", row.names=FALSE, col.names=TRUE,
            qmethod=c("double","escape"),
            logical01=getOption("datatable.logical01", TRUE),
            logicalAsInt=logical01,

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -9673,9 +9673,9 @@ test(1729.2, fwrite(data.table(V2=c(9.999999999999998223643160599749535322189331
 DT = data.table(V1=c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9,
                      0.000000000000000000000999999999999999999999999,
                      99999999999999999999999999999.999999))
-ans = "V19999999999.999.9e-169e-220.999.199.91e-211e+29"
+ans = "\"V1\"9999999999.999.9e-169e-220.999.199.91e-211e+29"
 test(1729.3, fwrite(DT), output=ans)
-test(1729.4, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans)
+test(1729.4, write.csv(DT,row.names=FALSE), output=ans)
 options(oldverbose)
 
 # same decimal/scientific rule (shortest format) as write.csv
@@ -9741,13 +9741,13 @@ ans = c("V1","5.123456789e+300","-5.123456789e+300",
 # 0 11111110100 010111011111100101001110101100000011 01101011 10101100
 # 0 00000001010 000101000110010100110011101010000110 00111110 01010001
 # 0 11111110100 011001101011100100100011110110110000 01001110 01011101
-test(1729.9, fwrite(DT), output=paste(ans,collapse=""))
+test(1729.9, fwrite(DT,na=""), output=paste(ans,collapse=""))
 test(1729.11, write.csv(DT,row.names=FALSE,quote=FALSE), output=paste(ans,collapse=""))
 DT = data.table(unlist(.Machine[c("double.eps","double.neg.eps","double.xmin","double.xmax")]))
 #    double.eps double.neg.eps    double.xmin    double.xmax
 #  2.220446e-16   1.110223e-16  2.225074e-308  1.797693e+308
 test(1729.12, typeof(DT[[1L]]), "double")
-test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE)))
+test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE)))
 
 if ("package:bit64" %in% search()) {
   test(1730.1, typeof(-2147483647L), "integer")
@@ -9904,10 +9904,11 @@ test(1736.6, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logicalAsIn
  c("A|B|C", "1|{1,2,3,4,5,6,7,8,9,10}|{s,t,u,v,w}",
  "2|{15,16,17,18}|{1.2,2.3,3.4,3.14159265358979,-9}", "3|{7}|{foo,bar}", "4|{9,10}|{1,1,0}"))
 DT = data.table(A=c("foo","ba|r","baz"))
-test(1736.7, capture.output(fwrite(DT)), c("A","foo","ba|r","baz"))  # no list column so no need to quote
+test(1736.7, capture.output(fwrite(DT,na="")), c("A","foo","ba|r","baz"))              # no list column so no need to quote
+test(1736.8, capture.output(fwrite(DT)), c("\"A\"","\"foo\"","\"ba|r\"","\"baz\""))    # column name is quoted because na="NA" due to 1-column
 DT = data.table(A=c("foo","ba|r","baz"), B=list(1:3,1:4,c("fo|o","ba,r","baz"))) # now list column and need to quote
-test(1736.8, capture.output(fwrite(DT)), c("A,B", "foo,1|2|3", "\"ba|r\",1|2|3|4", "baz,\"fo|o\"|\"ba,r\"|baz"))
-test(1736.9, capture.output(fwrite(DT,quote=TRUE)), c("\"A\",\"B\"", "\"foo\",1|2|3", "\"ba|r\",1|2|3|4", "\"baz\",\"fo|o\"|\"ba,r\"|\"baz\""))
+test(1736.9, capture.output(fwrite(DT)), c("A,B", "foo,1|2|3", "\"ba|r\",1|2|3|4", "baz,\"fo|o\"|\"ba,r\"|baz"))
+test(1736.11, capture.output(fwrite(DT,quote=TRUE)), c("\"A\",\"B\"", "\"foo\",1|2|3", "\"ba|r\",1|2|3|4", "\"baz\",\"fo|o\"|\"ba,r\"|\"baz\""))
 
 # any list of same length vector input
 test(1737.1, fwrite(list()), NULL, warning="fwrite was passed an empty list of no columns")
@@ -9918,8 +9919,8 @@ test(1737.5, fwrite(list(1.2,B=c("foo","bar"))), error="Column 2's length (2) is
 
 # fwrite ITime, Date, IDate
 DT = data.table(A=as.ITime(c("23:59:58","23:59:59","12:00:00","00:00:01",NA,"00:00:00")))
-test(1738.1, capture.output(fwrite(DT)), c("A","23:59:58","23:59:59","12:00:00","00:00:01","","00:00:00"))
-test(1738.2, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE, na="")))
+test(1738.1, capture.output(fwrite(DT)), c("\"A\"","23:59:58","23:59:59","12:00:00","00:00:01","NA","00:00:00"))
+test(1738.2, capture.output(fwrite(DT,na="")), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE, na="")))
 dts = c("1901-05-17","1907-10-22","1929-10-24","1962-05-28","1987-10-19","2008-09-15",
         "1968-12-30","1968-12-31","1969-01-01","1969-01-02")
 DT = data.table(A=as.Date(dts), B=as.IDate(dts))
@@ -10308,7 +10309,8 @@ if ("package:bit64" %in% search()) {
 # end Grouping Sets
 
 # for completeness, added test for NA problem to close #1837. Fixed long ago before release to CRAN.
-test(1751, capture.output(fwrite(data.table(x=NA_integer_),verbose=FALSE)), c("x",""))
+test(1751.1, capture.output(fwrite(data.table(x=NA_integer_),verbose=FALSE)), c("\"x\"","NA"))
+test(1751.2, capture.output(fwrite(data.table(x=NA_integer_),na="",verbose=FALSE)), c("x",""))
 
 if ("package:nanotime" %in% search()) {
   DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z",
@@ -10967,6 +10969,16 @@ test(1837, fread('v1,v2,v3,v4,v5\n1,2,3,4,5', select=-1), error="out of range.*C
 
 test(1838, fread("default payment next month\n0.5524\n0.2483\n0.1157\n"), data.table("default payment next month"=c(0.5524,0.2483,0.1157)))  #2322
 
+# better writing and reading of NA in single column input, #2106
+DT = data.table(a=c(4,NA,2,3.14,999,NA))
+fwrite(DT, f<-tempfile(), na="")  # old default for na was always ""
+test(1839.1, fread(f), data.table(a=4L), warning="text exists afterwards.*Consider fill.*blank.lines.skip.*<<2>>")
+test(1839.2, fread(f, blank.lines.skip=TRUE), data.table(a=c(4,2,3.14,999)))
+test(1839.3, fread(f, fill=TRUE), data.table(a=c(4,NA,2,3.14,999)))
+test(1839.4, fread(f, fill=TRUE, blank.lines.skip=TRUE), data.table(a=c(4,2,3.14,999)))
+fwrite(DT, f)  # new default sets na="NA" when ncol==1
+test(1839.5, fread(f), DT)
+
 
 
 ##########################

diff --git a/man/fread.Rd b/man/fread.Rd
@@ -24,7 +24,7 @@ nThread=getDTthreads(), logical01=TRUE
 }
 \arguments{
   \item{input}{ Either the file name to read (containing no \\n character), a shell command that pre-processes the file (e.g. \code{fread("grep blah filename"))} or the input itself as a string (containing at least one \\n), see examples. In both cases, a length 1 character string. A filename input is passed through \code{\link[base]{path.expand}} for convenience and may be a URL starting http:// or file://. }
-  \item{sep}{ The separator between columns. Defaults to the character in the set \code{[,\\t |;:]} that separates the sample of rows into the most number of lines with the same number of fields. }
+  \item{sep}{ The separator between columns. Defaults to the character in the set \code{[,\\t |;:]} that separates the sample of rows into the most number of lines with the same number of fields. Use \code{NULL} or \code{""} to specify no separator; i.e. each line a single character column like \code{base::readLines} does.}
   \item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;}], other than \code{sep}, that exists inside each field outside quoted regions in the sample. NB: \code{sep2} is not yet implemented. }
   \item{nrows}{ The maximum number of rows to read. Unlike \code{read.table}, you do not need to set this to an estimate of the number of rows in the file for better speed because that is already automatically determined by \code{fread} almost instantly using the large sample of lines. `nrows=0` returns the column names and typed empty columns determined by the large sample; useful for a dry run of a large file or to quickly check format consistency of a set of files before starting to read any of them. }
   \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. }

diff --git a/man/fwrite.Rd b/man/fwrite.Rd
@@ -10,7 +10,8 @@ This is new functionality as of Nov 2016. We may need to refine argument names a
 fwrite(x, file = "", append = FALSE, quote = "auto",
   sep = ",", sep2 = c("","|",""),
   eol = if (.Platform$OS.type=="windows") "\r\n" else "\n",
-  na = "", dec = ".", row.names = FALSE, col.names = TRUE,
+  na = if (length(x)>1L) "" else "NA", dec = ".",
+  row.names = FALSE, col.names = TRUE,
   qmethod = c("double","escape"),
   logical01 = getOption("datatable.logical01", TRUE),
   logicalAsInt = logical01,  # deprecated

diff --git a/src/fread.c b/src/fread.c
@@ -1608,7 +1608,7 @@ int freadMain(freadMainArgs _args) {
   if (lastSampleJumpOk) {
     while (ch<eof && isspace(*ch)) ch++;
     if (ch<eof)
-      DTWARN("Found the last consistent line but text exists afterwards (discarded): <<%s>>", strlim(ch,200));
+      DTWARN("Found the last consistent line but text exists afterwards. Consider fill=TRUE and/or blank.lines.skip=TRUE. First 200 characters of discarded line: <<%s>>", strlim(ch,200));
   } else {
     // nextGoodLine() was false for the last (extra) jump to check the end
     // must set lastRowEnd to eof accordingly otherwise it'll be left wherever the last good jump finished