From 1fc5cbaa32856a0732f6f8b78ef759095da0f22b Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Mon, 30 Oct 2017 15:47:36 -0700 Subject: [PATCH] fwrite single column DT now sets na='NA' to avoid blank lines, #2106 --- NEWS.md | 1 + R/fread.R | 9 +++++++-- R/fwrite.R | 2 +- inst/tests/tests.Rraw | 32 ++++++++++++++++++++++---------- man/fread.Rd | 2 +- man/fwrite.Rd | 3 ++- src/fread.c | 2 +- 7 files changed, 35 insertions(+), 16 deletions(-) diff --git a/NEWS.md b/NEWS.md index 42b533fc9..b42e7c87e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -29,6 +29,7 @@ 2. `fwrite()`: * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). * `logicalAsInt` has been renamed `logical01` and the default changed from `FALSE` to `TRUE`, both changes for consistency with `fread` (see item above). The old name `logicalAsInt` continues to work but is now deprecated. The previous default can easily be restored without any code changes by setting `options("datatable.logical01" = FALSE)`. + * When `DT` is a single column, `na=` is now set to `"NA"` to avoid blank lines in the output, [#2106](https://github.com/Rdatatable/data.table/issues/2106). Thanks to @skanskan, Michael Chirico and @franknarf1 for the testing and ideas. 3. Added helpful message when subsetting by a logical column without wrapping it in parentheses, [#1844](https://github.com/Rdatatable/data.table/issues/1844). Thanks @dracodoc for the suggestion and @MichaelChirico for the PR. diff --git a/R/fread.R b/R/fread.R index e42a5efc6..a55d3053a 100644 --- a/R/fread.R +++ b/R/fread.R @@ -1,8 +1,13 @@ fread <- function(input="",file,sep="auto",sep2="auto",dec=".",quote="\"",nrows=Inf,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=NA,skip=0,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"), col.names, check.names=FALSE, encoding="unknown", strip.white=TRUE, fill=FALSE, blank.lines.skip=FALSE, key=NULL, showProgress=interactive(),data.table=getOption("datatable.fread.datatable"),nThread=getDTthreads(),logical01=TRUE) { - stopifnot( is.character(sep), length(sep)==1, sep=="auto" || nchar(sep)==1 ) - if (sep == "auto") sep="" + if (is.null(sep)) sep="\n" # C level knows that \n means \r\n on Windows, for example + else { + stopifnot( is.character(sep), length(sep)==1 ) + if (sep=="") sep="\n" # meaning readLines behaviour. The 3 values (NULL, "" or "\n") are equivalent. + else if (sep=="auto") sep="" # sep=="" at C level means auto sep + else stopifnot( nchar(sep)==1 ) # otherwise an actual character to use as sep + } stopifnot( is.character(dec), length(dec)==1, nchar(dec)==1 ) # handle encoding, #563 if (length(encoding) != 1L || !encoding %in% c("unknown", "UTF-8", "Latin-1")) { diff --git a/R/fwrite.R b/R/fwrite.R index 66136c9da..5ef7fb78c 100644 --- a/R/fwrite.R +++ b/R/fwrite.R @@ -1,6 +1,6 @@ fwrite <- function(x, file="", append=FALSE, quote="auto", sep=",", sep2=c("","|",""), eol=if (.Platform$OS.type=="windows") "\r\n" else "\n", - na="", dec=".", row.names=FALSE, col.names=TRUE, + na=if (length(x)>1L) "" else "NA", dec=".", row.names=FALSE, col.names=TRUE, qmethod=c("double","escape"), logical01=getOption("datatable.logical01", TRUE), logicalAsInt=logical01, diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index e87db84ef..ea8830c67 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -9673,9 +9673,9 @@ test(1729.2, fwrite(data.table(V2=c(9.999999999999998223643160599749535322189331 DT = data.table(V1=c(9999999999.99, 0.00000000000000099, 0.0000000000000000000009, 0.9, 9.0, 9.1, 99.9, 0.000000000000000000000999999999999999999999999, 99999999999999999999999999999.999999)) -ans = "V19999999999.999.9e-169e-220.999.199.91e-211e+29" +ans = "\"V1\"9999999999.999.9e-169e-220.999.199.91e-211e+29" test(1729.3, fwrite(DT), output=ans) -test(1729.4, write.csv(DT,row.names=FALSE,quote=FALSE), output=ans) +test(1729.4, write.csv(DT,row.names=FALSE), output=ans) options(oldverbose) # same decimal/scientific rule (shortest format) as write.csv @@ -9741,13 +9741,13 @@ ans = c("V1","5.123456789e+300","-5.123456789e+300", # 0 11111110100 010111011111100101001110101100000011 01101011 10101100 # 0 00000001010 000101000110010100110011101010000110 00111110 01010001 # 0 11111110100 011001101011100100100011110110110000 01001110 01011101 -test(1729.9, fwrite(DT), output=paste(ans,collapse="")) +test(1729.9, fwrite(DT,na=""), output=paste(ans,collapse="")) test(1729.11, write.csv(DT,row.names=FALSE,quote=FALSE), output=paste(ans,collapse="")) DT = data.table(unlist(.Machine[c("double.eps","double.neg.eps","double.xmin","double.xmax")])) # double.eps double.neg.eps double.xmin double.xmax # 2.220446e-16 1.110223e-16 2.225074e-308 1.797693e+308 test(1729.12, typeof(DT[[1L]]), "double") -test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE))) +test(1729.13, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE))) if ("package:bit64" %in% search()) { test(1730.1, typeof(-2147483647L), "integer") @@ -9904,10 +9904,11 @@ test(1736.6, capture.output(fwrite(DT, sep='|', sep2=c("{",",","}"), logicalAsIn c("A|B|C", "1|{1,2,3,4,5,6,7,8,9,10}|{s,t,u,v,w}", "2|{15,16,17,18}|{1.2,2.3,3.4,3.14159265358979,-9}", "3|{7}|{foo,bar}", "4|{9,10}|{1,1,0}")) DT = data.table(A=c("foo","ba|r","baz")) -test(1736.7, capture.output(fwrite(DT)), c("A","foo","ba|r","baz")) # no list column so no need to quote +test(1736.7, capture.output(fwrite(DT,na="")), c("A","foo","ba|r","baz")) # no list column so no need to quote +test(1736.8, capture.output(fwrite(DT)), c("\"A\"","\"foo\"","\"ba|r\"","\"baz\"")) # column name is quoted because na="NA" due to 1-column DT = data.table(A=c("foo","ba|r","baz"), B=list(1:3,1:4,c("fo|o","ba,r","baz"))) # now list column and need to quote -test(1736.8, capture.output(fwrite(DT)), c("A,B", "foo,1|2|3", "\"ba|r\",1|2|3|4", "baz,\"fo|o\"|\"ba,r\"|baz")) -test(1736.9, capture.output(fwrite(DT,quote=TRUE)), c("\"A\",\"B\"", "\"foo\",1|2|3", "\"ba|r\",1|2|3|4", "\"baz\",\"fo|o\"|\"ba,r\"|\"baz\"")) +test(1736.9, capture.output(fwrite(DT)), c("A,B", "foo,1|2|3", "\"ba|r\",1|2|3|4", "baz,\"fo|o\"|\"ba,r\"|baz")) +test(1736.11, capture.output(fwrite(DT,quote=TRUE)), c("\"A\",\"B\"", "\"foo\",1|2|3", "\"ba|r\",1|2|3|4", "\"baz\",\"fo|o\"|\"ba,r\"|\"baz\"")) # any list of same length vector input test(1737.1, fwrite(list()), NULL, warning="fwrite was passed an empty list of no columns") @@ -9918,8 +9919,8 @@ test(1737.5, fwrite(list(1.2,B=c("foo","bar"))), error="Column 2's length (2) is # fwrite ITime, Date, IDate DT = data.table(A=as.ITime(c("23:59:58","23:59:59","12:00:00","00:00:01",NA,"00:00:00"))) -test(1738.1, capture.output(fwrite(DT)), c("A","23:59:58","23:59:59","12:00:00","00:00:01","","00:00:00")) -test(1738.2, capture.output(fwrite(DT)), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE, na=""))) +test(1738.1, capture.output(fwrite(DT)), c("\"A\"","23:59:58","23:59:59","12:00:00","00:00:01","NA","00:00:00")) +test(1738.2, capture.output(fwrite(DT,na="")), capture.output(write.csv(DT,row.names=FALSE,quote=FALSE, na=""))) dts = c("1901-05-17","1907-10-22","1929-10-24","1962-05-28","1987-10-19","2008-09-15", "1968-12-30","1968-12-31","1969-01-01","1969-01-02") DT = data.table(A=as.Date(dts), B=as.IDate(dts)) @@ -10308,7 +10309,8 @@ if ("package:bit64" %in% search()) { # end Grouping Sets # for completeness, added test for NA problem to close #1837. Fixed long ago before release to CRAN. -test(1751, capture.output(fwrite(data.table(x=NA_integer_),verbose=FALSE)), c("x","")) +test(1751.1, capture.output(fwrite(data.table(x=NA_integer_),verbose=FALSE)), c("\"x\"","NA")) +test(1751.2, capture.output(fwrite(data.table(x=NA_integer_),na="",verbose=FALSE)), c("x","")) if ("package:nanotime" %in% search()) { DT = data.table(A=nanotime(tt<-c("2016-09-28T15:30:00.000000070Z", @@ -10967,6 +10969,16 @@ test(1837, fread('v1,v2,v3,v4,v5\n1,2,3,4,5', select=-1), error="out of range.*C test(1838, fread("default payment next month\n0.5524\n0.2483\n0.1157\n"), data.table("default payment next month"=c(0.5524,0.2483,0.1157))) #2322 +# better writing and reading of NA in single column input, #2106 +DT = data.table(a=c(4,NA,2,3.14,999,NA)) +fwrite(DT, f<-tempfile(), na="") # old default for na was always "" +test(1839.1, fread(f), data.table(a=4L), warning="text exists afterwards.*Consider fill.*blank.lines.skip.*<<2>>") +test(1839.2, fread(f, blank.lines.skip=TRUE), data.table(a=c(4,2,3.14,999))) +test(1839.3, fread(f, fill=TRUE), data.table(a=c(4,NA,2,3.14,999))) +test(1839.4, fread(f, fill=TRUE, blank.lines.skip=TRUE), data.table(a=c(4,2,3.14,999))) +fwrite(DT, f) # new default sets na="NA" when ncol==1 +test(1839.5, fread(f), DT) + ########################## diff --git a/man/fread.Rd b/man/fread.Rd index 2c5d5fb3b..5b90fc2af 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -24,7 +24,7 @@ nThread=getDTthreads(), logical01=TRUE } \arguments{ \item{input}{ Either the file name to read (containing no \\n character), a shell command that pre-processes the file (e.g. \code{fread("grep blah filename"))} or the input itself as a string (containing at least one \\n), see examples. In both cases, a length 1 character string. A filename input is passed through \code{\link[base]{path.expand}} for convenience and may be a URL starting http:// or file://. } - \item{sep}{ The separator between columns. Defaults to the character in the set \code{[,\\t |;:]} that separates the sample of rows into the most number of lines with the same number of fields. } + \item{sep}{ The separator between columns. Defaults to the character in the set \code{[,\\t |;:]} that separates the sample of rows into the most number of lines with the same number of fields. Use \code{NULL} or \code{""} to specify no separator; i.e. each line a single character column like \code{base::readLines} does.} \item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;}], other than \code{sep}, that exists inside each field outside quoted regions in the sample. NB: \code{sep2} is not yet implemented. } \item{nrows}{ The maximum number of rows to read. Unlike \code{read.table}, you do not need to set this to an estimate of the number of rows in the file for better speed because that is already automatically determined by \code{fread} almost instantly using the large sample of lines. `nrows=0` returns the column names and typed empty columns determined by the large sample; useful for a dry run of a large file or to quickly check format consistency of a set of files before starting to read any of them. } \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. } diff --git a/man/fwrite.Rd b/man/fwrite.Rd index 65da84e3a..229264cdf 100644 --- a/man/fwrite.Rd +++ b/man/fwrite.Rd @@ -10,7 +10,8 @@ This is new functionality as of Nov 2016. We may need to refine argument names a fwrite(x, file = "", append = FALSE, quote = "auto", sep = ",", sep2 = c("","|",""), eol = if (.Platform$OS.type=="windows") "\r\n" else "\n", - na = "", dec = ".", row.names = FALSE, col.names = TRUE, + na = if (length(x)>1L) "" else "NA", dec = ".", + row.names = FALSE, col.names = TRUE, qmethod = c("double","escape"), logical01 = getOption("datatable.logical01", TRUE), logicalAsInt = logical01, # deprecated diff --git a/src/fread.c b/src/fread.c index 2cfb58285..4696fb182 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1608,7 +1608,7 @@ int freadMain(freadMainArgs _args) { if (lastSampleJumpOk) { while (ch>", strlim(ch,200)); + DTWARN("Found the last consistent line but text exists afterwards. Consider fill=TRUE and/or blank.lines.skip=TRUE. First 200 characters of discarded line: <<%s>>", strlim(ch,200)); } else { // nextGoodLine() was false for the last (extra) jump to check the end // must set lastRowEnd to eof accordingly otherwise it'll be left wherever the last good jump finished