diff --git a/README.md b/README.md index 042cd5e34..45636a062 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,9 @@ * Missing `integer64` values are properly assigned `NA`s. Closes [#488](https://github.com/Rdatatable/data.table/issues/488). Thanks to @PeterStoyanov and @richierocks for the report. * Column headers with empty strings aren't skipped anymore. [Closes #483](https://github.com/Rdatatable/data.table/issues/483). Thanks to @RobyJoehanes and @kforner. * Detects separator correctly when commas also exist in text fields. Closes [#923](https://github.com/Rdatatable/data.table/issues/923). Thanks to @raymondben for the report. - * `NA` values in NA inflated file are read properly. [Closes #737](https://github.com/Rdatatable/data.table/issues/737). Thanks to Adam Kennedy. + * `NA` values in NA inflated file are read properly. [Closes #737](https://github.com/Rdatatable/data.table/issues/737). Thanks to Adam Kennedy. + * `fread` now correctly handles `na.strings` argument for all types of columns - it detect possible `NA` values without coercion to character, like in base `read.table`. [fixes #504](https://github.com/Rdatatable/data.table/issues/504). Thanks to @dselivanov for the PR. + 6. Auto indexing: * `DT[colA == max(colA)]` now works again without needing `options(datatable.auto.index=FALSE)`. Thanks to Jan Gorecki and kaybenleroll, [#858](https://github.com/Rdatatable/data.table/issues/858). Test added. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 4953f040c..fb67481ad 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2438,8 +2438,9 @@ DT[4,e:=NaN] # write.table writes NaN as NA, though, and all.equal considers Na write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE) test(880, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE))) test(881, fread(f), DT) +# test that columns are not coerced if nastring=NULL DT[3,d:="NA"] -test(882, fread(f,na.strings=NULL), DT) +test(882, fread(f,na.strings=NULL)[['d']], DT[['d']]) DT[3,d:=NA_character_] unlink(f) write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=TRUE) @@ -6783,6 +6784,32 @@ DT <- data.table(a = rep(1:5,3*1e6), b = rep(letters[1:3],5*1e6)) test(1549, capture.output(print(DT)), c(" a b", " 1: 1 a", " 2: 2 b", " 3: 3 c", " 4: 4 a", " 5: 5 b", " --- ", "14999996: 1 b", "14999997: 2 c", "14999998: 3 a", "14999999: 4 b", "15000000: 5 c")) rm(DT) +# PR by @dselivanov +# fixes #504 - handle nastring while reading (without coercion to character) +# Note: this doesn't address cases like na.strings="-999" yet. See https://github.com/Rdatatable/data.table/pull/1236 for those examples. +K = 10L +nastrings = c('null', 'NULL', 'na', '_NA', 'NA', 'nan', 'Nan', 'NAN', 'NaN') +DT = data.table(int = 1:K, + char = sample(letters, size = K, replace = T), + float = 1:K + 0.1, + bool = sample( c(T, F), K, replace = T)) + +DT_NA = DT +for (j in seq_len( ncol(DT) )) { + set(x = DT_NA, i = j, j = j, value = NA) +} + +for(k in seq_along(nastrings)) { + dt0 = copy(DT) + for (j in seq_len( ncol(DT) )) { + set(x = dt0, i = NULL, j = j, value = as.character(dt0[[j]])) + set(x = dt0, i = j, j = j, value = nastrings[[k]]) + } + str = do.call(paste, c(dt0, collapse="\n", sep=",")) + str = paste(paste(names(dt0), collapse=","), str, sep="\n") + DT_fread = fread(str, na.strings = nastrings, verbose = FALSE) + test(1550 + k * 0.1, DT_fread, DT_NA) +} ########################## diff --git a/man/fread.Rd b/man/fread.Rd index bbb990694..9157f236b 100644 --- a/man/fread.Rd +++ b/man/fread.Rd @@ -25,7 +25,7 @@ data.table=getOption("datatable.fread.datatable") # default: TRUE \item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;:}], other than \code{sep}, that exists inside each field outside quoted regions on line \code{autostart}. NB: \code{sep2} is not yet implemented. } \item{nrows}{ The number of rows to read, by default -1 means all. Unlike \code{read.table}, it doesn't help speed to set this to the number of rows in the file (or an estimate), since the number of rows is automatically determined and is already fast. Only set \code{nrows} if you require the first 10 rows, for example. `nrows=0` is a special case that just returns the column names and types; e.g., a dry run for a large file or to quickly check format consistency of a set of files before starting to read any. } \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. } - \item{na.strings}{ A character vector of strings to convert to \code{NA_character_}. By default for columns read as type character \code{",,"} is read as a blank string (\code{""}) and \code{",NA,"} is read as \code{NA_character_}. Typical alternatives might be \code{na.strings=NULL} or perhaps \code{na.strings=c("NA","N/A","")}. } + \item{na.strings}{ A character vector of strings which are to be interpreted as \code{NA} values. By default \code{",,"} for columns read as type character is read as a blank string (\code{""}) and \code{",NA,"} is read as \code{NA}. Typical alternatives might be \code{na.strings=NULL} (no coercion to NA at all!) or perhaps \code{na.strings=c("NA","N/A","null")}. } \item{stringsAsFactors}{ Convert all character columns to factors? } \item{verbose}{ Be chatty and report timings? } \item{autostart}{ Any line number within the region of machine readable delimited text, by default 30. If the file is shorter or this line is empty (e.g. short files with trailing blank lines) then the last non empty line (with a non empty line above that) is used. This line and the lines above it are used to auto detect \code{sep}, \code{sep2} and the number of fields. It's extremely unlikely that \code{autostart} should ever need to be changed, we hope. } diff --git a/src/fread.c b/src/fread.c index 895873163..fa3aff5c9 100644 --- a/src/fread.c +++ b/src/fread.c @@ -102,6 +102,101 @@ void STOP(const char *format, ...) { closeFile(); // some errors point to data in the file, hence via msg buffer first error(msg); } +// ******************************************************************************************** +// NA handling. +// algorithm is following +// 1) Strto*() checks whether we can convert substring into given * type +// 2) If not, we try to iteratively char-by-char starting from begining of the substring +// look forward for maximum max(nchar(nastrings)) symbols: +// ******************************************************************************************** +// max_na_nchar = max(nchar(nastrings)) +// lch = pointer to begining of substring we want to check +// for (i in 0:max_na_nchar) { +// ch = lch[i] +// if( any of nastrings contain ch at position i) +// continue +// else return FALSE +// } +// return TRUE +// ******************************************************************************************** +// for checking "if" condition we manage mask "na_mask" with length na_len = length(nastrings). +// 1 on mask position i means that nastring[i] is still candidate for given substring. +// 0 means this substring can't be casted into nastring[i], so nastring[i] is not candidate. +int *NA_MASK; +// means nastrings == 0; will do nothing with nastrings +int FLAG_NA_STRINGS_NULL; +const char **NA_STRINGS; +int NA_MAX_NCHAR; +int NASTRINGS_LEN; +int *EACH_NA_STRING_LEN; +// calculates maximum string length for a given R character vector +int get_maxlen(SEXP char_vec) { + int maxlen = -1; + int cur_len; + for (int i=0; i< LENGTH(char_vec); i++) { + cur_len = strlen(CHAR(STRING_ELT(char_vec, i))); + maxlen = (cur_len > maxlen) ? cur_len : maxlen; + } + return maxlen; +} +// initialize mask. At the begining we assume any nastring can be candidate. +static inline void init_mask() { + for(int i = 0; i < NASTRINGS_LEN; i++) + NA_MASK[i] = 1; +} +static inline int can_cast_to_na(const char* lch) { + const char *lch2 = lch; + // nastrings==NULL => do nothing + if(FLAG_NA_STRINGS_NULL) { + return 0; + } + init_mask(); + // check whether mask contains any candidates which still potentially can be casted to NA + int non_zero_left = NASTRINGS_LEN; + //case when lch is empty string! + int na_found_flag = 1; + int pos = 0; + int j; + const char *nastring_iter; + // look for possible NA strings: + // max forward symbols = max length of the nastring template + while (pos < NA_MAX_NCHAR && lch2 != eof && *lch2 != sep && *lch2 != eol) { + j = 0; + na_found_flag = 0; + // check whether any of the nastrings template contains current (i-th forward) symbol at i-th forward positions + // iterate through nastrings + while( j < NASTRINGS_LEN && non_zero_left > 0 ) { + // not marked before + if( NA_MASK[j] != 0) { + nastring_iter = NA_STRINGS[j]; + // nastring[j] candidate founded + if(EACH_NA_STRING_LEN[j] == pos + 1 && nastring_iter[pos] == *lch2) { + na_found_flag = 1; + } + // nastring[j] candidate has smaller length than we are looking + // or doesn't contain necessary symbol at position we are cheking + if(EACH_NA_STRING_LEN[j] < pos + 1 || nastring_iter[pos] != *lch2) { + NA_MASK[j] = 0; + non_zero_left--; + } + } + j++; + } + //all elements of mask == 0 means we can't convert tested substring to NA + if(non_zero_left == 0) { + return 0; + } + pos++; + lch2++; + } + // found delimiter after right NA candidate or empty string + if(na_found_flag && (lch2 == eof || *lch2 == sep || *lch2 == eol)) { + ch = lch2; + return 1; + } + else return 0; +} +// ******************************************************************************************** static inline void Field(int err) { @@ -200,16 +295,16 @@ static inline Rboolean Strtoll() acc += *lch-'0'; // have assumed compiler will optimize the constant expression (LLONG_MAX-10)/10 lch++; // TO DO can remove lch bump type. } @@ -249,10 +344,10 @@ static inline Rboolean Strtod() return(TRUE); } } - if (lch==start && lch