Merge branch 'dselivanov-fread_improvements'

Rdatatable · Sep 7, 2015 · 75d32da · 75d32da
2 parents 7536824 + fda1c71
commit 75d32da
Show file tree

Hide file tree

Showing 4 changed files with 165 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -99,7 +99,9 @@
       * Missing `integer64` values are properly assigned `NA`s. Closes [#488](https://github.com/Rdatatable/data.table/issues/488). Thanks to @PeterStoyanov and @richierocks for the report.
       * Column headers with empty strings aren't skipped anymore. [Closes #483](https://github.com/Rdatatable/data.table/issues/483). Thanks to @RobyJoehanes and @kforner.
       * Detects separator correctly when commas also exist in text fields. Closes [#923](https://github.com/Rdatatable/data.table/issues/923). Thanks to @raymondben for the report.
-      * `NA` values in NA inflated file are read properly. [Closes #737](https://github.com/Rdatatable/data.table/issues/737). Thanks to Adam Kennedy.
+      * `NA` values in NA inflated file are read properly. [Closes #737](https://github.com/Rdatatable/data.table/issues/737). Thanks to Adam Kennedy.  
+      *  `fread` now correctly handles `na.strings` argument for all types of columns - it detect possible `NA` values without coercion to character, like in base `read.table`. [fixes #504](https://github.com/Rdatatable/data.table/issues/504). Thanks to @dselivanov for the PR.
+
 
   6. Auto indexing:
       * `DT[colA == max(colA)]` now works again without needing `options(datatable.auto.index=FALSE)`. Thanks to Jan Gorecki and kaybenleroll, [#858](https://github.com/Rdatatable/data.table/issues/858). Test added.

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -2438,8 +2438,9 @@ DT[4,e:=NaN]  # write.table writes NaN as NA, though, and all.equal considers Na
 write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=FALSE)
 test(880, fread(f), as.data.table(read.csv(f,stringsAsFactors=FALSE)))
 test(881, fread(f), DT)
+# test that columns are not coerced if nastring=NULL
 DT[3,d:="NA"]
-test(882, fread(f,na.strings=NULL), DT)
+test(882, fread(f,na.strings=NULL)[['d']], DT[['d']])
 DT[3,d:=NA_character_]
 unlink(f)
 write.table(DT,f<-tempfile(),sep=",",row.names=FALSE,quote=TRUE)
@@ -6783,6 +6784,32 @@ DT <- data.table(a = rep(1:5,3*1e6), b = rep(letters[1:3],5*1e6))
 test(1549, capture.output(print(DT)), c("          a b", "       1: 1 a", "       2: 2 b", "       3: 3 c", "       4: 4 a", "       5: 5 b", "      ---    ", "14999996: 1 b", "14999997: 2 c", "14999998: 3 a", "14999999: 4 b", "15000000: 5 c"))
 rm(DT)
 
+# PR by @dselivanov
+# fixes #504 - handle nastring while reading (without coercion to character) 
+# Note: this doesn't address cases like na.strings="-999" yet. See https://github.com/Rdatatable/data.table/pull/1236 for those examples.
+K = 10L
+nastrings = c('null', 'NULL', 'na', '_NA', 'NA', 'nan', 'Nan', 'NAN', 'NaN')
+DT = data.table(int = 1:K, 
+                 char = sample(letters, size = K, replace = T), 
+                 float = 1:K + 0.1, 
+                 bool = sample( c(T, F), K, replace = T))
+
+DT_NA = DT
+for (j in seq_len( ncol(DT) )) {
+  set(x = DT_NA, i = j, j = j, value = NA)
+}
+
+for(k in seq_along(nastrings)) {
+  dt0 = copy(DT)
+  for (j in seq_len( ncol(DT) )) {
+    set(x = dt0, i = NULL, j = j, value = as.character(dt0[[j]]))
+    set(x = dt0, i = j, j = j, value = nastrings[[k]])
+  }
+  str = do.call(paste, c(dt0, collapse="\n", sep=","))
+  str = paste(paste(names(dt0), collapse=","), str, sep="\n")
+  DT_fread = fread(str, na.strings = nastrings, verbose = FALSE)
+  test(1550 + k * 0.1, DT_fread, DT_NA)
+}
 
 ##########################
 

diff --git a/man/fread.Rd b/man/fread.Rd
@@ -25,7 +25,7 @@ data.table=getOption("datatable.fread.datatable")   # default: TRUE
   \item{sep2}{ The separator \emph{within} columns. A \code{list} column will be returned where each cell is a vector of values. This is much faster using less working memory than \code{strsplit} afterwards or similar techniques. For each column \code{sep2} can be different and is the first character in the same set above [\code{,\\t |;:}], other than \code{sep}, that exists inside each field outside quoted regions on line \code{autostart}. NB: \code{sep2} is not yet implemented. }
   \item{nrows}{ The number of rows to read, by default -1 means all. Unlike \code{read.table}, it doesn't help speed to set this to the number of rows in the file (or an estimate), since the number of rows is automatically determined and is already fast. Only set \code{nrows} if you require the first 10 rows, for example. `nrows=0` is a special case that just returns the column names and types; e.g., a dry run for a large file or to quickly check format consistency of a set of files before starting to read any. }
   \item{header}{ Does the first data line contain column names? Defaults according to whether every non-empty field on the first data line is type character. If so, or TRUE is supplied, any empty column names are given a default name. }
-  \item{na.strings}{ A character vector of strings to convert to \code{NA_character_}. By default for columns read as type character \code{",,"} is read as a blank string (\code{""}) and \code{",NA,"} is read as \code{NA_character_}. Typical alternatives might be \code{na.strings=NULL} or perhaps \code{na.strings=c("NA","N/A","")}. }
+  \item{na.strings}{ A character vector of strings which are to be interpreted as \code{NA} values. By default \code{",,"} for columns read as type character is read as a blank string (\code{""}) and \code{",NA,"} is read as \code{NA}. Typical alternatives might be \code{na.strings=NULL} (no coercion to NA at all!) or perhaps \code{na.strings=c("NA","N/A","null")}. }
   \item{stringsAsFactors}{ Convert all character columns to factors? }
   \item{verbose}{ Be chatty and report timings? }
   \item{autostart}{ Any line number within the region of machine readable delimited text, by default 30. If the file is shorter or this line is empty (e.g. short files with trailing blank lines) then the last non empty line (with a non empty line above that) is used. This line and the lines above it are used to auto detect \code{sep}, \code{sep2} and the number of fields. It's extremely unlikely that \code{autostart} should ever need to be changed, we hope. }

diff --git a/src/fread.c b/src/fread.c
@@ -102,6 +102,101 @@ void STOP(const char *format, ...) {
     closeFile();  // some errors point to data in the file, hence via msg buffer first
     error(msg);
 }
+// ********************************************************************************************
+// NA handling.
+// algorithm is following
+// 1) Strto*() checks whether we can convert substring into given * type
+// 2) If not, we try to iteratively char-by-char starting from begining of the substring 
+// look forward for maximum max(nchar(nastrings)) symbols:
+// ********************************************************************************************
+// max_na_nchar = max(nchar(nastrings))
+// lch = pointer to begining of substring we want to check
+// for (i in 0:max_na_nchar) {
+//   ch = lch[i]
+//   if( any of nastrings contain ch at position i)
+//      continue
+//   else return FALSE
+// }
+// return TRUE
+// ********************************************************************************************
+// for checking "if" condition we manage mask "na_mask" with length na_len = length(nastrings). 
+// 1 on mask position i means that nastring[i] is still candidate for given substring.
+// 0 means this substring can't be casted into nastring[i], so nastring[i] is not candidate.
+int *NA_MASK;
+// means nastrings == 0; will do nothing with nastrings
+int FLAG_NA_STRINGS_NULL;
+const char **NA_STRINGS;
+int NA_MAX_NCHAR;
+int NASTRINGS_LEN;
+int *EACH_NA_STRING_LEN;
+// calculates maximum string length for a given R character vector
+int get_maxlen(SEXP char_vec) {
+  int maxlen = -1;
+  int cur_len;
+  for (int i=0; i< LENGTH(char_vec); i++) {
+    cur_len = strlen(CHAR(STRING_ELT(char_vec, i)));
+    maxlen = (cur_len > maxlen) ? cur_len : maxlen;
+  }
+  return maxlen;
+}
+// initialize mask. At the begining we assume any nastring can be candidate.
+static inline void init_mask() {
+  for(int i = 0; i < NASTRINGS_LEN; i++)
+    NA_MASK[i] = 1;
+}
+static inline int can_cast_to_na(const char* lch) {
+  const char *lch2 = lch;
+  // nastrings==NULL => do nothing
+  if(FLAG_NA_STRINGS_NULL) {
+    return 0;
+  }
+  init_mask();
+  // check whether mask contains any candidates which still potentially can be casted to NA
+  int non_zero_left = NASTRINGS_LEN;
+  //case when lch is empty string!
+  int na_found_flag = 1;
+  int pos = 0;
+  int j;
+  const char *nastring_iter;
+  // look for possible NA strings:
+  // max forward symbols = max length of the nastring template
+  while (pos < NA_MAX_NCHAR && lch2 != eof && *lch2 != sep && *lch2 != eol) {
+    j = 0;
+    na_found_flag = 0;
+    // check whether any of the nastrings template contains current (i-th forward) symbol at i-th forward positions
+    // iterate through nastrings
+    while( j < NASTRINGS_LEN && non_zero_left > 0 ) {
+      // not marked before
+      if( NA_MASK[j] != 0) {
+        nastring_iter = NA_STRINGS[j];
+        // nastring[j] candidate founded
+        if(EACH_NA_STRING_LEN[j] == pos + 1 && nastring_iter[pos] == *lch2) {
+          na_found_flag = 1;
+        }
+        // nastring[j] candidate has smaller length than we are looking
+        // or doesn't contain necessary symbol at position we are cheking
+        if(EACH_NA_STRING_LEN[j] < pos + 1 || nastring_iter[pos] != *lch2) {
+          NA_MASK[j] = 0;
+          non_zero_left--;
+        }
+      }
+      j++;
+    }
+    //all elements of mask == 0 means we can't convert tested substring to NA
+    if(non_zero_left == 0) {
+      return 0;
+    }
+    pos++;
+    lch2++;
+  }
+  // found delimiter after right NA candidate or empty string
+  if(na_found_flag && (lch2 == eof || *lch2 == sep || *lch2 == eol)) {
+    ch = lch2;
+    return 1;
+  }
+  else return 0;
+}
+// ********************************************************************************************
 
 static inline void Field(int err)
 {
@@ -200,16 +295,16 @@ static inline Rboolean Strtoll()
         acc += *lch-'0';      // have assumed compiler will optimize the constant expression (LLONG_MAX-10)/10
         lch++;                // TO DO can remove lch<eof when last row is specialized in case of no final eol
     }
-    int len = lch-start;
+    //int len = lch-start;
     //Rprintf("Strtoll field '%.*s' has len %d\n", lch-ch+1, ch, len);
     if (lch==eof || *lch==sep || *lch==eol) {   // only if field fully consumed (e.g. not ,123456A,)
         ch = lch;
         u.l = sign ? acc : -acc;
         return(TRUE);     // either int or int64 read ok, result in u.l.  INT_MIN and INT_MAX checked by caller, as appropriate
     }
-    if (len==0 && lch<eof-1 && *lch++=='N' && *lch++=='A' && (lch==eof || *lch==sep || *lch==eol)) {   // ',NA,'
-        ch = lch;       // advance over NA
-        return(TRUE);   // NA_INTEGER or NA_REAL (for SXP_INT64) was already set by caller
+    if(lch == start && can_cast_to_na(lch)) {
+      // ch pointer already set to end of nastring by can_cast_to_na() function
+      return(TRUE);
     }
     return(FALSE);  // invalid integer such as "3.14", "123ABC," or "12345678901234567890" (larger than even int64) => bump type.
 }
@@ -249,10 +344,10 @@ static inline Rboolean Strtod()
             return(TRUE);
         }
     }
-    if (lch==start && lch<eof-1 && *lch++=='N' && *lch++=='A' && (lch==eof || *lch==sep || *lch==eol)) {
-        ch = lch;
-        u.d = NA_REAL;
-        return(TRUE);
+    if(lch == start && can_cast_to_na(lch)) {
+      u.d = NA_REAL;
+      // ch pointer already set to end of nastring by can_cast_to_na() function
+      return(TRUE);
     }
     return(FALSE);     // invalid double, need to bump type.
 }
@@ -275,10 +370,10 @@ static inline Rboolean Strtob()
         ch = start+1;
         if (*ch=='a' && *++ch=='l' && *++ch=='s' && *++ch=='e' && (++ch==eof || *ch==sep || *ch==eol)) return(TRUE);
     }
-    else if (ch==eof || *ch==sep || *ch==eol ||
-             (ch<eof-1 && *ch=='N' && *++ch=='A' && (++ch==eof || *ch==sep || *ch==eol))) {
-        u.b = NA_LOGICAL;
-        return(TRUE);
+    else if(can_cast_to_na(ch)) {
+      u.b = NA_LOGICAL;
+      // ch pointer already set to end of nastring by can_cast_to_na() function
+      return(TRUE);
     }
     ch = start;
     return(FALSE);     // invalid boolean, need to bump type.
@@ -474,7 +569,23 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
         strcmp(CHAR(STRING_ELT(integer64,0)), "character")!=0)
         error("integer64='%s' which isn't 'integer64'|'double'|'numeric'|'character'", CHAR(STRING_ELT(integer64,0)));
     if (!isNull(select) && !isNull(drop)) error("Supply either 'select' or 'drop' but not both");
-
+    // ********************************************************************************************
+    //   NA handling preparations
+    // ********************************************************************************************
+    if( ! isNull(nastrings)) {
+      FLAG_NA_STRINGS_NULL = 0;
+      NASTRINGS_LEN = LENGTH(nastrings);
+      NA_MASK = (int *)malloc(NASTRINGS_LEN * sizeof(int));
+      NA_MAX_NCHAR = get_maxlen(nastrings);
+      NA_STRINGS = malloc(NASTRINGS_LEN * sizeof(char *));
+      EACH_NA_STRING_LEN = malloc(NASTRINGS_LEN * sizeof(int));
+      for (int i = 0; i < NASTRINGS_LEN; i++) {
+        NA_STRINGS[i] = CHAR(STRING_ELT(nastrings, i));
+        EACH_NA_STRING_LEN[i] = strlen(CHAR(STRING_ELT(nastrings, i)));
+      }
+    } else {
+      FLAG_NA_STRINGS_NULL = 1;
+    }
     // ********************************************************************************************
     //   Point to text input, or open and mmap file
     // ********************************************************************************************
@@ -743,7 +854,8 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
         } else {                              // if field reads as double ok then it's INT/INT64/REAL; i.e., not character (and so not a column name)
             if (*ch!=sep && *ch!=eol && Strtod())  // blank column names (,,) considered character and will get default names
                 allchar=FALSE;                     // considered testing at least one isalpha, but we want 1E9 to be a value not a column name
-            else while(ch<eof && *ch!=eol && *ch!=sep) ch++;  // skip over unquoted character field
+            else 
+                while(ch<eof && *ch!=eol && *ch!=sep) ch++;  // skip over unquoted character field
         }
         if (i<ncol-1) {   // not the last column (doesn't have a separator after it)
             if (ch<eof && *ch!=sep) STOP("Unexpected character ending field %d of line %d: %.*s", i+1, line, ch-pos+5, pos);
@@ -1133,9 +1245,14 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
         if (verbose) Rprintf("Read %d rows. Exactly what was estimated and allocated up front\n", i);
     }
     for (j=0; j<ncol-numNULL; j++) SETLENGTH(VECTOR_ELT(ans,j), nrow);
-
+    // release memory from NA handling operations
+    if(!FLAG_NA_STRINGS_NULL) {
+      free(NA_MASK);
+      free(NA_STRINGS);
+      free(EACH_NA_STRING_LEN);
+    }
     // ********************************************************************************************
-    //   Convert na.strings to NA
+    //   Convert na.strings to NA for character columns
     // ********************************************************************************************
     for (k=0; k<length(nastrings); k++) {
         thisstr = STRING_ELT(nastrings,k);