diff --git a/NEWS.md b/NEWS.md index 3abcdaed4..f5fe08f92 100644 --- a/NEWS.md +++ b/NEWS.md @@ -251,6 +251,8 @@ # 3: 3 NA # 4: 4 NA ``` + +32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR. ## BUG FIXES diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 0b77d98fa..8982342bc 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -12498,9 +12498,12 @@ if (test_R.utils) { } # better colname detection by comparing potential column names to the whole sample not just the first row of the sample, #2526 -test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(A=NA, "100"=c(300L,500L), "200"=c(400L,600L))) -test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(A=NA, "100"=c(NA,500L), V3=c(NA,600L))) +test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,300L,500L), V3=c(200L,400L,600L))) +test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,NA,500L), V3=c(NA,NA,600L))) test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4))) +test(1870.4, fread("A,B,200\n,300,400\n,500,600"), data.table(A=NA, B=c(300L,500L), "200"=c(400L,600L))) +test(1870.5, fread("A,B,\n,,\n,500,600"), data.table(A=NA, B=c(NA,500L), V3=c(NA,600L))) +test(1870.6, fread("A,,\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(NA,300L,500L), V3=c(NA,400L,600L))) # nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267 txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n" diff --git a/src/fread.c b/src/fread.c index 2aa796037..04df88d9c 100644 --- a/src/fread.c +++ b/src/fread.c @@ -66,8 +66,8 @@ static int8_t *type = NULL, *tmpType = NULL, *size = NULL; static lenOff *colNames = NULL; static freadMainArgs args = {0}; // global for use by DTPRINT; static implies ={0} but include the ={0} anyway just in case for valgrind #4639 -const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; -int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; +const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"}; +int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 }; // In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043 // So we assign them through below init function. @@ -1076,6 +1076,12 @@ static void parse_iso8601_timestamp(FieldParseContext *ctx) *target = NA_FLOAT64; } +static void parse_empty(FieldParseContext *ctx) +{ + int8_t *target = (int8_t*) ctx->targets[sizeof(int8_t)]; + *target = NA_BOOL8; +} + /* Parse numbers 0 | 1 as boolean and ,, as NA (fwrite's default) */ static void parse_bool_numeric(FieldParseContext *ctx) { @@ -1152,7 +1158,8 @@ static void parse_bool_lowercase(FieldParseContext *ctx) */ typedef void (*reader_fun_t)(FieldParseContext *ctx); static reader_fun_t fun[NUMTYPE] = { - (reader_fun_t) &Field, + (reader_fun_t) &Field, // CT_DROP + (reader_fun_t) &parse_empty, // CT_EMPTY (reader_fun_t) &parse_bool_numeric, (reader_fun_t) &parse_bool_uppercase, (reader_fun_t) &parse_bool_titlecase, @@ -1167,7 +1174,7 @@ static reader_fun_t fun[NUMTYPE] = { (reader_fun_t) &Field }; -static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +static int disabled_parsers[NUMTYPE] = {0}; static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped) { // used in sampling column types and whether column names are present @@ -1883,8 +1890,7 @@ int freadMain(freadMainArgs _args) { bool bumped=false; detect_types(&ch, tmpType, ncol, &bumped); if (sampleLines>0) for (int j=0; jCT_EMPTY) { args.header=true; if (verbose) DTPRINT(_(" 'header' determined to be true due to column %d containing a string on row 1 and a lower type (%s) in the rest of the %d sample rows\n"), j+1, typeName[type[j]], sampleLines); diff --git a/src/fread.h b/src/fread.h index 446da18e4..7035615a5 100644 --- a/src/fread.h +++ b/src/fread.h @@ -20,7 +20,8 @@ typedef enum { NEG = -1, // dummy to force signed type; sign bit used for out-of-sample type bump management CT_DROP = 0, // skip column requested by user; it is navigated as a string column with the prevailing quoteRule - CT_BOOL8_N, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. + CT_EMPTY, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. EMPTY to help column heading guess, #5257 + CT_BOOL8_N, // int8_t CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, diff --git a/src/freadR.c b/src/freadR.c index 97fe691aa..82992aba3 100644 --- a/src/freadR.c +++ b/src/freadR.c @@ -24,9 +24,9 @@ Secondary separator for list() columns, such as columns 11 and 12 in BED (no nee #define NUT NUMTYPE+2 // +1 for "numeric" alias for "double"; +1 for CLASS fallback using as.class() at R level afterwards -static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; -static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; -static int typeEnum[NUT] = {CT_DROP, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; +static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP }; +static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" }; +static int typeEnum[NUT] = {CT_DROP, CT_EMPTY, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING}; static colType readInt64As=CT_INT64; static SEXP selectSxp; static SEXP dropSxp;