Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fread improve containing header guess #5257

Merged
merged 5 commits into from
Nov 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,8 @@
# 3: 3 NA
# 4: 4 NA
```

32. `fread()` already made a good guess as to whether column names are present by comparing the type of the fields in row 1 to the type of the fields in the sample. This guess is now improved when a column contains a string in row 1 (i.e. a potential column name) but all blank in the sample rows, [#2526](https://github.com/Rdatatable/data.table/issues/2526). Thanks @st-pasha for reporting, and @ben-schwen for the PR.

## BUG FIXES

Expand Down
7 changes: 5 additions & 2 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -12498,9 +12498,12 @@ if (test_R.utils) {
}

# better colname detection by comparing potential column names to the whole sample not just the first row of the sample, #2526
test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(A=NA, "100"=c(300L,500L), "200"=c(400L,600L)))
test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(A=NA, "100"=c(NA,500L), V3=c(NA,600L)))
test(1870.1, fread("A,100,200\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,300L,500L), V3=c(200L,400L,600L)))
test(1870.2, fread("A,100,\n,,\n,500,600"), data.table(V1=c("A","",""), V2=c(100L,NA,500L), V3=c(NA,NA,600L)))
test(1870.3, fread("A,B,\n,,\n,500,3.4"), data.table(A=NA, B=c(NA,500L), V3=c(NA,3.4)))
test(1870.4, fread("A,B,200\n,300,400\n,500,600"), data.table(A=NA, B=c(300L,500L), "200"=c(400L,600L)))
test(1870.5, fread("A,B,\n,,\n,500,600"), data.table(A=NA, B=c(NA,500L), V3=c(NA,600L)))
test(1870.6, fread("A,,\n,300,400\n,500,600"), data.table(V1=c("A","",""), V2=c(NA,300L,500L), V3=c(NA,400L,600L)))

# nrows= now ignores errors after those nrows as expected and skip= determines first row for sure, #1267
txt = "V1, V2, V3\n2,3,4\nV4, V5, V6, V7\n4,5,6,7\n8,9,10,11\n"
Expand Down
18 changes: 12 additions & 6 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ static int8_t *type = NULL, *tmpType = NULL, *size = NULL;
static lenOff *colNames = NULL;
static freadMainArgs args = {0}; // global for use by DTPRINT; static implies ={0} but include the ={0} anyway just in case for valgrind #4639

const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"};
int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 };
const char typeName[NUMTYPE][10] = {"drop", "bool8", "bool8", "bool8", "bool8", "bool8", "int32", "int64", "float64", "float64", "float64", "int32", "float64", "string"};
int8_t typeSize[NUMTYPE] = { 0, 1, 1, 1, 1, 1, 4, 8, 8, 8, 8, 4, 8 , 8 };

// In AIX, NAN and INFINITY don't qualify as constant literals. Refer: PR #3043
// So we assign them through below init function.
Expand Down Expand Up @@ -1076,6 +1076,12 @@ static void parse_iso8601_timestamp(FieldParseContext *ctx)
*target = NA_FLOAT64;
}

static void parse_empty(FieldParseContext *ctx)
{
int8_t *target = (int8_t*) ctx->targets[sizeof(int8_t)];
*target = NA_BOOL8;
}

/* Parse numbers 0 | 1 as boolean and ,, as NA (fwrite's default) */
static void parse_bool_numeric(FieldParseContext *ctx)
{
Expand Down Expand Up @@ -1152,7 +1158,8 @@ static void parse_bool_lowercase(FieldParseContext *ctx)
*/
typedef void (*reader_fun_t)(FieldParseContext *ctx);
static reader_fun_t fun[NUMTYPE] = {
(reader_fun_t) &Field,
(reader_fun_t) &Field, // CT_DROP
(reader_fun_t) &parse_empty, // CT_EMPTY
(reader_fun_t) &parse_bool_numeric,
(reader_fun_t) &parse_bool_uppercase,
(reader_fun_t) &parse_bool_titlecase,
Expand All @@ -1167,7 +1174,7 @@ static reader_fun_t fun[NUMTYPE] = {
(reader_fun_t) &Field
};

static int disabled_parsers[NUMTYPE] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
static int disabled_parsers[NUMTYPE] = {0};

static int detect_types( const char **pch, int8_t type[], int ncol, bool *bumped) {
// used in sampling column types and whether column names are present
Expand Down Expand Up @@ -1883,8 +1890,7 @@ int freadMain(freadMainArgs _args) {
bool bumped=false;
detect_types(&ch, tmpType, ncol, &bumped);
if (sampleLines>0) for (int j=0; j<ncol; j++) {
if (tmpType[j]==CT_STRING && type[j]<CT_STRING) {
// includes an all-blank column with a string at the top; e.g. test 1870.1 and 1870.2
if (tmpType[j]==CT_STRING && type[j]<CT_STRING && type[j]>CT_EMPTY) {
args.header=true;
if (verbose) DTPRINT(_(" 'header' determined to be true due to column %d containing a string on row 1 and a lower type (%s) in the rest of the %d sample rows\n"),
j+1, typeName[type[j]], sampleLines);
Expand Down
3 changes: 2 additions & 1 deletion src/fread.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@
typedef enum {
NEG = -1, // dummy to force signed type; sign bit used for out-of-sample type bump management
CT_DROP = 0, // skip column requested by user; it is navigated as a string column with the prevailing quoteRule
CT_BOOL8_N, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1.
CT_EMPTY, // int8_t; first enum value must be 1 not 0(=CT_DROP) so that it can be negated to -1. EMPTY to help column heading guess, #5257
CT_BOOL8_N, // int8_t
CT_BOOL8_U,
CT_BOOL8_T,
CT_BOOL8_L,
Expand Down
6 changes: 3 additions & 3 deletions src/freadR.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ Secondary separator for list() columns, such as columns 11 and 12 in BED (no nee

#define NUT NUMTYPE+2 // +1 for "numeric" alias for "double"; +1 for CLASS fallback using as.class() at R level afterwards

static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP };
static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" };
static int typeEnum[NUT] = {CT_DROP, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING};
static int typeSxp[NUT] = {NILSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, LGLSXP, INTSXP, REALSXP, REALSXP, REALSXP, REALSXP, INTSXP, REALSXP, STRSXP, REALSXP, STRSXP };
static char typeRName[NUT][10]={"NULL", "logical", "logical", "logical", "logical", "logical", "integer", "integer64", "double", "double", "double", "IDate", "POSIXct", "character", "numeric", "CLASS" };
static int typeEnum[NUT] = {CT_DROP, CT_EMPTY, CT_BOOL8_N, CT_BOOL8_U, CT_BOOL8_T, CT_BOOL8_L, CT_INT32, CT_INT64, CT_FLOAT64, CT_FLOAT64_HEX, CT_FLOAT64_EXT, CT_ISO8601_DATE, CT_ISO8601_TIME, CT_STRING, CT_FLOAT64, CT_STRING};
static colType readInt64As=CT_INT64;
static SEXP selectSxp;
static SEXP dropSxp;
Expand Down