Skip to content

Commit

Permalink
Closes #530. fread gains 'blank.lines.skip' argument.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Oct 29, 2015
1 parent d3a06b7 commit 9b14ec7
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 6 deletions.
7 changes: 5 additions & 2 deletions R/fread.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=1L,skip=0L,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"),dec=if (sep!=".") "." else ",", col.names, check.names=FALSE, encoding="unknown", quote="\"", strip.white=TRUE, showProgress=getOption("datatable.showProgress"),data.table=getOption("datatable.fread.datatable")) {
fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.strings="NA",stringsAsFactors=FALSE,verbose=getOption("datatable.verbose"),autostart=1L,skip=0L,select=NULL,drop=NULL,colClasses=NULL,integer64=getOption("datatable.integer64"),dec=if (sep!=".") "." else ",", col.names, check.names=FALSE, encoding="unknown", quote="\"", strip.white=TRUE, blank.lines.skip=FALSE, showProgress=getOption("datatable.showProgress"),data.table=getOption("datatable.fread.datatable")) {
if (!is.character(dec) || length(dec)!=1L || nchar(dec)!=1) stop("dec must be a single character e.g. '.' or ','")
# handle encoding, #563
if (!encoding %in% c("unknown", "UTF-8", "Latin-1")) {
Expand All @@ -8,6 +8,9 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
if (!strip.white %in% c(TRUE, FALSE)) {
stop("Argument 'strip.white' must be logical TRUE/FALSE")
}
if (!blank.lines.skip %in% c(TRUE, FALSE)) {
stop("Argument 'blank.lines.skip' must be logical TRUE/FALSE")
}
if (getOption("datatable.fread.dec.experiment") && Sys.localeconv()["decimal_point"] != dec) {
oldlocale = Sys.getlocale("LC_NUMERIC")
if (verbose) cat("dec='",dec,"' but current locale ('",oldlocale,"') has dec='",Sys.localeconv()["decimal_point"],"'. Attempting to change locale to one that has the desired decimal point.\n",sep="")
Expand Down Expand Up @@ -82,7 +85,7 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
if (identical(header,"auto")) header=NA
if (identical(sep,"auto")) sep=NULL
if (is.atomic(colClasses) && !is.null(names(colClasses))) colClasses = tapply(names(colClasses),colClasses,c,simplify=FALSE)
ans = .Call(Creadfile,input,sep,as.integer(nrows),header,na.strings,verbose,as.integer(autostart),skip,select,drop,colClasses,integer64,dec,encoding,quote,strip.white,as.integer(showProgress))
ans = .Call(Creadfile,input,sep,as.integer(nrows),header,na.strings,verbose,as.integer(autostart),skip,select,drop,colClasses,integer64,dec,encoding,quote,strip.white,blank.lines.skip,as.integer(showProgress))
nr = length(ans[[1]])
if ( integer64=="integer64" && !exists("print.integer64") && any(sapply(ans,inherits,"integer64")) )
warning("Some columns have been read as type 'integer64' but package bit64 isn't loaded. Those columns will display as strange looking floating point data. There is no need to reload the data. Just require(bit64) to obtain the integer64 print method and print the data again.")
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@

7. `on=` joins now provides more friendly error messages when columns aren't found, [#1376](https://github.com/Rdatatable/data.table/issues/1376).

8. `fread()` gains `blank.lines.skip` argument that continues reading by skipping empty lines. Default is `FALSE` for backwards compatibility, [#530](https://github.com/Rdatatable/data.table/issues/530). Thanks to @DirkJonker.

#### BUG FIXES

1. Now compiles and runs on IBM AIX gcc. Thanks to Vinh Nguyen for investigation and testing, [#1351](https://github.com/Rdatatable/data.table/issues/1351).
Expand Down
51 changes: 51 additions & 0 deletions inst/tests/530_fread.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
a,b,c,d
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2
1,2,3
a,b

1,3
2,4
12 changes: 12 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -7090,6 +7090,18 @@ X = fread("a|b|c|d
this|NA|row|4", colClasses="character", stringsAsFactors = TRUE)
test(1577.3, levels(X$b), character(0))

# FR #530, skip blank lines
input = "a,b\n\n1,3\n2,4"
test(1578.1, fread(input), data.table(V1=1:2, V2=3:4))
test(1578.2, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4))
input = "a,b\n\n\n1,3\n2,4"
test(1578.3, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4))
input = "a,b\n\n\n1,3\n\n2,4\n\n"
test(1578.4, fread(input, blank.lines.skip=TRUE), data.table( a=1:2, b=3:4))

test(1578.5, fread("530_fread.txt", skip=47L), data.table(V1=1:2, V2=3:4))
test(1578.6, fread("530_fread.txt", skip=47L, blank.lines.skip=TRUE), data.table(a=1:2, b=3:4))

##########################


Expand Down
13 changes: 9 additions & 4 deletions src/fread.c
Original file line number Diff line number Diff line change
Expand Up @@ -528,14 +528,14 @@ static SEXP coerceVectorSoFar(SEXP v, int oldtype, int newtype, R_len_t sofar, R
return(newv);
}

SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastrings, SEXP verbosearg, SEXP autostart, SEXP skip, SEXP select, SEXP drop, SEXP colClasses, SEXP integer64, SEXP dec, SEXP encoding, SEXP quoteArg, SEXP stripWhiteArg, SEXP showProgressArg)
SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastrings, SEXP verbosearg, SEXP autostart, SEXP skip, SEXP select, SEXP drop, SEXP colClasses, SEXP integer64, SEXP dec, SEXP encoding, SEXP quoteArg, SEXP stripWhiteArg, SEXP skipEmptyLinesArg, SEXP showProgressArg)
// can't be named fread here because that's already a C function (from which the R level fread function took its name)
{
SEXP thiscol, ans, thisstr;
R_len_t i, resi, j, resj, k, protecti=0, nrow=0, ncol=0;
int thistype;
const char *pos, *ch2, *lineStart;
Rboolean header, allchar;
Rboolean header, allchar, skipEmptyLines;
verbose=LOGICAL(verbosearg)[0];
clock_t t0 = clock();
ERANGEwarning = FALSE; // just while detecting types, then TRUE before the read data loop
Expand All @@ -549,6 +549,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
else ienc = CE_NATIVE;

stripWhite = LOGICAL(stripWhiteArg)[0];
skipEmptyLines = LOGICAL(skipEmptyLinesArg)[0];

// quoteArg for those rare cases when default scenario doesn't cut it.., FR #568
if (!isString(quoteArg) || LENGTH(quoteArg)!=1 || strlen(CHAR(STRING_ELT(quoteArg,0))) > 1)
Expand Down Expand Up @@ -809,6 +810,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
i=0;
int thisLine=line, thisLen=0, thisNcol=-1; // this* = this run's starting *
while(ch<=eof && ++i<=30) {
if (*ch == eol && skipEmptyLines) {ch++; continue;}
lineStart = ch;
ncol = countfields();
if (ncol==-1) {
Expand Down Expand Up @@ -931,7 +933,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
// Count number of rows
// ********************************************************************************************
i = INTEGER(nrowsarg)[0];
if (pos==eof || *pos==eol) {
if (pos==eof || (*pos==eol && !skipEmptyLines)) {
nrow=0;
if (verbose) Rprintf("Byte after header row is eof or eol, 0 data rows present.\n");
} else if (i>-1) {
Expand All @@ -953,6 +955,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
nblank += (i==0);
ch -= eolLen-1;
}
// TODO: add in logic here for a 'fill=' argument to not skip / leave at blank line, rather to fill with NAs
// if (nblank==0) There is non white after the last eol. Ok and dealt with. TO DO: reference test id here in comment
if (ncol==1) tmp = neol-nblank;
else tmp = MIN( nsep / (ncol-1), neol-nblank ); // good quick estimate with embedded sep and eol in mind
Expand Down Expand Up @@ -1200,9 +1203,10 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
}
R_CheckUserInterrupt();
batchend = MIN(i+10000, nrow); // batched into 10k rows to save (expensive) calls to clock()
for (; i<batchend && ch<eof; i++) {
while(i<batchend && ch<eof) {
//Rprintf("Row %d : %.10s\n", i+1, ch);
if (*ch==eol) {
if (skipEmptyLines) { ch++; continue; }
// blank line causes early stop. TO DO: allow blank line skips
whileBreak = TRUE; // break the enclosing while too, without changing i
break; // break this for
Expand Down Expand Up @@ -1258,6 +1262,7 @@ SEXP readfile(SEXP input, SEXP separg, SEXP nrowsarg, SEXP headerarg, SEXP nastr
ch+=eolLen; // now that we error here, the if-statement isn't needed -> // if (ch<eof && *ch==eol) ch+=eolLen;
pos = ch; // start of line position only needed to include the whole line in any error message
line++;
i++;
}
if (whileBreak) break;
}
Expand Down

0 comments on commit 9b14ec7

Please sign in to comment.