Skip to content

Commit

Permalink
Closes #1446. data.table() gains stringsAsFactors argument.
Browse files Browse the repository at this point in the history
  • Loading branch information
arunsrinivasan committed Feb 9, 2016
1 parent 5de510a commit 3dbc493
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 16 deletions.
10 changes: 9 additions & 1 deletion R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ null.data.table <-function() {
alloc.col(ans)
}

data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
{
# NOTE: It may be faster in some circumstances to create a data.table by creating a list l first, and then setattr(l,"class",c("data.table","data.frame")) at the expense of checking.
# TO DO: rewrite data.table(), one of the oldest functions here. Many people use data.table() to convert data.frame rather than
Expand Down Expand Up @@ -346,6 +346,8 @@ data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
&& !any(duplicated(names(value)[names(value) %in% ckey])))
setattr(value, "sorted", ckey)
}
# FR #1446, setfactor is an internal function in fread.R
if (isTRUE(stringsAsFactors)) setfactor(value, which(vapply(value, is.character, TRUE)), FALSE)
alloc.col(value) # returns a NAMED==0 object, unlike data.frame()
}

Expand Down Expand Up @@ -2520,6 +2522,12 @@ gsd <- function(x, na.rm=FALSE) .Call(Cgsd, x, na.rm)
gstart <- function(o, f, l, rows) .Call(Cgstart, o, f, l, rows)
gend <- function() .Call(Cgend)

# rowwise summary functions
rowmeans <- function(x, na.rm=FALSE) .Call("Crowmeans", x, na.rm)
rowsums <- function(x, na.rm=FALSE) .Call("Crowsums", x, na.rm)
rowmins <- function(x, na.rm=FALSE) .Call("Crowmins", x, na.rm)
rowmaxs <- function(x, na.rm=FALSE) .Call("Crowmaxs", x, na.rm)

isReallyReal <- function(x) {
.Call(CisReallyReal, x)
}
33 changes: 19 additions & 14 deletions R/fread.R
Original file line number Diff line number Diff line change
Expand Up @@ -104,15 +104,6 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
if (isTRUE(as.logical(check.names))) {
setattr(ans, 'names', make.names(names(ans), unique=TRUE))
}
as_factor <- function(x) {
lev = forderv(x, retGrp = TRUE, na.last = NA)
# get levels, also take care of all sorted condition
if (length(lev)) lev = x[lev[attributes(lev)$starts]]
else lev = x[attributes(lev)$starts]
ans = chmatch(x, lev)
setattr(ans, 'levels', lev)
setattr(ans, 'class', 'factor')
}
cols = NULL
if (isTRUE(as.logical(stringsAsFactors)))
cols = which(vapply(ans, is.character, TRUE))
Expand All @@ -122,11 +113,7 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
else if (is.atomic(colClasses) && "factor" %chin% colClasses)
cols = which(vapply(ans, is.character, TRUE))
}
if (length(cols)) {
if (verbose) cat("Converting column(s) [", paste(names(ans)[cols], collapse = ", "), "] from 'char' to 'factor'\n", sep = "")
for (j in cols)
set(ans, j = j, value = as_factor(.subset2(ans, j)))
}
setfactor(ans, cols, verbose)
# FR #768
if (!missing(col.names))
setnames(ans, col.names) # setnames checks and errors automatically
Expand All @@ -140,3 +127,21 @@ fread <- function(input="",sep="auto",sep2="auto",nrows=-1L,header="auto",na.str
}
ans
}

# for internal use only. Used in `fread` and `data.table` for 'stringsAsFactors' argument
setfactor <- function(x, cols, verbose) {
# simplified but faster version of `factor()` for internal use.
as_factor <- function(x) {
lev = forderv(x, retGrp = TRUE, na.last = NA)
# get levels, also take care of all sorted condition
lev = if (length(lev)) x[lev[attributes(lev)$starts]] else x[attributes(lev)$starts]
ans = chmatch(x, lev)
setattr(ans, 'levels', lev)
setattr(ans, 'class', 'factor')
}
if (length(cols)) {
if (verbose) cat("Converting column(s) [", paste(names(x)[cols], collapse = ", "), "] from 'char' to 'factor'\n", sep = "")
for (j in cols) set(x, j = j, value = as_factor(.subset2(x, j)))
}
invisible(x)
}
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@

19. `dcast.data.table` now allows `drop = c(FALSE, TRUE)` and `drop = c(TRUE, FALSE)`. The former only fills all missing combinations of formula LHS, where as the latter fills only all missing combinations of formula RHS. Thanks to Ananda Mahto for [this SO post](http://stackoverflow.com/q/34830908/559784) and to Jaap for filing [#1512](https://github.com/Rdatatable/data.table/issues/1512).

20. `data.table()` function gains `stringsAsFactors` argument with default `FALSE`, [#643](https://github.com/Rdatatable/data.table/issues/643). Thanks to @Jan for reviving this issue.

#### BUG FIXES

1. Now compiles and runs on IBM AIX gcc. Thanks to Vinh Nguyen for investigation and testing, [#1351](https://github.com/Rdatatable/data.table/issues/1351).
Expand Down
4 changes: 3 additions & 1 deletion man/data.table.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
Tip: one of the quickest ways to learn the features is to type \code{example(data.table)} and study the output at the prompt.
}
\usage{
data.table(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
data.table(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)

\method{[}{data.table}(x, i, j, by, keyby, with = TRUE,
nomatch = getOption("datatable.nomatch"), # default: NA_integer_
Expand Down Expand Up @@ -45,6 +45,8 @@ data.table(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
}
\item{key}{ Character vector of one or more column names which is passed to \code{\link{setkey}}. It may be a single comma separated string such as \code{key="x,y,z"}, or a vector of names such as \code{key=c("x","y","z")}.

}
\item{stringsAsFactors}{Logical (default is \code{FALSE}). Convert all \code{character} columns to \code{factor}s?
}
\item{x}{ A \code{data.table}.

Expand Down

0 comments on commit 3dbc493

Please sign in to comment.