Skip to content

Commit

Permalink
Better fix for encoding issues, #66, #69, #469, #1293.
Browse files Browse the repository at this point in the history
When marked non-utf8 encodings are detected, they are internally converted to utf-8 before comparing.
  • Loading branch information
arunsrinivasan committed Jan 27, 2016
1 parent c2c8387 commit 03cd45f
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 69 deletions.
7 changes: 3 additions & 4 deletions R/as.data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ as.data.table.matrix <- function(x, keep.rownames=FALSE, ...) {
setattr(value, "names", paste("V", ic, sep = ""))
setattr(value,"row.names",.set_row_names(nrows))
setattr(value,"class",c("data.table","data.frame"))
alloc.col(setencodingv(value))
alloc.col(value)
}

as.data.table.list <- function(x, keep.rownames=FALSE, ...) {
Expand Down Expand Up @@ -122,7 +122,7 @@ as.data.table.list <- function(x, keep.rownames=FALSE, ...) {
if (is.null(names(x))) setattr(x,"names",paste("V",seq_len(length(x)),sep=""))
setattr(x,"row.names",.set_row_names(max(n)))
setattr(x,"class",c("data.table","data.frame"))
alloc.col(setencodingv(x))
alloc.col(x)
}

# don't retain classes before "data.frame" while converting
Expand Down Expand Up @@ -153,11 +153,10 @@ as.data.table.data.frame <- function(x, keep.rownames=FALSE, ...) {

# fix for #1078 and #1128, see .resetclass() for explanation.
setattr(ans, "class", .resetclass(x, "data.frame"))
alloc.col(setencodingv(ans))
alloc.col(ans)
}

as.data.table.data.table <- function(x, ...) {
setencodingv(x)
# fix for #1078 and #1128, see .resetclass() for explanation.
setattr(x, 'class', .resetclass(x, "data.table"))
return(x)
Expand Down
41 changes: 0 additions & 41 deletions R/data.table.R
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,6 @@ data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
setattr(value,"names",vnames)
setattr(value,"row.names",.set_row_names(nr))
setattr(value,"class",c("data.table","data.frame"))
setencodingv(value) # fix for all mixed encoding issues
if (!is.null(key)) {
if (!is.character(key)) stop("key argument of data.table() must be character")
if (length(key)==1L) {
Expand Down Expand Up @@ -2391,7 +2390,6 @@ setDT <- function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
}
}
if (is.data.table(x)) {
setencodingv(x)
# fix for #1078 and #1128, see .resetclass() for explanation.
setattr(x, 'class', .resetclass(x, 'data.table'))
if (!missing(key)) setkeyv(x, key) # fix for #1169
Expand All @@ -2403,7 +2401,6 @@ setDT <- function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE))
# fix for #1078 and #1128, see .resetclass() for explanation.
setattr(x, "class", .resetclass(x, 'data.frame'))
setencodingv(x)
alloc.col(x)
if (!is.null(rn)) {
nm = c(if (is.character(keep.rownames)) keep.rownames[1L] else "rn", names(x))
Expand Down Expand Up @@ -2431,7 +2428,6 @@ setDT <- function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
}
setattr(x,"row.names",.set_row_names(max(n)))
setattr(x,"class",c("data.table","data.frame"))
setencodingv(x)
alloc.col(x)
} else {
stop("Argument 'x' to 'setDT' should be a 'list', 'data.frame' or 'data.table'")
Expand Down Expand Up @@ -2533,40 +2529,3 @@ gend <- function() .Call(Cgend)
isReallyReal <- function(x) {
.Call(CisReallyReal, x)
}

# handle mixed encoding issues, attempting to fix #66, #69, #469 and #1293.
charcols <- function(x) {
if (!is.data.table(x))
stop("x must be a data.table")
which(vapply(x, is.character, TRUE))
}

setencoding <- function(x, ...) {
setencodingv(x, as.character(substitute(list(...))[-1]))
}

setencodingv <- function(x, cols=charcols(x)) {
if (!missing(cols)) {
if (!is.character(cols) || !length(cols) > 0L)
stop("cols must be a character vector of length > 0.")
if ( length(idx <- which(!cols %chin% names(x))) )
stop("Column(s) [", paste(cols[idx], collapse=","), "] not present in input data.table.")
notchars = cols[vapply(cols, function(x) !identical("character", mode(x[[cols]])), TRUE)]
if (length(notchars)) {
warning("Discarding column(s) [", paste(notchars, collapse=","), "] as they are not character type.")
cols = setdiff(cols, notchars)
}
cols = match(cols, names(x), nomatch=0L)
}
xkey = key(x)
if (length(cols)) {
for (col in cols) {
this = enc2native(x[[col]]) # does this not handle any case?
if ( !identical(Encoding(this), Encoding(x[[col]])) )
point(x, col, as_list(this), 1L)
}
}
if (!is.null(xkey))
setattr(x, 'sorted', xkey)
invisible(x)
}
11 changes: 7 additions & 4 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -7294,20 +7294,23 @@ test(1590.1, A[B], cbind(A, d=2:1))
# #69
a = c("a", "ä", "ß", "z")
au = iconv(a, "UTF8", "latin1")
dt = data.table(a,b=1:4)
dta = data.table(a,b=1:4)
dtb <- data.table(a=au)
df = data.frame(a,b=1:4, stringsAsFactors=FALSE)
rownames(df) = df$a
test(1590.2, setDT(merge(df, data.frame(a=au), by="a", sort=FALSE)),
merge(dt, data.table(a=au), by="a", sort=FALSE))
test(1590.2, merge(dta, dtb, by="a", sort=FALSE),
setDT(merge(df, setDF(data.table(a=au)), by="a", sort=FALSE))
)

# #469
A = data.table(
kom = rep(101L,20),
eje = 1L:20L,
gad = rep("A.C. Meyers Vænge",20),
num = rep(c('1','10','11','11A'),times=c(10,1,2,7)),
enc = sample(c('unkwown','UTF-8'), 20, replace=TRUE))
enc = sample(c('unknown','UTF-8'), 20, replace=TRUE))
Encoding(A$gad) = A$enc
setkey(A, kom, gad, num)
test(1590.3, unique(A, by=c("kom","gad","num")),
as.data.table(A[!duplicated(paste(A$kom, A$gad, A$num))]))

Expand Down
70 changes: 51 additions & 19 deletions src/bmerge.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Differences over standard binary search (e.g. bsearch in stdlib.h) :
static SEXP i, x;
static int ncol, *icols, *xcols, *o, *xo, *retFirst, *retLength, *allLen1, *rollends;
static double roll, rollabs;
static Rboolean rollToNearest=FALSE; //, enc_warn=TRUE; See setencodingv in data.table.R
static Rboolean rollToNearest=FALSE;
#define XIND(i) (xo ? xo[(i)]-1 : i)

void bmerge_r(int xlow, int xupp, int ilow, int iupp, int col, int lowmax, int uppmax);
Expand All @@ -32,7 +32,6 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE
int xN, iN, protecti=0;
roll = 0.0;
rollToNearest = FALSE;
// enc_warn = TRUE;
if (isString(rollarg)) {
if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error("roll is character but not 'nearest'");
roll=1.0; rollToNearest=TRUE; // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later
Expand Down Expand Up @@ -108,6 +107,16 @@ static union {
static int mid, tmplow, tmpupp; // global to save them being added to recursive stack. Maybe optimizer would do this anyway.
static SEXP ic, xc;

// If we find a non-ASCII, non-NA, non-UTF8 encoding, we try to convert it to UTF8. That is, marked non-ascii/non-UTF8 encodings will always be checked in UTF8 locale. This seems to be the best fix I could think of to put the encoding issues to rest..
// Since the if-statement will fail with the first condition check in "normal" ASCII cases, there shouldn't be huge penalty issues for default setup.
// Fix for #66, #69, #469 and #1293
// TODO: compare 1.9.6 performance with 1.9.7 with huge number of ASCII strings.
SEXP ENC2UTF8(SEXP s) {
if (!IS_ASCII(s) && s != NA_STRING && !IS_UTF8(s))
s = mkCharCE(translateCharUTF8(s), CE_UTF8);
return (s);
}

void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int lowmax, int uppmax)
// col is >0 and <=ncol-1 if this range of [xlow,xupp] and [ilow,iupp] match up to but not including that column
// lowmax=1 if xlowIn is the lower bound of this group (needed for roll)
Expand Down Expand Up @@ -166,31 +175,22 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int lowma
// ilow and iupp now surround the group in ic, too
break;
case STRSXP :
ival.s = STRING_ELT(ic,ir);
ival.s = ENC2UTF8(STRING_ELT(ic,ir));
while(xlow < xupp-1) {
mid = xlow + (xupp-xlow)/2;
xval.s = STRING_ELT(xc, XIND(mid));
// See setencodingv in data.table.R. Idea is to correct false encodings upfront while creating data.table (fread, data.table, as.data.table and setDT). This warning is therefore not necessary anymore.
// if (enc_warn && (ENC_KNOWN(ival.s) || ENC_KNOWN(xval.s))) {
// // The || is only done here to avoid the warning message being repeating in this code.
// warning("A known encoding (latin1 or UTF-8) was detected in a join column. data.table compares the bytes currently, so doesn't support *mixed* encodings well; i.e., using both latin1 and UTF-8, or if any unknown encodings are non-ascii and some of those are marked known and others not. But if either latin1 or UTF-8 is used exclusively, and all unknown encodings are ascii, then the result should be ok. In future we will check for you and avoid this warning if everything is ok. The tricky part is doing this without impacting performance for ascii-only cases.");
// // TO DO: check and warn in forder whether any strings are non-ascii (>127) but unknown encoding
// // check in forder whether both latin1 and UTF-8 have been used
// // See bugs #5159 and #5266 and related #5295 to revisit
// enc_warn = FALSE; // just warn once
// }
xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid)));
tmp = StrCmp(xval.s, ival.s); // uses pointer equality first, NA_STRING are allowed and joined to, then uses strcmp on CHAR().
if (tmp == 0) { // TO DO: deal with mixed encodings and locale optionally
tmplow = mid;
tmpupp = mid;
while(tmplow<xupp-1) {
mid = tmplow + (xupp-tmplow)/2;
xval.s = STRING_ELT(xc, XIND(mid));
if (ival.s == xval.s) tmplow=mid; else xupp=mid; // the == here assumes (within this column) no mixing of latin1 and UTF-8, and no unknown non-ascii
} // TO DO: add checks to forder, see above.
xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid)));
if (ival.s == xval.s) tmplow=mid; else xupp=mid; // the == here handles encodings as well. Marked non-utf8 encodings are converted to utf-8 using ENC2UTF8.
}
while(xlow<tmpupp-1) {
mid = xlow + (tmpupp-xlow)/2;
xval.s = STRING_ELT(xc, XIND(mid));
xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid)));
if (ival.s == xval.s) tmpupp=mid; else xlow=mid; // see above re ==
}
break;
Expand All @@ -204,12 +204,12 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int lowma
tmpupp = lir;
while(tmplow<iupp-1) {
mid = tmplow + (iupp-tmplow)/2;
xval.s = STRING_ELT(ic, o ? o[mid]-1 : mid);
xval.s = ENC2UTF8(STRING_ELT(ic, o ? o[mid]-1 : mid));
if (xval.s == ival.s) tmplow=mid; else iupp=mid; // see above re ==
}
while(ilow<tmpupp-1) {
mid = ilow + (tmpupp-ilow)/2;
xval.s = STRING_ELT(ic, o ? o[mid]-1 : mid);
xval.s = ENC2UTF8(STRING_ELT(ic, o ? o[mid]-1 : mid));
if (xval.s == ival.s) tmpupp=mid; else ilow=mid; // see above re ==
}
break;
Expand Down Expand Up @@ -323,4 +323,36 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int lowma
bmerge_r(xupp-1, xuppIn, iupp-1, iuppIn, col, lowmax && xupp-1==xlowIn, uppmax);
}

// HOPEFULLY NOT NEEDED ANYMORE BUT JUST IN CASE...
// // adapted from util.c::do_enc2, here so that we can directly update columns
// SEXP setencoding(SEXP s, SEXP enc_utf8, SEXP enc_latin1) {
// R_xlen_t i;
// SEXP el;
// for (i=0; i<XLENGTH(s); i++) {
// el = STRING_ELT(s, i);
// if (el == NA_STRING) continue;
// if (LOGICAL(enc_utf8)[0]) {
// if (IS_ASCII(el) || IS_UTF8(el)) continue;
// SET_STRING_ELT(s, i, mkCharCE(translateCharUTF8(el), CE_UTF8));
// } else if (ENC_KNOWN(el)) {
// if (IS_ASCII(el) || (LOGICAL(enc_latin1)[0] && IS_LATIN1(el))) continue;
// if (LOGICAL(enc_latin1)[0])
// SET_STRING_ELT(s, i, mkCharCE(translateChar(el), CE_LATIN1));
// else
// SET_STRING_ELT(s, i, mkChar(translateChar(el)));
// }
// }
// return (R_NilValue);
// }

// SEXP is_encoding_marked(SEXP x) {

// int i, n=XLENGTH(x);
// SEXP s;
// for (i=0; i<n; i++) {
// s = STRING_ELT(x, i);
// if (s != NA_STRING && !IS_ASCII(s))
// break;
// }
// return (ScalarLogical(i!=n));
// }
5 changes: 5 additions & 0 deletions src/data.table.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
// #include <signal.h> // the debugging machinery + breakpoint aidee
// raise(SIGINT);

#define IS_UTF8(x) (LEVELS(x) & 8)
#define IS_ASCII(x) (LEVELS(x) & 64)
#define IS_LATIN(x) (LEVELS(x) & 4)

#define SIZEOF(x) sizes[TYPEOF(x)]
#ifdef MIN
#undef MIN
Expand Down Expand Up @@ -70,6 +74,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX

// bmerge.c
SEXP bmerge(SEXP left, SEXP right, SEXP leftcols, SEXP rightcols, SEXP isorted, SEXP xoArg, SEXP rollarg, SEXP rollends, SEXP nomatch, SEXP retFirst, SEXP retLength, SEXP allLen1);
SEXP ENC2UTF8(SEXP s);

// fcast.c
SEXP coerce_to_char(SEXP s, SEXP env);
Expand Down
4 changes: 3 additions & 1 deletion src/uniqlist.c
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,9 @@ SEXP uniqlist(SEXP l, SEXP order)
case INTSXP : case LGLSXP :
b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break;
case STRSXP :
b=STRING_ELT(v,thisi)==STRING_ELT(v,previ); break; // forder checks no non-ascii unknown, and either UTF-8 or Latin1 but not both. So == pointers is ok given that check.
// fix for #469, when key is set, duplicated calls uniqlist, where encoding
// needs to be taken care of.
b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break; // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings.
case REALSXP :
ulv = (unsigned long long *)REAL(v);
b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup)
Expand Down

0 comments on commit 03cd45f

Please sign in to comment.