Better fix for encoding issues, #66, #69, #469, #1293.

When marked non-utf8 encodings are detected, they are internally converted to utf-8 before comparing.
Rdatatable · Jan 27, 2016 · 03cd45f · 03cd45f
1 parent c2c8387
commit 03cd45f
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 69 deletions.
diff --git a/R/as.data.table.R b/R/as.data.table.R
@@ -90,7 +90,7 @@ as.data.table.matrix <- function(x, keep.rownames=FALSE, ...) {
         setattr(value, "names", paste("V", ic, sep = ""))
     setattr(value,"row.names",.set_row_names(nrows))
     setattr(value,"class",c("data.table","data.frame"))
-    alloc.col(setencodingv(value))
+    alloc.col(value)
 }
 
 as.data.table.list <- function(x, keep.rownames=FALSE, ...) {
@@ -122,7 +122,7 @@ as.data.table.list <- function(x, keep.rownames=FALSE, ...) {
     if (is.null(names(x))) setattr(x,"names",paste("V",seq_len(length(x)),sep=""))
     setattr(x,"row.names",.set_row_names(max(n)))
     setattr(x,"class",c("data.table","data.frame"))
-    alloc.col(setencodingv(x))
+    alloc.col(x)
 }
 
 # don't retain classes before "data.frame" while converting 
@@ -153,11 +153,10 @@ as.data.table.data.frame <- function(x, keep.rownames=FALSE, ...) {
 
     # fix for #1078 and #1128, see .resetclass() for explanation.
     setattr(ans, "class", .resetclass(x, "data.frame"))
-    alloc.col(setencodingv(ans))
+    alloc.col(ans)
 }
 
 as.data.table.data.table <- function(x, ...) {
-    setencodingv(x)
     # fix for #1078 and #1128, see .resetclass() for explanation.
     setattr(x, 'class', .resetclass(x, "data.table"))
     return(x)

diff --git a/R/data.table.R b/R/data.table.R
@@ -329,7 +329,6 @@ data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL)
     setattr(value,"names",vnames)
     setattr(value,"row.names",.set_row_names(nr))
     setattr(value,"class",c("data.table","data.frame"))
-    setencodingv(value) # fix for all mixed encoding issues
     if (!is.null(key)) {
       if (!is.character(key)) stop("key argument of data.table() must be character")
       if (length(key)==1L) {
@@ -2391,7 +2390,6 @@ setDT <- function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
         }
     }
     if (is.data.table(x)) {
-        setencodingv(x)
         # fix for #1078 and #1128, see .resetclass() for explanation.
         setattr(x, 'class', .resetclass(x, 'data.table'))
         if (!missing(key)) setkeyv(x, key) # fix for #1169
@@ -2403,7 +2401,6 @@ setDT <- function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
         if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE))
         # fix for #1078 and #1128, see .resetclass() for explanation.
         setattr(x, "class", .resetclass(x, 'data.frame'))
-        setencodingv(x)
         alloc.col(x)
         if (!is.null(rn)) {
             nm = c(if (is.character(keep.rownames)) keep.rownames[1L] else "rn", names(x))
@@ -2431,7 +2428,6 @@ setDT <- function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) {
         }
         setattr(x,"row.names",.set_row_names(max(n)))
         setattr(x,"class",c("data.table","data.frame"))
-        setencodingv(x)
         alloc.col(x)
     } else {
         stop("Argument 'x' to 'setDT' should be a 'list', 'data.frame' or 'data.table'")
@@ -2533,40 +2529,3 @@ gend <- function() .Call(Cgend)
 isReallyReal <- function(x) {
     .Call(CisReallyReal, x)
 }
-
-# handle mixed encoding issues, attempting to fix #66, #69, #469 and #1293.
-charcols <- function(x) {
-    if (!is.data.table(x))
-        stop("x must be a data.table")
-    which(vapply(x, is.character, TRUE))
-}
-
-setencoding <- function(x, ...) {
-    setencodingv(x, as.character(substitute(list(...))[-1]))
-}
-
-setencodingv <- function(x, cols=charcols(x)) {
-    if (!missing(cols)) {
-        if (!is.character(cols) || !length(cols) > 0L)
-            stop("cols must be a character vector of length > 0.")
-        if ( length(idx <- which(!cols %chin% names(x))) )
-            stop("Column(s) [", paste(cols[idx], collapse=","), "] not present in input data.table.")
-        notchars = cols[vapply(cols, function(x) !identical("character", mode(x[[cols]])), TRUE)]
-        if (length(notchars)) {
-            warning("Discarding column(s) [", paste(notchars, collapse=","), "] as they are not character type.")
-            cols = setdiff(cols, notchars)
-        }
-        cols = match(cols, names(x), nomatch=0L)
-    }
-    xkey = key(x)
-    if (length(cols)) {
-        for (col in cols) {
-            this = enc2native(x[[col]]) # does this not handle any case?
-            if ( !identical(Encoding(this), Encoding(x[[col]])) )
-                point(x, col, as_list(this), 1L)
-        }
-    }
-    if (!is.null(xkey)) 
-        setattr(x, 'sorted', xkey)
-    invisible(x)
-}
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -7294,20 +7294,23 @@ test(1590.1, A[B], cbind(A, d=2:1))
 # #69
 a  = c("a", "ä", "ß", "z")
 au = iconv(a, "UTF8", "latin1")
-dt = data.table(a,b=1:4)
+dta = data.table(a,b=1:4)
+dtb <- data.table(a=au)
 df = data.frame(a,b=1:4, stringsAsFactors=FALSE)
 rownames(df) = df$a
-test(1590.2, setDT(merge(df, data.frame(a=au), by="a", sort=FALSE)), 
-             merge(dt, data.table(a=au), by="a", sort=FALSE))
+test(1590.2, merge(dta, dtb, by="a", sort=FALSE), 
+             setDT(merge(df, setDF(data.table(a=au)), by="a", sort=FALSE))
+             )
 
 # #469
 A = data.table(
      kom = rep(101L,20),
      eje = 1L:20L,
      gad = rep("A.C. Meyers Vænge",20),
      num = rep(c('1','10','11','11A'),times=c(10,1,2,7)),
-     enc = sample(c('unkwown','UTF-8'), 20, replace=TRUE))
+     enc = sample(c('unknown','UTF-8'), 20, replace=TRUE))
 Encoding(A$gad) = A$enc
+setkey(A, kom, gad, num)
 test(1590.3, unique(A, by=c("kom","gad","num")), 
              as.data.table(A[!duplicated(paste(A$kom, A$gad, A$num))]))
 

diff --git a/src/bmerge.c b/src/bmerge.c
@@ -22,7 +22,7 @@ Differences over standard binary search (e.g. bsearch in stdlib.h) :
 static SEXP i, x;
 static int ncol, *icols, *xcols, *o, *xo, *retFirst, *retLength, *allLen1, *rollends;
 static double roll, rollabs;
-static Rboolean rollToNearest=FALSE; //, enc_warn=TRUE; See setencodingv in data.table.R
+static Rboolean rollToNearest=FALSE; 
 #define XIND(i) (xo ? xo[(i)]-1 : i)
 
 void bmerge_r(int xlow, int xupp, int ilow, int iupp, int col, int lowmax, int uppmax);
@@ -32,7 +32,6 @@ SEXP bmerge(SEXP iArg, SEXP xArg, SEXP icolsArg, SEXP xcolsArg, SEXP isorted, SE
     int xN, iN, protecti=0;
     roll = 0.0;
     rollToNearest = FALSE;
-    // enc_warn = TRUE;
     if (isString(rollarg)) {
         if (strcmp(CHAR(STRING_ELT(rollarg,0)),"nearest") != 0) error("roll is character but not 'nearest'");
         roll=1.0; rollToNearest=TRUE;       // the 1.0 here is just any non-0.0, so roll!=0.0 can be used later
@@ -108,6 +107,16 @@ static union {
 static int mid, tmplow, tmpupp;  // global to save them being added to recursive stack. Maybe optimizer would do this anyway.
 static SEXP ic, xc;
 
+// If we find a non-ASCII, non-NA, non-UTF8 encoding, we try to convert it to UTF8. That is, marked non-ascii/non-UTF8 encodings will always be checked in UTF8 locale. This seems to be the best fix I could think of to put the encoding issues to rest..
+// Since the if-statement will fail with the first condition check in "normal" ASCII cases, there shouldn't be huge penalty issues for default setup.
+// Fix for #66, #69, #469 and #1293
+// TODO: compare 1.9.6 performance with 1.9.7 with huge number of ASCII strings.
+SEXP ENC2UTF8(SEXP s) {
+    if (!IS_ASCII(s) && s != NA_STRING && !IS_UTF8(s))
+        s = mkCharCE(translateCharUTF8(s), CE_UTF8);
+    return (s);
+}
+
 void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int lowmax, int uppmax)
 // col is >0 and <=ncol-1 if this range of [xlow,xupp] and [ilow,iupp] match up to but not including that column
 // lowmax=1 if xlowIn is the lower bound of this group (needed for roll)
@@ -166,31 +175,22 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int lowma
         // ilow and iupp now surround the group in ic, too
         break;
     case STRSXP :
-        ival.s = STRING_ELT(ic,ir);
+        ival.s = ENC2UTF8(STRING_ELT(ic,ir));
         while(xlow < xupp-1) {
             mid = xlow + (xupp-xlow)/2;
-            xval.s = STRING_ELT(xc, XIND(mid));
-            // See setencodingv in data.table.R. Idea is to correct false encodings upfront while creating data.table (fread, data.table, as.data.table and setDT). This warning is therefore not necessary anymore.
-            // if (enc_warn && (ENC_KNOWN(ival.s) || ENC_KNOWN(xval.s))) {
-            //     // The || is only done here to avoid the warning message being repeating in this code.
-            //     warning("A known encoding (latin1 or UTF-8) was detected in a join column. data.table compares the bytes currently, so doesn't support *mixed* encodings well; i.e., using both latin1 and UTF-8, or if any unknown encodings are non-ascii and some of those are marked known and others not. But if either latin1 or UTF-8 is used exclusively, and all unknown encodings are ascii, then the result should be ok. In future we will check for you and avoid this warning if everything is ok. The tricky part is doing this without impacting performance for ascii-only cases.");
-            //     // TO DO: check and warn in forder whether any strings are non-ascii (>127) but unknown encoding
-            //     //        check in forder whether both latin1 and UTF-8 have been used
-            //     //        See bugs #5159 and #5266 and related #5295 to revisit
-            //     enc_warn = FALSE;  // just warn once
-            // }
+            xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid)));
             tmp = StrCmp(xval.s, ival.s);  // uses pointer equality first, NA_STRING are allowed and joined to, then uses strcmp on CHAR().
             if (tmp == 0) {                // TO DO: deal with mixed encodings and locale optionally
                 tmplow = mid;
                 tmpupp = mid;
                 while(tmplow<xupp-1) {
                     mid = tmplow + (xupp-tmplow)/2;
-                    xval.s = STRING_ELT(xc, XIND(mid));
-                    if (ival.s == xval.s) tmplow=mid; else xupp=mid;  // the == here assumes (within this column) no mixing of latin1 and UTF-8, and no unknown non-ascii
-                }                                                     // TO DO: add checks to forder, see above.
+                    xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid)));
+                    if (ival.s == xval.s) tmplow=mid; else xupp=mid;  // the == here handles encodings as well. Marked non-utf8 encodings are converted to utf-8 using ENC2UTF8.
+                }
                 while(xlow<tmpupp-1) {
                     mid = xlow + (tmpupp-xlow)/2;
-                    xval.s = STRING_ELT(xc, XIND(mid));
+                    xval.s = ENC2UTF8(STRING_ELT(xc, XIND(mid)));
                     if (ival.s == xval.s) tmpupp=mid; else xlow=mid;  // see above re ==
                 }
                 break;
@@ -204,12 +204,12 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int lowma
         tmpupp = lir;
         while(tmplow<iupp-1) {
             mid = tmplow + (iupp-tmplow)/2;
-            xval.s = STRING_ELT(ic, o ? o[mid]-1 : mid);
+            xval.s = ENC2UTF8(STRING_ELT(ic, o ? o[mid]-1 : mid));
             if (xval.s == ival.s) tmplow=mid; else iupp=mid;   // see above re ==
         }
         while(ilow<tmpupp-1) {
             mid = ilow + (tmpupp-ilow)/2;
-            xval.s = STRING_ELT(ic, o ? o[mid]-1 : mid);
+            xval.s = ENC2UTF8(STRING_ELT(ic, o ? o[mid]-1 : mid));
             if (xval.s == ival.s) tmpupp=mid; else ilow=mid;   // see above re == 
         }
         break;
@@ -323,4 +323,36 @@ void bmerge_r(int xlowIn, int xuppIn, int ilowIn, int iuppIn, int col, int lowma
         bmerge_r(xupp-1, xuppIn, iupp-1, iuppIn, col, lowmax && xupp-1==xlowIn, uppmax);
 }
 
+// HOPEFULLY NOT NEEDED ANYMORE BUT JUST IN CASE...
+// // adapted from util.c::do_enc2, here so that we can directly update columns
+// SEXP setencoding(SEXP s, SEXP enc_utf8, SEXP enc_latin1) {
+//     R_xlen_t i;
+//     SEXP el;
+//     for (i=0; i<XLENGTH(s); i++) {
+//         el = STRING_ELT(s, i);
+//         if (el == NA_STRING) continue;
+//         if (LOGICAL(enc_utf8)[0]) {
+//             if (IS_ASCII(el) || IS_UTF8(el)) continue;
+//             SET_STRING_ELT(s, i, mkCharCE(translateCharUTF8(el), CE_UTF8));
+//         } else if (ENC_KNOWN(el)) {
+//             if (IS_ASCII(el) || (LOGICAL(enc_latin1)[0] && IS_LATIN1(el))) continue;
+//             if (LOGICAL(enc_latin1)[0])
+//                 SET_STRING_ELT(s, i, mkCharCE(translateChar(el), CE_LATIN1));
+//             else 
+//                 SET_STRING_ELT(s, i, mkChar(translateChar(el)));
+//         }
+//     }
+//     return (R_NilValue);
+// }
+
+// SEXP is_encoding_marked(SEXP x) {
 
+//     int i, n=XLENGTH(x);
+//     SEXP s;
+//     for (i=0; i<n; i++) {
+//         s = STRING_ELT(x, i);
+//         if (s != NA_STRING && !IS_ASCII(s))
+//             break;
+//     }
+//     return (ScalarLogical(i!=n));
+// }
diff --git a/src/data.table.h b/src/data.table.h
@@ -5,6 +5,10 @@
 // #include <signal.h> // the debugging machinery + breakpoint aidee
 // raise(SIGINT);
 
+#define IS_UTF8(x)  (LEVELS(x) & 8)
+#define IS_ASCII(x) (LEVELS(x) & 64)
+#define IS_LATIN(x) (LEVELS(x) & 4)
+
 #define SIZEOF(x) sizes[TYPEOF(x)]
 #ifdef MIN
 #undef MIN
@@ -70,6 +74,7 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX
 
 // bmerge.c
 SEXP bmerge(SEXP left, SEXP right, SEXP leftcols, SEXP rightcols, SEXP isorted, SEXP xoArg, SEXP rollarg, SEXP rollends, SEXP nomatch, SEXP retFirst, SEXP retLength, SEXP allLen1);
+SEXP ENC2UTF8(SEXP s);
 
 // fcast.c
 SEXP coerce_to_char(SEXP s, SEXP env);

diff --git a/src/uniqlist.c b/src/uniqlist.c
@@ -37,7 +37,9 @@ SEXP uniqlist(SEXP l, SEXP order)
             case INTSXP : case LGLSXP :
                 b=INTEGER(v)[thisi]==INTEGER(v)[previ]; break;
             case STRSXP :
-                b=STRING_ELT(v,thisi)==STRING_ELT(v,previ); break;  // forder checks no non-ascii unknown, and either UTF-8 or Latin1 but not both. So == pointers is ok given that check.
+                // fix for #469, when key is set, duplicated calls uniqlist, where encoding 
+                // needs to be taken care of.
+                b=ENC2UTF8(STRING_ELT(v,thisi))==ENC2UTF8(STRING_ELT(v,previ)); break;  // marked non-utf8 encodings are converted to utf8 so as to match properly when inputs are of different encodings.
             case REALSXP :
                 ulv = (unsigned long long *)REAL(v);  
                 b = ulv[thisi] == ulv[previ]; // (gives >=2x speedup)