From 47ab524e0291b98f4c886d7b16e5ff3596cd442a Mon Sep 17 00:00:00 2001 From: "NORDEXAG\\BonschM" Date: Fri, 6 Oct 2017 08:23:29 +0200 Subject: [PATCH] Fixed index retainment in .shallow. --- R/data.table.R | 3 ++ inst/tests/tests.Rraw | 101 +++++++++++++++++++++++++++++------------- src/assign.c | 29 ++++++------ 3 files changed, 88 insertions(+), 45 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 1cd28c9f5..073313913 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2378,6 +2378,9 @@ point <- function(to, to_idx, from, from_idx) { ## Either reduced index already present or no columns of the original index remain. ## Drop the original index completely setattr(attr(ans, "index", exact = TRUE), index, NULL) + } else if(length(attr(attr(ans, "index"), index))){ + ## index is not length 0. Drop it since shortening could lead to spurious reordering in discarded columns (#2336) + setattr(attr(ans, "index", exact = TRUE), index, NULL) } else { ## rename index to reducedindex names(attributes(attr(ans, "index")))[names(attributes(attr(ans, "index"))) == index] <- reducedindex diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 15c39c115..3e112e807 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -5779,7 +5779,7 @@ test(1417, DT[2,baz:=10L,verbose=TRUE], output=".*Shortening index 'foo__bar__ba setindex(DT,bar,baz) test(1418.1, DT[2,c("foo","bar"):=10L,verbose=TRUE], output=".*Dropping index.* due to an update on a key column") # test 2nd to 1st setindex(DT,bar,baz) -test(1418.2, DT[2,c("foo","baz"):=10L,verbose=TRUE], output=".*Shortening index 'bar__baz' to 'bar' due to an update on a key column") # test 2nd to 2nd +test(1418.2, DT[2,c("foo","baz"):=10L,verbose=TRUE], output=".*Dropping index 'bar__baz' due to an update on a key column") # test 2nd to 2nd ## testing key retainment on assign (#2372) DT <- data.table(x1 = c(1,1,1,1,1,2,2,2,2,2), @@ -5826,11 +5826,8 @@ test(1419.18, key(thisDT), NULL) thisDT <- copy(DT)[, c("x2", "x3") := .(3,3)] test(1419.19, key(thisDT), "x1") -## testing secondary key retainment on assign (#2372) -DT <- data.table(a = c(1,1,1,2,1,2,2,2,2,2), - aaa = c(2,1,2,2,2,1,1,2,2,2), - b = c(1,2,1,1,2,1,1,1,1,2), - ab = rnorm(10)) +## testing secondary index retainment on assign (#2372) + allIndicesValid <- function(DT){ ## checks that the order of all indices is correct for(idx in seq_along(indices(DT))){ @@ -5847,6 +5844,13 @@ allIndicesValid <- function(DT){ } return(TRUE) } + +## on data.table where indices are not integer(0) +DT <- data.table(a = c(1,1,1,2,1,2,2,2,2,2), + aaa = c(2,1,2,2,2,1,1,2,2,2), + b = c(1,2,1,1,2,1,1,1,1,2), + ab = rnorm(10)) + test(1419.21, indices(copy(DT)[1, a:=1]), NULL) setindex(DT, a) setindex(DT, a, aaa) @@ -5860,36 +5864,67 @@ thisDT <- copy(DT)[, b := 2] test(1419.25, indices(thisDT), c("a", "a__aaa", "ab__aaa")) test(1419.26, allIndicesValid(thisDT), TRUE) thisDT <- copy(DT)[, ab := 2] -test(1419.27, indices(thisDT), c("a", "a__aaa", "a__aaa__b")) +test(1419.27, indices(thisDT), c("a", "a__aaa")) test(1419.28, allIndicesValid(thisDT), TRUE) thisDT <- copy(DT)[, aaa := 2] -test(1419.29, indices(thisDT), c("a", "ab")) +test(1419.29, indices(thisDT), c("a")) test(1419.31, allIndicesValid(thisDT), TRUE) thisDT <- copy(DT)[, c("aaa", "b") := 2] -test(1419.32, indices(thisDT), c("a", "ab")) +test(1419.32, indices(thisDT), c("a")) test(1419.33, allIndicesValid(thisDT), TRUE) -## same on empty DT + +## on data.table where indices are integer(0) +DT <- data.table(a = c(1,1,1,1,1,2,2,2,2,2), + aaa = c(1,1,2,2,2,1,1,2,2,2), + b = c(1,2,1,2,3,1,2,1,2,3), + ab = 1:10) + +test(1419.34, indices(copy(DT)[1, a:=1]), NULL) +setindex(DT, a) +setindex(DT, a, aaa) +setindex(DT, ab, aaa) +setindex(DT) +test(1419.35, allIndicesValid(DT), TRUE) +thisDT <- copy(DT)[1, a:=1][, aaa := 1][, ab := 1] +test(1419.36, indices(thisDT), NULL) +test(1419.37, allIndicesValid(thisDT), TRUE) +thisDT <- copy(DT)[, b := 2] +test(1419.38, indices(thisDT), c("a", "a__aaa", "ab__aaa")) +test(1419.39, allIndicesValid(thisDT), TRUE) +thisDT <- copy(DT)[, ab := 2] +test(1419.41, indices(thisDT), c("a", "a__aaa", "a__aaa__b")) +test(1419.42, allIndicesValid(thisDT), TRUE) +thisDT <- copy(DT)[, aaa := 2] +test(1419.43, indices(thisDT), c("a", "ab")) +test(1419.44, allIndicesValid(thisDT), TRUE) +thisDT <- copy(DT)[, c("aaa", "b") := 2] +test(1419.45, indices(thisDT), c("a", "ab")) +test(1419.46, allIndicesValid(thisDT), TRUE) + +## on empty DT DT <- DT[0] +setindex(DT, NULL) +test(1419.47, indices(copy(DT)[, a:=1]), NULL) setindex(DT, a) setindex(DT, a, aaa) setindex(DT, ab, aaa) setindex(DT) -test(1419.34, allIndicesValid(DT), TRUE) +test(1419.48, allIndicesValid(DT), TRUE) thisDT <- copy(DT)[, a:=1][, aaa := 1][, ab := 1] -test(1419.35, indices(thisDT), NULL) -test(1419.36, allIndicesValid(thisDT), TRUE) +test(1419.49, indices(thisDT), NULL) +test(1419.51, allIndicesValid(thisDT), TRUE) thisDT <- copy(DT)[, b := 2] -test(1419.37, indices(thisDT), c("a", "a__aaa", "ab__aaa")) -test(1419.38, allIndicesValid(thisDT), TRUE) +test(1419.52, indices(thisDT), c("a", "a__aaa", "ab__aaa")) +test(1419.53, allIndicesValid(thisDT), TRUE) thisDT <- copy(DT)[, ab := 2] -test(1419.39, indices(thisDT), c("a", "a__aaa", "a__aaa__b")) -test(1419.41, allIndicesValid(thisDT), TRUE) +test(1419.54, indices(thisDT), c("a", "a__aaa", "a__aaa__b")) +test(1419.55, allIndicesValid(thisDT), TRUE) thisDT <- copy(DT)[, aaa := 2] -test(1419.42, indices(thisDT), c("a", "ab")) -test(1419.43, allIndicesValid(thisDT), TRUE) +test(1419.56, indices(thisDT), c("a", "ab")) +test(1419.57, allIndicesValid(thisDT), TRUE) thisDT <- copy(DT)[, c("aaa", "b") := 2] -test(1419.44, indices(thisDT), c("a", "ab")) -test(1419.45, allIndicesValid(thisDT), TRUE) +test(1419.58, indices(thisDT), c("a", "ab")) +test(1419.59, allIndicesValid(thisDT), TRUE) # setnames updates secondary key @@ -7006,25 +7041,27 @@ test(1545.137, key(.shallow(x1, cols=c("a1", "a2"), retain.key=TRUE)), "a1") test(1545.138, key(.shallow(x1, cols=c("a3"), retain.key=TRUE)), NULL) # tests for #2336. .shallow now retains indices as well -x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'), a2 = c(2L, 2L, 1L, 2L, 3L, 2L), a3 = c(FALSE, TRUE, TRUE, FALSE, FALSE, TRUE), key="a1,a2") +x1 <- data.table(a1 = c('a', 'a', 'a', 'a', 'b', 'c'), + a2 = c(1L, 1L, 1L, 2L, 2L, 2L), + a3 = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE)) setindex(x1, a1, a2, a3) setindex(x1, a1, a3) +setindex(x1, a1, a2) ## index with length 0 test(1545.15, indices(.shallow(x1, retain.key=FALSE)), NULL) -test(1545.16, indices(.shallow(x1, cols = "a1", retain.key=FALSE)), NULL) +test(1545.16, indices(.shallow(x1, cols = "a2", retain.key=FALSE)), NULL) test(1545.17, indices(.shallow(x1, retain.key=TRUE)), indices(x1)) test(1545.18, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a2__a3")], c("a1", "a2", "a3")), integer(0)) test(1545.19, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0)) -test(1545.20, forderv(.shallow(x1, retain.key=TRUE)[attr(attr(.shallow(x1, retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.20, forderv(.shallow(x1, retain.key=TRUE), c("a1", "a2")), integer(0)) test(1545.21, indices(.shallow(x1, cols = "a1", retain.key=TRUE)), c("a1")) -test(1545.22, forderv(.shallow(x1, cols = "a1", retain.key=TRUE)[attr(attr(.shallow(x1, cols = "a1", retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) +test(1545.22, forderv(.shallow(x1, cols = "a1", retain.key=TRUE), c("a1")), integer(0)) test(1545.23, attributes(attr(.shallow(x1, cols = c("a1", "a2"), retain.key = TRUE), "index", exact = TRUE)), attributes(attr(.shallow(x1, cols = c("a2", "a1"), retain.key = TRUE), "index", exact = TRUE))) -test(1545.24, indices(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)), c("a1__a2", "a1")) -test(1545.25, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) -test(1545.26, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), "index"), "__a1__a2")], c("a1", "a2")), integer(0)) -test(1545.27, indices(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)), c("a1", "a1__a3")) -test(1545.28, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1")], c("a1")), integer(0)) -test(1545.29, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0)) -test(1545.30, indices(.shallow(x1, cols = c("a2", "a3"), retain.key=TRUE)), NULL) +test(1545.24, indices(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE)), c("a1__a2")) +test(1545.25, forderv(.shallow(x1, cols = c("a1", "a2"), retain.key=TRUE), c("a1", "a2")), integer(0)) +test(1545.26, indices(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)), c("a1__a3", "a1")) +test(1545.27, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), c("a1")), integer(0)) +test(1545.28, forderv(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE)[attr(attr(.shallow(x1, cols = c("a1", "a3"), retain.key=TRUE), "index"), "__a1__a3")], c("a1", "a3")), integer(0)) +test(1545.29, indices(.shallow(x1, cols = c("a2", "a3"), retain.key=TRUE)), NULL) test(1545.31, indices(.shallow(x1, cols = c("a3"), retain.key=TRUE)), NULL) test(1545.32, .shallow(x1, cols = c("a1", "a2", "a3"), retain.key=TRUE), .shallow(x1, retain.key=TRUE)) diff --git a/src/assign.c b/src/assign.c index 88d0ec7f5..e9c63bc85 100644 --- a/src/assign.c +++ b/src/assign.c @@ -281,13 +281,13 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP v // newcolnames : add these columns (if any) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign - R_len_t i, j, nrow, targetlen, vlen, r, oldncol, oldtncol, coln, protecti=0, newcolnum; + R_len_t i, j, nrow, targetlen, vlen, r, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; SEXP targetcol, RHS, names, nullint, thisvalue, thisv, targetlevels, newcol, s, colnam, class, tmp, colorder, key, index, a, assignedNames, indexNames; SEXP bindingIsLocked = getAttrib(dt, install(".data.table.locked")); Rboolean verbose = LOGICAL(verb)[0], anytodelete=FALSE, isDataTable=FALSE; char *s1, *s2, *s3, *s4, *s5; const char *c1, *tc1, *tc2; - int *buf, k=0, newKeyLength; + int *buf, k=0, newKeyLength, indexNo; size_t size; // must be size_t otherwise overflow later in memcpy if (isNull(dt)) error("assign has been passed a NULL dt"); if (TYPEOF(dt) != VECSXP) error("dt passed to assign isn't type VECSXP"); @@ -641,10 +641,10 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP v index = getAttrib(dt, install("index")); if (index != R_NilValue) { s = ATTRIB(index); + indexNo = 0; // get a vector with all index names PROTECT(indexNames = allocVector(STRSXP, xlength(s))); protecti++; - int indexNo = 0; while(s != R_NilValue){ SET_STRING_ELT(indexNames, indexNo, PRINTNAME(TAG(s))); indexNo++; @@ -654,6 +654,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP v indexNo = 0; while(s != R_NilValue) { a = TAG(s); + indexLength = xlength(CAR(s)); tc1 = c1 = CHAR(PRINTNAME(a)); // the index name; e.g. "__col1__col2" if (*tc1!='_' || *(tc1+1)!='_') { // fix for #1396 @@ -703,21 +704,23 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values, SEXP v setAttrib(index, a, R_NilValue); SET_STRING_ELT(indexNames, indexNo, NA_STRING); if (verbose) { - Rprintf("Dropping index '%s' due to an update on a key column\n", c1+2); + Rprintf("Dropping index '%s' due to an update on a key column\n", c1+2); } } else if(newKeyLength < strlen(c1)){ - if(LOGICAL(chmatch(mkString(s4), indexNames, 0, TRUE))[0]) { // index is shortened, but shortened version is already covered by other index. - setAttrib(index, a, R_NilValue); - SET_STRING_ELT(indexNames, indexNo, NA_STRING); - if (verbose) { - Rprintf("Dropping index '%s' due to an update on a key column\n", c1+2); - } - } else { - // rename index to shortened version + if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) + LOGICAL(chmatch(mkString(s4), indexNames, 0, TRUE))[0] == 0 ){// index with shortened name not present yet SET_TAG(s, install(s4)); SET_STRING_ELT(indexNames, indexNo, mkChar(s4)); if (verbose) { - Rprintf("Shortening index '%s' to '%s' due to an update on a key column\n", c1+2, s4 + 2); + Rprintf("Shortening index '%s' to '%s' due to an update on a key column\n", c1+2, s4 + 2); + } + } else{ // indexLength > 0 || shortened name present already + // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) + // shortened anme already present indicates that index needs to be dropped to avoid duplicate indices. + setAttrib(index, a, R_NilValue); + SET_STRING_ELT(indexNames, indexNo, NA_STRING); + if (verbose) { + Rprintf("Dropping index '%s' due to an update on a key column\n", c1+2); } } } //else: index is not affected by assign: nothing to be done