Skip to content

Commit

Permalink
Implemented setdiff #547 task list 1. Not exported.
Browse files Browse the repository at this point in the history
First attempt at setdiff for DTs
  • Loading branch information
arunsrinivasan committed Aug 5, 2014
1 parent 9e8a716 commit d4445b6
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 0 deletions.
37 changes: 37 additions & 0 deletions R/setops.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# setdiff for data.tables, internal at the moment #547
setdiff_ <- function(x, y, by.x=seq_along(x), by.y=seq_along(y)) {
if (!is.data.table(x) || !is.data.table(y)) stop("x and y must be both data.tables")
if (is.null(y) || ncol(y) == 0L) return(unique(x))
if (length(by.x) != length(by.y)) stop("setdiff(x,y) requires same number of columns for both x and y. However, length(by.x) != length(by.y)")
if (length(by.x) == 0L) stop("by.x and by.y must be character or integer vectors of length >= 1")
if (is.character(by.x)) by.x = chmatch(by.x, names(x))
if (is.character(by.y)) by.y = chmatch(by.y, names(y))
by.x = as.integer(by.x); by.y = as.integer(by.y)
if (anyNA(by.x)) stop("Some column(s) specified in by.x are not present in x")
if (anyNA(by.y)) stop("Some column(s) specified in by.y are not present in y")
# factor in x should've factor/character in y, and viceversa
for (a in seq_along(by.x)) {
lc = by.y[a]
rc = by.x[a]
icnam = names(y)[lc]
xcnam = names(x)[rc]
if ( is.character(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) {
stop("When x's column ('",xcnam,"') is character, the corresponding column in y ('",icnam,"') should be factor or character, but found incompatible type '",typeof(y[[lc]]),"'.")
} else if ( is.factor(x[[rc]]) && !(is.character(y[[lc]]) || is.factor(y[[lc]])) ) {
stop("When x's column ('",xcnam,"') is factor, the corresponding column in y ('",icnam,"') should be character or factor, but found incompatible type '",typeof(y[[lc]]),"'.")
} else if ( (is.integer(x[[rc]]) || is.double(x[[rc]])) && (is.logical(y[[lc]]) || is.character(y[[lc]])) ) {
stop("When x's column ('",xcnam,"') is integer or numeric, the corresponding column in y ('",icnam,"') can not be character or logical types, but found incompatible type '",typeof(y[[lc]]),"'.")
}
}
ux = vector("list", length(by.x))
uy = vector("list", length(by.y))
point(ux, seq_along(by.x), x, by.x)
point(uy, seq_along(by.y), y, by.y)
setDT(ux); setDT(uy)
setnames(ux, names(x)[by.x])
setnames(uy, names(x)[by.x])
# actual setdiff starts here...
ux = unique(ux); uy = unique(uy)
idx = duplicated(rbind(unique(uy), unique(ux), use.names=TRUE, fill=FALSE))[-seq_len(nrow(uy))]
.Call("CsubsetDT", ux, which(!idx), seq_along(ux))
}
21 changes: 21 additions & 0 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -4979,6 +4979,27 @@ test(1363.1, selfrefok(DT), 0L)
setDT(DT)
test(1363.2, selfrefok(DT), 1L)

# setdiff, parly #547. internal as of now, and named setdiff_ because the name "set" can be confused with the set* functions.
# maybe provide a %diff% operator that internally calls setdiff_?? Usage x %diff% y?
X = data.table(a=c(1,1,1,1,3,3,2,2,2))[, `:=`(b=factor(a), c=as.character(a), d = as.integer(a), e=1:9)]
Y = data.table(a=c(3,4), b=factor(3:4), c=c("3","4"), d=3:4, e=c(TRUE, FALSE), f=c(5L,7L))
test(1364.1, setdiff_(X, Y, "a", "a"), data.table(a=c(1,2)))
test(1364.2, setdiff_(X, Y, c("a", "e"), c("a", "f")), X[!5, list(a,e)])
test(1364.3, setdiff_(X, Y, "a", "e"), error="When x's column ('a') is integer or numeric, the corresponding column in y ('e')")
test(1364.4, setdiff_(X, Y, "b", "b"), data.table(b=factor(c(1,2), levels=c(1,2,3))))
test(1364.5, setdiff_(X, Y, c("b", "e"), c("b", "f")), X[!5, list(b,e)])
test(1364.6, setdiff_(X, Y, "b", "c"), data.table(b=factor(c(1,2), levels=c(1,2,3))))
test(1364.7, setdiff_(X, Y, "c", "c"), data.table(c=as.character(c(1,2))))
test(1364.8, setdiff_(X, Y, c("c", "e"), c("c", "f")), X[!5, list(c,e)])
test(1364.9, setdiff_(X, Y, "c", "b"), data.table(c=c("1", "2")))
test(1364.11, setdiff_(X, Y, "d", "d"), data.table(d=1:2))
test(1364.12, setdiff_(X, Y, c("d", "e"), c("d", "f")), X[!5, list(d,e)])
test(1364.13, setdiff_(X, Y, "d", "e"), error="When x's column ('d') is integer or numeric, the corresponding column in y ('e')")
test(1364.14, setdiff_(X, Y, "b", "a"), error="When x's column ('b') is factor, the corresponding column in y ('a')")
test(1364.15, setdiff_(X, Y, "c", "a"), error="When x's column ('c') is character, the corresponding column in y ('a') ")
test(1364.16, setdiff_(X, Y), error="setdiff(x,y) requires same number of columns for both x and y")
test(1364.17, setdiff_(X[, list(a)], Y[, list(a)]), data.table(a=c(1,2)))

##########################


Expand Down

0 comments on commit d4445b6

Please sign in to comment.