From 59a03025dcf7cd8b8fa8b8012e8a5de5dd3e32a9 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 21 Jun 2019 21:34:29 -0700 Subject: [PATCH 1/8] Added specialized findOverlaps methods for GRangesFactors. --- R/findOverlaps-methods.R | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/R/findOverlaps-methods.R b/R/findOverlaps-methods.R index aa608a01..7471e938 100644 --- a/R/findOverlaps-methods.R +++ b/R/findOverlaps-methods.R @@ -223,6 +223,52 @@ setMethod("findOverlaps", c("GenomicRanges", "GRangesList"), } ) +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### "findOverlaps" methods for GRangesFactor objects +### + +setMethod("findOverlaps", c("GRangesFactor", "GenomicRanges"), function(query, subject, + maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), + select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) +{ + idx <- as.integer(query) + query <- levels(query) + lev.hits <- callGeneric() + idx.hits <- findMatches(idx, queryHits(lev.hits)) + Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)], + nLnode=length(idx), nRnode=length(subject), sort.by.query=TRUE) +}) + +setMethod("findOverlaps", c("GenomicRanges", "GRangesFactor"), function(query, subject, + maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), + select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) +{ + idx <- as.integer(subject) + subject <- levels(subject) + lev.hits <- callGeneric() + idx.hits <- findMatches(subjectHits(lev.hits), idx) + Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits), + nLnode=length(query), nRnode=length(idx), sort.by.query=TRUE) +}) + +setMethod("findOverlaps", c("GRangesFactor", "GRangesFactor"), function(query, subject, + maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), + select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) +{ + q.idx <- as.integer(query) + query <- levels(query) + s.idx <- as.integer(subject) + subject <- levels(subject) + + lev.hits <- callGeneric() + q.idx.hits <- findMatches(q.idx, queryHits(lev.hits)) + s.idx.hits <- findMatches(subjectHits(lev.hits), s.idx) + reconciler <- findMatches(subjectHits(q.idx.hits), queryHits(s.idx.hits)) + + Hits(from=queryHits(q.idx.hits)[queryHits(reconciler)], + to=subjectHits(s.idx.hits)[subjectHits(reconciler)], + nLnode=length(q.idx), nRnode=length(s.idx), sort.by.query=TRUE) +}) ### ========================================================================= ### findOverlaps-based methods From 4b770414aba1842375fdaacb53c920896043796a Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 21 Jun 2019 21:41:54 -0700 Subject: [PATCH 2/8] Tested new GRF findOverlaps methods. --- inst/unitTests/test_findOverlaps-methods.R | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/inst/unitTests/test_findOverlaps-methods.R b/inst/unitTests/test_findOverlaps-methods.R index 8cc87728..48ec2b35 100644 --- a/inst/unitTests/test_findOverlaps-methods.R +++ b/inst/unitTests/test_findOverlaps-methods.R @@ -325,3 +325,39 @@ test_findOverlaps_with_circular_sequences <- function() .checkHits(1:4, 1:4, 4, 4, current5, select="all") } +test_findOverlaps_with_GRangesFactors <- function() { + ir0 <- IRanges(c(5, 25, 20, 30, 45, 35, 10, 15), width=10) + gr0 <- GRanges("chrA", ir0) + F0 <- Factor(gr0[rep(seq_along(gr0), seq_along(gr0))]) + + ir1 <- IRanges(c(18, 8, 28, 38), width=5) + gr1 <- GRanges("chrA", ir1) + F1 <- Factor(gr1[rep(seq_along(gr1), rev(seq_along(gr1)))]) + + # findOverlaps works with a Factor as the query. + out <- findOverlaps(F0, gr1) + ref <- findOverlaps(unfactor(F0), gr1) + checkIdentical(out, ref) + + out <- findOverlaps(F0, gr1, minoverlap=4) + ref <- findOverlaps(unfactor(F0), gr1, minoverlap=4) + checkIdentical(out, ref) + + # findOverlaps works with a Factor as the subject. + out <- findOverlaps(gr0, F1) + ref <- findOverlaps(gr0, unfactor(F1)) + checkIdentical(sort(out), sort(ref)) # hack to overcome lack of subject sorting guarantees. + + out <- findOverlaps(gr0, F1, maxgap=4) + ref <- findOverlaps(gr0, unfactor(F1), maxgap=4) + checkIdentical(sort(out), sort(ref)) # hack to overcome lack of subject sorting guarantees. + + # findOverlaps works with two Factors. + out <- findOverlaps(F0, F1) + ref <- findOverlaps(unfactor(F0), unfactor(F1)) + checkIdentical(out, ref) + + out <- findOverlaps(F0, F1, maxgap=2) + ref <- findOverlaps(unfactor(F0), unfactor(F1), maxgap=2) + checkIdentical(out, ref) +} From 72efa6a63fc9e8da09e84ef182acf0b31f9c02b4 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 21 Jun 2019 21:45:43 -0700 Subject: [PATCH 3/8] Mentioned new methods in docs. --- man/findOverlaps-methods.Rd | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/man/findOverlaps-methods.Rd b/man/findOverlaps-methods.Rd index be2e737b..7467f500 100644 --- a/man/findOverlaps-methods.Rd +++ b/man/findOverlaps-methods.Rd @@ -6,6 +6,9 @@ \alias{findOverlaps,GRangesList,GenomicRanges-method} \alias{findOverlaps,GenomicRanges,GRangesList-method} \alias{findOverlaps,GRangesList,GRangesList-method} +\alias{findOverlaps,GRangesFactor,GenomicRanges-method} +\alias{findOverlaps,GenomicRanges,GRangesFactor-method} +\alias{findOverlaps,GRangesFactor,GRangesFactor-method} \alias{countOverlaps} \alias{countOverlaps,GenomicRanges,GenomicRanges-method} @@ -97,6 +100,10 @@ For \code{type="equal"} with GRangesList objects, \code{query[[i]]} matches \code{subject[[j]]} iff for each range in \code{query[[i]]} there is an identical range in \code{subject[[j]]}, and vice versa. + + If either or both \code{query} or \code{subject} are \link{GRangesFactor} + objects, overlaps are identified based on the unique levels. This improves + the efficiency of this function for large GRangesFactors with few levels. } \value{ From c2b519a96cd38a9ebb9d79ee91202968677be588 Mon Sep 17 00:00:00 2001 From: LTLA Date: Fri, 21 Jun 2019 21:53:17 -0700 Subject: [PATCH 4/8] Allow efficient bypass for small Factors with many levels. --- R/findOverlaps-methods.R | 64 ++++++++++++++-------- inst/unitTests/test_findOverlaps-methods.R | 25 +++++++++ 2 files changed, 66 insertions(+), 23 deletions(-) diff --git a/R/findOverlaps-methods.R b/R/findOverlaps-methods.R index 7471e938..9dd9b5e3 100644 --- a/R/findOverlaps-methods.R +++ b/R/findOverlaps-methods.R @@ -231,43 +231,61 @@ setMethod("findOverlaps", c("GRangesFactor", "GenomicRanges"), function(query, s maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) { - idx <- as.integer(query) - query <- levels(query) - lev.hits <- callGeneric() - idx.hits <- findMatches(idx, queryHits(lev.hits)) - Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)], - nLnode=length(idx), nRnode=length(subject), sort.by.query=TRUE) + if (length(query) < length(levels(query))) { + query <- unfactor(query) + callGeneric() + } else { + idx <- as.integer(query) + query <- levels(query) + lev.hits <- callGeneric() + idx.hits <- findMatches(idx, queryHits(lev.hits)) + Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)], + nLnode=length(idx), nRnode=length(subject), sort.by.query=TRUE) + } }) setMethod("findOverlaps", c("GenomicRanges", "GRangesFactor"), function(query, subject, maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) { - idx <- as.integer(subject) - subject <- levels(subject) - lev.hits <- callGeneric() - idx.hits <- findMatches(subjectHits(lev.hits), idx) - Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits), - nLnode=length(query), nRnode=length(idx), sort.by.query=TRUE) + if (length(subject) < length(levels(subject))) { + subject <- unfactor(subject) + callGeneric() + } else { + idx <- as.integer(subject) + subject <- levels(subject) + lev.hits <- callGeneric() + idx.hits <- findMatches(subjectHits(lev.hits), idx) + Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits), + nLnode=length(query), nRnode=length(idx), sort.by.query=TRUE) + } }) setMethod("findOverlaps", c("GRangesFactor", "GRangesFactor"), function(query, subject, maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) { - q.idx <- as.integer(query) - query <- levels(query) - s.idx <- as.integer(subject) - subject <- levels(subject) + if (length(query) < length(levels(query))) { + query <- unfactor(query) + callGeneric() + } else if (length(subject) < length(levels(subject))) { + subject <- unfactor(subject) + callGeneric() + } else { + q.idx <- as.integer(query) + query <- levels(query) + s.idx <- as.integer(subject) + subject <- levels(subject) - lev.hits <- callGeneric() - q.idx.hits <- findMatches(q.idx, queryHits(lev.hits)) - s.idx.hits <- findMatches(subjectHits(lev.hits), s.idx) - reconciler <- findMatches(subjectHits(q.idx.hits), queryHits(s.idx.hits)) + lev.hits <- callGeneric() + q.idx.hits <- findMatches(q.idx, queryHits(lev.hits)) + s.idx.hits <- findMatches(subjectHits(lev.hits), s.idx) + reconciler <- findMatches(subjectHits(q.idx.hits), queryHits(s.idx.hits)) - Hits(from=queryHits(q.idx.hits)[queryHits(reconciler)], - to=subjectHits(s.idx.hits)[subjectHits(reconciler)], - nLnode=length(q.idx), nRnode=length(s.idx), sort.by.query=TRUE) + Hits(from=queryHits(q.idx.hits)[queryHits(reconciler)], + to=subjectHits(s.idx.hits)[subjectHits(reconciler)], + nLnode=length(q.idx), nRnode=length(s.idx), sort.by.query=TRUE) + } }) ### ========================================================================= diff --git a/inst/unitTests/test_findOverlaps-methods.R b/inst/unitTests/test_findOverlaps-methods.R index 48ec2b35..119078fd 100644 --- a/inst/unitTests/test_findOverlaps-methods.R +++ b/inst/unitTests/test_findOverlaps-methods.R @@ -360,4 +360,29 @@ test_findOverlaps_with_GRangesFactors <- function() { out <- findOverlaps(F0, F1, maxgap=2) ref <- findOverlaps(unfactor(F0), unfactor(F1), maxgap=2) checkIdentical(out, ref) + + # All methods work correctly with small Factors that cause unfactor()ing. + out <- findOverlaps(F0[1:2], gr1) + ref <- findOverlaps(unfactor(F0[1:2]), gr1) + checkIdentical(out, ref) + + out <- findOverlaps(F0[3], gr1) + ref <- findOverlaps(unfactor(F0[3]), gr1) + checkIdentical(out, ref) + + out <- findOverlaps(gr0, F1[4:3]) + ref <- findOverlaps(gr0, unfactor(F1[4:3])) + checkIdentical(sort(out), sort(ref)) + + out <- findOverlaps(gr0, F1[1]) + ref <- findOverlaps(gr0, unfactor(F1[1])) + checkIdentical(sort(out), sort(ref)) + + out <- findOverlaps(F0[10:13], F1) + ref <- findOverlaps(unfactor(F0[10:13]), unfactor(F1)) + checkIdentical(out, ref) + + out <- findOverlaps(F0, F1[2]) + ref <- findOverlaps(unfactor(F0), unfactor(F1[2])) + checkIdentical(out, ref) } From a572e80e8f2159ed6c5f2c2e217836c507b7a50c Mon Sep 17 00:00:00 2001 From: LTLA Date: Sat, 22 Jun 2019 10:46:42 -0700 Subject: [PATCH 5/8] Accommodate other choices of select=. --- R/findOverlaps-methods.R | 19 +++++++++++--- inst/unitTests/test_findOverlaps-methods.R | 30 ++++++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/R/findOverlaps-methods.R b/R/findOverlaps-methods.R index 9dd9b5e3..f1ca0b62 100644 --- a/R/findOverlaps-methods.R +++ b/R/findOverlaps-methods.R @@ -237,10 +237,15 @@ setMethod("findOverlaps", c("GRangesFactor", "GenomicRanges"), function(query, s } else { idx <- as.integer(query) query <- levels(query) + + select0 <- match.arg(select) + select <- "all" lev.hits <- callGeneric() idx.hits <- findMatches(idx, queryHits(lev.hits)) - Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)], + + hits <- Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)], nLnode=length(idx), nRnode=length(subject), sort.by.query=TRUE) + selectHits(hits, select0) } }) @@ -254,10 +259,15 @@ setMethod("findOverlaps", c("GenomicRanges", "GRangesFactor"), function(query, s } else { idx <- as.integer(subject) subject <- levels(subject) + + select0 <- match.arg(select) + select <- "all" lev.hits <- callGeneric() idx.hits <- findMatches(subjectHits(lev.hits), idx) - Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits), + + hits <- Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits), nLnode=length(query), nRnode=length(idx), sort.by.query=TRUE) + selectHits(hits, select0) } }) @@ -277,14 +287,17 @@ setMethod("findOverlaps", c("GRangesFactor", "GRangesFactor"), function(query, s s.idx <- as.integer(subject) subject <- levels(subject) + select0 <- match.arg(select) + select <- "all" lev.hits <- callGeneric() q.idx.hits <- findMatches(q.idx, queryHits(lev.hits)) s.idx.hits <- findMatches(subjectHits(lev.hits), s.idx) reconciler <- findMatches(subjectHits(q.idx.hits), queryHits(s.idx.hits)) - Hits(from=queryHits(q.idx.hits)[queryHits(reconciler)], + hits <- Hits(from=queryHits(q.idx.hits)[queryHits(reconciler)], to=subjectHits(s.idx.hits)[subjectHits(reconciler)], nLnode=length(q.idx), nRnode=length(s.idx), sort.by.query=TRUE) + selectHits(hits, select0) } }) diff --git a/inst/unitTests/test_findOverlaps-methods.R b/inst/unitTests/test_findOverlaps-methods.R index 119078fd..23505d5d 100644 --- a/inst/unitTests/test_findOverlaps-methods.R +++ b/inst/unitTests/test_findOverlaps-methods.R @@ -334,6 +334,7 @@ test_findOverlaps_with_GRangesFactors <- function() { gr1 <- GRanges("chrA", ir1) F1 <- Factor(gr1[rep(seq_along(gr1), rev(seq_along(gr1)))]) + ###################### # findOverlaps works with a Factor as the query. out <- findOverlaps(F0, gr1) ref <- findOverlaps(unfactor(F0), gr1) @@ -361,6 +362,35 @@ test_findOverlaps_with_GRangesFactors <- function() { ref <- findOverlaps(unfactor(F0), unfactor(F1), maxgap=2) checkIdentical(out, ref) + ###################### + # All methods work with different settings for 'select'. + + out <- findOverlaps(F0, gr1, select="first") + ref <- findOverlaps(unfactor(F0), gr1, select="first") + checkIdentical(out, ref) + + out <- findOverlaps(F0, gr1, select="last") + ref <- findOverlaps(unfactor(F0), gr1, select="last") + checkIdentical(out, ref) + + out <- findOverlaps(gr0, F1, select="last") + ref <- findOverlaps(gr0, unfactor(F1), select="last") + checkIdentical(out, ref) + + out <- findOverlaps(gr0, F1, select="last") + ref <- findOverlaps(gr0, unfactor(F1), select="last") + checkIdentical(out, ref) + + out <- findOverlaps(F0, F1, select="first") + ref <- findOverlaps(unfactor(F0), unfactor(F1), select="first") + checkIdentical(out, ref) + + out <- findOverlaps(F0, F1, select="last") + ref <- findOverlaps(unfactor(F0), unfactor(F1), select="last") + checkIdentical(out, ref) + + + ###################### # All methods work correctly with small Factors that cause unfactor()ing. out <- findOverlaps(F0[1:2], gr1) ref <- findOverlaps(unfactor(F0[1:2]), gr1) From ddccec8fa0c918d2745057633f5c4d4b1bb521a6 Mon Sep 17 00:00:00 2001 From: LTLA Date: Sat, 22 Jun 2019 11:59:00 -0700 Subject: [PATCH 6/8] Allow GRFs to overlap GRLs. --- R/findOverlaps-methods.R | 52 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/R/findOverlaps-methods.R b/R/findOverlaps-methods.R index f1ca0b62..2be0523f 100644 --- a/R/findOverlaps-methods.R +++ b/R/findOverlaps-methods.R @@ -227,49 +227,49 @@ setMethod("findOverlaps", c("GenomicRanges", "GRangesList"), ### "findOverlaps" methods for GRangesFactor objects ### -setMethod("findOverlaps", c("GRangesFactor", "GenomicRanges"), function(query, subject, +.findOverlaps_Factor_other <- function(query, subject, maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) { if (length(query) < length(levels(query))) { - query <- unfactor(query) - callGeneric() + findOverlaps(unfactor(query), subject, maxgap=maxgap, minoverlap=minoverlap, + type=match.arg(type), select=match.arg(select), ignore.strand=ignore.strand) } else { - idx <- as.integer(query) - query <- levels(query) - - select0 <- match.arg(select) - select <- "all" - lev.hits <- callGeneric() - idx.hits <- findMatches(idx, queryHits(lev.hits)) + lev.hits <- findOverlaps(levels(query), subject, maxgap=maxgap, minoverlap=minoverlap, + type=match.arg(type), select="all", ignore.strand=ignore.strand) + idx.hits <- findMatches(as.integer(query), queryHits(lev.hits)) hits <- Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)], - nLnode=length(idx), nRnode=length(subject), sort.by.query=TRUE) - selectHits(hits, select0) + nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) + selectHits(hits, match.arg(select)) } -}) +} -setMethod("findOverlaps", c("GenomicRanges", "GRangesFactor"), function(query, subject, +setMethod("findOverlaps", c("GRangesFactor", "GenomicRanges"), .findOverlaps_Factor_other) + +setMethod("findOverlaps", c("GRangesFactor", "GRangesList"), .findOverlaps_Factor_other) + +.findOverlaps_other_Factor <- function(query, subject, maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) { if (length(subject) < length(levels(subject))) { - subject <- unfactor(subject) - callGeneric() + findOverlaps(query, unfactor(subject), maxgap=maxgap, minoverlap=minoverlap, + type=match.arg(type), select=match.arg(select), ignore.strand=ignore.strand) } else { - idx <- as.integer(subject) - subject <- levels(subject) - - select0 <- match.arg(select) - select <- "all" - lev.hits <- callGeneric() - idx.hits <- findMatches(subjectHits(lev.hits), idx) + lev.hits <- findOverlaps(query, levels(subject), maxgap=maxgap, minoverlap=minoverlap, + type=match.arg(type), select="all", ignore.strand=ignore.strand) + idx.hits <- findMatches(subjectHits(lev.hits), as.integer(subject)) hits <- Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits), - nLnode=length(query), nRnode=length(idx), sort.by.query=TRUE) - selectHits(hits, select0) + nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) + selectHits(hits, match.arg(select)) } -}) +} + +setMethod("findOverlaps", c("GenomicRanges", "GRangesFactor"), .findOverlaps_other_Factor) + +setMethod("findOverlaps", c("GRangesList", "GRangesFactor"), .findOverlaps_other_Factor) setMethod("findOverlaps", c("GRangesFactor", "GRangesFactor"), function(query, subject, maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), From 2a4b8db1ea07cd3fc19ec539b5aa35732f09ec68 Mon Sep 17 00:00:00 2001 From: LTLA Date: Sun, 23 Jun 2019 09:46:30 -0700 Subject: [PATCH 7/8] Avoid creating Hits for efficiency when select!='all'. --- R/findOverlaps-methods.R | 103 ++++++++++++++++++++++++++------------- 1 file changed, 69 insertions(+), 34 deletions(-) diff --git a/R/findOverlaps-methods.R b/R/findOverlaps-methods.R index 2be0523f..6a7dbdea 100644 --- a/R/findOverlaps-methods.R +++ b/R/findOverlaps-methods.R @@ -231,17 +231,25 @@ setMethod("findOverlaps", c("GenomicRanges", "GRangesList"), maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) { + select <- match.arg(select) + type <- match.arg(type) + FUN <- function(Query, Select) { + findOverlaps(Query, subject, maxgap=maxgap, minoverlap=minoverlap, + type=type, select=Select, ignore.strand=ignore.strand) + } + if (length(query) < length(levels(query))) { - findOverlaps(unfactor(query), subject, maxgap=maxgap, minoverlap=minoverlap, - type=match.arg(type), select=match.arg(select), ignore.strand=ignore.strand) + FUN(unfactor(query), Select=select) } else { - lev.hits <- findOverlaps(levels(query), subject, maxgap=maxgap, minoverlap=minoverlap, - type=match.arg(type), select="all", ignore.strand=ignore.strand) - idx.hits <- findMatches(as.integer(query), queryHits(lev.hits)) - - hits <- Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)], - nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) - selectHits(hits, match.arg(select)) + if (select=="all") { + lev.hits <- FUN(levels(query), "all") + idx.hits <- findMatches(as.integer(query), queryHits(lev.hits)) + Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)], + nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) + } else { + lev.hits <- FUN(levels(query), select) + lev.hits[as.integer(query)] + } } } @@ -253,17 +261,33 @@ setMethod("findOverlaps", c("GRangesFactor", "GRangesList"), .findOverlaps_Facto maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"), select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE) { + select <- match.arg(select) + type <- match.arg(type) + FUN <- function(Subject, Select) { + findOverlaps(query, Subject, maxgap=maxgap, minoverlap=minoverlap, + type=type, select=Select, ignore.strand=ignore.strand) + } + if (length(subject) < length(levels(subject))) { - findOverlaps(query, unfactor(subject), maxgap=maxgap, minoverlap=minoverlap, - type=match.arg(type), select=match.arg(select), ignore.strand=ignore.strand) + FUN(unfactor(subject), select) } else { - lev.hits <- findOverlaps(query, levels(subject), maxgap=maxgap, minoverlap=minoverlap, - type=match.arg(type), select="all", ignore.strand=ignore.strand) - idx.hits <- findMatches(subjectHits(lev.hits), as.integer(subject)) - - hits <- Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits), - nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) - selectHits(hits, match.arg(select)) + if (select=="all") { + lev.hits <- FUN(levels(subject), "all") + idx.hits <- findMatches(subjectHits(lev.hits), as.integer(subject)) + Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits), + nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) + } else { + s.idx <- as.integer(subject) + if (select=="first") { + # Get the index of the first range for each level. + u <- which(!duplicated(s.idx)) + } else { + # Get the index of the last range for each level. + u <- which(!duplicated(s.idx, fromLast=TRUE)) + } + lev.hits <- FUN(levels(subject)[s.idx[u]], select) + u[lev.hits] + } } } @@ -282,22 +306,33 @@ setMethod("findOverlaps", c("GRangesFactor", "GRangesFactor"), function(query, s subject <- unfactor(subject) callGeneric() } else { - q.idx <- as.integer(query) - query <- levels(query) - s.idx <- as.integer(subject) - subject <- levels(subject) - - select0 <- match.arg(select) - select <- "all" - lev.hits <- callGeneric() - q.idx.hits <- findMatches(q.idx, queryHits(lev.hits)) - s.idx.hits <- findMatches(subjectHits(lev.hits), s.idx) - reconciler <- findMatches(subjectHits(q.idx.hits), queryHits(s.idx.hits)) - - hits <- Hits(from=queryHits(q.idx.hits)[queryHits(reconciler)], - to=subjectHits(s.idx.hits)[subjectHits(reconciler)], - nLnode=length(q.idx), nRnode=length(s.idx), sort.by.query=TRUE) - selectHits(hits, select0) + FUN <- function(Query, Subject, Select) { + findOverlaps(Query, Subject, maxgap=maxgap, minoverlap=minoverlap, + type=type, select=Select, ignore.strand=ignore.strand) + } + + select <- match.arg(select) + if (select=="all") { + lev.hits <- FUN(levels(query), levels(subject), "all") + q.idx.hits <- findMatches(as.integer(query), queryHits(lev.hits)) + s.idx.hits <- findMatches(subjectHits(lev.hits), as.integer(subject)) + reconciler <- findMatches(subjectHits(q.idx.hits), queryHits(s.idx.hits)) + + Hits(from=queryHits(q.idx.hits)[queryHits(reconciler)], + to=subjectHits(s.idx.hits)[subjectHits(reconciler)], + nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE) + } else { + s.idx <- as.integer(subject) + if (select=="first") { + # Get the index of the first range for each level. + u <- which(!duplicated(s.idx)) + } else { + # Get the index of the last range for each level. + u <- which(!duplicated(s.idx, fromLast=TRUE)) + } + lev.hits <- FUN(levels(query), levels(subject)[s.idx[u]], select) + u[lev.hits][as.integer(query)] + } } }) From 6a4048197d946cb5a26d7b22a9c4799dceb1b018 Mon Sep 17 00:00:00 2001 From: LTLA Date: Sun, 23 Jun 2019 09:46:41 -0700 Subject: [PATCH 8/8] Minor testfixes. --- inst/unitTests/test_findOverlaps-methods.R | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/inst/unitTests/test_findOverlaps-methods.R b/inst/unitTests/test_findOverlaps-methods.R index 23505d5d..703560ed 100644 --- a/inst/unitTests/test_findOverlaps-methods.R +++ b/inst/unitTests/test_findOverlaps-methods.R @@ -373,8 +373,8 @@ test_findOverlaps_with_GRangesFactors <- function() { ref <- findOverlaps(unfactor(F0), gr1, select="last") checkIdentical(out, ref) - out <- findOverlaps(gr0, F1, select="last") - ref <- findOverlaps(gr0, unfactor(F1), select="last") + out <- findOverlaps(gr0, F1, select="first") + ref <- findOverlaps(gr0, unfactor(F1), select="first") checkIdentical(out, ref) out <- findOverlaps(gr0, F1, select="last") @@ -389,7 +389,6 @@ test_findOverlaps_with_GRangesFactors <- function() { ref <- findOverlaps(unfactor(F0), unfactor(F1), select="last") checkIdentical(out, ref) - ###################### # All methods work correctly with small Factors that cause unfactor()ing. out <- findOverlaps(F0[1:2], gr1)