Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Specialize findOverlaps for GRangesFactor objects #28

Open
wants to merge 9 commits into
base: devel
Choose a base branch
from
112 changes: 112 additions & 0 deletions R/findOverlaps-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,118 @@ setMethod("findOverlaps", c("GenomicRanges", "GRangesList"),
}
)

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### "findOverlaps" methods for GRangesFactor objects
###

.findOverlaps_Factor_other <- function(query, subject,
maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"),
select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE)
{
select <- match.arg(select)
type <- match.arg(type)
FUN <- function(Query, Select) {
findOverlaps(Query, subject, maxgap=maxgap, minoverlap=minoverlap,
type=type, select=Select, ignore.strand=ignore.strand)
}

if (length(query) < length(levels(query))) {
FUN(unfactor(query), Select=select)
} else {
if (select=="all") {
lev.hits <- FUN(levels(query), "all")
idx.hits <- findMatches(as.integer(query), queryHits(lev.hits))
Hits(from=queryHits(idx.hits), to=subjectHits(lev.hits)[subjectHits(idx.hits)],
nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE)
} else {
lev.hits <- FUN(levels(query), select)
lev.hits[as.integer(query)]
}
}
}

setMethod("findOverlaps", c("GRangesFactor", "GenomicRanges"), .findOverlaps_Factor_other)

setMethod("findOverlaps", c("GRangesFactor", "GRangesList"), .findOverlaps_Factor_other)

.findOverlaps_other_Factor <- function(query, subject,
maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"),
select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE)
{
select <- match.arg(select)
type <- match.arg(type)
FUN <- function(Subject, Select) {
findOverlaps(query, Subject, maxgap=maxgap, minoverlap=minoverlap,
type=type, select=Select, ignore.strand=ignore.strand)
}

if (length(subject) < length(levels(subject))) {
FUN(unfactor(subject), select)
} else {
if (select=="all") {
lev.hits <- FUN(levels(subject), "all")
idx.hits <- findMatches(subjectHits(lev.hits), as.integer(subject))
Hits(from=queryHits(lev.hits)[queryHits(idx.hits)], to=subjectHits(idx.hits),
nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE)
} else {
s.idx <- as.integer(subject)
if (select=="first") {
# Get the index of the first range for each level.
u <- which(!duplicated(s.idx))
} else {
# Get the index of the last range for each level.
u <- which(!duplicated(s.idx, fromLast=TRUE))
}
lev.hits <- FUN(levels(subject)[s.idx[u]], select)
u[lev.hits]
}
}
}

setMethod("findOverlaps", c("GenomicRanges", "GRangesFactor"), .findOverlaps_other_Factor)

setMethod("findOverlaps", c("GRangesList", "GRangesFactor"), .findOverlaps_other_Factor)

setMethod("findOverlaps", c("GRangesFactor", "GRangesFactor"), function(query, subject,
maxgap=-1L, minoverlap=0L, type=c("any", "start", "end", "within", "equal"),
select=c("all", "first", "last", "arbitrary"), ignore.strand=FALSE)
{
if (length(query) < length(levels(query))) {
query <- unfactor(query)
callGeneric()
} else if (length(subject) < length(levels(subject))) {
subject <- unfactor(subject)
callGeneric()
} else {
FUN <- function(Query, Subject, Select) {
findOverlaps(Query, Subject, maxgap=maxgap, minoverlap=minoverlap,
type=type, select=Select, ignore.strand=ignore.strand)
}

select <- match.arg(select)
if (select=="all") {
lev.hits <- FUN(levels(query), levels(subject), "all")
q.idx.hits <- findMatches(as.integer(query), queryHits(lev.hits))
s.idx.hits <- findMatches(subjectHits(lev.hits), as.integer(subject))
reconciler <- findMatches(subjectHits(q.idx.hits), queryHits(s.idx.hits))

Hits(from=queryHits(q.idx.hits)[queryHits(reconciler)],
to=subjectHits(s.idx.hits)[subjectHits(reconciler)],
nLnode=length(query), nRnode=length(subject), sort.by.query=TRUE)
} else {
s.idx <- as.integer(subject)
if (select=="first") {
# Get the index of the first range for each level.
u <- which(!duplicated(s.idx))
} else {
# Get the index of the last range for each level.
u <- which(!duplicated(s.idx, fromLast=TRUE))
}
lev.hits <- FUN(levels(query), levels(subject)[s.idx[u]], select)
u[lev.hits][as.integer(query)]
}
}
})

### =========================================================================
### findOverlaps-based methods
Expand Down
91 changes: 91 additions & 0 deletions inst/unitTests/test_findOverlaps-methods.R
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,97 @@ test_findOverlaps_with_circular_sequences <- function()
.checkHits(1:4, 1:4, 4, 4, current5, select="all")
}

test_findOverlaps_with_GRangesFactors <- function() {
ir0 <- IRanges(c(5, 25, 20, 30, 45, 35, 10, 15), width=10)
gr0 <- GRanges("chrA", ir0)
F0 <- Factor(gr0[rep(seq_along(gr0), seq_along(gr0))])

ir1 <- IRanges(c(18, 8, 28, 38), width=5)
gr1 <- GRanges("chrA", ir1)
F1 <- Factor(gr1[rep(seq_along(gr1), rev(seq_along(gr1)))])

######################
# findOverlaps works with a Factor as the query.
out <- findOverlaps(F0, gr1)
ref <- findOverlaps(unfactor(F0), gr1)
checkIdentical(out, ref)

out <- findOverlaps(F0, gr1, minoverlap=4)
ref <- findOverlaps(unfactor(F0), gr1, minoverlap=4)
checkIdentical(out, ref)

# findOverlaps works with a Factor as the subject.
out <- findOverlaps(gr0, F1)
ref <- findOverlaps(gr0, unfactor(F1))
checkIdentical(sort(out), sort(ref)) # hack to overcome lack of subject sorting guarantees.

out <- findOverlaps(gr0, F1, maxgap=4)
ref <- findOverlaps(gr0, unfactor(F1), maxgap=4)
checkIdentical(sort(out), sort(ref)) # hack to overcome lack of subject sorting guarantees.

# findOverlaps works with two Factors.
out <- findOverlaps(F0, F1)
ref <- findOverlaps(unfactor(F0), unfactor(F1))
checkIdentical(out, ref)

out <- findOverlaps(F0, F1, maxgap=2)
ref <- findOverlaps(unfactor(F0), unfactor(F1), maxgap=2)
checkIdentical(out, ref)

######################
# All methods work with different settings for 'select'.

out <- findOverlaps(F0, gr1, select="first")
ref <- findOverlaps(unfactor(F0), gr1, select="first")
checkIdentical(out, ref)

out <- findOverlaps(F0, gr1, select="last")
ref <- findOverlaps(unfactor(F0), gr1, select="last")
checkIdentical(out, ref)

out <- findOverlaps(gr0, F1, select="first")
ref <- findOverlaps(gr0, unfactor(F1), select="first")
checkIdentical(out, ref)

out <- findOverlaps(gr0, F1, select="last")
ref <- findOverlaps(gr0, unfactor(F1), select="last")
checkIdentical(out, ref)

out <- findOverlaps(F0, F1, select="first")
ref <- findOverlaps(unfactor(F0), unfactor(F1), select="first")
checkIdentical(out, ref)

out <- findOverlaps(F0, F1, select="last")
ref <- findOverlaps(unfactor(F0), unfactor(F1), select="last")
checkIdentical(out, ref)

######################
# All methods work correctly with small Factors that cause unfactor()ing.
out <- findOverlaps(F0[1:2], gr1)
ref <- findOverlaps(unfactor(F0[1:2]), gr1)
checkIdentical(out, ref)

out <- findOverlaps(F0[3], gr1)
ref <- findOverlaps(unfactor(F0[3]), gr1)
checkIdentical(out, ref)

out <- findOverlaps(gr0, F1[4:3])
ref <- findOverlaps(gr0, unfactor(F1[4:3]))
checkIdentical(sort(out), sort(ref))

out <- findOverlaps(gr0, F1[1])
ref <- findOverlaps(gr0, unfactor(F1[1]))
checkIdentical(sort(out), sort(ref))

out <- findOverlaps(F0[10:13], F1)
ref <- findOverlaps(unfactor(F0[10:13]), unfactor(F1))
checkIdentical(out, ref)

out <- findOverlaps(F0, F1[2])
ref <- findOverlaps(unfactor(F0), unfactor(F1[2]))
checkIdentical(out, ref)
}

test_poverlaps <- function() {
ans <- poverlaps(GRanges(), GRanges())
checkIdentical(ans, Rle())
Expand Down
7 changes: 7 additions & 0 deletions man/findOverlaps-methods.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
\alias{findOverlaps,GRangesList,GenomicRanges-method}
\alias{findOverlaps,GenomicRanges,GRangesList-method}
\alias{findOverlaps,GRangesList,GRangesList-method}
\alias{findOverlaps,GRangesFactor,GenomicRanges-method}
\alias{findOverlaps,GenomicRanges,GRangesFactor-method}
\alias{findOverlaps,GRangesFactor,GRangesFactor-method}

\alias{countOverlaps}
\alias{countOverlaps,GenomicRanges,GenomicRanges-method}
Expand Down Expand Up @@ -97,6 +100,10 @@
For \code{type="equal"} with GRangesList objects, \code{query[[i]]}
matches \code{subject[[j]]} iff for each range in \code{query[[i]]}
there is an identical range in \code{subject[[j]]}, and vice versa.

If either or both \code{query} or \code{subject} are \link{GRangesFactor}
objects, overlaps are identified based on the unique levels. This improves
the efficiency of this function for large GRangesFactors with few levels.
}

\value{
Expand Down