Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

making changes to reduce size of giant interval lists #1309

Merged
merged 3 commits into from
Mar 4, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion src/main/java/htsjdk/samtools/util/IntervalTree.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,13 @@
*/
package htsjdk.samtools.util;

import htsjdk.utils.ValidationUtils;

import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.function.BiFunction;

/**
* A Red-Black tree with intervals for keys.
Expand Down Expand Up @@ -56,6 +60,8 @@ public void clear()
mRoot = null;
}


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

extra nls

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


/**
* Put a new interval into the tree (or update the value associated with an existing interval).
* If the interval is novel, the special sentinel value is returned.
Expand All @@ -64,7 +70,6 @@ public void clear()
* @param value The associated value.
* @return The old value associated with that interval, or the sentinel.
*/
@SuppressWarnings("null")
public V put( final int start, final int end, final V value )
{
if ( start > end )
Expand Down Expand Up @@ -114,6 +119,34 @@ public V put( final int start, final int end, final V value )
return result;
}

/**
* If the specified start and end positions are not already associated with a value or are
* associated with the sentinel ( see {@link #getSentinel()}, associates it with the given (non-sentinel) value.
* Otherwise, replaces the associated value with the results of the given
* remapping function, or removes if the result is equal to the sentinel value. This
* method may be of use when combining multiple values that have the same start and end position.
*
* @param start interval start position
* @param end interval end position
* @param value value to merge into the tree, must not be equal to the sentinel value
* @param remappingFunction a function that merges the new value with the existing value for the same start and end position
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note that if function returns null the region will be left empty

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

* @return the updated value that is now stored in the tree or sentinel
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

now is not clear enough...(does it refer to before or after the application of the method?)
perhaps the updated value that is eventually stored in the tree or the sentinel value if nothing ends up being stored

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I went with a slight modification:

the updated value that is stored in the tree after the completion of this merge operation, this will be the sentinel value if nothing ended up being stored
``

*/
public V merge(int start, int end, V value, BiFunction<? super V, ? super V, ? extends V> remappingFunction) {
ValidationUtils.validateArg(!Objects.equals(value, mSentinel), "Values equal to the sentinel value may not be merged");
final V alreadyPresent = put(start, end, value);
if (!Objects.equals(alreadyPresent, mSentinel)) {
final V newComputedValue = remappingFunction.apply(value, alreadyPresent);
if(Objects.equals(newComputedValue, mSentinel)){
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

spaces

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

remove(start, end);
} else {
put(start, end, newComputedValue);
}
return newComputedValue;
}
return value;
}

/**
* Remove an interval from the tree. If the interval does not exist in the tree the
* special sentinel value is returned.
Expand Down
39 changes: 25 additions & 14 deletions src/main/java/htsjdk/samtools/util/OverlapDetector.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
package htsjdk.samtools.util;

import java.util.*;
import java.util.function.BiFunction;

/**
* Utility class to efficiently do in memory overlap detection between a large
Expand Down Expand Up @@ -80,25 +81,35 @@ public void addLhs(final T object, final Locatable interval) {
}

final int start = interval.getStart() + this.lhsBuffer;
final int end = interval.getEnd() - this.lhsBuffer;
final int end = interval.getEnd() - this.lhsBuffer;

final Set<T> newValue = Collections.singleton(object);
if (start <= end) { // Don't put in sequences that have no overlappable bases
final Set<T> alreadyThere = tree.put(start, end, newValue);
if (alreadyThere != null) {
if( alreadyThere.size() == 1){
Set<T> mutableSet = new HashSet<>(2);
mutableSet.addAll(alreadyThere);
mutableSet.add(object);
tree.put(start, end, mutableSet);
} else {
alreadyThere.add(object);
tree.put(start, end, alreadyThere);
}
}
tree.merge(start, end,
Collections.singleton(object),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wierd spacing

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

mergeSetsAccountingForSingletons());
}
}

/**
* merge two Sets, assumes sets of size 1 are immutale
*/
private static <T> BiFunction<Set<T>, Set<T>, Set<T>> mergeSetsAccountingForSingletons() {
return (newValue, oldValue) -> {
// Sets of size 1 are immutable SingletonSets so we have to make a new
// mutable one to add to
if (oldValue.size() == 1) {
Set<T> newMutableSet = new HashSet<>();
Copy link
Contributor

@yfarjoun yfarjoun Mar 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can replace entire if statement with:

final Set<T> newMutableSet = oldValue.size() == 1 ? new HashSet<>() : oldValue;
newMutableSet.addAll(oldValue);
newMutableSet.addAll(newValue);
return newMutableSet;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh right, clever, it's a set so you can unconditionally add the same thing over again...

newMutableSet.addAll(oldValue);
newMutableSet.addAll(newValue);
return newMutableSet;
// otherwise it's already a HashSet and we can just add values to it
} else {
oldValue.addAll(newValue);
return oldValue;
}
};
}

/**
* Adds all items to the overlap detector.
*
Expand Down
32 changes: 32 additions & 0 deletions src/test/java/htsjdk/samtools/util/IntervalTreeTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,12 @@
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.function.BiFunction;
import java.util.function.UnaryOperator;

import static htsjdk.samtools.util.IntervalTree.Node.HAS_OVERLAPPING_PART;

Expand Down Expand Up @@ -310,4 +315,31 @@ public void testRemove() {
}


@DataProvider
public Object[][] getMergeTestCases(){
return new Object[][]{
{add, null, Arrays.asList(0, 1, 2 , 3), 6},
{add, null, Arrays.asList(0, 1, 2), 3},
{add, null, Arrays.asList(0, 1), 1},
{add, null, Collections.singletonList(0), 0},
{addButCountNullAs10, null, Arrays.asList(1, 2, 3), 6},
{addButCountNullAs10, 0, Arrays.asList(1, 2, null), 13},
{addButCountNullAs10, 0, Arrays.asList(null, null, null), 30}
};
}

private static final BiFunction<Integer, Integer, Integer> add = (a, b) -> a + b;
private static final BiFunction<Integer, Integer, Integer> addButCountNullAs10 = (a, b) -> (a == null ? 10 : a) + (b == null ? 10 : b);

@Test(dataProvider = "getMergeTestCases")
public void testMerge( BiFunction<Integer, Integer, Integer> function, Integer sentinel, List<Integer> values, Integer expected){
final IntervalTree<Integer> tree = new IntervalTree<>();
tree.setSentinel(sentinel);
values.forEach(value -> {
tree.merge(10, 20, value, function);
});
final Iterator<IntervalTree.Node<Integer>> iterator = tree.iterator();
Assert.assertEquals(iterator.next().getValue(), expected);
Assert.assertFalse(iterator.hasNext());
}
}
38 changes: 38 additions & 0 deletions src/test/java/htsjdk/samtools/util/OverlapDetectorTest.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package htsjdk.samtools.util;

import htsjdk.HtsjdkTest;
import htsjdk.tribble.SimpleFeature;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
Expand Down Expand Up @@ -238,4 +239,41 @@ public void testAddTwice() throws Exception {
Assert.assertEquals(overlaps.size(), 1);
Assert.assertEquals(overlaps, Collections.singleton(new Interval("1",10,100)));
}

@Test
public void testMultipleIntervalsSameCoordinates(){
final List<Locatable> input = Arrays.asList(
new IntervalWithNameInEquals("1", 10, 100, false, "same coords 1"),
new IntervalWithNameInEquals("1",10,100, true, "same coords 2"),
new IntervalWithNameInEquals("1", 10, 100, true, "same coords 3"),
new IntervalWithNameInEquals("1", 150, 250, false, "not same coordinates")
);
final OverlapDetector<Locatable> detector = OverlapDetector.create(input);
final Set<Locatable> overlaps = detector.getOverlaps(new Interval("1", 50, 200));
Assert.assertEquals(overlaps.size(), 4);
Assert.assertEquals(overlaps, new HashSet<>(input));
}

/**
* small class to allow creating intervals that have the same coordinates but don't evaluate as equal to each other
*/
private static class IntervalWithNameInEquals extends Interval{

public IntervalWithNameInEquals(String sequence, int start, int end, boolean negative, String name) {
super(sequence, start, end, negative, name);
}

@Override
public boolean equals(Object other){
if (super.equals(other)) {
return getName().equals(((Interval)other).getName());
}
return false;
}

@Override
public int hashCode(){
return super.hashCode() * getName().hashCode();
}
}
}