From e47374adf3b9f27690227be19724dcc870e54056 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Fri, 22 Feb 2019 17:14:59 -0500 Subject: [PATCH 1/3] making changes to reduce size of giant interval lists --- .../java/htsjdk/samtools/util/IntervalList.java | 8 +++++++- .../htsjdk/samtools/util/OverlapDetector.java | 16 +++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/IntervalList.java b/src/main/java/htsjdk/samtools/util/IntervalList.java index 31c0d552d1..88a0626695 100644 --- a/src/main/java/htsjdk/samtools/util/IntervalList.java +++ b/src/main/java/htsjdk/samtools/util/IntervalList.java @@ -531,6 +531,7 @@ public static IntervalList fromReader(final BufferedReader in) { // Then read in the intervals final FormatUtil format = new FormatUtil(); + String lastSeq = null; do { if (line.trim().isEmpty()) { continue; // skip over blank lines @@ -544,7 +545,12 @@ public static IntervalList fromReader(final BufferedReader in) { } // Then parse them out - final String seq = fields[SEQUENCE_POS]; + String seq = fields[SEQUENCE_POS]; + if (seq.equals(lastSeq)) { + seq = lastSeq; + } + lastSeq = seq; + final int start = format.parseInt(fields[START_POS]); final int end = format.parseInt(fields[END_POS]); if (start < 1) { diff --git a/src/main/java/htsjdk/samtools/util/OverlapDetector.java b/src/main/java/htsjdk/samtools/util/OverlapDetector.java index ba177b73ea..11fa40c86d 100644 --- a/src/main/java/htsjdk/samtools/util/OverlapDetector.java +++ b/src/main/java/htsjdk/samtools/util/OverlapDetector.java @@ -82,13 +82,19 @@ public void addLhs(final T object, final Locatable interval) { final int start = interval.getStart() + this.lhsBuffer; final int end = interval.getEnd() - this.lhsBuffer; - final Set objects = new HashSet<>(1); - objects.add(object); + final Set newValue = Collections.singleton(object); if (start <= end) { // Don't put in sequences that have no overlappable bases - final Set alreadyThere = tree.put(start, end, objects); + final Set alreadyThere = tree.put(start, end, newValue); if (alreadyThere != null) { - alreadyThere.add(object); - tree.put(start, end, alreadyThere); + if( alreadyThere.size() == 1){ + Set mutableSet = new HashSet<>(2); + mutableSet.addAll(alreadyThere); + mutableSet.add(object); + tree.put(start, end, mutableSet); + } else { + alreadyThere.add(object); + tree.put(start, end, alreadyThere); + } } } } From 8486fc0c0b872335c5c8d5ce8106fdcb86a61074 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Fri, 1 Mar 2019 14:22:41 -0500 Subject: [PATCH 2/3] responding to comments --- .../htsjdk/samtools/util/IntervalTree.java | 35 ++++++++++++++++- .../htsjdk/samtools/util/OverlapDetector.java | 39 ++++++++++++------- .../samtools/util/IntervalTreeTest.java | 32 +++++++++++++++ .../samtools/util/OverlapDetectorTest.java | 38 ++++++++++++++++++ 4 files changed, 129 insertions(+), 15 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/IntervalTree.java b/src/main/java/htsjdk/samtools/util/IntervalTree.java index 3efc4dfbbd..423c5ed0a7 100644 --- a/src/main/java/htsjdk/samtools/util/IntervalTree.java +++ b/src/main/java/htsjdk/samtools/util/IntervalTree.java @@ -23,9 +23,13 @@ */ package htsjdk.samtools.util; +import htsjdk.utils.ValidationUtils; + import java.util.ConcurrentModificationException; import java.util.Iterator; import java.util.NoSuchElementException; +import java.util.Objects; +import java.util.function.BiFunction; /** * A Red-Black tree with intervals for keys. @@ -56,6 +60,8 @@ public void clear() mRoot = null; } + + /** * Put a new interval into the tree (or update the value associated with an existing interval). * If the interval is novel, the special sentinel value is returned. @@ -64,7 +70,6 @@ public void clear() * @param value The associated value. * @return The old value associated with that interval, or the sentinel. */ - @SuppressWarnings("null") public V put( final int start, final int end, final V value ) { if ( start > end ) @@ -114,6 +119,34 @@ public V put( final int start, final int end, final V value ) return result; } + /** + * If the specified start and end positions are not already associated with a value or are + * associated with the sentinel ( see {@link #getSentinel()}, associates it with the given (non-sentinel) value. + * Otherwise, replaces the associated value with the results of the given + * remapping function, or removes if the result is equal to the sentinel value. This + * method may be of use when combining multiple values that have the same start and end position. + * + * @param start interval start position + * @param end interval end position + * @param value value to merge into the tree, must not be equal to the sentinel value + * @param remappingFunction a function that merges the new value with the existing value for the same start and end position + * @return the updated value that is now stored in the tree or sentinel + */ + public V merge(int start, int end, V value, BiFunction remappingFunction) { + ValidationUtils.validateArg(!Objects.equals(value, mSentinel), "Values equal to the sentinel value may not be merged"); + final V alreadyPresent = put(start, end, value); + if (!Objects.equals(alreadyPresent, mSentinel)) { + final V newComputedValue = remappingFunction.apply(value, alreadyPresent); + if(Objects.equals(newComputedValue, mSentinel)){ + remove(start, end); + } else { + put(start, end, newComputedValue); + } + return newComputedValue; + } + return value; + } + /** * Remove an interval from the tree. If the interval does not exist in the tree the * special sentinel value is returned. diff --git a/src/main/java/htsjdk/samtools/util/OverlapDetector.java b/src/main/java/htsjdk/samtools/util/OverlapDetector.java index 11fa40c86d..2663d1cb77 100644 --- a/src/main/java/htsjdk/samtools/util/OverlapDetector.java +++ b/src/main/java/htsjdk/samtools/util/OverlapDetector.java @@ -24,6 +24,7 @@ package htsjdk.samtools.util; import java.util.*; +import java.util.function.BiFunction; /** * Utility class to efficiently do in memory overlap detection between a large @@ -80,25 +81,35 @@ public void addLhs(final T object, final Locatable interval) { } final int start = interval.getStart() + this.lhsBuffer; - final int end = interval.getEnd() - this.lhsBuffer; + final int end = interval.getEnd() - this.lhsBuffer; - final Set newValue = Collections.singleton(object); if (start <= end) { // Don't put in sequences that have no overlappable bases - final Set alreadyThere = tree.put(start, end, newValue); - if (alreadyThere != null) { - if( alreadyThere.size() == 1){ - Set mutableSet = new HashSet<>(2); - mutableSet.addAll(alreadyThere); - mutableSet.add(object); - tree.put(start, end, mutableSet); - } else { - alreadyThere.add(object); - tree.put(start, end, alreadyThere); - } - } + tree.merge(start, end, + Collections.singleton(object), + mergeSetsAccountingForSingletons()); } } + /** + * merge two Sets, assumes sets of size 1 are immutale + */ + private static BiFunction, Set, Set> mergeSetsAccountingForSingletons() { + return (newValue, oldValue) -> { + // Sets of size 1 are immutable SingletonSets so we have to make a new + // mutable one to add to + if (oldValue.size() == 1) { + Set newMutableSet = new HashSet<>(); + newMutableSet.addAll(oldValue); + newMutableSet.addAll(newValue); + return newMutableSet; + // otherwise it's already a HashSet and we can just add values to it + } else { + oldValue.addAll(newValue); + return oldValue; + } + }; + } + /** * Adds all items to the overlap detector. * diff --git a/src/test/java/htsjdk/samtools/util/IntervalTreeTest.java b/src/test/java/htsjdk/samtools/util/IntervalTreeTest.java index dcd225ec06..ff35659c62 100644 --- a/src/test/java/htsjdk/samtools/util/IntervalTreeTest.java +++ b/src/test/java/htsjdk/samtools/util/IntervalTreeTest.java @@ -29,7 +29,12 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.Arrays; +import java.util.Collections; import java.util.Iterator; +import java.util.List; +import java.util.function.BiFunction; +import java.util.function.UnaryOperator; import static htsjdk.samtools.util.IntervalTree.Node.HAS_OVERLAPPING_PART; @@ -310,4 +315,31 @@ public void testRemove() { } + @DataProvider + public Object[][] getMergeTestCases(){ + return new Object[][]{ + {add, null, Arrays.asList(0, 1, 2 , 3), 6}, + {add, null, Arrays.asList(0, 1, 2), 3}, + {add, null, Arrays.asList(0, 1), 1}, + {add, null, Collections.singletonList(0), 0}, + {addButCountNullAs10, null, Arrays.asList(1, 2, 3), 6}, + {addButCountNullAs10, 0, Arrays.asList(1, 2, null), 13}, + {addButCountNullAs10, 0, Arrays.asList(null, null, null), 30} + }; + } + + private static final BiFunction add = (a, b) -> a + b; + private static final BiFunction addButCountNullAs10 = (a, b) -> (a == null ? 10 : a) + (b == null ? 10 : b); + + @Test(dataProvider = "getMergeTestCases") + public void testMerge( BiFunction function, Integer sentinel, List values, Integer expected){ + final IntervalTree tree = new IntervalTree<>(); + tree.setSentinel(sentinel); + values.forEach(value -> { + tree.merge(10, 20, value, function); + }); + final Iterator> iterator = tree.iterator(); + Assert.assertEquals(iterator.next().getValue(), expected); + Assert.assertFalse(iterator.hasNext()); + } } diff --git a/src/test/java/htsjdk/samtools/util/OverlapDetectorTest.java b/src/test/java/htsjdk/samtools/util/OverlapDetectorTest.java index d8adf2e2da..9f1fae162d 100644 --- a/src/test/java/htsjdk/samtools/util/OverlapDetectorTest.java +++ b/src/test/java/htsjdk/samtools/util/OverlapDetectorTest.java @@ -1,6 +1,7 @@ package htsjdk.samtools.util; import htsjdk.HtsjdkTest; +import htsjdk.tribble.SimpleFeature; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -238,4 +239,41 @@ public void testAddTwice() throws Exception { Assert.assertEquals(overlaps.size(), 1); Assert.assertEquals(overlaps, Collections.singleton(new Interval("1",10,100))); } + + @Test + public void testMultipleIntervalsSameCoordinates(){ + final List input = Arrays.asList( + new IntervalWithNameInEquals("1", 10, 100, false, "same coords 1"), + new IntervalWithNameInEquals("1",10,100, true, "same coords 2"), + new IntervalWithNameInEquals("1", 10, 100, true, "same coords 3"), + new IntervalWithNameInEquals("1", 150, 250, false, "not same coordinates") + ); + final OverlapDetector detector = OverlapDetector.create(input); + final Set overlaps = detector.getOverlaps(new Interval("1", 50, 200)); + Assert.assertEquals(overlaps.size(), 4); + Assert.assertEquals(overlaps, new HashSet<>(input)); + } + + /** + * small class to allow creating intervals that have the same coordinates but don't evaluate as equal to each other + */ + private static class IntervalWithNameInEquals extends Interval{ + + public IntervalWithNameInEquals(String sequence, int start, int end, boolean negative, String name) { + super(sequence, start, end, negative, name); + } + + @Override + public boolean equals(Object other){ + if (super.equals(other)) { + return getName().equals(((Interval)other).getName()); + } + return false; + } + + @Override + public int hashCode(){ + return super.hashCode() * getName().hashCode(); + } + } } From 05400bfdf6475e4c992fbd2278c50ecafee61d65 Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Mon, 4 Mar 2019 11:04:13 -0500 Subject: [PATCH 3/3] respond to comments again --- .../htsjdk/samtools/util/IntervalTree.java | 10 +++++----- .../htsjdk/samtools/util/OverlapDetector.java | 18 +++++------------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/main/java/htsjdk/samtools/util/IntervalTree.java b/src/main/java/htsjdk/samtools/util/IntervalTree.java index 423c5ed0a7..c79b99c1ec 100644 --- a/src/main/java/htsjdk/samtools/util/IntervalTree.java +++ b/src/main/java/htsjdk/samtools/util/IntervalTree.java @@ -60,8 +60,6 @@ public void clear() mRoot = null; } - - /** * Put a new interval into the tree (or update the value associated with an existing interval). * If the interval is novel, the special sentinel value is returned. @@ -129,15 +127,17 @@ public V put( final int start, final int end, final V value ) * @param start interval start position * @param end interval end position * @param value value to merge into the tree, must not be equal to the sentinel value - * @param remappingFunction a function that merges the new value with the existing value for the same start and end position - * @return the updated value that is now stored in the tree or sentinel + * @param remappingFunction a function that merges the new value with the existing value for the same start and end position, + * if the function returns the sentinel value then the mapping will be unset + * @return the updated value that is stored in the tree after the completion of this merge operation, this will + * be the sentinel value if nothing ended up being stored */ public V merge(int start, int end, V value, BiFunction remappingFunction) { ValidationUtils.validateArg(!Objects.equals(value, mSentinel), "Values equal to the sentinel value may not be merged"); final V alreadyPresent = put(start, end, value); if (!Objects.equals(alreadyPresent, mSentinel)) { final V newComputedValue = remappingFunction.apply(value, alreadyPresent); - if(Objects.equals(newComputedValue, mSentinel)){ + if (Objects.equals(newComputedValue, mSentinel)) { remove(start, end); } else { put(start, end, newComputedValue); diff --git a/src/main/java/htsjdk/samtools/util/OverlapDetector.java b/src/main/java/htsjdk/samtools/util/OverlapDetector.java index 2663d1cb77..3d2f662500 100644 --- a/src/main/java/htsjdk/samtools/util/OverlapDetector.java +++ b/src/main/java/htsjdk/samtools/util/OverlapDetector.java @@ -84,9 +84,7 @@ public void addLhs(final T object, final Locatable interval) { final int end = interval.getEnd() - this.lhsBuffer; if (start <= end) { // Don't put in sequences that have no overlappable bases - tree.merge(start, end, - Collections.singleton(object), - mergeSetsAccountingForSingletons()); + tree.merge(start, end, Collections.singleton(object), mergeSetsAccountingForSingletons()); } } @@ -97,16 +95,10 @@ private static BiFunction, Set, Set> mergeSetsAccountingForSing return (newValue, oldValue) -> { // Sets of size 1 are immutable SingletonSets so we have to make a new // mutable one to add to - if (oldValue.size() == 1) { - Set newMutableSet = new HashSet<>(); - newMutableSet.addAll(oldValue); - newMutableSet.addAll(newValue); - return newMutableSet; - // otherwise it's already a HashSet and we can just add values to it - } else { - oldValue.addAll(newValue); - return oldValue; - } + final Set mutableSet = oldValue.size() == 1 ? new HashSet<>() : oldValue; + mutableSet.addAll(oldValue); + mutableSet.addAll(newValue); + return mutableSet; }; }