-
-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: add optical duplicate tagging to GroupReadsByUmi
- Loading branch information
Showing
12 changed files
with
1,438 additions
and
4 deletions.
There are no files selected for viewing
340 changes: 340 additions & 0 deletions
340
src/main/java/com/fulcrumgenomics/illumina/OpticalDuplicateFinder.java
Large diffs are not rendered by default.
Oops, something went wrong.
37 changes: 37 additions & 0 deletions
37
src/main/java/com/fulcrumgenomics/illumina/PhysicalLocation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package com.fulcrumgenomics.illumina; | ||
|
||
import java.io.Serializable; | ||
|
||
/** | ||
* Small interface that provides access to the physical location information about a cluster. | ||
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow | ||
* non-zero positive integers, x and y coordinates may be negative. | ||
*/ | ||
public interface PhysicalLocation extends Serializable { | ||
public static int NO_VALUE = -1; | ||
|
||
public short getReadGroup(); | ||
|
||
public void setReadGroup(short rg); | ||
|
||
public short getTile(); | ||
|
||
public void setTile(short tile); | ||
|
||
public int getX(); | ||
|
||
public void setX(int x); | ||
|
||
public int getY(); | ||
|
||
public void setY(int y); | ||
|
||
public short getLibraryId(); | ||
|
||
public void setLibraryId(short libraryId); | ||
|
||
/** Default implementation of a method to check whether real location data has been set. */ | ||
default public boolean hasLocation() { | ||
return getTile() != NO_VALUE; | ||
} | ||
} |
35 changes: 35 additions & 0 deletions
35
src/main/java/com/fulcrumgenomics/illumina/PhysicalLocationInt.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package com.fulcrumgenomics.illumina; | ||
|
||
/** | ||
* Small class that provides access to the physical location information about a cluster. | ||
* All values should be defaulted to -1 if unavailable. Tile should only allow | ||
* non-zero positive integers, x and y coordinates must be non-negative. | ||
* This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts | ||
* thus, they do not overflow within a HiSeqX tile. | ||
*/ | ||
public class PhysicalLocationInt implements PhysicalLocation { | ||
|
||
public short tile = -1; | ||
public int x = -1, y = -1; | ||
|
||
public short getReadGroup() { throw new RuntimeException("Not Implemented"); } | ||
|
||
public void setReadGroup(final short readGroup) { throw new RuntimeException("Not Implemented"); } | ||
|
||
public short getTile() { return tile; } | ||
|
||
public void setTile(final short tile) { this.tile = tile; } | ||
|
||
public int getX() { return x; } | ||
|
||
public void setX(final int x) { this.x = x; } | ||
|
||
public int getY() { return y; } | ||
|
||
public void setY(final int y) { this.y = y; } | ||
|
||
public short getLibraryId() { throw new RuntimeException("Not Implemented"); } | ||
|
||
public void setLibraryId(final short libraryId) { throw new RuntimeException("Not Implemented"); } | ||
|
||
} |
41 changes: 41 additions & 0 deletions
41
src/main/java/com/fulcrumgenomics/illumina/PhysicalLocationShort.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2015 The Broad Institute | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
|
||
package com.fulcrumgenomics.illumina; | ||
|
||
/** | ||
* Small class that provides access to the physical location information about a cluster. | ||
* All values should be defaulted to -1 if unavailable. Tile should only allow | ||
* non-zero positive integers, x and y coordinates must be non-negative. | ||
* This is different from PhysicalLocationInt in that the x and y positions are shorts, not ints | ||
* thus, they may overflow within a HiSeqX tile. | ||
*/ | ||
public class PhysicalLocationShort extends PhysicalLocationInt { | ||
|
||
@Override | ||
public void setX(final int x) { super.setX((short)x); } | ||
|
||
@Override | ||
public void setY(final int y) { super.setY((short)y); } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2009 The Broad Institute | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
package com.fulcrumgenomics.illumina; | ||
|
||
|
||
/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */ | ||
abstract public class ReadEnds extends PhysicalLocationShort { | ||
|
||
public static final byte F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5; | ||
|
||
public short libraryId; | ||
public byte orientation; | ||
public int read1ReferenceIndex = -1; | ||
public int read1Coordinate = -1; | ||
public int read2ReferenceIndex = -1; | ||
public int read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported) | ||
|
||
// Additional information used to detect optical dupes | ||
public short readGroup = -1; | ||
|
||
/** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */ | ||
public byte orientationForOpticalDuplicates = -1; | ||
|
||
/** A *transient* flag marking this read end as being an optical duplicate. */ | ||
public transient boolean isOpticalDuplicate = false; | ||
|
||
public boolean isPaired() { return this.read2ReferenceIndex != -1; } | ||
|
||
@Override | ||
public short getReadGroup() { return this.readGroup; } | ||
|
||
@Override | ||
public void setReadGroup(final short readGroup) { this.readGroup = readGroup; } | ||
|
||
@Override | ||
public short getLibraryId() { return this.libraryId; } | ||
|
||
@Override | ||
public void setLibraryId(final short libraryId) { this.libraryId = libraryId; } | ||
|
||
/** | ||
* Returns a single byte that encodes the orientation of the two reads in a pair. | ||
*/ | ||
public static byte getOrientationByte(final boolean read1NegativeStrand, final boolean read2NegativeStrand) { | ||
if (read1NegativeStrand) { | ||
if (read2NegativeStrand) return ReadEnds.RR; | ||
else return ReadEnds.RF; | ||
} else { | ||
if (read2NegativeStrand) return ReadEnds.FR; | ||
else return ReadEnds.FF; | ||
} | ||
} | ||
} |
108 changes: 108 additions & 0 deletions
108
src/main/java/com/fulcrumgenomics/illumina/ReadEndsForMarkDuplicates.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
/* | ||
* The MIT License | ||
* | ||
* Copyright (c) 2014 The Broad Institute | ||
* | ||
* Permission is hereby granted, free of charge, to any person obtaining a copy | ||
* of this software and associated documentation files (the "Software"), to deal | ||
* in the Software without restriction, including without limitation the rights | ||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
* copies of the Software, and to permit persons to whom the Software is | ||
* furnished to do so, subject to the following conditions: | ||
* | ||
* The above copyright notice and this permission notice shall be included in | ||
* all copies or substantial portions of the Software. | ||
* | ||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
* THE SOFTWARE. | ||
*/ | ||
|
||
package com.fulcrumgenomics.illumina; | ||
|
||
import htsjdk.samtools.SAMRecord; | ||
|
||
/** | ||
* Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar | ||
* | ||
* @author Nils Homer | ||
*/ | ||
public class ReadEndsForMarkDuplicates extends ReadEnds implements Cloneable { | ||
/* | ||
What do we need to store you ask? Well, we need to store: | ||
- byte: orientation | ||
- short: libraryId, readGroup, tile, x, y, score | ||
- int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate, duplicateSetSize | ||
- long: read1IndexInFile, read2IndexInFile | ||
*/ | ||
protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (5 * 4) + (8 * 2) + 1 | ||
+ 8 + // last 8 == reference overhead | ||
13; // This is determined experimentally with JProfiler | ||
|
||
public static int getSizeOf() { | ||
return SIZE_OF; | ||
} | ||
|
||
public short score = 0; | ||
public long read1IndexInFile = -1; | ||
public long read2IndexInFile = -1; | ||
public int duplicateSetSize = -1; | ||
|
||
public ReadEndsForMarkDuplicates() {} | ||
|
||
public ReadEndsForMarkDuplicates(final ReadEndsForMarkDuplicates read) { | ||
this.libraryId = read.getLibraryId(); | ||
this.orientation = read.orientation; | ||
this.read1ReferenceIndex = read.read1ReferenceIndex; | ||
this.read1Coordinate = read.read1Coordinate; | ||
this.read2ReferenceIndex = read.read2ReferenceIndex; | ||
this.read2Coordinate = read.read2Coordinate; | ||
|
||
this.readGroup = read.getReadGroup(); | ||
this.tile = read.getTile(); | ||
this.x = read.x; | ||
this.y = read.y; | ||
|
||
this.orientationForOpticalDuplicates = read.orientationForOpticalDuplicates; | ||
|
||
this.score = read.score; | ||
|
||
this.read1IndexInFile = read.read1IndexInFile; | ||
this.read2IndexInFile = read.read2IndexInFile; | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return String.format("%d %d %d", read1IndexInFile, read1Coordinate, score); | ||
} | ||
|
||
@Override | ||
public ReadEndsForMarkDuplicates clone() { | ||
return new ReadEndsForMarkDuplicates(this); | ||
} | ||
|
||
/** | ||
* This method is used to generate the following two metrics: | ||
* UNPAIRED_DUPS_WITH_TLEN | ||
* UNPAIRED_DUPS_WITHOUT_TLEN | ||
* | ||
* It will return true if and only if the read is single ended and the exact fragment length is | ||
* known (i.e. it was not quality trimmed) | ||
*/ | ||
public static boolean isSingleEndReadKnownFragment(final SAMRecord rec) { | ||
if ( rec.getReadUnmappedFlag() || rec.getReadPairedFlag() ) { | ||
return false; | ||
// } else if ( MarkDuplicatesForFlowHelper.isAdapterClipped(rec) ) { | ||
// return true; | ||
} else if ( !rec.getReadNegativeStrandFlag() ) { | ||
return rec.getEnd() != rec.getUnclippedEnd(); | ||
} else { | ||
return rec.getStart() != rec.getUnclippedStart(); | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.