Skip to content

Commit

Permalink
feat: add optical duplicate tagging to GroupReadsByUmi
Browse files Browse the repository at this point in the history
  • Loading branch information
yfarjoun committed Feb 16, 2025
1 parent 6ca7733 commit e33c422
Show file tree
Hide file tree
Showing 12 changed files with 1,438 additions and 4 deletions.
340 changes: 340 additions & 0 deletions src/main/java/com/fulcrumgenomics/illumina/OpticalDuplicateFinder.java

Large diffs are not rendered by default.

37 changes: 37 additions & 0 deletions src/main/java/com/fulcrumgenomics/illumina/PhysicalLocation.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package com.fulcrumgenomics.illumina;

import java.io.Serializable;

/**
* Small interface that provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. ReadGroup and Tile should only allow
* non-zero positive integers, x and y coordinates may be negative.
*/
public interface PhysicalLocation extends Serializable {
public static int NO_VALUE = -1;

public short getReadGroup();

public void setReadGroup(short rg);

public short getTile();

public void setTile(short tile);

public int getX();

public void setX(int x);

public int getY();

public void setY(int y);

public short getLibraryId();

public void setLibraryId(short libraryId);

/** Default implementation of a method to check whether real location data has been set. */
default public boolean hasLocation() {
return getTile() != NO_VALUE;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
package com.fulcrumgenomics.illumina;

/**
* Small class that provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. Tile should only allow
* non-zero positive integers, x and y coordinates must be non-negative.
* This is different from PhysicalLocationShort in that the x and y positions are ints, not shorts
* thus, they do not overflow within a HiSeqX tile.
*/
public class PhysicalLocationInt implements PhysicalLocation {

public short tile = -1;
public int x = -1, y = -1;

public short getReadGroup() { throw new RuntimeException("Not Implemented"); }

public void setReadGroup(final short readGroup) { throw new RuntimeException("Not Implemented"); }

public short getTile() { return tile; }

public void setTile(final short tile) { this.tile = tile; }

public int getX() { return x; }

public void setX(final int x) { this.x = x; }

public int getY() { return y; }

public void setY(final int y) { this.y = y; }

public short getLibraryId() { throw new RuntimeException("Not Implemented"); }

public void setLibraryId(final short libraryId) { throw new RuntimeException("Not Implemented"); }

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
* The MIT License
*
* Copyright (c) 2015 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package com.fulcrumgenomics.illumina;

/**
* Small class that provides access to the physical location information about a cluster.
* All values should be defaulted to -1 if unavailable. Tile should only allow
* non-zero positive integers, x and y coordinates must be non-negative.
* This is different from PhysicalLocationInt in that the x and y positions are shorts, not ints
* thus, they may overflow within a HiSeqX tile.
*/
public class PhysicalLocationShort extends PhysicalLocationInt {

@Override
public void setX(final int x) { super.setX((short)x); }

@Override
public void setY(final int y) { super.setY((short)y); }
}
74 changes: 74 additions & 0 deletions src/main/java/com/fulcrumgenomics/illumina/ReadEnds.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* The MIT License
*
* Copyright (c) 2009 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
package com.fulcrumgenomics.illumina;


/** Little struct-like class to hold read pair (and fragment) end data for duplicate marking. */
abstract public class ReadEnds extends PhysicalLocationShort {

public static final byte F = 0, R = 1, FF = 2, FR = 3, RR = 4, RF = 5;

public short libraryId;
public byte orientation;
public int read1ReferenceIndex = -1;
public int read1Coordinate = -1;
public int read2ReferenceIndex = -1;
public int read2Coordinate = -1; // This field is overloaded for flow based processing as the end coordinate of read 1. (paired reads not supported)

// Additional information used to detect optical dupes
public short readGroup = -1;

/** For optical duplicate detection the orientation matters regard to 1st or 2nd end of a mate */
public byte orientationForOpticalDuplicates = -1;

/** A *transient* flag marking this read end as being an optical duplicate. */
public transient boolean isOpticalDuplicate = false;

public boolean isPaired() { return this.read2ReferenceIndex != -1; }

@Override
public short getReadGroup() { return this.readGroup; }

@Override
public void setReadGroup(final short readGroup) { this.readGroup = readGroup; }

@Override
public short getLibraryId() { return this.libraryId; }

@Override
public void setLibraryId(final short libraryId) { this.libraryId = libraryId; }

/**
* Returns a single byte that encodes the orientation of the two reads in a pair.
*/
public static byte getOrientationByte(final boolean read1NegativeStrand, final boolean read2NegativeStrand) {
if (read1NegativeStrand) {
if (read2NegativeStrand) return ReadEnds.RR;
else return ReadEnds.RF;
} else {
if (read2NegativeStrand) return ReadEnds.FR;
else return ReadEnds.FF;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* The MIT License
*
* Copyright (c) 2014 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/

package com.fulcrumgenomics.illumina;

import htsjdk.samtools.SAMRecord;

/**
* Little struct-like class to hold read pair (and fragment) end data for MarkDuplicatesWithMateCigar
*
* @author Nils Homer
*/
public class ReadEndsForMarkDuplicates extends ReadEnds implements Cloneable {
/*
What do we need to store you ask? Well, we need to store:
- byte: orientation
- short: libraryId, readGroup, tile, x, y, score
- int: read1ReferenceIndex, read1Coordinate, read2ReferenceIndex, read2Coordinate, duplicateSetSize
- long: read1IndexInFile, read2IndexInFile
*/
protected static final int SIZE_OF = (1 * 1) + (5 * 2) + (5 * 4) + (8 * 2) + 1
+ 8 + // last 8 == reference overhead
13; // This is determined experimentally with JProfiler

public static int getSizeOf() {
return SIZE_OF;
}

public short score = 0;
public long read1IndexInFile = -1;
public long read2IndexInFile = -1;
public int duplicateSetSize = -1;

public ReadEndsForMarkDuplicates() {}

public ReadEndsForMarkDuplicates(final ReadEndsForMarkDuplicates read) {
this.libraryId = read.getLibraryId();
this.orientation = read.orientation;
this.read1ReferenceIndex = read.read1ReferenceIndex;
this.read1Coordinate = read.read1Coordinate;
this.read2ReferenceIndex = read.read2ReferenceIndex;
this.read2Coordinate = read.read2Coordinate;

this.readGroup = read.getReadGroup();
this.tile = read.getTile();
this.x = read.x;
this.y = read.y;

this.orientationForOpticalDuplicates = read.orientationForOpticalDuplicates;

this.score = read.score;

this.read1IndexInFile = read.read1IndexInFile;
this.read2IndexInFile = read.read2IndexInFile;
}

@Override
public String toString() {
return String.format("%d %d %d", read1IndexInFile, read1Coordinate, score);
}

@Override
public ReadEndsForMarkDuplicates clone() {
return new ReadEndsForMarkDuplicates(this);
}

/**
* This method is used to generate the following two metrics:
* UNPAIRED_DUPS_WITH_TLEN
* UNPAIRED_DUPS_WITHOUT_TLEN
*
* It will return true if and only if the read is single ended and the exact fragment length is
* known (i.e. it was not quality trimmed)
*/
public static boolean isSingleEndReadKnownFragment(final SAMRecord rec) {
if ( rec.getReadUnmappedFlag() || rec.getReadPairedFlag() ) {
return false;
// } else if ( MarkDuplicatesForFlowHelper.isAdapterClipped(rec) ) {
// return true;
} else if ( !rec.getReadNegativeStrandFlag() ) {
return rec.getEnd() != rec.getUnclippedEnd();
} else {
return rec.getStart() != rec.getUnclippedStart();
}
}

}
Loading

0 comments on commit e33c422

Please sign in to comment.