Skip to content

Commit

Permalink
add predicate to GFF3Codec to give a chance to filter out some unused…
Browse files Browse the repository at this point in the history
… attributes (#1575)

* When using a GFF3Reader with DecodeDepth==DEEP, it may use a large amount of memory with attributes that will never be used ("version" ,"tag", etc...). This adds an optional filter parameter when creating the codec to allow the user to select which attributes they want to keep.  All the attributes will still be read and parsed but returned objects will be filtered to the desired set.
  • Loading branch information
lindenb authored Dec 8, 2021
1 parent e63c34a commit e927064
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 9 deletions.
9 changes: 3 additions & 6 deletions src/main/java/htsjdk/tribble/gff/Gff3BaseData.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,6 @@
import java.util.Map;

public class Gff3BaseData {
private static final String ID_ATTRIBUTE_KEY = "ID";
private static final String NAME_ATTRIBUTE_KEY = "Name";
private static final String ALIAS_ATTRIBUTE_KEY = "Alias";
private final String contig;
private final String source;
private final String type;
Expand All @@ -38,9 +35,9 @@ public Gff3BaseData(final String contig, final String source, final String type,
this.phase = phase;
this.strand = strand;
this.attributes = copyAttributesSafely(attributes);
this.id = Gff3Codec.extractSingleAttribute(attributes.get(ID_ATTRIBUTE_KEY));
this.name = Gff3Codec.extractSingleAttribute(attributes.get(NAME_ATTRIBUTE_KEY));
this.aliases = attributes.getOrDefault(ALIAS_ATTRIBUTE_KEY, Collections.emptyList());
this.id = Gff3Codec.extractSingleAttribute(attributes.get(Gff3Constants.ID_ATTRIBUTE_KEY));
this.name = Gff3Codec.extractSingleAttribute(attributes.get(Gff3Constants.NAME_ATTRIBUTE_KEY));
this.aliases = attributes.getOrDefault(Gff3Constants.ALIAS_ATTRIBUTE_KEY, Collections.emptyList());
this.hashCode = computeHashCode();
}

Expand Down
27 changes: 24 additions & 3 deletions src/main/java/htsjdk/tribble/gff/Gff3Codec.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;

Expand Down Expand Up @@ -71,20 +72,38 @@ public class Gff3Codec extends AbstractFeatureCodec<Gff3Feature, LineIterator> {

private int currentLine = 0;

/** filter to removing keys from the EXTRA_FIELDS column */
private final Predicate<String> filterOutAttribute;

public Gff3Codec() {
this(DecodeDepth.DEEP);
}

public Gff3Codec(final DecodeDepth decodeDepth) {
this(decodeDepth, KEY -> false);
}

/**
* @param decodeDepth a value from DecodeDepth
* @param filterOutAttribute filter to remove keys from the EXTRA_FIELDS column
*/
public Gff3Codec(final DecodeDepth decodeDepth, final Predicate<String> filterOutAttribute) {
super(Gff3Feature.class);
this.decodeDepth = decodeDepth;
this.filterOutAttribute = filterOutAttribute;
/* check required keys are always kept */
for (final String key : new String[] {Gff3Constants.PARENT_ATTRIBUTE_KEY, Gff3Constants.ID_ATTRIBUTE_KEY, Gff3Constants.NAME_ATTRIBUTE_KEY}) {
if (filterOutAttribute.test(key)) {
throw new IllegalArgumentException("Predicate should always accept " + key);
}
}
}

public enum DecodeDepth {
DEEP ,
SHALLOW
}

@Override
public Gff3Feature decode(final LineIterator lineIterator) throws IOException {
return decode(lineIterator, decodeDepth);
Expand Down Expand Up @@ -129,7 +148,7 @@ private Gff3Feature decode(final LineIterator lineIterator, final DecodeDepth de



final Gff3FeatureImpl thisFeature = new Gff3FeatureImpl(parseLine(line, currentLine));
final Gff3FeatureImpl thisFeature = new Gff3FeatureImpl(parseLine(line, currentLine, this.filterOutAttribute));
activeFeatures.add(thisFeature);
if (depth == DecodeDepth.DEEP) {
//link to parents/children/co-features
Expand Down Expand Up @@ -200,7 +219,7 @@ static private Map<String, List<String>> parseAttributes(final String attributes
return attributes;
}

static private Gff3BaseData parseLine(final String line, final int currentLine) {
private static Gff3BaseData parseLine(final String line, final int currentLine, final Predicate<String> filterOutAttribute) {
final List<String> splitLine = ParsingUtils.split(line, Gff3Constants.FIELD_DELIMITER);

if (splitLine.size() != NUM_FIELDS) {
Expand All @@ -217,6 +236,8 @@ static private Gff3BaseData parseLine(final String line, final int currentLine)
final int phase = splitLine.get(GENOMIC_PHASE_INDEX).equals(Gff3Constants.UNDEFINED_FIELD_VALUE) ? -1 : Integer.parseInt(splitLine.get(GENOMIC_PHASE_INDEX));
final Strand strand = Strand.decode(splitLine.get(GENOMIC_STRAND_INDEX));
final Map<String, List<String>> attributes = parseAttributes(splitLine.get(EXTRA_FIELDS_INDEX));
/* remove attibutes matching 'filterOutAttribute' */
attributes.keySet().removeIf(filterOutAttribute);
return new Gff3BaseData(contig, source, type, start, end, score, strand, phase, attributes);
} catch (final NumberFormatException ex ) {
throw new TribbleException("Cannot read integer value for start/end position from line " + currentLine + ". Line is: " + line, ex);
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/htsjdk/tribble/gff/Gff3Constants.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,7 @@ public class Gff3Constants {
public static final String UNDEFINED_FIELD_VALUE = ".";
public static final String PARENT_ATTRIBUTE_KEY = "Parent";
public final static char END_OF_LINE_CHARACTER = '\n';
public static final String ID_ATTRIBUTE_KEY = "ID";
public static final String NAME_ATTRIBUTE_KEY = "Name";
public static final String ALIAS_ATTRIBUTE_KEY = "Alias";
}
19 changes: 19 additions & 0 deletions src/test/java/htsjdk/tribble/gff/Gff3CodecTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,25 @@ public void basicDecodeTest(final Path inputGff3, final int expectedTotalFeature
Assert.assertEquals(countTotalFeatures, expectedTotalFeatures);
}

@Test(dataProvider = "basicDecodeDataProvider")
public void codecFilterOutFieldsTest(final Path inputGff3, final int expectedTotalFeatures) throws IOException {
final Set<String> skip_attributes = new HashSet<>(Arrays.asList("version","rank","biotype","transcript_support_level","mgi_id","havana_gene","tag"));
final Gff3Codec codec = new Gff3Codec(Gff3Codec.DecodeDepth.SHALLOW, S->skip_attributes.contains(S));
Assert.assertTrue(codec.canDecode(inputGff3.toAbsolutePath().toString()));
final AbstractFeatureReader<Gff3Feature, LineIterator> reader = AbstractFeatureReader.getFeatureReader(inputGff3.toAbsolutePath().toString(), null,codec, false);
int countTotalFeatures = 0;
for (final Gff3Feature feature : reader.iterator()) {
for(final String key : skip_attributes) {
Assert.assertTrue(feature.getAttribute(key).isEmpty());
}
countTotalFeatures++;
}

Assert.assertEquals(countTotalFeatures, expectedTotalFeatures);
}



@Test(dataProvider = "basicDecodeDataProvider")
public void basicShallowDecodeTest(final Path inputGff3, final int expectedTotalFeatures) throws IOException {
Assert.assertTrue((new Gff3Codec(Gff3Codec.DecodeDepth.SHALLOW)).canDecode(inputGff3.toAbsolutePath().toString()));
Expand Down

0 comments on commit e927064

Please sign in to comment.