From b12a3effd874f3e754f9086087ef0052e6c91072 Mon Sep 17 00:00:00 2001 From: Jonn Smith Date: Fri, 21 Jul 2017 14:07:19 -0400 Subject: [PATCH] Adding in codec to read from Gencode GTF files. Fixes #3277 Includes tests for both hg19 and hg38. --- .../codecs/GENCODE/GencodeGtfCDSFeature.java | 75 + .../utils/codecs/GENCODE/GencodeGtfCodec.java | 376 ++ .../codecs/GENCODE/GencodeGtfExonFeature.java | 145 + .../codecs/GENCODE/GencodeGtfFeature.java | 1320 +++++++ .../codecs/GENCODE/GencodeGtfGeneFeature.java | 121 + .../GencodeGtfSelenocysteineFeature.java | 74 + .../GENCODE/GencodeGtfStartCodonFeature.java | 75 + .../GENCODE/GencodeGtfStopCodonFeature.java | 74 + .../GENCODE/GencodeGtfTranscriptFeature.java | 144 + .../codecs/GENCODE/GencodeGtfUTRFeature.java | 74 + .../GENCODE/GencodeGtfCodecUnitTest.java | 3336 +++++++++++++++++ .../resources/large/gencode.v19.LargeFile.gtf | 3 + .../large/gencode.v19.LargeFile.gtf.idx | 3 + ...de.v26.primary_assembly.annotation.XYZ.gtf | 3 + ...26.primary_assembly.annotation.XYZ.gtf.idx | 3 + ...code.and.this.is.a.valid.one.too.table.gtf | 74 + .../gencode.invalid_malformed_header.gtf | 17 + .../gencode.invalid_malformed_header_cont.gtf | 17 + .../gencode.invalid_malformed_header_date.gtf | 17 + .../gencode.invalid_malformed_header_desc.gtf | 17 + .../gencode.invalid_malformed_header_form.gtf | 17 + .../gencode.invalid_malformed_header_prov.gtf | 17 + .../GENCODE/gencode.invalid_short_header.gtf | 14 + ...encode.v19.and.this.is.a.valid.one.too.gtf | 20 + .../codecs/GENCODE/gencode.v19.valid1.gtf | 8 + .../gencode.v19.valid_gencode_file2.gtf | 43 + .../utils/codecs/GENCODE/gencode.valid1.gtf | 8 + .../GENCODE/gencode.valid_gencode_file2.gtf | 64 + 28 files changed, 6159 insertions(+) create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCDSFeature.java create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCodec.java create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfExonFeature.java create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfFeature.java create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfGeneFeature.java create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfSelenocysteineFeature.java create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfStartCodonFeature.java create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfStopCodonFeature.java create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfTranscriptFeature.java create mode 100644 src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfUTRFeature.java create mode 100644 src/test/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCodecUnitTest.java create mode 100644 src/test/resources/large/gencode.v19.LargeFile.gtf create mode 100644 src/test/resources/large/gencode.v19.LargeFile.gtf.idx create mode 100644 src/test/resources/large/gencode.v26.primary_assembly.annotation.XYZ.gtf create mode 100644 src/test/resources/large/gencode.v26.primary_assembly.annotation.XYZ.gtf.idx create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.and.this.is.a.valid.one.too.table.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_cont.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_date.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_desc.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_form.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_prov.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_short_header.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.and.this.is.a.valid.one.too.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.valid1.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.valid_gencode_file2.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.valid1.gtf create mode 100644 src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.valid_gencode_file2.gtf diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCDSFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCDSFeature.java new file mode 100644 index 00000000000..2fd12b09050 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCDSFeature.java @@ -0,0 +1,75 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import java.util.ArrayList; + +/** + * A Gencode GTF Feature representing a CDS. + * + * A GTF Feature represents one row of a GTF File. + * The specification of a GTF file is defined here: + * http://mblab.wustl.edu/GTF22.html + * + * Created by jonn on 7/25/17. + */ +// {gene,transcript,exon,CDS,UTR,start_codon,stop_codon,Selenocysteine}` +final public class GencodeGtfCDSFeature extends GencodeGtfFeature { + + private GencodeGtfCDSFeature(String[] gtfFields) { + super(gtfFields); + } + + public static GencodeGtfFeature create(String[] gtfFields) { + return new GencodeGtfCDSFeature(gtfFields); + } + + private GencodeGtfCDSFeature(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + super(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + public static GencodeGtfFeature create(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + return new GencodeGtfCDSFeature(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCodec.java new file mode 100644 index 00000000000..ba4e9722c78 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCodec.java @@ -0,0 +1,376 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import htsjdk.samtools.util.CloserUtil; +import htsjdk.samtools.util.LocationAware; +import htsjdk.tribble.AbstractFeatureCodec; +import htsjdk.tribble.FeatureCodecHeader; +import htsjdk.tribble.readers.*; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.exceptions.UserException; + +import java.io.*; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +/** + * {@link htsjdk.tribble.Tribble} Codec to read data from a GENCODE GTF file. + * + * GENCODE GTF Files are defined here: https://www.gencodegenes.org/data_format.html + * + * Created by jonn on 7/21/17. + */ +final public class GencodeGtfCodec extends AbstractFeatureCodec { + + protected final Logger logger = LogManager.getLogger(this.getClass()); + + private static final String COMMENT_START = "##"; + + static final int NUM_COLUMNS = 9; + + private long currentLineNum = 1; + protected List header = new ArrayList<>(); + + // ============================================================================================================ + + public GencodeGtfCodec() { + super(GencodeGtfFeature.class); + } + + // ============================================================================================================ + // Trivial override methods that are pulled form AsciiFeatureCodec + // This was done to ensure that this was a reasonable Codec class (with good interfaces for reading features). + + @Override + public void close(final LineIterator lineIterator) { + CloserUtil.close(lineIterator); + } + + @Override + public boolean isDone(final LineIterator lineIterator) { + return !lineIterator.hasNext(); + } + + @Override + public LineIterator makeSourceFromStream(final InputStream bufferedInputStream) { + return new LineIteratorImpl(new SynchronousLineReader(bufferedInputStream)); + } + + @Override + public FeatureCodecHeader readHeader(final LineIterator lineIterator) throws IOException { + return new FeatureCodecHeader(readActualHeader(lineIterator), FeatureCodecHeader.NO_HEADER_END); + } + + @Override + public LocationAware makeIndexableSourceFromStream(final InputStream bufferedInputStream) { + final PositionalBufferedStream pbs; + if (bufferedInputStream instanceof PositionalBufferedStream) { + pbs = (PositionalBufferedStream) bufferedInputStream; + } else { + pbs = new PositionalBufferedStream(bufferedInputStream); + } + return new AsciiLineReaderIterator(new AsciiLineReader(pbs)); + } + + // ============================================================================================================ + + @Override + public GencodeGtfFeature decode(final LineIterator lineIterator) { + + GencodeGtfFeature decodedFeature = null; + + // Create some caches for our data (as we need to group it): + GencodeGtfGeneFeature gene = null; + GencodeGtfTranscriptFeature transcript = null; + final ArrayList< GencodeGtfExonFeature > exonStore = new ArrayList<>(); + final ArrayList< GencodeGtfFeature > leafFeatureStore = new ArrayList<>(); + + // Accumulate lines until we have a full gene and all of its internal features: + while ( lineIterator.hasNext() ) { + + String line = lineIterator.peek(); + + // We must assume we can get header lines. + // If we get a header line, we return null. + // This allows indexing to work. + if ( line.startsWith(COMMENT_START) ) { + lineIterator.next(); + return null; + } + + String[] splitLine = line.split("\t", -1); + + // Ensure the file is at least trivially well-formed: + if (splitLine.length != NUM_COLUMNS) { + throw new UserException.MalformedFile("Found an invalid number of columns in the given GENCODE file on line " + + currentLineNum + " - Given: " + splitLine.length + " Expected: " + NUM_COLUMNS + " : " + line); + } + + // We need to key off the feature type to collapse our accumulated records: + final String featureType = splitLine[2]; + + // Create a baseline feature to add into our data: + GencodeGtfFeature feature = GencodeGtfFeature.create(splitLine); + + // Make sure we keep track of the line number for if and when we need to write the file back out: + feature.setFeatureOrderNumber(currentLineNum); + + // Once we see another gene, we take all accumulated records and combine them into a + // GencodeGtfFeature. + if ((gene != null) && (transcript != null) && (featureType.equals("gene") || featureType.equals("transcript") )) { + + aggregateRecordsIntoGeneFeature(gene, transcript, exonStore, leafFeatureStore); + + if ( featureType.equals("gene") ) { + decodedFeature = gene; + break; + } + else if ( featureType.equals("transcript") ) { + transcript = (GencodeGtfTranscriptFeature) feature; + ++currentLineNum; + } + } + else { + // We have not reached the end of this set of gene / transcript records. + // We must cache these records together so we can create a meaningful data hierarchy from them all. + // Records are stored in their Feature form, not string form. + + // Add the feature to the correct storage unit for easy assembly later: + switch (featureType) { + case "gene": + gene = (GencodeGtfGeneFeature)feature; + break; + case "transcript": + transcript = (GencodeGtfTranscriptFeature)feature; + break; + case "exon": + exonStore.add((GencodeGtfExonFeature)feature); + break; + default: + leafFeatureStore.add(feature); + break; + } + + ++currentLineNum; + } + + // Increment our iterator here so we don't accidentally miss any features from the following gene + lineIterator.next(); + } + + // Do we have some records leftover that we need to aggregate into a feature: + if ( (gene != null) && + (exonStore.size() != 0) || (leafFeatureStore.size() != 0) ) { + + aggregateRecordsIntoGeneFeature(gene, transcript, exonStore, leafFeatureStore); + decodedFeature = gene; + } + + // If we have other records left over we should probably yell a lot, + // as this is bad. + // + // However, this should never actually happen. + // + if ( (exonStore.size() != 0) || (leafFeatureStore.size() != 0) ) { + + if (exonStore.size() != 0) { + logger.error("Gene Feature Aggregation: Exon store not empty: " + exonStore.toString()); + } + + if (leafFeatureStore.size() != 0) { + logger.error("Gene Feature Aggregation: leaf feature store not empty: " + leafFeatureStore.toString()); + } + + String msg = "Aggregated data left over after parsing complete: Exons: " + exonStore.size() + " ; LeafFeatures: " + leafFeatureStore.size(); + throw new RuntimeException(msg); + } + + return decodedFeature; + } + + /** + * Read the {@code header} from the given {@link LineIterator} for the GENCODE GTF File. + * Will also validate this {@code header} for correctness before returning it. + * Throws a {@link UserException.MalformedFile} if the header is malformed. + * @param reader The {@link LineIterator} from which to read the header. + * @return The header as read from the {@code reader} + */ + private List readActualHeader(LineIterator reader) { + + boolean isFirst = true; + + while ( reader.hasNext() ) { + String line = reader.peek(); + + // The file will start with commented out lines. + // Grab them until there are no more commented out lines. + if ( line.startsWith(COMMENT_START) ) { + header.add(line); + reader.next(); + isFirst = false; + } + else if ( isFirst ) { + throw new UserException.MalformedFile("GENCODE file does not have a header!"); + } + else { + // Validate our header: + if ( !validateHeader(header.toArray(new String[0])) ) { + throw new UserException.MalformedFile("Invalid GENCODE GTF File - Header does not conform to GENCODE GTF Specifications!"); + } + + break; + } + } + + // Set our line number to be the line of the first actual Feature: + currentLineNum = 6; + + return header; + } + + @Override + public boolean canDecode(String path) { + + // Simple file and name checks to start with: + File f = new File(path); + boolean canDecode = f.getName().toLowerCase().startsWith("gencode") && f.getName().toLowerCase().endsWith(".gtf"); + + if ( canDecode ) { + + String localPath = path; + if ( path.startsWith("file://") ) { + localPath = path.substring(7); + } + + // Crack open the file and look at the top of it: + try { + try (BufferedReader br = new BufferedReader(new FileReader(localPath))) { + // Read the first 5 lines. + // They compose the header of a valid GTF File. + String[] stringArray = new String[5]; + + for (int i = 0; i < stringArray.length; ++i){ + stringArray[i] = br.readLine(); + } + + // Validate our header: + canDecode = validateHeader(stringArray); + } + } + catch (FileNotFoundException ex) { + logger.warn("File does not exist! - " + path + " - returning can decode as failure."); + canDecode = false; + } + catch (IOException ex) { + logger.warn("Caught IOException on file: " + path + " - returning can decode as failure."); + canDecode = false; + } + } + else { + logger.warn("Given file name does not conform to GENCODE GTF standards: " + path); + } + + return canDecode; + } + + // ============================================================================================================ + + /** + * Check if the given header of a tentative GENCODE GTF file is, in fact, the header to such a file. + * @param header Header lines to check for conformity to GENCODE GTF specifications. + * @return true if the given {@code header} is that of a GENCODE GTF file; false otherwise. + */ + static boolean validateHeader(final String[] header) { + boolean isValid = false; + + if ( header.length == 5 ) { + // Check the normal commented fields: + isValid = header[0].startsWith("##description:"); + isValid = isValid && header[1].startsWith("##provider: GENCODE"); + + isValid = isValid && header[2].startsWith("##contact: gencode"); + isValid = isValid && header[2].endsWith("@sanger.ac.uk"); + + isValid = isValid && header[3].startsWith("##format: gtf"); + isValid = isValid && header[4].startsWith("##date:"); + } + + return isValid; + } + + /** + * Aggregates the given feature sets into a single gene feature. + * + * The given gene is updated using modifiers. + * {@code exonStore} and {@code leafFeatureStore} are cleared of all data. + * + * @param gene {@link GencodeGtfGeneFeature} into which to aggregate features. + * @param transcript {@link GencodeGtfTranscriptFeature} to insert into {@code gene} + * @param exonStore {@link List} of {@link GencodeGtfExonFeature}s to insert into corresponding {@link GencodeGtfTranscriptFeature} {@code transcript} + * @param leafFeatureStore {@link List} of {@link GencodeGtfFeature}s to insert into corresponding {@link GencodeGtfExonFeature} objects in {@code exonStore} + */ + private void aggregateRecordsIntoGeneFeature(final GencodeGtfGeneFeature gene, + final GencodeGtfTranscriptFeature transcript, + final ArrayList< GencodeGtfExonFeature > exonStore, + final ArrayList< GencodeGtfFeature > leafFeatureStore ) { + + // OK, we go through the record and consolidate the sub parts of the record. + // We must consolidate these records through grouping by genomic position. + + // Loop through the Exons and put the correct leaf features into each: + for ( GencodeGtfExonFeature exon : exonStore ) { + for (Iterator iterator = leafFeatureStore.iterator(); iterator.hasNext(); ) { + + GencodeGtfFeature feature = iterator.next(); + + // Features that are within the extents of an exon belong in that exon: + if ( exon.contains(feature) ) { + + GencodeGtfFeature.FeatureType featureType = feature.getFeatureType(); + + // Add the feature to the correct place in the exon: + switch (featureType) { + case cds: + exon.setCds((GencodeGtfCDSFeature) feature); + break; + case start_codon: + exon.setStartCodon((GencodeGtfStartCodonFeature) feature); + break; + case stop_codon: + exon.setStopCodon((GencodeGtfStopCodonFeature) feature); + break; + case utr: + transcript.addUtr((GencodeGtfUTRFeature) feature); + break; + case selenocysteine: + transcript.addSelenocysteine(((GencodeGtfSelenocysteineFeature) feature)); + break; + default: + throw new UserException.MalformedFile( + "Found unexpected Feature Type in GENCODE GTF File (line " + + feature.getFeatureOrderNumber() + "): " + + featureType.toString() + ); + } + + // We have used this iterator item. + // We should remove it now so we don't keep going through the list each exon. + iterator.remove(); + } + } + + // Now insert this exon into the transcript: + transcript.addExon(exon); + } + + // Add in the transcript: + gene.addTranscript(transcript); + + // Clear the input data: + exonStore.clear(); + leafFeatureStore.clear(); + } + + +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfExonFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfExonFeature.java new file mode 100644 index 00000000000..3d555d6f251 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfExonFeature.java @@ -0,0 +1,145 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +/** + * A Gencode GTF Feature representing an exon. + * + * A GTF Feature represents one row of a GTF File. + * The specification of a GTF file is defined here: + * http://mblab.wustl.edu/GTF22.html + * + * Created by jonn on 7/25/17. + */ +final public class GencodeGtfExonFeature extends GencodeGtfFeature { + + private GencodeGtfExonFeature(String[] gtfFields) { + super(gtfFields); + } + + public static GencodeGtfFeature create(String[] gtfFields) { + return new GencodeGtfExonFeature(gtfFields); + } + + private GencodeGtfExonFeature(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + super(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + public static GencodeGtfFeature create(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + return new GencodeGtfExonFeature(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + // ============================================================================================================ + + + GencodeGtfCDSFeature cds = null; + GencodeGtfStartCodonFeature startCodon = null; + GencodeGtfStopCodonFeature stopCodon = null; + + // ============================================================================================================ + + public GencodeGtfCDSFeature getCds() { + return cds; + } + + public GencodeGtfStartCodonFeature getStartCodon() { + return startCodon; + } + + public GencodeGtfStopCodonFeature getStopCodon() { + return stopCodon; + } + + // ============================================================================================================ + + public void setCds(GencodeGtfCDSFeature cds) { + this.cds = cds; + } + + public void setStartCodon(GencodeGtfStartCodonFeature startCodon) { + this.startCodon = startCodon; + } + + public void setStopCodon(GencodeGtfStopCodonFeature stopCodon) { + this.stopCodon = stopCodon; + } + + + @Override + protected List getAllFeatures() { + ArrayList list = new ArrayList<>(); + list.add(this); + + if ( cds != null ) { list.add(cds) ; } + if ( startCodon != null ) { list.add(startCodon) ; } + if ( stopCodon != null ) { list.add(stopCodon) ; } + + return list; + } + + @Override + public boolean equals(Object other) { + if (!(other instanceof GencodeGtfExonFeature)) { + return false; + } + + GencodeGtfExonFeature otherExon = (GencodeGtfExonFeature) other; + + if ( !super.equals(otherExon) ) { + return false; + } + + if (!(((cds == null) && (otherExon.cds == null)) || ((cds != null) && cds.equals(otherExon.cds))) || + !(((startCodon == null) && (otherExon.startCodon == null)) || ((startCodon != null) && startCodon.equals(otherExon.startCodon))) || + !(((stopCodon == null) && (otherExon.stopCodon == null)) || ((stopCodon != null) && stopCodon.equals(otherExon.stopCodon))) ) { + return false; + } + + return true; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfFeature.java new file mode 100644 index 00000000000..986bbedf6a7 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfFeature.java @@ -0,0 +1,1320 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import htsjdk.samtools.util.Locus; +import htsjdk.tribble.Feature; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.exceptions.UserException; +import scala.tools.nsc.Global; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +/** + * A GTF Feature represents one row of a GTF File. + * The specification of a GTF file is defined here: + * http://mblab.wustl.edu/GTF22.html + * + * Currently only supports version 26 or greater! + * + * Created by jonn on 7/21/17. + */ +public abstract class GencodeGtfFeature implements Feature, Comparable { + + protected final Logger logger = LogManager.getLogger(this.getClass()); + + // Metadata fields: + + /** + * The relative order of this Feature. + * Normally it is a line number indicating the position in the original data file of this feature. + */ + private long featureOrderNumber = -1; + + // Required base GTF Fields: + private String chromosomeName; + private AnnotationSource annotationSource; + private FeatureType featureType; + private int genomicStartLocation; + private int genomicEndLocation; + private GenomicStrand genomicStrand; + private GenomicPhase genomicPhase; + + // "Required" GENCODE GTF Fields: + private String geneId = null; + private String transcriptId = null; + private GeneTranscriptStatus geneStatus = null; + private GeneTranscriptType geneType = null; + private String geneName = null; + private GeneTranscriptType transcriptType = null; + private GeneTranscriptStatus transcriptStatus = null; + private String transcriptName = null; + private int exonNumber = -1; + private String exonId = null; + private LocusLevel locusLevel = null; + + // Optional GENCODE GTF Fields: + private ArrayList> optionalFields = new ArrayList<>(); + + // Optional General GTF Fields: + private String anonymousOptionalFields = null; + + // ================================================================================================ + + /** + * Populate this GencodeGtfFeature with the given data. + */ + protected GencodeGtfFeature(String[] gtfFields) { + chromosomeName = gtfFields[0]; + annotationSource = AnnotationSource.valueOf( gtfFields[1] ); + featureType = GencodeGtfFeature.FeatureType.valueOf( gtfFields[2].toLowerCase() ); + genomicStartLocation = Integer.valueOf( gtfFields[3] ); + genomicEndLocation = Integer.valueOf( gtfFields[4] ); + genomicStrand = GenomicStrand.getEnum( gtfFields[6] ); + genomicPhase = GenomicPhase.getEnum( gtfFields[7] ); + + // Get the extra fields from the last column: + String[] extraFields = gtfFields[8].split(";"); + + StringBuilder anonymousOptionalFieldBuilder = new StringBuilder(); + + // Now there are "optional" fields to go through (some actually required, some actually optional), + // But we need to match up the field names to the fields themselves: + for ( String extraField : extraFields ) { + + String[] fieldParts = extraField.trim().split(" "); + + String fieldName = fieldParts[0].trim(); + + // The value of the field may be between two quotes. + // We remove them here. + String fieldValue = fieldParts[1].trim().replaceAll("\"", ""); + + OptionalField optionalField = null; + + switch (fieldName) { + // Find the right field to set: + case "gene_id": + geneId = fieldValue; + break; + case "transcript_id": + transcriptId = fieldValue; + break; + case "gene_type": + geneType = GeneTranscriptType.getEnum(fieldValue); + break; + case "gene_status": + geneStatus = GeneTranscriptStatus.valueOf(fieldValue); + break; + case "gene_name": + geneName = fieldValue; + break; + case "transcript_type": + transcriptType = GeneTranscriptType.getEnum(fieldValue); + break; + case "transcript_status": + transcriptStatus = GeneTranscriptStatus.valueOf(fieldValue); + break; + case "transcript_name": + transcriptName = fieldValue; + break; + case "exon_number": + exonNumber = Integer.valueOf(fieldValue); + break; + case "exon_id": + exonId = fieldValue; + break; + case "level": + locusLevel = LocusLevel.getEnum(fieldValue); + break; + case "tag": + optionalField = new OptionalField<>(fieldName, FeatureTag.getEnum(fieldValue)); + break; + case "ccdsid": + optionalField = new OptionalField<>(fieldName, fieldValue); + break; + case "havana_gene": + optionalField = new OptionalField<>(fieldName, fieldValue); + break; + case "havana_transcript": + optionalField = new OptionalField<>(fieldName, fieldValue); + break; + case "protein_id": + optionalField = new OptionalField<>(fieldName, fieldValue); + break; + case "ont": + optionalField = new OptionalField<>(fieldName, fieldValue); + break; + case "transcript_support_level": + optionalField = new OptionalField<>(fieldName, TranscriptSupportLevel.getEnum(fieldValue)); + break; + case "remap_status": + optionalField = new OptionalField<>(fieldName, RemapStatus.valueOf(fieldValue)); + break; + case "remap_original_id": + optionalField = new OptionalField<>(fieldName, fieldValue); + break; + case "remap_original_location": + optionalField = new OptionalField<>(fieldName, Long.valueOf(fieldValue)); + break; + case "remap_num_mappings": + optionalField = new OptionalField<>(fieldName, Long.valueOf(fieldValue)); + break; + case "remap_target_status": + optionalField = new OptionalField<>(fieldName, RemapTargetStatus.getEnum(fieldValue)); + break; + case "remap_substituted_missing_target": + optionalField = new OptionalField<>(fieldName, fieldValue); + break; + default: + anonymousOptionalFieldBuilder.append(extraField + ";"); + } + + // If the optional field was good, we add it: + if ( optionalField != null ) { + optionalFields.add(optionalField); + } + } + + // Save our anonymous optional fields: + if ( anonymousOptionalFieldBuilder.length() != 0 ) { + anonymousOptionalFields = anonymousOptionalFieldBuilder.toString(); + } + } + + /** + * Populate this GencodeGtfFeature with the given data. + */ + protected GencodeGtfFeature(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + this.featureOrderNumber = featureOrderNumber; + this.chromosomeName = chromosomeName; + this.annotationSource = annotationSource; + this.featureType = featureType; + this.genomicStartLocation = genomicStartLocation; + this.genomicEndLocation = genomicEndLocation; + this.genomicStrand = genomicStrand; + this.genomicPhase = genomicPhase; + this.geneId = geneId; + this.transcriptId = transcriptId; + this.geneType = geneType; + this.geneStatus = geneStatus; + this.geneName = geneName; + this.transcriptType = transcriptType; + this.transcriptStatus = transcriptStatus; + this.transcriptName = transcriptName; + this.exonNumber = exonNumber; + this.exonId = exonId; + this.locusLevel = locusLevel; + + if ( optionalFields != null ) { + this.optionalFields = optionalFields; + } + + this.anonymousOptionalFields = anonymousOptionalFields; + } + + // ================================================================================================ + + public static GencodeGtfFeature create(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + GencodeGtfFeature feature; + + // Figure out which kind of feature to make: + switch (featureType) { + case gene: + feature = GencodeGtfGeneFeature.create(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + break; + case transcript: + feature = GencodeGtfTranscriptFeature.create(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + break; + case exon: + feature = GencodeGtfExonFeature.create(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + break; + case cds: + feature = GencodeGtfCDSFeature.create(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + break; + case utr: + feature = GencodeGtfUTRFeature.create(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + break; + case start_codon: + feature = GencodeGtfStartCodonFeature.create(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + break; + case stop_codon: + feature = GencodeGtfStopCodonFeature.create(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + break; + case selenocysteine: + feature = GencodeGtfSelenocysteineFeature.create(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + break; + default: + throw new UserException.MalformedFile("Unknown type of GencodeGtfFeature: " + featureType); + + } + + return feature; + + } + + /** + * Create a {@link GencodeGtfFeature} based on a line from a Gencode GTF File. + * @param gtfLine A line from a Gencode GTF File to convert into a {@link GencodeGtfFeature} object. + * @return The {@link GencodeGtfFeature} representing the information in {@code gtfLine} + */ + public static GencodeGtfFeature create(String gtfLine) { + return create(gtfLine.split("\t")); + } + + /** + * Create a {@link GencodeGtfFeature} based on a line from a Gencode GTF File. + * @param gtfFields A line from a Gencode GTF File split on the {@code \t} character. + * @return The {@link GencodeGtfFeature} representing the information in {@code gtfLine} + */ + public static GencodeGtfFeature create(String[] gtfFields) { + + // Ensure that the input data are superficially well-formed: + if ( gtfFields.length != GencodeGtfCodec.NUM_COLUMNS ) { + throw new UserException.MalformedFile("Invalid number of fields in the given GENCODE line " + + " - Given: " + gtfFields.length + " Expected: " + GencodeGtfCodec.NUM_COLUMNS); + } + + GencodeGtfFeature feature = null; + + String featureType = gtfFields[2].toLowerCase(); + + // Figure out which kind of feature to make: + if ( featureType.equals("gene") ) { + feature = GencodeGtfGeneFeature.create(gtfFields); + } + else if ( featureType.equals("transcript") ) { + feature = GencodeGtfTranscriptFeature.create(gtfFields); + } + else if ( featureType.equals("exon") ) { + feature = GencodeGtfExonFeature.create(gtfFields); + } + else if ( featureType.equals("cds") ) { + feature = GencodeGtfCDSFeature.create(gtfFields); + } + else if ( featureType.equals("utr") ) { + feature = GencodeGtfUTRFeature.create(gtfFields); + } + else if ( featureType.equals("start_codon") ) { + feature = GencodeGtfStartCodonFeature.create(gtfFields); + } + else if ( featureType.equals("stop_codon") ) { + feature = GencodeGtfStopCodonFeature.create(gtfFields); + } + else if ( featureType.equals("selenocysteine") ) { + feature = GencodeGtfSelenocysteineFeature.create(gtfFields); + } + else { + throw new UserException.MalformedFile("Unknown type of GencodeGtfFeature: " + featureType); + } + + return feature; + } + + // ================================================================================================ + + @Override + public String getContig() { + return chromosomeName; + } + + @Override + public int getStart() { + return genomicStartLocation; + } + + @Override + public int getEnd() { + return genomicEndLocation; + } + + // ================================================================================================ + + @Override + public boolean equals(Object other) { + + boolean isEqual = other instanceof GencodeGtfFeature; + + if ( isEqual ) { + + GencodeGtfFeature otherFeature = (GencodeGtfFeature) other; + + // It goes like this: + // If the field is a primitive, just check it. + // If the field is an object: + // check if both this and the other field is null + // if they are not, make sure this one is not null and do a comparison against the other. + // All compacted together in short-circuited comparisons. + isEqual = + (featureOrderNumber == otherFeature.featureOrderNumber) && + (((chromosomeName == null) && (otherFeature.chromosomeName == null)) || ((chromosomeName != null) && chromosomeName.equals(otherFeature.chromosomeName))) && + (((annotationSource == null) && (otherFeature.annotationSource == null)) || ((annotationSource != null) && annotationSource.equals(otherFeature.annotationSource))) && + (((featureType == null) && (otherFeature.featureType == null)) || ((featureType != null) && featureType.equals(otherFeature.featureType))) && + (genomicStartLocation == otherFeature.genomicStartLocation) && + (genomicEndLocation == otherFeature.genomicEndLocation) && + (((genomicStrand == null) && (otherFeature.genomicStrand == null)) || ((genomicStrand != null) && genomicStrand.equals(otherFeature.genomicStrand))) && + (((genomicPhase == null) && (otherFeature.genomicPhase == null)) || ((genomicPhase != null) && genomicPhase.equals(otherFeature.genomicPhase))) && + (((geneId == null) && (otherFeature.geneId == null)) || ((geneId != null) && geneId.equals(otherFeature.geneId))) && + (((transcriptId == null) && (otherFeature.transcriptId == null)) || ((transcriptId != null) && transcriptId.equals(otherFeature.transcriptId))) && + (((geneType == null) && (otherFeature.geneType == null)) || ((geneType != null) && geneType.equals(otherFeature.geneType))) && + (geneStatus == otherFeature.geneStatus) && + (((geneName == null) && (otherFeature.geneName == null)) || ((geneName != null) && geneName.equals(otherFeature.geneName))) && + (((transcriptType == null) && (otherFeature.transcriptType == null)) || ((transcriptType != null) && transcriptType.equals(otherFeature.transcriptType))) && + (transcriptStatus == otherFeature.transcriptStatus) && + (((transcriptName == null) && (otherFeature.transcriptName == null)) || ((transcriptName != null) && transcriptName.equals(otherFeature.transcriptName))) && + (exonNumber == otherFeature.exonNumber) && + (((exonId == null) && (otherFeature.exonId == null)) || ((exonId != null) && exonId.equals(otherFeature.exonId))) && + (((locusLevel == null) && (otherFeature.locusLevel == null)) || ((locusLevel != null) && locusLevel.equals(otherFeature.locusLevel))) && + (((anonymousOptionalFields == null) && (otherFeature.anonymousOptionalFields == null)) || ((anonymousOptionalFields != null) && anonymousOptionalFields.equals(otherFeature.anonymousOptionalFields)) && + optionalFields.equals(otherFeature.optionalFields)); + } + + return isEqual; + } + + /** + * Get all the features from this {@link GencodeGtfFeature} itself. + * This is useful to get any subfeatures included in this {@link GencodeGtfFeature}. + * @return A {@link List} of the features represented in this {@link GencodeGtfFeature}. + */ + protected List getAllFeatures() { + ArrayList list = new ArrayList<>(); + list.add(this); + return list; + } + + /** + * Serializes this {@link GencodeGtfFeature} to a string. + * @return a {@link String} representing this {@link GencodeGtfFeature} + */ + private String serializeToString() { + + StringBuilder stringBuilder = new StringBuilder(); + + stringBuilder.append( chromosomeName ); + stringBuilder.append( '\t' ); + stringBuilder.append( annotationSource ); + stringBuilder.append( '\t' ); + stringBuilder.append( featureType ); + stringBuilder.append( '\t' ); + stringBuilder.append( genomicStartLocation ); + stringBuilder.append( '\t' ); + stringBuilder.append( genomicEndLocation ); + stringBuilder.append( "\t.\t" ); + stringBuilder.append( genomicStrand ); + stringBuilder.append( '\t' ); + stringBuilder.append( genomicPhase ); + stringBuilder.append( '\t' ); + + if ( geneId != null ) { + stringBuilder.append("gene_id \""); + stringBuilder.append(geneId); + stringBuilder.append( "\"; " ); + } + if ( transcriptId != null) { + stringBuilder.append("transcript_id \""); + stringBuilder.append(transcriptId); + stringBuilder.append( "\"; " ); + } + if ( geneType != null ) { + stringBuilder.append("gene_type \""); + stringBuilder.append(geneType); + stringBuilder.append( "\"; " ); + } + if ( geneStatus != null ) { + stringBuilder.append("gene_status \""); + stringBuilder.append(geneStatus); + stringBuilder.append( "\"; " ); + } + if ( geneName != null ) { + stringBuilder.append("gene_name \""); + stringBuilder.append(geneName); + stringBuilder.append( "\"; " ); + } + if ( transcriptType != null ) { + stringBuilder.append("transcript_type \""); + stringBuilder.append(transcriptType); + stringBuilder.append( "\"; " ); + } + if ( transcriptStatus != null ) { + stringBuilder.append("transcript_status \""); + stringBuilder.append(transcriptStatus); + stringBuilder.append( "\"; " ); + } + if ( transcriptName != null ) { + stringBuilder.append("transcript_name \""); + stringBuilder.append(transcriptName); + stringBuilder.append( "\"; " ); + } + if ( exonNumber != -1 ) { + stringBuilder.append("exon_number "); + stringBuilder.append(exonNumber); + stringBuilder.append( "; " ); + } + if ( exonId != null) { + stringBuilder.append("exon_id \""); + stringBuilder.append(exonId); + stringBuilder.append( "\"; "); + } + if (locusLevel != null) { + stringBuilder.append("level "); + stringBuilder.append(locusLevel); + stringBuilder.append("; "); + } + + // = = = = = = = = = = = = = = = = = = = = = = = + + // Output our optional fields: + stringBuilder.append( + optionalFields.stream().map(Object::toString).collect(Collectors.joining(" ")) + ); + + if ( anonymousOptionalFields != null ) { + stringBuilder.append(anonymousOptionalFields); + } + + return stringBuilder.toString().trim(); + } + + @Override + public String toString() { + StringBuilder stringBuilder = new StringBuilder(); + + List features = getAllFeatures(); + Collections.sort( features ); + + for ( GencodeGtfFeature feature : features ) { + stringBuilder.append( feature.serializeToString() ); + stringBuilder.append("\n"); + } + + return stringBuilder.toString().trim(); + } + + @Override + public int hashCode() { + return this.serializeToString().hashCode(); + } + + // ================================================================================================ + + public long getFeatureOrderNumber() { return featureOrderNumber; } + + public String getChromosomeName() { + return chromosomeName; + } + + public AnnotationSource getAnnotationSource() { + return annotationSource; + } + + public FeatureType getFeatureType() { + return featureType; + } + + public int getGenomicStartLocation() { + return genomicStartLocation; + } + + public int getGenomicEndLocation() { + return genomicEndLocation; + } + + public GenomicStrand getGenomicStrand() { + return genomicStrand; + } + + public GenomicPhase getGenomicPhase() { + return genomicPhase; + } + + public String getGeneId() { + return geneId; + } + + public String getTranscriptId() { + return transcriptId; + } + + public GeneTranscriptType getGeneType() { + return geneType; + } + + public String getGeneName() { + return geneName; + } + + public GeneTranscriptType getTranscriptType() { + return transcriptType; + } + + public String getTranscriptName() { + return transcriptName; + } + + public int getExonNumber() { + return exonNumber; + } + + public String getExonId() { + return exonId; + } + + public LocusLevel getLocusLevel() { + return locusLevel; + } + + public ArrayList> getOptionalFields() { + return optionalFields; + } + + public String getAnonymousOptionalFields() { + return anonymousOptionalFields; + } + + public OptionalField getOptionalField(String key) { + for (OptionalField optionalField : optionalFields) { + if ( optionalField.getName().equals(key) ) { + return optionalField; + } + } + return null; + } + + /** + * Comparable interface implementation for {@link GencodeGtfFeature}. + * + * Order is determined by {@link #featureOrderNumber} + * + * @param other {@link GencodeGtfFeature} to which to compare + * @return -1 if this < other; 0 if this == other; 1 if this > other + */ + @Override + public int compareTo(GencodeGtfFeature other) { + return (int)(featureOrderNumber - other.featureOrderNumber); + } + + /** + * Checks if {@code other} is contained within this {@link GencodeGtfFeature}. + * Comparison is made using {@link #genomicStartLocation} and {@link #genomicEndLocation} (both ends inclusive). + * @param other {@link GencodeGtfFeature} of which to check the bounds. + * @return true if {@code other} is contained within the bounds of this {@link GencodeGtfFeature}, false otherwise. + */ + public boolean contains(GencodeGtfFeature other) { + return (other.getStart() >= getStart()) && (other.getEnd() <= getEnd()); + } + + public void setFeatureOrderNumber(long featureOrderNumber) { + this.featureOrderNumber = featureOrderNumber; + } + + // ================================================================================================ + + static public class OptionalField { + + private String name; + private T value; + + public OptionalField(String name, T value) { + this.name = name; + this.value = value; + } + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public T getValue() { + return value; + } + + public void setValue(T value) { + this.value = value; + } + + @Override + public String toString() { + + StringBuilder sb = new StringBuilder(); + + sb.append(name); + sb.append(" "); + + // We need to do some formatting for the numbers / non-numbers in the field: + String valueString = value.toString(); + if ( valueString.matches("\\d\\d*") ) { + sb.append(valueString); + sb.append(";"); + } + else { + sb.append("\""); + sb.append(valueString); + sb.append("\";"); + } + + return sb.toString(); + } + + @Override + public int hashCode() { + return this.toString().hashCode(); + } + + @Override + public boolean equals(Object other) { + + if ( !(other instanceof OptionalField) ) { + return false; + } + + OptionalField otherOptionalField = (OptionalField) other; + + return (name.equals(otherOptionalField.name)) && + (value.equals(otherOptionalField.value)); + } + } + + // ================================================================================================ + + public enum AnnotationSource { + ENSEMBL, + HAVANA + } + + public enum FeatureType { + gene("gene"), + transcript("transcript"), + selenocysteine("Selenocysteine"), + exon("exon"), + cds("CDS"), + start_codon("start_codon"), + stop_codon("stop_codon"), + utr("UTR"); + + private String serialized; + + FeatureType(String serializedValue) { serialized = serializedValue; } + + @Override + public String toString() { return serialized; } + + public static FeatureType getEnum(String s) { + for( FeatureType val : values() ) { + if(val.serialized.equalsIgnoreCase(s)) { + return val; + } + } + throw new IllegalArgumentException(); + } + } + + public enum GenomicStrand { + FORWARD("+"), + BACKWARD("-"); + + private String serialized; + + GenomicStrand(String serializedValue) { + serialized = serializedValue; + } + + @Override + public String toString() { + return serialized; + } + + public static GenomicStrand getEnum(String s) { + for( GenomicStrand val : values() ) { + if(val.serialized.equalsIgnoreCase(s)) { + return val; + } + } + throw new IllegalArgumentException(); + } + } + + public enum GenomicPhase { + ZERO("0"), + ONE ("1"), + TWO ("2"), + DOT ("."); + + private String serialized; + + GenomicPhase(String serializedValue) { + serialized = serializedValue; + } + + @Override + public String toString() { + return serialized; + } + + public static GenomicPhase getEnum(String s) { + for( GenomicPhase val : values() ) { + if(val.serialized.equalsIgnoreCase(s)) { + return val; + } + } + throw new IllegalArgumentException(); + } + } + + public enum GeneTranscriptType { + // Immunoglobulin (Ig) variable chain and T-cell receptor (TcR) genes imported or annotated according to the IMGT (http://www.imgt.org/) + IG_C_gene, + IG_D_gene, + IG_J_gene, + IG_LV_gene, + IG_V_gene, + TR_C_gene, + TR_J_gene, + TR_V_gene, + TR_D_gene, + + // Inactivated immunoglobulin gene. + IG_pseudogene, + IG_C_pseudogene, + IG_J_pseudogene, + IG_V_pseudogene, + TR_V_pseudogene, + TR_J_pseudogene, + + // Non-coding RNA predicted using sequences from Rfam (http://rfam.xfam.org/) and miRBase (http://www.mirbase.org/) + Mt_rRNA, + Mt_tRNA, + miRNA, + misc_RNA, + rRNA, + scRNA, + snRNA, + snoRNA, + ribozyme, + sRNA, + scaRNA, + + // Non-coding RNA predicted to be pseudogene by the Ensembl pipeline + Mt_tRNA_pseudogene, + tRNA_pseudogene, + snoRNA_pseudogene, + snRNA_pseudogene, + scRNA_pseudogene, + rRNA_pseudogene, + misc_RNA_pseudogene, + miRNA_pseudogene, + + // To be Experimentally Confirmed. This is used for non-spliced EST clusters that have polyA features. This category has been specifically created for the ENCODE project to highlight regions that could indicate the presence of protein coding genes that require experimental validation, either by 5' RACE or RT-PCR to extend the transcripts, or by confirming expression of the putatively-encoded peptide with specific antibodies. + TEC, + + // If the coding sequence (following the appropriate reference) of a transcript finishes >50bp from a downstream splice site then it is tagged as NMD. If the variant does not cover the full reference coding sequence then it is annotated as NMD if NMD is unavoidable i.e. no matter what the exon structure of the missing portion is the transcript will be subject to NMD. + nonsense_mediated_decay, + + // Transcript that has polyA features (including signal) without a prior stop codon in the CDS, i.e. a non-genomic polyA tail attached directly to the CDS without 3' UTR. These transcripts are subject to degradation. + non_stop_decay, + + // Alternatively spliced transcript believed to contain intronic sequence relative to other, coding, variants. + retained_intron, + + // Contains an open reading frame (ORF). + protein_coding, + + // Doesn't contain an ORF. + processed_transcript, + + // Transcript which is known from the literature to not be protein coding. + non_coding, + + // Transcript believed to be protein coding, but with more than one possible open reading frame. + ambiguous_orf, + + // Long non-coding transcript in introns of a coding gene that does not overlap any exons. + sense_intronic, + + // Long non-coding transcript that contains a coding gene in its intron on the same strand. + sense_overlapping, + + // Has transcripts that overlap the genomic span (i.e. exon or introns) of a protein-coding locus on the opposite strand. + antisense, + + known_ncrna, + + // Have homology to proteins but generally suffer from a disrupted coding sequence and an active homologous gene can be found at another locus. Sometimes these entries have an intact coding sequence or an open but truncated ORF, in which case there is other evidence used (for example genomic polyA stretches at the 3' end) to classify them as a pseudogene. Can be further classified as one of the following. + pseudogene, + + // Pseudogene that lack introns and is thought to arise from reverse transcription of mRNA followed by reinsertion of DNA into the genome. + processed_pseudogene, + + // Pseudogene owing to a SNP/DIP but in other individuals/haplotypes/strains the gene is translated. + polymorphic_pseudogene, + + // Pseudogene owing to a reverse transcribed and re-inserted sequence. + retrotransposed, + + // Pseudogene where protein homology or genomic structure indicates a pseudogene, but the presence of locus-specific transcripts indicates expression. + transcribed_processed_pseudogene, + transcribed_unprocessed_pseudogene, + transcribed_unitary_pseudogene, + + // Pseudogene that has mass spec data suggesting that it is also translated. + translated_processed_pseudogene, + translated_unprocessed_pseudogene, + + // A species specific unprocessed pseudogene without a parent gene, as it has an active orthologue in another species. + unitary_pseudogene, + + // Pseudogene that can contain introns since produced by gene duplication. + unprocessed_pseudogene, + + // Used to tag mistakes in the public databases (Ensembl/SwissProt/Trembl) + artifact, + + // Long, intervening noncoding (linc) RNA that can be found in evolutionarily conserved, intergenic regions. + lincRNA, + + // Unspliced lncRNA that is several kb in size. + macro_lncRNA, + + // Transcript where ditag and/or published experimental data strongly supports the existence of short non-coding transcripts transcribed from the 3'UTR. + three_prime_overlapping_ncRNA, + + // Otherwise viable coding region omitted from this alternatively spliced transcript because the splice variation affects a region coding for a protein domain. + disrupted_domain, + + // Short non coding RNA gene that forms part of the vault ribonucleoprotein complex. + vaultRNA, + + // A non-coding locus that originates from within the promoter region of a protein-coding gene, with transcription proceeding in the opposite direction on the other strand. + bidirectional_promoter_lncRNA; + + public static GeneTranscriptType getEnum(String s) { + + if (s.startsWith("3")) { + s = "three_" + s.substring(1); + } + + // Looks like sometimes RNA is spelled `rna`. + // Here's a fix for that: + s = s.replace( "rna", "RNA" ); + + return GeneTranscriptType.valueOf(s); + } + + @Override + public String toString() { + String s = super.toString(); + + if ( s.startsWith("three_") ) { + s = "3" + s.substring(7); + } + return s; + } + + } + + public enum GeneTranscriptStatus { + KNOWN, + NOVEL, + PUTATIVE + } + + public enum LocusLevel { + // Verified locus + ONE("1"), + // Manually annotated locus + TWO("2"), + // Automatically annotated locus + THREE("3"); + + private String serialized; + + LocusLevel(String serializedValue) { + serialized = serializedValue; + } + + @Override + public String toString() { + return serialized; + } + + public static LocusLevel getEnum(String s) { + for( LocusLevel val : values() ) { + if(val.serialized.equalsIgnoreCase(s)) { + return val; + } + } + throw new IllegalArgumentException(); + } + } + + public enum FeatureTag { + // 3' end extended based on RNA-seq data. + three_nested_supported_extension, + + // 3' end extended based on RNA-seq data. + three_standard_supported_extension, + + // annotated based on RNA-seq data. + fourfivefour_RNA_Seq_supported, + + // 5' end extended based on RNA-seq data. + five_nested_supported_extension, + + // 5' end extended based on RNA-seq data. + five_standard_supported_extension, + + // shares an identical CDS but has alternative 5' UTR with respect to a reference variant. + alternative_3_UTR, + + // shares an identical CDS but has alternative 3' UTR with respect to a reference variant. + alternative_5_UTR, + + // (This flag corresponds to the older flag "appris_principal") Where the transcript expected to code for the main + appris_principal_1, + + // (This flag corresponds to the older flag "appris_candidate_ccds") Where the APPRIS core modules are unable to choose a + appris_principal_2, + + // Where the APPRIS core modules are unable to choose a clear principal variant and there more than one of the variants + appris_principal_3, + + // (This flag corresponds to the Ensembl 78 flag "appris_candidate_longest_ccds") Where the APPRIS core modules are unable + appris_principal_4, + + // (This flag corresponds to the Ensembl 78 flag "appris_candidate_longest_seq") Where the APPRIS core modules are unable + appris_principal_5, + + // Candidate transcript(s) models that are conserved in at least three tested non-primate species. + appris_alternative_1, + + // Candidate transcript(s) models that appear to be conserved in fewer than three tested non-primate species. + appris_alternative_2, + + // ranscript expected to code for the main functional isoform based on a range of protein features (APPRIS pipeline). + appris_principal, + + // where there is no single 'appris_principal' variant the main functional isoform will be translated from one of the + appris_candidate, + + // he "appris_candidate" transcript that has an unique CCDS. + appris_candidate_ccds, + + // where there is no 'appris_principal' variant, the candidate with highest APPRIS score is selected as the primary + appris_candidate_highest_score, + + // where there is no 'appris_principal' variant, the longest of the 'appris_candidate' variants is selected as the primary + appris_candidate_longest, + + // he "appris_candidate" transcripts where there are several CCDS, in this case APPRIS labels the longest CCDS. + appris_candidate_longest_ccds, + + // where there is no "appris_candidate_ccds" or "appris_candidate_longest_ccds" variant, the longest protein of the + appris_candidate_longest_seq, + + // identifies a subset of representative transcripts for each gene; prioritises full-length protein coding transcripts + basic, + + // ranscript contains two confidently annotated CDSs. Support may come from eg proteomic data, cross-species conservation + bicistronic, + + // ranscript 5' end overlaps ENCODE or Fantom CAGE cluster. + CAGE_supported_TSS, + + // member of the consensus CDS gene set, confirming coding regions between ENSEMBL, UCSC, NCBI and HAVANA. + CCDS, + + // he coding region end could not be confirmed. + cds_end_NF, + + // he coding region start could not be confirmed. + cds_start_NF, + + // ranscript QC checked using dotplot to identify features eg splice junctions, end of homology. + dotter_confirmed, + + // an upstream ATG is used where a downstream ATG seems more evolutionary conserved. + downstream_ATG, + + // ranscript was tested and confirmed experimentally. + exp_conf, + + // locus consists of non-overlapping transcript fragments either because of genome assembly issues (i.e., gaps or + fragmented_locus, + + // ranscript model contains all possible in-frame exons supported by homology, experimental evidence or conservation, but + inferred_exon_combination, + + // ranscript model is not supported by a single piece of transcript evidence. May be supported by multiple fragments of + inferred_transcript_model, + + // ranscript supported by transcript evidence that, while ampping best-in-genome, shows regions of poor sequence quality. + low_sequence_quality, + + // he mRNA end could not be confirmed. + mRNA_end_NF, + + // he mRNA start could not be confirmed. + mRNA_start_NF, + + // in-frame type of variation where, at the acceptor site, some variants splice after the first AG and others after the + NAGNAG_splice_site, + + // he locus is a host for small non-coding RNAs. + ncRNA_host, + + // annotated based on RNA-seq data. + nested_454_RNA_Seq_supported, + + // he transcript looks like it is subject to NMD but publications, experiments or conservation support the translation of + NMD_exception, + + // codon if the transcript were longer but cannot currently be annotated as NMD as does not fulfil all criteria - most + NMD_likely_if_extended, + + // he CDS has a non-ATG start and its validity is supported by publication or conservation. + non_ATG_start, + + // he transcript has a non-canonical splice site conserved in other species. + non_canonical_conserved, + + // he transcript has a non-canonical splice site explained by a genomic sequencing error. + non_canonical_genome_sequence_error, + + // he transcript has a non-canonical splice site explained by other reasons. + non_canonical_other, + + // he transcript has a non-canonical splice site explained by a SNP. + non_canonical_polymorphism, + + // he transcript has a non-canonical splice site that needs experimental confirmation. + non_canonical_TEC, + + // he transcript has a non-canonical splice site explained by a U12 intron (i.e. AT-AC splice site). + non_canonical_U12, + + // a splice variant for which supporting evidence has not been submitted to databases, i.e. the model is based on + non_submitted_evidence, + + // a transcript is supported by evidence from same species paralogous loci. + not_best_in_genome_evidence, + + // evidence from other species was used to build model. + not_organism_supported, + + // protein-coding locus with no paralogues or orthologs. + orphan, + + // exon(s) of the locus overlap exon(s) of a readthrough transcript or a transcript belonging to another locus. + overlapping_locus, + + // a low confidence upstream ATG existing in other coding variant would lead to NMD in this trancript, that uses the high + overlapping_uORF, + + // annotation in the pseudo-autosomal region, which is duplicated between chromosomes X and Y. + PAR, + + // member of the pseudogene set predicted by YALE, UCSC and HAVANA. + pseudo_consens, + + // a transcript that overlaps two or more independent loci but is considered to belong to a third, separate locus. + readthrough_transcript, + + // locus overlaps a sequence error or an assembly error in the reference genome that affects its annotation (e.g., 1 or + reference_genome_error, + + // internal intron of CDS portion of transcript is retained. + retained_intron_CDS, + + // final intron of CDS portion of transcript is retained. + retained_intron_final, + + // first intron of CDS portion of transcript is retained. + retained_intron_first, + + // protein-coding locus created via retrotransposition. + retrogene, + + // ranscript supported by RNAseq data and not supported by mRNA or EST evidence. + RNA_Seq_supported_only, + + // ranscript annotated based on mixture of RNA-seq data and EST/mRNA/protein evidence. + RNA_Seq_supported_partial, + + // ranscript that contains a CDS that has a translation initiation site supported by Ribosomal Profiling data. + RP_supported_TIS, + + // contains a selenocysteine. + seleno, + + // a processed pseudogene with one or more introns still present. These are likely formed through the retrotransposition + semi_processed, + + // ranscript contains at least 1 non-canonical splice junction that is associated with a known or novel genome sequence + sequence_error, + + // an upstream ATG exists when a downstream ATG is better supported. + upstream_ATG, + + // a low confidence upstream ATG existing in other coding variant would lead to NMD in this trancript, that uses the high + upstream_uORF; + + public static FeatureTag getEnum(String s) { + + if ( s.startsWith("3") ){ + s = "three" + s.substring(1); + } + else if ( s.startsWith("5") ){ + s = "five" + s.substring(1); + } + else if ( s.startsWith("454") ){ + s = "fourfivefour" + s.substring(3); + } + + return FeatureTag.valueOf(s); + } + + @Override + public String toString() { + String s = super.toString(); + + if ( s.startsWith("three_") ){ + s = "2" + s.substring(6); + } + else if ( s.startsWith("five_") ){ + s = "5" + s.substring(5); + } + else if ( s.startsWith("fourfivefour_") ){ + s = "454" + s.substring(13); + } + + return s; + } + } + + public enum TranscriptSupportLevel { + /** all splice junctions of the transcript are supported by at least one non-suspect mRNA */ + ONE("1"), + + /** the best supporting mRNA is flagged as suspect or the support is from multiple ESTs */ + TWO("2"), + + /** the only support is from a single EST */ + THREE("3"), + + /** the best supporting EST is flagged as suspect */ + FOUR("4"), + + /** no single transcript supports the model structure */ + FIVE("5"), + + /** the transcript was not analyzed */ + NA("NA"); + + private String serialized; + + TranscriptSupportLevel(String serializedValue) { + serialized = serializedValue; + } + + @Override + public String toString() { + return serialized; + } + + public static TranscriptSupportLevel getEnum(String s) { + for( TranscriptSupportLevel val : values() ) { + if(val.serialized.equalsIgnoreCase(s)) { + return val; + } + } + throw new IllegalArgumentException(); + } + } + + public enum RemapStatus { + full_contig, + full_fragment, + partial, + deleted, + no_seq_map, + gene_conflict, + gene_size_change, + automatic_small_ncrna_gene, + automatic_gene, + pseudogene + } + + public enum RemapTargetStatus { + NEW("new"), + LOST("lost"), + OVERLAP("overlap"), + NONOVERLAP("nonOverlap"); + + private String serialized; + + RemapTargetStatus(String serializedValue) { + serialized = serializedValue; + } + + @Override + public String toString() { + return serialized; + } + + public static RemapTargetStatus getEnum(String s) { + for( RemapTargetStatus val : values() ) { + if(val.serialized.equalsIgnoreCase(s)) { + return val; + } + } + throw new IllegalArgumentException(); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfGeneFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfGeneFeature.java new file mode 100644 index 00000000000..10b8c75ae1b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfGeneFeature.java @@ -0,0 +1,121 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import java.util.ArrayList; +import java.util.List; + +/** + * A Gencode GTF Feature representing a gene. + * + * A GTF Feature represents one row of a GTF File. + * The specification of a GTF file is defined here: + * http://mblab.wustl.edu/GTF22.html + * + * Created by jonn on 7/25/17. + */ +final public class GencodeGtfGeneFeature extends GencodeGtfFeature { + + private GencodeGtfGeneFeature(String[] gtfFields) { + super(gtfFields); + } + + public static GencodeGtfFeature create(String[] gtfFields) { + return new GencodeGtfGeneFeature(gtfFields); + } + + private GencodeGtfGeneFeature(long lineNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + super(lineNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + public static GencodeGtfFeature create(long lineNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + return new GencodeGtfGeneFeature(lineNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + // ================================================================================================ + + private ArrayList transcripts = new ArrayList<>(); + + // ================================================================================================ + + public void addTranscript(GencodeGtfTranscriptFeature transcript) { transcripts.add(transcript); } + + public ArrayList getTranscripts() { + return transcripts; + } + + @Override + protected List getAllFeatures() { + ArrayList list = new ArrayList<>(); + list.add(this); + + for ( GencodeGtfTranscriptFeature transcript : transcripts ) { + list.addAll(transcript.getAllFeatures()); + } + + return list; + } + + @Override + public boolean equals(Object other) { + if ( (!(other instanceof GencodeGtfGeneFeature)) ) { + return false; + } + + GencodeGtfGeneFeature otherGene = (GencodeGtfGeneFeature) other; + + if ( !super.equals(otherGene) ) { + return false; + } + + for ( int i = 0 ; i < transcripts.size(); ++i ) { + if ( !transcripts.get(i).equals(otherGene.transcripts.get(i))) { + return false; + } + } + + + return true; + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfSelenocysteineFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfSelenocysteineFeature.java new file mode 100644 index 00000000000..c1fef720edd --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfSelenocysteineFeature.java @@ -0,0 +1,74 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import java.util.ArrayList; + +/** + * A Gencode GTF Feature representing a selenocysteine. + * + * A GTF Feature represents one row of a GTF File. + * The specification of a GTF file is defined here: + * http://mblab.wustl.edu/GTF22.html + * + * Created by jonn on 7/25/17. + */ +final public class GencodeGtfSelenocysteineFeature extends GencodeGtfFeature { + + private GencodeGtfSelenocysteineFeature(String[] gtfFields) { + super(gtfFields); + } + + public static GencodeGtfFeature create(String[] gtfFields) { + return new GencodeGtfSelenocysteineFeature(gtfFields); + } + + private GencodeGtfSelenocysteineFeature(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + super(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + public static GencodeGtfFeature create(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + return new GencodeGtfSelenocysteineFeature(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfStartCodonFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfStartCodonFeature.java new file mode 100644 index 00000000000..be1a7162a9d --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfStartCodonFeature.java @@ -0,0 +1,75 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import java.util.ArrayList; + +/** + * A Gencode GTF Feature representing a start codon. + * + * A GTF Feature represents one row of a GTF File. + * The specification of a GTF file is defined here: + * http://mblab.wustl.edu/GTF22.html + * + * Created by jonn on 7/25/17. + */ +// {gene,transcript,exon,CDS,UTR,start_codon,stop_codon,Selenocysteine}` +final public class GencodeGtfStartCodonFeature extends GencodeGtfFeature { + + private GencodeGtfStartCodonFeature(String[] gtfFields) { + super(gtfFields); + } + + public static GencodeGtfFeature create(String[] gtfFields) { + return new GencodeGtfStartCodonFeature(gtfFields); + } + + private GencodeGtfStartCodonFeature(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + super(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + public static GencodeGtfFeature create(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + return new GencodeGtfStartCodonFeature(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfStopCodonFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfStopCodonFeature.java new file mode 100644 index 00000000000..c4bb9ed3282 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfStopCodonFeature.java @@ -0,0 +1,74 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import java.util.ArrayList; + +/** + * A Gencode GTF Feature representing a stop codon. + * + * A GTF Feature represents one row of a GTF File. + * The specification of a GTF file is defined here: + * http://mblab.wustl.edu/GTF22.html + * + * Created by jonn on 7/25/17. + */ +final public class GencodeGtfStopCodonFeature extends GencodeGtfFeature { + + private GencodeGtfStopCodonFeature(String[] gtfFields) { + super(gtfFields); + } + + public static GencodeGtfFeature create(String[] gtfFields) { + return new GencodeGtfStopCodonFeature(gtfFields); + } + + private GencodeGtfStopCodonFeature(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + super(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + public static GencodeGtfFeature create(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + return new GencodeGtfStopCodonFeature(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfTranscriptFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfTranscriptFeature.java new file mode 100644 index 00000000000..a7e21691edf --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfTranscriptFeature.java @@ -0,0 +1,144 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +/** + * A Gencode GTF Feature representing a transcript. + * + * A GTF Feature represents one row of a GTF File. + * The specification of a GTF file is defined here: + * http://mblab.wustl.edu/GTF22.html + * + * Created by jonn on 7/25/17. + */ +final public class GencodeGtfTranscriptFeature extends GencodeGtfFeature { + + private GencodeGtfTranscriptFeature(String[] gtfFields) { + super(gtfFields); + } + + public static GencodeGtfFeature create(String[] gtfFields) { + return new GencodeGtfTranscriptFeature(gtfFields); + } + + private GencodeGtfTranscriptFeature(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + super(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + public static GencodeGtfFeature create(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + return new GencodeGtfTranscriptFeature(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + // ================================================================================================ + + private ArrayList exons = new ArrayList<>(); + private ArrayList selenocysteines = new ArrayList<>(); + private ArrayList utrs = new ArrayList<>(); + + // ================================================================================================ + + public ArrayList getExons() { + return exons; + } + + public void addExon( GencodeGtfExonFeature exon ) { + exons.add(exon); + } + + public ArrayList getSelenocysteines() { + return selenocysteines; + } + + public void addSelenocysteine( GencodeGtfSelenocysteineFeature selenocysteine ) { + selenocysteines.add(selenocysteine); + } + + public ArrayList getUtrs() { + return utrs; + } + + public void addUtr( GencodeGtfUTRFeature utr ) { utrs.add(utr); } + + @Override + protected List getAllFeatures() { + ArrayList list = new ArrayList<>(); + list.add(this); + + for ( GencodeGtfExonFeature feature : exons ) { + list.addAll(feature.getAllFeatures()); + } + + for ( GencodeGtfSelenocysteineFeature feature : selenocysteines ) { + list.addAll(feature.getAllFeatures()); + } + + for ( GencodeGtfUTRFeature feature : utrs ) { + list.addAll(feature.getAllFeatures()); + } + + return list; + } + + @Override + public boolean equals(Object other) { + if ( (!(other instanceof GencodeGtfTranscriptFeature)) ) { + return false; + } + + GencodeGtfTranscriptFeature otherTranscript = (GencodeGtfTranscriptFeature) other; + + if ( (!super.equals(otherTranscript)) || + (!exons.equals(otherTranscript.exons)) || + (!selenocysteines.equals(otherTranscript.selenocysteines)) || + (!utrs.equals(otherTranscript.utrs))) { + return false; + } + + return true; + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfUTRFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfUTRFeature.java new file mode 100644 index 00000000000..6863368d444 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfUTRFeature.java @@ -0,0 +1,74 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import java.util.ArrayList; + +/** + * A Gencode GTF Feature representing an untranslated region. + * + * A GTF Feature represents one row of a GTF File. + * The specification of a GTF file is defined here: + * http://mblab.wustl.edu/GTF22.html + * + * Created by jonn on 7/25/17. + */ +final public class GencodeGtfUTRFeature extends GencodeGtfFeature { + + private GencodeGtfUTRFeature(String[] gtfFields) { + super(gtfFields); + } + + public static GencodeGtfFeature create(String[] gtfFields) { + return new GencodeGtfUTRFeature(gtfFields); + } + + private GencodeGtfUTRFeature(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + super(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + + public static GencodeGtfFeature create(long featureOrderNumber, + String chromosomeName, + AnnotationSource annotationSource, + FeatureType featureType, + int genomicStartLocation, + int genomicEndLocation, + GenomicStrand genomicStrand, + GenomicPhase genomicPhase, + String geneId, + String transcriptId, + GeneTranscriptType geneType, + GeneTranscriptStatus geneStatus, + String geneName, + GeneTranscriptType transcriptType, + GeneTranscriptStatus transcriptStatus, + String transcriptName, + int exonNumber, + String exonId, + LocusLevel locusLevel, + ArrayList> optionalFields, + String anonymousOptionalFields) { + + return new GencodeGtfUTRFeature(featureOrderNumber, chromosomeName, annotationSource, featureType, genomicStartLocation, genomicEndLocation, genomicStrand, genomicPhase, geneId, transcriptId, geneType, geneStatus, geneName, transcriptType, transcriptStatus, transcriptName, exonNumber, exonId, locusLevel, optionalFields, anonymousOptionalFields); + } + +} diff --git a/src/test/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCodecUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCodecUnitTest.java new file mode 100644 index 00000000000..4f5d891c421 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/utils/codecs/GENCODE/GencodeGtfCodecUnitTest.java @@ -0,0 +1,3336 @@ +package org.broadinstitute.hellbender.utils.codecs.GENCODE; + +import htsjdk.tribble.Tribble; +import htsjdk.tribble.index.Index; +import htsjdk.tribble.index.IndexFactory; +import htsjdk.tribble.readers.LineIterator; +import org.broadinstitute.hellbender.engine.FeatureDataSource; +import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.test.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.*; +import java.nio.file.Files; +import java.util.*; + +import static java.nio.file.StandardCopyOption.REPLACE_EXISTING; + + +/** + * Test class for the GENCODE GTF Reader. + * Modeled after the TableCodecUnitTest, with extras specific to this file format. + * Created by jonn on 7/27/17. + */ +public class GencodeGtfCodecUnitTest extends BaseTest { + + private static final String testResourceDir = publicTestDir + "org/broadinstitute/hellbender/utils/codecs/GENCODE/"; + private static final String xyzTestFile = largeFileTestDir + "gencode.v26.primary_assembly.annotation.XYZ.gtf"; + private static final String gencodeHg19TestFile = largeFileTestDir + "gencode.v19.LargeFile.gtf"; + + /** + * Checks the given feature and the given start and end positions for whether the regions overlap. + * @param feature {@link GencodeGtfFeature} to use for overlap checking + * @param contig Contig to check for overlap + * @param start Interval start position to check for overlap + * @param end Interval end position to check for overlap + * @return {@code true} if the region in the {@link GencodeGtfFeature} and the given interval overlap, {@code false} otherwise. + */ + private boolean checkForOverlap(final GencodeGtfFeature feature, + final String contig, + final int start, + final int end) { + + boolean overlaps = feature.getChromosomeName().equals(contig); + + // Check for any overlap. + // This includes overlapping on either end, as well as either region + // being contained within the other. + overlaps = overlaps && + ((start >= feature.getStart() ) && (start <= feature.getEnd())) || + ((end >= feature.getStart() ) && (end <= feature.getEnd())) || + ((start <= feature.getStart() ) && (end >= feature.getEnd())); + + return overlaps; + } + + /** + * Tests that a given query in a file returns the correct number of results + * @param contig Contiguous region in the genome in which to search. + * @param start Start position within {@code contig} in which to search. + * @param end End position within {@code contig} in which to search. + * @param numExpectedGenes The number of expected results from the query. + * @param testFile A GENCODE GTF {@link File} to query against. + */ + private void testIndexHelper(String contig, int start, int end, int numExpectedGenes, File testFile) { + // Now we do our queries: + try (FeatureDataSource featureDataSource = new FeatureDataSource<>(testFile) ) + { + final Iterator it = featureDataSource.query( new SimpleInterval(contig, start, end) ); + + int geneCount = 0; + + for ( ; it.hasNext() ; ) { + + GencodeGtfFeature feature = it.next(); + + // Verify the bounds: + Assert.assertTrue( checkForOverlap(feature, contig, start, end) ); + + // Keep track of how many genes we've seen: + ++geneCount; + } + + Assert.assertEquals( geneCount, numExpectedGenes ); + } + } + + /** + * Creates a valid {@link GencodeGtfGeneFeature} that corresponds to the data in the file {@code gencode.valid1.gtf} + * @return a {@link GencodeGtfGeneFeature} representing the data in the file {@code gencode.valid1.gtf} + */ + private GencodeGtfGeneFeature createGencodeGtfGene_valid1() { + + // Create the Features as they exist in the test file: + GencodeGtfGeneFeature gene = (GencodeGtfGeneFeature)GencodeGtfFeature.create(6, "chr1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.gene, + 30366, 30503, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000284332.1", null, GencodeGtfFeature.GeneTranscriptType.miRNA, + null, "MIR1302-2", null, null, null, -1, null, GencodeGtfFeature.LocusLevel.THREE, null, null); + + GencodeGtfTranscriptFeature transcript = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(7, "chr1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.transcript, + 30366, 30503, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000284332.1", "ENST00000607096.1", GencodeGtfFeature.GeneTranscriptType.miRNA, + null, "MIR1302-2", GencodeGtfFeature.GeneTranscriptType.miRNA, null, "MIR1302-2-201", -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.NA), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic) + ) + ), + null + ); + + GencodeGtfExonFeature exon = (GencodeGtfExonFeature) GencodeGtfFeature.create(8, "chr1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 30366, 30503, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000284332.1", "ENST00000607096.1", GencodeGtfFeature.GeneTranscriptType.miRNA, + null, "MIR1302-2", GencodeGtfFeature.GeneTranscriptType.miRNA, null, "MIR1302-2-201", 1, "ENSE00003695741.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.NA), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic) + ) + ), + null + ); + + // Aggregate the Features as they should be: + transcript.addExon(exon); + gene.addTranscript(transcript); + + return gene; + } + + /** + * Creates a valid {@link GencodeGtfGeneFeature} that corresponds to the data in the file {@code gencode.valid_gencode_file2.gtf} + * @return a {@link GencodeGtfGeneFeature} representing the data in the file {@code gencode.valid_gencode_file2.gtf} + */ + private GencodeGtfGeneFeature createGencodeGtfGene_file2() { + + // Let's define all our features up front: + + GencodeGtfGeneFeature gene1 = (GencodeGtfGeneFeature) GencodeGtfFeature.create(6, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.gene, + 50200979, 50217616, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", null, GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", null, null, null, -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfTranscriptFeature transcript1 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(7, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.transcript, + 50200979, 50217615, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon1 = (GencodeGtfExonFeature) GencodeGtfFeature.create(8, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50200979, 50201590, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds1 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(9, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50201037, 50201590, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfStartCodonFeature start_codon1 = (GencodeGtfStartCodonFeature) GencodeGtfFeature.create(10, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.start_codon, + 50201037, 50201039, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon2 = (GencodeGtfExonFeature) GencodeGtfFeature.create(11, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50206317, 50206520, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 2, "ENSE00001129529.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds2 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(12, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50206317, 50206520, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 2, "ENSE00001129529.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon3 = (GencodeGtfExonFeature) GencodeGtfFeature.create(13, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50208536, 50208716, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 3, "ENSE00001129524.2", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds3 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(14, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50208536, 50208716, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 3, "ENSE00001129524.2", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon4 = (GencodeGtfExonFeature) GencodeGtfFeature.create(15, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50210181, 50210311, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 4, "ENSE00003473644.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds4 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(16, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50210181, 50210311, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 4, "ENSE00003473644.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon5 = (GencodeGtfExonFeature) GencodeGtfFeature.create(17, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50210631, 50210911, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 5, "ENSE00003503715.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds5 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(18, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50210631, 50210911, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 5, "ENSE00003503715.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon6 = (GencodeGtfExonFeature) GencodeGtfFeature.create(19, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50215717, 50215867, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 6, "ENSE00003573348.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds6 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(20, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50215717, 50215867, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.TWO, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 6, "ENSE00003573348.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon7 = (GencodeGtfExonFeature) GencodeGtfFeature.create(21, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50216691, 50216876, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 7, "ENSE00003510005.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds7 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(22, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50216691, 50216876, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 7, "ENSE00003510005.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon8 = (GencodeGtfExonFeature) GencodeGtfFeature.create(23, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50216972, 50217128, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 8, "ENSE00003591346.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds8 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(24, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50216972, 50217128, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 8, "ENSE00003591346.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon9 = (GencodeGtfExonFeature) GencodeGtfFeature.create(25, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50217205, 50217357, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 9, "ENSE00003728455.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds9 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(26, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50217205, 50217357, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 9, "ENSE00003728455.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfExonFeature exon10 = (GencodeGtfExonFeature) GencodeGtfFeature.create(27, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 50217361, 50217615, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 10, "ENSE00003739808.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfCDSFeature cds10 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(28, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 50217361, 50217366, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 10, "ENSE00003739808.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfStopCodonFeature stop_codon1 = (GencodeGtfStopCodonFeature) GencodeGtfFeature.create(29, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.stop_codon, + 50217367, 50217369, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 10, "ENSE00003739808.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfUTRFeature utr1 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(30, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 50200979, 50201036, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfUTRFeature utr2 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(31, "chr22", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 50217367, 50217615, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000611222.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-201", 10, "ENSE00003739808.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000483593.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.FIVE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_alternative_2), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfTranscriptFeature transcript2 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(32, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.transcript, + 50200979, 50217616, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfSelenocysteineFeature selenocysteine1 = (GencodeGtfSelenocysteineFeature) GencodeGtfFeature.create(33, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.selenocysteine, + 50217358, 50217360, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon11 = (GencodeGtfExonFeature) GencodeGtfFeature.create(34, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50200979, 50201590, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds11 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(35, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50201037, 50201590, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfStartCodonFeature start_codon2 = (GencodeGtfStartCodonFeature) GencodeGtfFeature.create(36, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.start_codon, + 50201037, 50201039, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon12 = (GencodeGtfExonFeature) GencodeGtfFeature.create(37, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50206317, 50206520, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 2, "ENSE00001129529.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds12 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(38, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50206317, 50206520, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 2, "ENSE00001129529.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon13 = (GencodeGtfExonFeature) GencodeGtfFeature.create(39, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50208536, 50208716, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 3, "ENSE00001129524.2", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds13 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(40, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50208536, 50208716, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 3, "ENSE00001129524.2", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon14 = (GencodeGtfExonFeature) GencodeGtfFeature.create(41, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50210181, 50210311, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 4, "ENSE00003473644.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds14 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(42, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50210181, 50210311, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 4, "ENSE00003473644.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon15 = (GencodeGtfExonFeature) GencodeGtfFeature.create(43, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50210631, 50210911, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 5, "ENSE00003503715.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds15 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(44, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50210631, 50210911, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 5, "ENSE00003503715.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon16 = (GencodeGtfExonFeature) GencodeGtfFeature.create(45, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50215717, 50215867, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 6, "ENSE00003573348.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds16 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(46, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50215717, 50215867, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.TWO, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 6, "ENSE00003573348.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon17 = (GencodeGtfExonFeature) GencodeGtfFeature.create(47, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50216691, 50216876, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 7, "ENSE00003510005.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds17 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(48, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50216691, 50216876, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 7, "ENSE00003510005.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon18 = (GencodeGtfExonFeature) GencodeGtfFeature.create(49, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50216972, 50217128, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 8, "ENSE00003591346.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds18 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(50, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50216972, 50217128, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 8, "ENSE00003591346.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon19 = (GencodeGtfExonFeature) GencodeGtfFeature.create(51, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50217205, 50217616, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 9, "ENSE00003512975.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds19 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(52, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50217205, 50217366, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 9, "ENSE00003512975.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfStopCodonFeature stop_codon2 = (GencodeGtfStopCodonFeature) GencodeGtfFeature.create(53, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.stop_codon, + 50217367, 50217369, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 9, "ENSE00003512975.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfUTRFeature utr3 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(54, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.utr, + 50200979, 50201036, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfUTRFeature utr4 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(55, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.utr, + 50217367, 50217616, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000380903.6", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "SELENOO-001", 9, "ENSE00003512975.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000370288.2"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal_2), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfTranscriptFeature transcript3 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(56, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.transcript, + 50206442, 50217616, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, null, "SELENOO-002", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon20 = (GencodeGtfExonFeature) GencodeGtfFeature.create(57, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50206442, 50206520, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, null, "SELENOO-002", 1, "ENSE00001890724.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon21 = (GencodeGtfExonFeature) GencodeGtfFeature.create(58, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50208488, 50208716, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, null, "SELENOO-002", 2, "ENSE00001952603.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon22 = (GencodeGtfExonFeature) GencodeGtfFeature.create(59, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50210181, 50210311, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, null, "SELENOO-002", 3, "ENSE00003583919.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon23 = (GencodeGtfExonFeature) GencodeGtfFeature.create(60, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50210631, 50210911, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, null, "SELENOO-002", 4, "ENSE00003620115.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon24 = (GencodeGtfExonFeature) GencodeGtfFeature.create(61, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50215717, 50215867, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, null, "SELENOO-002", 5, "ENSE00003636069.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon25 = (GencodeGtfExonFeature) GencodeGtfFeature.create(62, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50216691, 50216876, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, null, "SELENOO-002", 6, "ENSE00003579717.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon26 = (GencodeGtfExonFeature) GencodeGtfFeature.create(63, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50216972, 50217128, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, null, "SELENOO-002", 7, "ENSE00003650938.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon27 = (GencodeGtfExonFeature) GencodeGtfFeature.create(64, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50217205, 50217616, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.13", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "SELENOO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, null, "SELENOO-002", 8, "ENSE00003475904.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + // ====================== + + // Now let's collapse these objects into their correct structure: + exon1.setCds(cds1); + exon1.setStartCodon(start_codon1); + + exon2.setCds(cds2); + exon3.setCds(cds3); + exon4.setCds(cds4); + exon5.setCds(cds5); + exon6.setCds(cds6); + exon7.setCds(cds7); + exon8.setCds(cds8); + exon9.setCds(cds9); + + exon10.setCds(cds10); + exon10.setStopCodon(stop_codon1); + + transcript1.addExon(exon1); + transcript1.addExon(exon2); + transcript1.addExon(exon3); + transcript1.addExon(exon4); + transcript1.addExon(exon5); + transcript1.addExon(exon6); + transcript1.addExon(exon7); + transcript1.addExon(exon8); + transcript1.addExon(exon9); + transcript1.addExon(exon10); + + transcript1.addUtr(utr1); + transcript1.addUtr(utr2); + + gene1.addTranscript(transcript1); + + // ====================== + + transcript2.addSelenocysteine(selenocysteine1); + + exon11.setCds(cds11); + exon11.setStartCodon(start_codon2); + + exon12.setCds(cds12); + exon13.setCds(cds13); + exon14.setCds(cds14); + exon15.setCds(cds15); + exon16.setCds(cds16); + exon17.setCds(cds17); + exon18.setCds(cds18); + + exon19.setCds(cds19); + exon19.setStopCodon(stop_codon2); + + transcript2.addExon(exon11); + transcript2.addExon(exon12); + transcript2.addExon(exon13); + transcript2.addExon(exon14); + transcript2.addExon(exon15); + transcript2.addExon(exon16); + transcript2.addExon(exon17); + transcript2.addExon(exon18); + transcript2.addExon(exon19); + + transcript2.addUtr(utr3); + transcript2.addUtr(utr4); + + gene1.addTranscript(transcript2); + + // ====================== + + transcript3.addExon(exon20); + transcript3.addExon(exon21); + transcript3.addExon(exon22); + transcript3.addExon(exon23); + transcript3.addExon(exon24); + transcript3.addExon(exon25); + transcript3.addExon(exon26); + transcript3.addExon(exon27); + + gene1.addTranscript(transcript3); + + // ====================== + + return gene1; + } + + /** + * Creates a valid {@link GencodeGtfGeneFeature} that corresponds to the data in the file {@code gencode.valid_gencode_file2.gtf} + * @return a {@link GencodeGtfGeneFeature} representing the data in the file {@code gencode.valid_gencode_file2.gtf} + */ + private GencodeGtfGeneFeature createGencodeGtfGene_file3() { + + GencodeGtfGeneFeature gene1 = (GencodeGtfGeneFeature) GencodeGtfFeature.create(6, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.gene, + 138082, 161852, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", null, GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", null, null, null, -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>(), null + ); + + + GencodeGtfTranscriptFeature transcript1 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(7, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.transcript, + 138082, 161750, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon1 = (GencodeGtfExonFeature) GencodeGtfFeature.create(8, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 161689, 161750, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 1, "ENSE00003735197.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon2 = (GencodeGtfExonFeature) GencodeGtfFeature.create(9, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 156289, 156497, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 2, "ENSE00003737280.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds1 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(10, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 156289, 156446, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 2, "ENSE00003737280.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfStartCodonFeature start_codon1 = (GencodeGtfStartCodonFeature) GencodeGtfFeature.create(11, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.start_codon, + 156444, 156446, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 2, "ENSE00003737280.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon3 = (GencodeGtfExonFeature) GencodeGtfFeature.create(12, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 150987, 151021, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 3, "ENSE00003731891.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds2 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(13, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 150987, 151021, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 3, "ENSE00003731891.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon4 = (GencodeGtfExonFeature) GencodeGtfFeature.create(14, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 150350, 150499, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 4, "ENSE00003724613.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds3 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(15, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 150350, 150499, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.TWO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 4, "ENSE00003724613.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon5 = (GencodeGtfExonFeature) GencodeGtfFeature.create(16, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 148414, 148478, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 5, "ENSE00003732418.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds4 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(17, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 148414, 148478, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.TWO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 5, "ENSE00003732418.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon6 = (GencodeGtfExonFeature) GencodeGtfFeature.create(18, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 148116, 148232, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 6, "ENSE00003733960.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds5 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(19, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 148116, 148232, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 6, "ENSE00003733960.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon7 = (GencodeGtfExonFeature) GencodeGtfFeature.create(20, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 147624, 147703, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 7, "ENSE00003727207.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds6 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(21, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 147624, 147703, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 7, "ENSE00003727207.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon8 = (GencodeGtfExonFeature) GencodeGtfFeature.create(22, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 146640, 146721, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 8, "ENSE00003728972.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds7 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(23, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 146640, 146721, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 8, "ENSE00003728972.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon9 = (GencodeGtfExonFeature) GencodeGtfFeature.create(24, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 145004, 145096, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 9, "ENSE00003733844.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds8 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(25, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 145004, 145096, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 9, "ENSE00003733844.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon10 = (GencodeGtfExonFeature) GencodeGtfFeature.create(26, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 144749, 144895, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 10, "ENSE00003752738.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds9 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(27, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 144749, 144895, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 10, "ENSE00003752738.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon11 = (GencodeGtfExonFeature) GencodeGtfFeature.create(28, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 143614, 143789, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 11, "ENSE00003720006.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds10 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(29, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 143614, 143789, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 11, "ENSE00003720006.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon12 = (GencodeGtfExonFeature) GencodeGtfFeature.create(30, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 142194, 142292, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 12, "ENSE00003719283.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds11 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(31, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 142194, 142292, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 12, "ENSE00003719283.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon13 = (GencodeGtfExonFeature) GencodeGtfFeature.create(32, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 138743, 138831, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 13, "ENSE00003751415.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds12 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(33, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 138743, 138831, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 13, "ENSE00003751415.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon14 = (GencodeGtfExonFeature) GencodeGtfFeature.create(34, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 138082, 138667, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 14, "ENSE00003753010.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds13 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(35, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 138483, 138667, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.TWO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 14, "ENSE00003753010.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfStopCodonFeature stop_codon1 = (GencodeGtfStopCodonFeature) GencodeGtfFeature.create(36, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.stop_codon, + 138480, 138482, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 14, "ENSE00003753010.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfUTRFeature utr1 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(37, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 161689, 161750, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 1, "ENSE00003735197.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfUTRFeature utr2 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(38, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 156447, 156497, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 2, "ENSE00003737280.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfUTRFeature utr3 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(39, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 138082, 138482, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000615165.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-202", 14, "ENSE00003753010.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000482462.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfTranscriptFeature transcript2 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(40, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.transcript, + 138082, 161852, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon15 = (GencodeGtfExonFeature) GencodeGtfFeature.create(41, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 161689, 161852, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 1, "ENSE00003746084.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon16 = (GencodeGtfExonFeature) GencodeGtfFeature.create(42, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 161314, 161626, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 2, "ENSE00003719550.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds14 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(43, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 161314, 161586, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 2, "ENSE00003719550.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfStartCodonFeature start_codon2 = (GencodeGtfStartCodonFeature) GencodeGtfFeature.create(44, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.start_codon, + 161584, 161586, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 2, "ENSE00003719550.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon17 = (GencodeGtfExonFeature) GencodeGtfFeature.create(45, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 156289, 156497, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 3, "ENSE00003723757.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds15 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(46, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 156289, 156497, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 3, "ENSE00003723757.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon18 = (GencodeGtfExonFeature) GencodeGtfFeature.create(47, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 150987, 151021, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 4, "ENSE00003731891.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds16 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(48, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 150987, 151021, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 4, "ENSE00003731891.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon19 = (GencodeGtfExonFeature) GencodeGtfFeature.create(49, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 150350, 150499, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 5, "ENSE00003724613.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds17 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(50, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 150350, 150499, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.TWO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 5, "ENSE00003724613.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon20 = (GencodeGtfExonFeature) GencodeGtfFeature.create(51, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 148414, 148478, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 6, "ENSE00003732418.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds18 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(52, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 148414, 148478, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.TWO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 6, "ENSE00003732418.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon21 = (GencodeGtfExonFeature) GencodeGtfFeature.create(53, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 148116, 148232, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 7, "ENSE00003733960.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds19 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(54, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 148116, 148232, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 7, "ENSE00003733960.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon22 = (GencodeGtfExonFeature) GencodeGtfFeature.create(55, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 147624, 147703, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 8, "ENSE00003727207.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds20 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(56, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 147624, 147703, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 8, "ENSE00003727207.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon23 = (GencodeGtfExonFeature) GencodeGtfFeature.create(57, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 146640, 146721, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 9, "ENSE00003728972.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds21 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(58, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 146640, 146721, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 9, "ENSE00003728972.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon24 = (GencodeGtfExonFeature) GencodeGtfFeature.create(59, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 145004, 145096, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 10, "ENSE00003733844.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds22 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(60, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 145004, 145096, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 10, "ENSE00003733844.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon25 = (GencodeGtfExonFeature) GencodeGtfFeature.create(61, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 144749, 144895, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 11, "ENSE00003752738.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds23 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(62, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 144749, 144895, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 11, "ENSE00003752738.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon26 = (GencodeGtfExonFeature) GencodeGtfFeature.create(63, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 143614, 143789, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 12, "ENSE00003720006.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds24 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(64, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 143614, 143789, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 12, "ENSE00003720006.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon27 = (GencodeGtfExonFeature) GencodeGtfFeature.create(65, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 142194, 142292, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 13, "ENSE00003719283.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds25 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(66, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 142194, 142292, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 13, "ENSE00003719283.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon28 = (GencodeGtfExonFeature) GencodeGtfFeature.create(67, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 138743, 138831, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 14, "ENSE00003751415.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds26 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(68, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 138743, 138831, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 14, "ENSE00003751415.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon29 = (GencodeGtfExonFeature) GencodeGtfFeature.create(69, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 138082, 138667, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 15, "ENSE00003753010.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds27 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(70, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 138483, 138667, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.TWO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 15, "ENSE00003753010.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfStopCodonFeature stop_codon2 = (GencodeGtfStopCodonFeature) GencodeGtfFeature.create(71, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.stop_codon, + 138480, 138482, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 15, "ENSE00003753010.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfUTRFeature utr4 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(72, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 161689, 161852, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 1, "ENSE00003746084.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfUTRFeature utr5 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(73, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 161587, 161626, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 2, "ENSE00003719550.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfUTRFeature utr6 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(74, "KI270734.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 138082, 138482, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000277196.4", "ENST00000621424.4", GencodeGtfFeature.GeneTranscriptType.protein_coding, + null, "AC007325.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, null, "AC007325.2-201", 15, "ENSE00003753010.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("protein_id", "ENSP00000481127.1"), + new GencodeGtfFeature.OptionalField<>("transcript_support_level", GencodeGtfFeature.TranscriptSupportLevel.ONE), + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + // ====================== + // Now let's collapse these objects into their correct structure: + + exon2.setCds(cds1); + exon2.setStartCodon(start_codon1); + + exon3.setCds(cds2); + exon4.setCds(cds3); + exon5.setCds(cds4); + exon6.setCds(cds5); + exon7.setCds(cds6); + exon8.setCds(cds7); + exon9.setCds(cds8); + exon10.setCds(cds9); + exon11.setCds(cds10); + exon12.setCds(cds11); + exon13.setCds(cds12); + + exon14.setCds(cds13); + exon14.setStopCodon(stop_codon1); + + transcript1.addExon(exon1); + transcript1.addExon(exon2); + transcript1.addExon(exon3); + transcript1.addExon(exon4); + transcript1.addExon(exon5); + transcript1.addExon(exon6); + transcript1.addExon(exon7); + transcript1.addExon(exon8); + transcript1.addExon(exon9); + transcript1.addExon(exon10); + transcript1.addExon(exon11); + transcript1.addExon(exon12); + transcript1.addExon(exon13); + transcript1.addExon(exon14); + + transcript1.addUtr(utr1); + transcript1.addUtr(utr2); + transcript1.addUtr(utr3); + + gene1.addTranscript(transcript1); + + // ====================== + + exon16.setCds(cds14); + exon16.setStartCodon(start_codon2); + + exon17.setCds(cds15); + exon18.setCds(cds16); + exon19.setCds(cds17); + exon20.setCds(cds18); + exon21.setCds(cds19); + exon22.setCds(cds20); + exon23.setCds(cds21); + exon24.setCds(cds22); + exon25.setCds(cds23); + exon26.setCds(cds24); + exon27.setCds(cds25); + exon28.setCds(cds26); + + exon29.setCds(cds27); + exon29.setStopCodon(stop_codon2); + + transcript2.addExon(exon15); + transcript2.addExon(exon16); + transcript2.addExon(exon17); + transcript2.addExon(exon18); + transcript2.addExon(exon19); + transcript2.addExon(exon20); + transcript2.addExon(exon21); + transcript2.addExon(exon22); + transcript2.addExon(exon23); + transcript2.addExon(exon24); + transcript2.addExon(exon25); + transcript2.addExon(exon26); + transcript2.addExon(exon27); + transcript2.addExon(exon28); + transcript2.addExon(exon29); + + transcript2.addUtr(utr4); + transcript2.addUtr(utr5); + transcript2.addUtr(utr6); + + gene1.addTranscript(transcript2); + + // ====================== + + return gene1; + } + + private GencodeGtfGeneFeature createGencodeGtfGene_v19_valid1() { + // Create the Features as they exist in the test file: + GencodeGtfGeneFeature gene = (GencodeGtfGeneFeature)GencodeGtfFeature.create(6, "chr1", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.gene, + 11869, 14412, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000223972.4", "ENSG00000223972.4", + GencodeGtfFeature.GeneTranscriptType.pseudogene, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "DDX11L1", GencodeGtfFeature.GeneTranscriptType.pseudogene, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "DDX11L1", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000000961.2") + ) + ), + null + ); + + GencodeGtfTranscriptFeature transcript = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(7, "chr1", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.transcript, + 11869, 14409, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000223972.4", "ENST00000456328.2", + GencodeGtfFeature.GeneTranscriptType.pseudogene, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "DDX11L1", GencodeGtfFeature.GeneTranscriptType.processed_transcript, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "DDX11L1-002", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", "basic"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000000961.2"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000362751.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon = (GencodeGtfExonFeature) GencodeGtfFeature.create(8, "chr1", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 11869, 12227, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000223972.4", "ENST00000456328.2", + GencodeGtfFeature.GeneTranscriptType.pseudogene, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "DDX11L1", GencodeGtfFeature.GeneTranscriptType.processed_transcript, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "DDX11L1-002", 1, "ENSE00002234944.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", "basic"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000000961.2"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000362751.1") + ) + ), + null + ); + + // Aggregate the Features as they should be: + transcript.addExon(exon); + gene.addTranscript(transcript); + + return gene; + } + + private ArrayList createGencodeGtfGene_v19_file2() { + + GencodeGtfGeneFeature gene1 = (GencodeGtfGeneFeature) GencodeGtfFeature.create(6, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.gene, + 50637519, 50638976, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000273253.1", "ENSG00000273253.1", GencodeGtfFeature.GeneTranscriptType.antisense, + GencodeGtfFeature.GeneTranscriptStatus.NOVEL, "RP3-402G11.26", GencodeGtfFeature.GeneTranscriptType.antisense, GencodeGtfFeature.GeneTranscriptStatus.NOVEL, "RP3-402G11.26", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000186123.2") + ) + ), + null + ); + + GencodeGtfTranscriptFeature transcript1 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(7, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.transcript, + 50637519, 50638976, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000273253.1", "ENST00000608025.1", GencodeGtfFeature.GeneTranscriptType.antisense, + GencodeGtfFeature.GeneTranscriptStatus.NOVEL, "RP3-402G11.26", GencodeGtfFeature.GeneTranscriptType.antisense, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "RP3-402G11.26-001", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000186123.2"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000472292.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon1 = (GencodeGtfExonFeature) GencodeGtfFeature.create(8, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50638505, 50638976, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000273253.1", "ENST00000608025.1", GencodeGtfFeature.GeneTranscriptType.antisense, + GencodeGtfFeature.GeneTranscriptStatus.NOVEL, "RP3-402G11.26", GencodeGtfFeature.GeneTranscriptType.antisense, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "RP3-402G11.26-001", 1, "ENSE00003710600.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000186123.2"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000472292.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon2 = (GencodeGtfExonFeature) GencodeGtfFeature.create(9, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50637519, 50637757, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000273253.1", "ENST00000608025.1", GencodeGtfFeature.GeneTranscriptType.antisense, + GencodeGtfFeature.GeneTranscriptStatus.NOVEL, "RP3-402G11.26", GencodeGtfFeature.GeneTranscriptType.antisense, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "RP3-402G11.26-001", 2, "ENSE00003710731.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000186123.2"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000472292.2") + ) + ), + null + ); + + GencodeGtfGeneFeature gene2 = (GencodeGtfGeneFeature) GencodeGtfFeature.create(10, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.gene, + 50639408, 50656045, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENSG00000073169.9", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3") + ) + ), + null + ); + + GencodeGtfTranscriptFeature transcript2 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(11, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.transcript, + 50639408, 50656045, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfSelenocysteineFeature selenocysteine1 = (GencodeGtfSelenocysteineFeature) GencodeGtfFeature.create(12, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.selenocysteine, + 50655787, 50655789, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon3 = (GencodeGtfExonFeature) GencodeGtfFeature.create(13, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50639408, 50640019, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds1 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(14, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50639466, 50640019, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfStartCodonFeature start_codon1 = (GencodeGtfStartCodonFeature) GencodeGtfFeature.create(15, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.start_codon, + 50639466, 50639468, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 1, "ENSE00001541223.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon4 = (GencodeGtfExonFeature) GencodeGtfFeature.create(16, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50644746, 50644949, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 2, "ENSE00001129529.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds2 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(17, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50644746, 50644949, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 2, "ENSE00001129529.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon5 = (GencodeGtfExonFeature) GencodeGtfFeature.create(18, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50646965, 50647145, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 3, "ENSE00001129524.2", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds3 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(19, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50646965, 50647145, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 3, "ENSE00001129524.2", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon6 = (GencodeGtfExonFeature) GencodeGtfFeature.create(20, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50648610, 50648740, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 4, "ENSE00003473644.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds4 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(21, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50648610, 50648740, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 4, "ENSE00003473644.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon7 = (GencodeGtfExonFeature) GencodeGtfFeature.create(22, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50649060, 50649340, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 5, "ENSE00003503715.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds5 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(23, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50649060, 50649340, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 5, "ENSE00003503715.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon8 = (GencodeGtfExonFeature) GencodeGtfFeature.create(24, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50654146, 50654296, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 6, "ENSE00003573348.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds6 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(25, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50654146, 50654296, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.TWO, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 6, "ENSE00003573348.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon9 = (GencodeGtfExonFeature) GencodeGtfFeature.create(26, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50655120, 50655305, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 7, "ENSE00003510005.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds7 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(27, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50655120, 50655305, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 7, "ENSE00003510005.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon10 = (GencodeGtfExonFeature) GencodeGtfFeature.create(28, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50655401, 50655557, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 8, "ENSE00003591346.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds8 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(29, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50655401, 50655557, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 8, "ENSE00003591346.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfExonFeature exon11 = (GencodeGtfExonFeature) GencodeGtfFeature.create(30, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50655634, 50656045, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 9, "ENSE00003512975.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfCDSFeature cds9 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(31, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.cds, + 50655634, 50655795, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 9, "ENSE00003512975.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfStopCodonFeature stop_codon1 = (GencodeGtfStopCodonFeature) GencodeGtfFeature.create(32, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.stop_codon, + 50655796, 50655798, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", 9, "ENSE00003512975.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfUTRFeature utr1 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(33, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.utr, + 50639408, 50639465, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfUTRFeature utr2 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(34, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.utr, + 50655796, 50656045, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000380903.2", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-001", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.appris_principal), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.CCDS), + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.seleno), + new GencodeGtfFeature.OptionalField<>("ccdsid", "CCDS43034.1"), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000075003.2") + ) + ), + null + ); + + GencodeGtfTranscriptFeature transcript3 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(35, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.transcript, + 50644871, 50656045, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-002", -1, null, GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon12 = (GencodeGtfExonFeature) GencodeGtfFeature.create(36, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50644871, 50644949, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-002", 1, "ENSE00001890724.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon13 = (GencodeGtfExonFeature) GencodeGtfFeature.create(37, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50646917, 50647145, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-002", 2, "ENSE00001952603.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon14 = (GencodeGtfExonFeature) GencodeGtfFeature.create(38, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50648610, 50648740, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-002", 3, "ENSE00003583919.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon15 = (GencodeGtfExonFeature) GencodeGtfFeature.create(39, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50649060, 50649340, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-002", 4, "ENSE00003620115.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon16 = (GencodeGtfExonFeature) GencodeGtfFeature.create(40, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50654146, 50654296, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-002", 5, "ENSE00003636069.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon17 = (GencodeGtfExonFeature) GencodeGtfFeature.create(41, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50655120, 50655305, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-002", 6, "ENSE00003579717.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon18 = (GencodeGtfExonFeature) GencodeGtfFeature.create(42, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50655401, 50655557, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-002", 7, "ENSE00003650938.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + GencodeGtfExonFeature exon19 = (GencodeGtfExonFeature) GencodeGtfFeature.create(43, "chr22", GencodeGtfFeature.AnnotationSource.HAVANA, GencodeGtfFeature.FeatureType.exon, + 50655634, 50656045, GencodeGtfFeature.GenomicStrand.FORWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000073169.9", "ENST00000492092.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO", GencodeGtfFeature.GeneTranscriptType.processed_transcript, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "SELO-002", 8, "ENSE00003475904.1", GencodeGtfFeature.LocusLevel.TWO, + new ArrayList<>( + Arrays.asList( + new GencodeGtfFeature.OptionalField<>("tag", GencodeGtfFeature.FeatureTag.basic), + new GencodeGtfFeature.OptionalField<>("havana_gene", "OTTHUMG00000044645.3"), + new GencodeGtfFeature.OptionalField<>("havana_transcript", "OTTHUMT00000316993.1") + ) + ), + null + ); + + // ====================== + // Now let's collapse these objects into their correct structure: + + transcript1.addExon(exon1); + transcript1.addExon(exon2); + gene1.addTranscript(transcript1); + + // ====================== + // ====================== + + transcript2.addSelenocysteine(selenocysteine1); + + exon3.setCds(cds1); + exon3.setStartCodon(start_codon1); + + exon4.setCds(cds2); + exon5.setCds(cds3); + exon6.setCds(cds4); + exon7.setCds(cds5); + exon8.setCds(cds6); + exon9.setCds(cds7); + exon10.setCds(cds8); + + exon11.setCds(cds9); + exon11.setStopCodon(stop_codon1); + + transcript2.addExon(exon3); + transcript2.addExon(exon4); + transcript2.addExon(exon5); + transcript2.addExon(exon6); + transcript2.addExon(exon7); + transcript2.addExon(exon8); + transcript2.addExon(exon9); + transcript2.addExon(exon10); + transcript2.addExon(exon11); + + transcript2.addUtr(utr1); + transcript2.addUtr(utr2); + + // ====================== + + transcript3.addExon(exon12); + transcript3.addExon(exon13); + transcript3.addExon(exon14); + transcript3.addExon(exon15); + transcript3.addExon(exon16); + transcript3.addExon(exon17); + transcript3.addExon(exon18); + transcript3.addExon(exon19); + + // ====================== + + gene2.addTranscript(transcript2); + gene2.addTranscript(transcript3); + + return new ArrayList<>( Arrays.asList(gene1, gene2) ); + } + + private GencodeGtfGeneFeature createGencodeGtfGene_v19_theOtherFile() { + + GencodeGtfGeneFeature gene1 = (GencodeGtfGeneFeature) GencodeGtfFeature.create(6, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.gene, + 38792, 97421, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENSG00000215615.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>(), null + ); + + + GencodeGtfTranscriptFeature transcript1 = (GencodeGtfTranscriptFeature) GencodeGtfFeature.create(7, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.transcript, + 38792, 97421, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon1 = (GencodeGtfExonFeature) GencodeGtfFeature.create(8, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 97368, 97421, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 1, "ENSE00001544212.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon2 = (GencodeGtfExonFeature) GencodeGtfFeature.create(9, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 95174, 95232, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 2, "ENSE00001849396.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds1 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(10, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 95174, 95230, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 2, "ENSE00001849396.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfStartCodonFeature start_codon1 = (GencodeGtfStartCodonFeature) GencodeGtfFeature.create(11, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.start_codon, + 95228, 95230, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 2, "ENSE00001849396.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon3 = (GencodeGtfExonFeature) GencodeGtfFeature.create(12, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 40642, 40872, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 3, "ENSE00001900862.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds2 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(13, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 40642, 40872, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 3, "ENSE00001900862.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon4 = (GencodeGtfExonFeature) GencodeGtfFeature.create(14, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 39874, 40028, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 4, "ENSE00001544206.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds3 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(15, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 39874, 40028, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ZERO, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 4, "ENSE00001544206.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfExonFeature exon5 = (GencodeGtfExonFeature) GencodeGtfFeature.create(16, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.exon, + 38792, 39019, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 5, "ENSE00001544203.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfCDSFeature cds4 = (GencodeGtfCDSFeature) GencodeGtfFeature.create(17, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.cds, + 38794, 39019, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.ONE, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", 5, "ENSE00001544203.1", GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfUTRFeature utr1 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(18, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 97368, 97421, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfUTRFeature utr2 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(19, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 95231, 95232, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + GencodeGtfUTRFeature utr3 = (GencodeGtfUTRFeature) GencodeGtfFeature.create(20, "GL000218.1", GencodeGtfFeature.AnnotationSource.ENSEMBL, GencodeGtfFeature.FeatureType.utr, + 38792, 38793, GencodeGtfFeature.GenomicStrand.BACKWARD, GencodeGtfFeature.GenomicPhase.DOT, "ENSG00000215615.1", "ENST00000400681.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, + GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1", GencodeGtfFeature.GeneTranscriptType.protein_coding, GencodeGtfFeature.GeneTranscriptStatus.KNOWN, "AL354822.1-201", -1, null, GencodeGtfFeature.LocusLevel.THREE, + new ArrayList<>( + Collections.singletonList( + new GencodeGtfFeature.OptionalField<>("tag", "basic") + ) + ), + null + ); + + // ====================== + // Now let's collapse these objects into their correct structure: + + exon2.setCds(cds1); + exon2.setStartCodon(start_codon1); + + exon3.setCds(cds2); + exon4.setCds(cds3); + exon5.setCds(cds4); + + transcript1.addExon(exon1); + transcript1.addExon(exon2); + transcript1.addExon(exon3); + transcript1.addExon(exon4); + transcript1.addExon(exon5); + + transcript1.addUtr(utr1); + transcript1.addUtr(utr2); + transcript1.addUtr(utr3); + + gene1.addTranscript(transcript1); + + return gene1; + } + + /** + * Helper method to create data for the {@link DataProvider} {@link #decodeTestProvider()} + * @return An {@link Object} array of size 2 containing a file name the object-representation of a file's data. + */ + private Object[] createTestData_valid1() { + return new Object[] { + "gencode.valid1.gtf", new ArrayList<>( Collections.singletonList( createGencodeGtfGene_valid1()) ) + }; + } + + /** + * Helper method to create data for the {@link DataProvider} {@link #decodeTestProvider()} + * @return An {@link Object} array of size 2 containing a file name the object-representation of a file's data. + */ + private Object[] createTestData_gencode_file2() { + + return new Object[] { + "gencode.valid_gencode_file2.gtf", + new ArrayList<>( Collections.singletonList( createGencodeGtfGene_file2()) ) + }; + } + + private Object[] createTestData_theOtherfile() { + return new Object[] { + "gencode.and.this.is.a.valid.one.too.table.gtf", + new ArrayList<>( Collections.singletonList( createGencodeGtfGene_file3()) ) + }; + } + + private Object[] createTestData_v19_valid1() { + return new Object[] { + "gencode.v19.valid1.gtf", + new ArrayList<>( Collections.singletonList( createGencodeGtfGene_v19_valid1() ) ) + }; + } + + private Object[] createTestData_v19_file2() { + return new Object[] { + "gencode.v19.valid_gencode_file2.gtf", + createGencodeGtfGene_v19_file2() + }; + } + + private Object[] createTestData_v19_theOtherFile() { + return new Object[] { + "gencode.v19.and.this.is.a.valid.one.too.gtf", + new ArrayList<>( Collections.singletonList( createGencodeGtfGene_v19_theOtherFile() ) ) + }; + } + + // ============================================================================================================ + // ============================================================================================================ + // ============================================================================================================ + + @DataProvider + private Object[][] testIndexingProvider() { + + return new Object[][] { + + // HG38 + { xyzTestFile, "chrX", 105958620, 106070470, 1, }, // gene fits in region + { xyzTestFile, "chrX", 106033442, 106034738, 1, }, // region fits in gene + { xyzTestFile, "chrX", 106033442, 106070470, 1, }, // region start in gene + { xyzTestFile, "chrX", 105958620, 106034738, 1, }, // region end in gene + { xyzTestFile, "chrX", 1, 200000000, 2366, }, // Many genes in region + { xyzTestFile, "chrX", 10000, 200000, 0, }, // no genes in region + + // HG19 + { gencodeHg19TestFile, "chr1", 32198, 41014, 1, }, // gene fits in region + { gencodeHg19TestFile, "chr1", 35001, 35500, 1, }, // region fits in gene + { gencodeHg19TestFile, "chr1", 35500, 37000, 1, }, // region start in gene + { gencodeHg19TestFile, "chr1", 32198, 35500, 1, }, // region end in gene + { gencodeHg19TestFile, "chr1", 1, 200000000, 1982, }, // Many genes in region + { gencodeHg19TestFile, "chr1", 33001, 34091, 0, }, // no genes in region + + }; + } + + @DataProvider + private Object[][] toStringTestProvider() { + + // Hand-done results: + GencodeGtfGeneFeature gene = createGencodeGtfGene_valid1(); + + String expected = "chr1\tENSEMBL\tgene\t30366\t30503\t.\t+\t.\tgene_id \"ENSG00000284332.1\"; gene_type \"miRNA\"; gene_name \"MIR1302-2\"; level 3;\n" + + "chr1\tENSEMBL\ttranscript\t30366\t30503\t.\t+\t.\tgene_id \"ENSG00000284332.1\"; transcript_id \"ENST00000607096.1\"; gene_type \"miRNA\"; gene_name \"MIR1302-2\"; transcript_type \"miRNA\"; transcript_name \"MIR1302-2-201\"; level 3; transcript_support_level \"NA\"; tag \"basic\";\n" + + "chr1\tENSEMBL\texon\t30366\t30503\t.\t+\t.\tgene_id \"ENSG00000284332.1\"; transcript_id \"ENST00000607096.1\"; gene_type \"miRNA\"; gene_name \"MIR1302-2\"; transcript_type \"miRNA\"; transcript_name \"MIR1302-2-201\"; exon_number 1; exon_id \"ENSE00003695741.1\"; level 3; transcript_support_level \"NA\"; tag \"basic\";"; + + return new Object[][] { + {gene, expected}, + }; + } + + @DataProvider + public Object[][] nameProvider() { + + return new Object[][] { + { "a.tsv" , false }, // Wrong File name / extension + { "a.table.gz", false }, // Wrong File name / extension + { "a.bed" , false }, // Wrong File name / extension + { "a.bcf" , false }, // Wrong File name / extension + { "a.hapmap" , false }, // Wrong File name / extension + { "a.refseq" , false }, // Wrong File name / extension + { "a.beagle" , false }, // Wrong File name / extension + { "a.table" , false }, // Wrong File name / extension + + { "gencode.v26.annotation.gtf.tsv", false}, // Wrong File name / extension + { "gencode.v26.annotation.tgz" , false}, // Wrong File name / extension + { "gencode.v26.annotation.tar.gz" , false}, // Wrong File name / extension + + { "gencode.gtf" , false}, // File does not exist + { "gencode.v26.primary_assembly.annotation.gtf", false}, // File does not exist + { "gencode.v26.long_noncoding_RNAs.gtf" , false}, // File does not exist + + { "gencode.invalid_short_header.gtf" , false}, // File exists, has invalid header + { "gencode.invalid_malformed_header.gtf" , false}, // File exists, has invalid header + { "gencode.invalid_malformed_header_desc.gtf" , false}, // File exists, has invalid header + { "gencode.invalid_malformed_header_prov.gtf" , false}, // File exists, has invalid header + { "gencode.invalid_malformed_header_cont.gtf" , false}, // File exists, has invalid header + { "gencode.invalid_malformed_header_form.gtf" , false}, // File exists, has invalid header + { "gencode.invalid_malformed_header_date.gtf" , false}, // File exists, has invalid header + + { "gencode.valid1.gtf" , true}, // Valid file + { "gencode.valid_gencode_file2.gtf" , true}, // Valid file + { "gencode.and.this.is.a.valid.one.too.table.gtf", true}, // Valid file + }; + } + + @DataProvider + public Object[][] headerProvider() { + return new Object[][] { + + { new String[] {}, false }, // Wrong length header + { new String[] { "", + "", + "", + "", + "" }, + false }, // Bad content + { new String[] { "##descr", + "##provider: GENCODE", + "##contact: gencode-help@sanger.ac.uk", + "##format: gtf", + "##date: 2017-04-08" }, + false }, // Bad header - description + { new String[] { "##description: THIS IS A SAMPLE", + "##provider: GARBAGEDAY", + "##contact: gencode-help@sanger.ac.uk", + "##format: gtf", + "##date: 2017-04-08" }, + false }, // Bad header - provider + { new String[] { "##description: THIS IS A SAMPLE", + "##provider: GENCODE", + "##contact: gencode@NORTHPOLE.pl", + "##format: gtf", + "##date: 2017-04-08" }, + false }, // Bad header - contact + { new String[] { "##description: THIS IS A SAMPLE", + "##provider: GENCODE", + "##contact: SANTACLAUSE@sanger.ac.uk", + "##format: gtf", + "##date: 2017-04-08" }, + false }, // Bad header - contact + { new String[] { "##description: THIS IS A SAMPLE", + "##provider: GENCODE", + "##contact: gencode-help@sanger.ac.uk", + "##format: dumpy", + "##date: 2017-04-08" }, + false }, // Bad header - format + { new String[] { "##description: THIS IS A SAMPLE", + "##provider: GENCODE", + "##contact: gencode-help@sanger.ac.uk", + "##format: gtf", + "##doom: ID Software" }, + false }, // Bad header - date + { new String[] { "##description: evidence-based annotation of the human genome (GRCh37), version 19 (Ensembl 74)", + "##provider: GENCODE", + "##contact: gencode@sanger.ac.uk", + "##format: gtf", + "##date: 2014-07-25" }, + true }, // Good Header! + { new String[] { "##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88)", + "##provider: GENCODE", + "##contact: gencode-help@sanger.ac.uk", + "##format: gtf", + "##date: 2014-07-25" }, + true }, // Good Header! + + }; + } + + @DataProvider + public Object[][] decodeTestProvider() { + + return new Object[][] { + createTestData_valid1(), + createTestData_gencode_file2(), + createTestData_theOtherfile(), + createTestData_v19_valid1(), + createTestData_v19_file2(), + createTestData_v19_theOtherFile() + }; + } + + // ============================================================================================================= + + @Test(dataProvider = "nameProvider") + public void testCanDecode(final String name, final boolean expected) { + GencodeGtfCodec gencodeGtfCodec = new GencodeGtfCodec(); + Assert.assertEquals(gencodeGtfCodec.canDecode(testResourceDir + name), expected, name); + } + + @Test(dataProvider = "headerProvider") + public void testValidateHeader( final String[] header, final boolean expected ) { + Assert.assertEquals( GencodeGtfCodec.validateHeader(header), expected ); + } + + @Test(dataProvider = "decodeTestProvider") + public void testDecode( final String filePath, final List expected) throws IOException { + GencodeGtfCodec gencodeGtfCodec = new GencodeGtfCodec(); + + try (BufferedInputStream bufferedInputStream = + new BufferedInputStream( + new FileInputStream(testResourceDir + filePath) + ) + ) { + // Get the line iterator: + LineIterator lineIterator = gencodeGtfCodec.makeSourceFromStream(bufferedInputStream); + + // Get the header (required for the read to work correctly): + gencodeGtfCodec.readHeader(lineIterator); + + // Setup our expected data iterator: + Iterator expectedIterator = expected.iterator(); + + // Now read our features and make sure they're what we expect: + while ( lineIterator.hasNext() ) { + GencodeGtfFeature feature = gencodeGtfCodec.decode(lineIterator); + Assert.assertEquals(feature, expectedIterator.next()); + } + } + } + + @Test(dataProvider = "toStringTestProvider") + public void testToString( final GencodeGtfFeature feature, final String expected ) { + Assert.assertEquals(feature.toString(), expected); + } + + @Test(dataProvider = "testIndexingProvider") + public void testIndexing( final String fileName, final String contig, final int start, final int end, final int numExpectedGenes ) { + + final File gencodeTestFile = new File(fileName); + testIndexHelper(contig, start, end, numExpectedGenes, gencodeTestFile); + } + + @Test(dataProvider = "testIndexingProvider") + public void testIndexingAndIndexCreation( final String fileName, + final String contig, + final int start, + final int end, + final int numExpectedGenes ) throws IOException { + + GencodeGtfCodec codec = new GencodeGtfCodec(); + + // Create a temp dir: + final File tmpDir = createTempDir("testIndexingAndIndexCreation_" + start + "_" + end); + tmpDir.deleteOnExit(); + + // Create a copy of our index file: + final File originalTestFile = new File(fileName); + final File testFile = new File(tmpDir.getAbsolutePath() + File.separator + originalTestFile.getName()); + + // Copy our file to the tmp dir: + Files.copy(originalTestFile.toPath(), testFile.toPath(), REPLACE_EXISTING); + + // Create our Index: + File indexFile = Tribble.indexFile(testFile); + Index index = IndexFactory.createDynamicIndex(testFile, codec, IndexFactory.IndexBalanceApproach.FOR_SEEK_TIME); + index.write(indexFile); + + // Make sure it works: + testIndexHelper(contig, start, end, numExpectedGenes, testFile); + } +} diff --git a/src/test/resources/large/gencode.v19.LargeFile.gtf b/src/test/resources/large/gencode.v19.LargeFile.gtf new file mode 100644 index 00000000000..f49fd17840a --- /dev/null +++ b/src/test/resources/large/gencode.v19.LargeFile.gtf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23463901adf7900e1bef49721e273811aa812e0cf70f652a25025e365a731516 +size 42419032 diff --git a/src/test/resources/large/gencode.v19.LargeFile.gtf.idx b/src/test/resources/large/gencode.v19.LargeFile.gtf.idx new file mode 100644 index 00000000000..15a61a9d983 --- /dev/null +++ b/src/test/resources/large/gencode.v19.LargeFile.gtf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca4d117e97bc960956b90be14d920b00fe7d2208d85eb9fb6905fb7a2bd1a0a6 +size 1545 diff --git a/src/test/resources/large/gencode.v26.primary_assembly.annotation.XYZ.gtf b/src/test/resources/large/gencode.v26.primary_assembly.annotation.XYZ.gtf new file mode 100644 index 00000000000..39a96edf1a6 --- /dev/null +++ b/src/test/resources/large/gencode.v26.primary_assembly.annotation.XYZ.gtf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb7c2c9ce8bc9d70984f9d66c7fa444a53ee6cb21b7221ffbe3b77839104d2e1 +size 39967198 diff --git a/src/test/resources/large/gencode.v26.primary_assembly.annotation.XYZ.gtf.idx b/src/test/resources/large/gencode.v26.primary_assembly.annotation.XYZ.gtf.idx new file mode 100644 index 00000000000..f71a8fb29cf --- /dev/null +++ b/src/test/resources/large/gencode.v26.primary_assembly.annotation.XYZ.gtf.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b47c0d4db0df9a199acbcc46c320bac46f72b32194078438cfb5d7ed7cc555 +size 6921 diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.and.this.is.a.valid.one.too.table.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.and.this.is.a.valid.one.too.table.gtf new file mode 100644 index 00000000000..acb3165c10a --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.and.this.is.a.valid.one.too.table.gtf @@ -0,0 +1,74 @@ +##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88) +##provider: GENCODE +##contact: gencode-help@sanger.ac.uk +##format: gtf +##date: 2017-03-14 +KI270734.1 ENSEMBL gene 138082 161852 . - . gene_id "ENSG00000277196.4"; gene_type "protein_coding"; gene_name "AC007325.2"; level 3; +KI270734.1 ENSEMBL transcript 138082 161750 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 161689 161750 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 1; exon_id "ENSE00003735197.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 156289 156497 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 2; exon_id "ENSE00003737280.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 156289 156446 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 2; exon_id "ENSE00003737280.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL start_codon 156444 156446 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 2; exon_id "ENSE00003737280.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 150987 151021 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 3; exon_id "ENSE00003731891.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 150987 151021 . - 1 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 3; exon_id "ENSE00003731891.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 150350 150499 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 4; exon_id "ENSE00003724613.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 150350 150499 . - 2 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 4; exon_id "ENSE00003724613.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 148414 148478 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 5; exon_id "ENSE00003732418.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 148414 148478 . - 2 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 5; exon_id "ENSE00003732418.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 148116 148232 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 6; exon_id "ENSE00003733960.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 148116 148232 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 6; exon_id "ENSE00003733960.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 147624 147703 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 7; exon_id "ENSE00003727207.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 147624 147703 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 7; exon_id "ENSE00003727207.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 146640 146721 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 8; exon_id "ENSE00003728972.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 146640 146721 . - 1 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 8; exon_id "ENSE00003728972.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 145004 145096 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 9; exon_id "ENSE00003733844.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 145004 145096 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 9; exon_id "ENSE00003733844.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 144749 144895 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 10; exon_id "ENSE00003752738.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 144749 144895 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 10; exon_id "ENSE00003752738.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 143614 143789 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 11; exon_id "ENSE00003720006.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 143614 143789 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 11; exon_id "ENSE00003720006.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 142194 142292 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 12; exon_id "ENSE00003719283.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 142194 142292 . - 1 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 12; exon_id "ENSE00003719283.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 138743 138831 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 13; exon_id "ENSE00003751415.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 138743 138831 . - 1 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 13; exon_id "ENSE00003751415.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 138082 138667 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 14; exon_id "ENSE00003753010.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 138483 138667 . - 2 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 14; exon_id "ENSE00003753010.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL stop_codon 138480 138482 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 14; exon_id "ENSE00003753010.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL UTR 161689 161750 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 1; exon_id "ENSE00003735197.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL UTR 156447 156497 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 2; exon_id "ENSE00003737280.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL UTR 138082 138482 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000615165.1"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-202"; exon_number 14; exon_id "ENSE00003753010.1"; level 3; protein_id "ENSP00000482462.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL transcript 138082 161852 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 161689 161852 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 1; exon_id "ENSE00003746084.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 161314 161626 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 2; exon_id "ENSE00003719550.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 161314 161586 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 2; exon_id "ENSE00003719550.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL start_codon 161584 161586 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 2; exon_id "ENSE00003719550.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 156289 156497 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 3; exon_id "ENSE00003723757.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 156289 156497 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 3; exon_id "ENSE00003723757.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 150987 151021 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 4; exon_id "ENSE00003731891.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 150987 151021 . - 1 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 4; exon_id "ENSE00003731891.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 150350 150499 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 5; exon_id "ENSE00003724613.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 150350 150499 . - 2 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 5; exon_id "ENSE00003724613.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 148414 148478 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 6; exon_id "ENSE00003732418.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 148414 148478 . - 2 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 6; exon_id "ENSE00003732418.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 148116 148232 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 7; exon_id "ENSE00003733960.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 148116 148232 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 7; exon_id "ENSE00003733960.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 147624 147703 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 8; exon_id "ENSE00003727207.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 147624 147703 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 8; exon_id "ENSE00003727207.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 146640 146721 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 9; exon_id "ENSE00003728972.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 146640 146721 . - 1 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 9; exon_id "ENSE00003728972.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 145004 145096 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 10; exon_id "ENSE00003733844.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 145004 145096 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 10; exon_id "ENSE00003733844.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 144749 144895 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 11; exon_id "ENSE00003752738.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 144749 144895 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 11; exon_id "ENSE00003752738.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 143614 143789 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 12; exon_id "ENSE00003720006.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 143614 143789 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 12; exon_id "ENSE00003720006.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 142194 142292 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 13; exon_id "ENSE00003719283.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 142194 142292 . - 1 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 13; exon_id "ENSE00003719283.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 138743 138831 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 14; exon_id "ENSE00003751415.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 138743 138831 . - 1 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 14; exon_id "ENSE00003751415.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL exon 138082 138667 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 15; exon_id "ENSE00003753010.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL CDS 138483 138667 . - 2 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 15; exon_id "ENSE00003753010.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL stop_codon 138480 138482 . - 0 gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 15; exon_id "ENSE00003753010.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL UTR 161689 161852 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 1; exon_id "ENSE00003746084.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL UTR 161587 161626 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 2; exon_id "ENSE00003719550.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; +KI270734.1 ENSEMBL UTR 138082 138482 . - . gene_id "ENSG00000277196.4"; transcript_id "ENST00000621424.4"; gene_type "protein_coding"; gene_name "AC007325.2"; transcript_type "protein_coding"; transcript_name "AC007325.2-201"; exon_number 15; exon_id "ENSE00003753010.1"; level 3; protein_id "ENSP00000481127.1"; transcript_support_level "1"; tag "basic"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header.gtf new file mode 100644 index 00000000000..3146697d244 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header.gtf @@ -0,0 +1,17 @@ +## +## +## +## +## +chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 1; exon_id "ENSE00002234944.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 3; exon_id "ENSE00002312635.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA transcript 12010 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12010 12057 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 1; exon_id "ENSE00001948541.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12179 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 2; exon_id "ENSE00001671638.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12613 12697 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 3; exon_id "ENSE00001758273.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12975 13052 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 4; exon_id "ENSE00001799933.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13221 13374 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 5; exon_id "ENSE00001746346.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13453 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 6; exon_id "ENSE00001863096.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_cont.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_cont.gtf new file mode 100644 index 00000000000..2eef2d31bbd --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_cont.gtf @@ -0,0 +1,17 @@ +##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88) +##provider: GENCODE +##contact: NO HELP FOR YOU +##format: gtf +##date: 2017-03-14 +chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 1; exon_id "ENSE00002234944.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 3; exon_id "ENSE00002312635.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA transcript 12010 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12010 12057 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 1; exon_id "ENSE00001948541.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12179 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 2; exon_id "ENSE00001671638.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12613 12697 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 3; exon_id "ENSE00001758273.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12975 13052 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 4; exon_id "ENSE00001799933.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13221 13374 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 5; exon_id "ENSE00001746346.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13453 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 6; exon_id "ENSE00001863096.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_date.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_date.gtf new file mode 100644 index 00000000000..9ee6e95787a --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_date.gtf @@ -0,0 +1,17 @@ +##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88) +##provider: GENCODE +##contact: gencode-help@sanger.ac.uk +##format: gtf +##done: 2017-03-14 +chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 1; exon_id "ENSE00002234944.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 3; exon_id "ENSE00002312635.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA transcript 12010 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12010 12057 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 1; exon_id "ENSE00001948541.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12179 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 2; exon_id "ENSE00001671638.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12613 12697 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 3; exon_id "ENSE00001758273.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12975 13052 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 4; exon_id "ENSE00001799933.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13221 13374 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 5; exon_id "ENSE00001746346.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13453 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 6; exon_id "ENSE00001863096.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_desc.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_desc.gtf new file mode 100644 index 00000000000..8757d061925 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_desc.gtf @@ -0,0 +1,17 @@ +##descriptin: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88) +##provider: GENCODE +##contact: gencode-help@sanger.ac.uk +##format: gtf +##date: 2017-03-14 +chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 1; exon_id "ENSE00002234944.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 3; exon_id "ENSE00002312635.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA transcript 12010 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12010 12057 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 1; exon_id "ENSE00001948541.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12179 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 2; exon_id "ENSE00001671638.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12613 12697 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 3; exon_id "ENSE00001758273.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12975 13052 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 4; exon_id "ENSE00001799933.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13221 13374 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 5; exon_id "ENSE00001746346.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13453 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 6; exon_id "ENSE00001863096.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_form.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_form.gtf new file mode 100644 index 00000000000..fc3e9a904e8 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_form.gtf @@ -0,0 +1,17 @@ +##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88) +##provider: GENCODE +##contact: gencode-help@sanger.ac.uk +##format: GARBAGEDAY +##date: 2017-03-14 +chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 1; exon_id "ENSE00002234944.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 3; exon_id "ENSE00002312635.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA transcript 12010 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12010 12057 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 1; exon_id "ENSE00001948541.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12179 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 2; exon_id "ENSE00001671638.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12613 12697 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 3; exon_id "ENSE00001758273.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12975 13052 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 4; exon_id "ENSE00001799933.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13221 13374 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 5; exon_id "ENSE00001746346.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13453 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 6; exon_id "ENSE00001863096.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_prov.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_prov.gtf new file mode 100644 index 00000000000..caca25893e7 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_malformed_header_prov.gtf @@ -0,0 +1,17 @@ +##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88) +##provider: Pangolins. Lots and lots of Pangolins. +##contact: gencode-help@sanger.ac.uk +##format: gtf +##date: 2017-03-14 +chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 1; exon_id "ENSE00002234944.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 3; exon_id "ENSE00002312635.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA transcript 12010 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12010 12057 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 1; exon_id "ENSE00001948541.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12179 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 2; exon_id "ENSE00001671638.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12613 12697 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 3; exon_id "ENSE00001758273.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12975 13052 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 4; exon_id "ENSE00001799933.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13221 13374 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 5; exon_id "ENSE00001746346.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13453 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 6; exon_id "ENSE00001863096.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_short_header.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_short_header.gtf new file mode 100644 index 00000000000..3162ee781fc --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.invalid_short_header.gtf @@ -0,0 +1,14 @@ +##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88) +##provider: GENCODE +chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 1; exon_id "ENSE00002234944.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 12613 12721 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 2; exon_id "ENSE00003582793.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 13221 14409 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000456328.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_name "DDX11L1-002"; exon_number 3; exon_id "ENSE00002312635.1"; level 2; transcript_support_level "1"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA transcript 12010 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12010 12057 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 1; exon_id "ENSE00001948541.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12179 12227 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 2; exon_id "ENSE00001671638.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12613 12697 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 3; exon_id "ENSE00001758273.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 12975 13052 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 4; exon_id "ENSE00001799933.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13221 13374 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 5; exon_id "ENSE00001746346.2"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; +chr1 HAVANA exon 13453 13670 . + . gene_id "ENSG00000223972.5"; transcript_id "ENST00000450305.2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; transcript_type "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-001"; exon_number 6; exon_id "ENSE00001863096.1"; level 2; transcript_support_level "NA"; ont "PGO:0000005"; ont "PGO:0000019"; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000002844.2"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.and.this.is.a.valid.one.too.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.and.this.is.a.valid.one.too.gtf new file mode 100644 index 00000000000..9ad3dd9cc9d --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.and.this.is.a.valid.one.too.gtf @@ -0,0 +1,20 @@ +##description: evidence-based annotation of the human genome (GRCh37), version 19 (Ensembl 74) +##provider: GENCODE +##contact: gencode@sanger.ac.uk +##format: gtf +##date: 2013-12-06 +GL000218.1 ENSEMBL gene 38792 97421 . - . gene_id "ENSG00000215615.1"; transcript_id "ENSG00000215615.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1"; level 3; +GL000218.1 ENSEMBL transcript 38792 97421 . - . gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; level 3; tag "basic"; +GL000218.1 ENSEMBL exon 97368 97421 . - . gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 1; exon_id "ENSE00001544212.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL exon 95174 95232 . - . gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 2; exon_id "ENSE00001849396.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL CDS 95174 95230 . - 0 gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 2; exon_id "ENSE00001849396.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL start_codon 95228 95230 . - 0 gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 2; exon_id "ENSE00001849396.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL exon 40642 40872 . - . gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 3; exon_id "ENSE00001900862.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL CDS 40642 40872 . - 0 gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 3; exon_id "ENSE00001900862.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL exon 39874 40028 . - . gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 4; exon_id "ENSE00001544206.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL CDS 39874 40028 . - 0 gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 4; exon_id "ENSE00001544206.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL exon 38792 39019 . - . gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 5; exon_id "ENSE00001544203.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL CDS 38794 39019 . - 1 gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; exon_number 5; exon_id "ENSE00001544203.1"; level 3; tag "basic"; +GL000218.1 ENSEMBL UTR 97368 97421 . - . gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; level 3; tag "basic"; +GL000218.1 ENSEMBL UTR 95231 95232 . - . gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; level 3; tag "basic"; +GL000218.1 ENSEMBL UTR 38792 38793 . - . gene_id "ENSG00000215615.1"; transcript_id "ENST00000400681.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "AL354822.1"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "AL354822.1-201"; level 3; tag "basic"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.valid1.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.valid1.gtf new file mode 100644 index 00000000000..ae5ca88a6b8 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.valid1.gtf @@ -0,0 +1,8 @@ +##description: evidence-based annotation of the human genome (GRCh37), version 19 (Ensembl 74) +##provider: GENCODE +##contact: gencode@sanger.ac.uk +##format: gtf +##date: 2013-12-06 +chr1 HAVANA gene 11869 14412 . + . gene_id "ENSG00000223972.4"; transcript_id "ENSG00000223972.4"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "DDX11L1"; transcript_type "pseudogene"; transcript_status "KNOWN"; transcript_name "DDX11L1"; level 2; havana_gene "OTTHUMG00000000961.2"; +chr1 HAVANA transcript 11869 14409 . + . gene_id "ENSG00000223972.4"; transcript_id "ENST00000456328.2"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "DDX11L1-002"; level 2; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; +chr1 HAVANA exon 11869 12227 . + . gene_id "ENSG00000223972.4"; transcript_id "ENST00000456328.2"; gene_type "pseudogene"; gene_status "KNOWN"; gene_name "DDX11L1"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "DDX11L1-002"; exon_number 1; exon_id "ENSE00002234944.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000000961.2"; havana_transcript "OTTHUMT00000362751.1"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.valid_gencode_file2.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.valid_gencode_file2.gtf new file mode 100644 index 00000000000..ed830412a33 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.v19.valid_gencode_file2.gtf @@ -0,0 +1,43 @@ +##description: evidence-based annotation of the human genome (GRCh37), version 19 (Ensembl 74) +##provider: GENCODE +##contact: gencode@sanger.ac.uk +##format: gtf +##date: 2013-12-06 +chr22 HAVANA gene 50637519 50638976 . - . gene_id "ENSG00000273253.1"; transcript_id "ENSG00000273253.1"; gene_type "antisense"; gene_status "NOVEL"; gene_name "RP3-402G11.26"; transcript_type "antisense"; transcript_status "NOVEL"; transcript_name "RP3-402G11.26"; level 2; havana_gene "OTTHUMG00000186123.2"; +chr22 HAVANA transcript 50637519 50638976 . - . gene_id "ENSG00000273253.1"; transcript_id "ENST00000608025.1"; gene_type "antisense"; gene_status "NOVEL"; gene_name "RP3-402G11.26"; transcript_type "antisense"; transcript_status "KNOWN"; transcript_name "RP3-402G11.26-001"; level 2; tag "basic"; havana_gene "OTTHUMG00000186123.2"; havana_transcript "OTTHUMT00000472292.2"; +chr22 HAVANA exon 50638505 50638976 . - . gene_id "ENSG00000273253.1"; transcript_id "ENST00000608025.1"; gene_type "antisense"; gene_status "NOVEL"; gene_name "RP3-402G11.26"; transcript_type "antisense"; transcript_status "KNOWN"; transcript_name "RP3-402G11.26-001"; exon_number 1; exon_id "ENSE00003710600.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000186123.2"; havana_transcript "OTTHUMT00000472292.2"; +chr22 HAVANA exon 50637519 50637757 . - . gene_id "ENSG00000273253.1"; transcript_id "ENST00000608025.1"; gene_type "antisense"; gene_status "NOVEL"; gene_name "RP3-402G11.26"; transcript_type "antisense"; transcript_status "KNOWN"; transcript_name "RP3-402G11.26-001"; exon_number 2; exon_id "ENSE00003710731.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000186123.2"; havana_transcript "OTTHUMT00000472292.2"; +chr22 HAVANA gene 50639408 50656045 . + . gene_id "ENSG00000073169.9"; transcript_id "ENSG00000073169.9"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO"; level 2; havana_gene "OTTHUMG00000044645.3"; +chr22 HAVANA transcript 50639408 50656045 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA Selenocysteine 50655787 50655789 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50639408 50640019 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 1; exon_id "ENSE00001541223.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50639466 50640019 . + 0 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 1; exon_id "ENSE00001541223.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA start_codon 50639466 50639468 . + 0 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 1; exon_id "ENSE00001541223.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50644746 50644949 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 2; exon_id "ENSE00001129529.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50644746 50644949 . + 1 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 2; exon_id "ENSE00001129529.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50646965 50647145 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 3; exon_id "ENSE00001129524.2"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50646965 50647145 . + 1 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 3; exon_id "ENSE00001129524.2"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50648610 50648740 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 4; exon_id "ENSE00003473644.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50648610 50648740 . + 0 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 4; exon_id "ENSE00003473644.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50649060 50649340 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 5; exon_id "ENSE00003503715.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50649060 50649340 . + 1 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 5; exon_id "ENSE00003503715.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50654146 50654296 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 6; exon_id "ENSE00003573348.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50654146 50654296 . + 2 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 6; exon_id "ENSE00003573348.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50655120 50655305 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 7; exon_id "ENSE00003510005.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50655120 50655305 . + 1 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 7; exon_id "ENSE00003510005.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50655401 50655557 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 8; exon_id "ENSE00003591346.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50655401 50655557 . + 1 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 8; exon_id "ENSE00003591346.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50655634 50656045 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 9; exon_id "ENSE00003512975.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50655634 50655795 . + 0 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 9; exon_id "ENSE00003512975.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA stop_codon 50655796 50655798 . + 0 gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; exon_number 9; exon_id "ENSE00003512975.1"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA UTR 50639408 50639465 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA UTR 50655796 50656045 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000380903.2"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "protein_coding"; transcript_status "KNOWN"; transcript_name "SELO-001"; level 2; tag "basic"; tag "appris_principal"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA transcript 50644871 50656045 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SELO-002"; level 2; tag "basic"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50644871 50644949 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SELO-002"; exon_number 1; exon_id "ENSE00001890724.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50646917 50647145 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SELO-002"; exon_number 2; exon_id "ENSE00001952603.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50648610 50648740 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SELO-002"; exon_number 3; exon_id "ENSE00003583919.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50649060 50649340 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SELO-002"; exon_number 4; exon_id "ENSE00003620115.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50654146 50654296 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SELO-002"; exon_number 5; exon_id "ENSE00003636069.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50655120 50655305 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SELO-002"; exon_number 6; exon_id "ENSE00003579717.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50655401 50655557 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SELO-002"; exon_number 7; exon_id "ENSE00003650938.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50655634 50656045 . + . gene_id "ENSG00000073169.9"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_status "KNOWN"; gene_name "SELO"; transcript_type "processed_transcript"; transcript_status "KNOWN"; transcript_name "SELO-002"; exon_number 8; exon_id "ENSE00003475904.1"; level 2; tag "basic"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.valid1.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.valid1.gtf new file mode 100644 index 00000000000..2f873c2aa57 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.valid1.gtf @@ -0,0 +1,8 @@ +##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88) +##provider: GENCODE +##contact: gencode-help@sanger.ac.uk +##format: gtf +##date: 2017-03-14 +chr1 ENSEMBL gene 30366 30503 . + . gene_id "ENSG00000284332.1"; gene_type "miRNA"; gene_name "MIR1302-2"; level 3; +chr1 ENSEMBL transcript 30366 30503 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; level 3; transcript_support_level "NA"; tag "basic"; +chr1 ENSEMBL exon 30366 30503 . + . gene_id "ENSG00000284332.1"; transcript_id "ENST00000607096.1"; gene_type "miRNA"; gene_name "MIR1302-2"; transcript_type "miRNA"; transcript_name "MIR1302-2-201"; exon_number 1; exon_id "ENSE00003695741.1"; level 3; transcript_support_level "NA"; tag "basic"; diff --git a/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.valid_gencode_file2.gtf b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.valid_gencode_file2.gtf new file mode 100644 index 00000000000..8315fdd03f5 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/utils/codecs/GENCODE/gencode.valid_gencode_file2.gtf @@ -0,0 +1,64 @@ +##description: evidence-based annotation of the human genome (GRCh38), version 26 (Ensembl 88) +##provider: GENCODE +##contact: gencode-help@sanger.ac.uk +##format: gtf +##date: 2017-03-14 +chr22 HAVANA gene 50200979 50217616 . + . gene_id "ENSG00000073169.13"; gene_type "protein_coding"; gene_name "SELENOO"; level 2; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL transcript 50200979 50217615 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50200979 50201590 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 1; exon_id "ENSE00001541223.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50201037 50201590 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 1; exon_id "ENSE00001541223.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL start_codon 50201037 50201039 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 1; exon_id "ENSE00001541223.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50206317 50206520 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 2; exon_id "ENSE00001129529.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50206317 50206520 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 2; exon_id "ENSE00001129529.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50208536 50208716 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 3; exon_id "ENSE00001129524.2"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50208536 50208716 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 3; exon_id "ENSE00001129524.2"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50210181 50210311 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 4; exon_id "ENSE00003473644.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50210181 50210311 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 4; exon_id "ENSE00003473644.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50210631 50210911 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 5; exon_id "ENSE00003503715.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50210631 50210911 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 5; exon_id "ENSE00003503715.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50215717 50215867 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 6; exon_id "ENSE00003573348.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50215717 50215867 . + 2 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 6; exon_id "ENSE00003573348.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50216691 50216876 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 7; exon_id "ENSE00003510005.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50216691 50216876 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 7; exon_id "ENSE00003510005.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50216972 50217128 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 8; exon_id "ENSE00003591346.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50216972 50217128 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 8; exon_id "ENSE00003591346.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50217205 50217357 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 9; exon_id "ENSE00003728455.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50217205 50217357 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 9; exon_id "ENSE00003728455.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL exon 50217361 50217615 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 10; exon_id "ENSE00003739808.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL CDS 50217361 50217366 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 10; exon_id "ENSE00003739808.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL stop_codon 50217367 50217369 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 10; exon_id "ENSE00003739808.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL UTR 50200979 50201036 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 1; exon_id "ENSE00001541223.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 ENSEMBL UTR 50217367 50217615 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000611222.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-201"; exon_number 10; exon_id "ENSE00003739808.1"; level 3; protein_id "ENSP00000483593.1"; transcript_support_level "5"; tag "basic"; tag "appris_alternative_2"; havana_gene "OTTHUMG00000044645.3"; +chr22 HAVANA transcript 50200979 50217616 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA Selenocysteine 50217358 50217360 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50200979 50201590 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 1; exon_id "ENSE00001541223.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50201037 50201590 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 1; exon_id "ENSE00001541223.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA start_codon 50201037 50201039 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 1; exon_id "ENSE00001541223.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50206317 50206520 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 2; exon_id "ENSE00001129529.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50206317 50206520 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 2; exon_id "ENSE00001129529.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50208536 50208716 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 3; exon_id "ENSE00001129524.2"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50208536 50208716 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 3; exon_id "ENSE00001129524.2"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50210181 50210311 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 4; exon_id "ENSE00003473644.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50210181 50210311 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 4; exon_id "ENSE00003473644.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50210631 50210911 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 5; exon_id "ENSE00003503715.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50210631 50210911 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 5; exon_id "ENSE00003503715.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50215717 50215867 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 6; exon_id "ENSE00003573348.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50215717 50215867 . + 2 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 6; exon_id "ENSE00003573348.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50216691 50216876 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 7; exon_id "ENSE00003510005.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50216691 50216876 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 7; exon_id "ENSE00003510005.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50216972 50217128 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 8; exon_id "ENSE00003591346.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50216972 50217128 . + 1 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 8; exon_id "ENSE00003591346.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA exon 50217205 50217616 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 9; exon_id "ENSE00003512975.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA CDS 50217205 50217366 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 9; exon_id "ENSE00003512975.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA stop_codon 50217367 50217369 . + 0 gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 9; exon_id "ENSE00003512975.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA UTR 50200979 50201036 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 1; exon_id "ENSE00001541223.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA UTR 50217367 50217616 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000380903.6"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "protein_coding"; transcript_name "SELENOO-001"; exon_number 9; exon_id "ENSE00003512975.1"; level 2; protein_id "ENSP00000370288.2"; transcript_support_level "1"; tag "basic"; tag "appris_principal_2"; tag "CCDS"; tag "seleno"; ccdsid "CCDS43034.1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000075003.2"; +chr22 HAVANA transcript 50206442 50217616 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "processed_transcript"; transcript_name "SELENOO-002"; level 2; transcript_support_level "1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50206442 50206520 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "processed_transcript"; transcript_name "SELENOO-002"; exon_number 1; exon_id "ENSE00001890724.1"; level 2; transcript_support_level "1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50208488 50208716 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "processed_transcript"; transcript_name "SELENOO-002"; exon_number 2; exon_id "ENSE00001952603.1"; level 2; transcript_support_level "1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50210181 50210311 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "processed_transcript"; transcript_name "SELENOO-002"; exon_number 3; exon_id "ENSE00003583919.1"; level 2; transcript_support_level "1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50210631 50210911 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "processed_transcript"; transcript_name "SELENOO-002"; exon_number 4; exon_id "ENSE00003620115.1"; level 2; transcript_support_level "1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50215717 50215867 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "processed_transcript"; transcript_name "SELENOO-002"; exon_number 5; exon_id "ENSE00003636069.1"; level 2; transcript_support_level "1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50216691 50216876 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "processed_transcript"; transcript_name "SELENOO-002"; exon_number 6; exon_id "ENSE00003579717.1"; level 2; transcript_support_level "1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50216972 50217128 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "processed_transcript"; transcript_name "SELENOO-002"; exon_number 7; exon_id "ENSE00003650938.1"; level 2; transcript_support_level "1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1"; +chr22 HAVANA exon 50217205 50217616 . + . gene_id "ENSG00000073169.13"; transcript_id "ENST00000492092.1"; gene_type "protein_coding"; gene_name "SELENOO"; transcript_type "processed_transcript"; transcript_name "SELENOO-002"; exon_number 8; exon_id "ENSE00003475904.1"; level 2; transcript_support_level "1"; havana_gene "OTTHUMG00000044645.3"; havana_transcript "OTTHUMT00000316993.1";