Skip to content

Commit

Permalink
Created AvroFileReader and unittest, Update ExtractCohort and Extract…
Browse files Browse the repository at this point in the history
…CohortEngine (#7174)

-Created AvroFileReader

Update ExtractCohort and ExtractCohortEngine to accept a AvroFile
Testing:
Created a test for AvroFileReader

Co-authored-by: Marianie-Simeon <msimeon@broadinstitue.org>
  • Loading branch information
2 people authored and mmorgantaylor committed Apr 6, 2021
1 parent 27af70f commit 4d83f4f
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ public class ExtractCohort extends ExtractTool {
)
private String cohortTable = null;

@Argument(
fullName = "cohort-avro-file-name",
doc = "Path of the cohort avro file",
optional = true
)
private String cohortAvroFileName = null;

@Argument(
fullName = "filter-set-name",
doc = "Name in filter_set_name column of filtering table to use. Which training set should be applied in extract.",
Expand Down Expand Up @@ -83,6 +90,7 @@ protected void onStartup() {
sampleNames,
mode,
cohortTable,
cohortAvroFileName,
minLocation,
maxLocation,
filteringFQTableName,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ public class ExtractCohortEngine {
private int totalNumberOfVariants = 0;
private int totalNumberOfSites = 0;

private final String cohortAvroFileName;
private final String filterSetName;

/**
Expand All @@ -81,6 +82,7 @@ public ExtractCohortEngine(final String projectID,
final Set<String> sampleNames,
final CommonCode.ModeEnum mode,
final String cohortTableName,
final String cohortAvroFileName,
final Long minLocation,
final Long maxLocation,
final String filteringTableName,
Expand All @@ -101,6 +103,7 @@ public ExtractCohortEngine(final String projectID,
this.mode = mode;

this.cohortTableRef = new TableReference(cohortTableName, SchemaUtils.COHORT_FIELDS);
this.cohortAvroFileName = cohortAvroFileName;
this.minLocation = minLocation;
this.maxLocation = maxLocation;
this.filteringTableRef = filteringTableName == null || "".equals(filteringTableName) ? null : new TableReference(filteringTableName, SchemaUtils.YNG_FIELDS);
Expand Down Expand Up @@ -164,8 +167,14 @@ public void traverse() {
logger.debug("using storage api with local sort");
}
logger.debug("Initializing Reader");
final StorageAPIAvroReader storageAPIAvroReader = new StorageAPIAvroReader(cohortTableRef, rowRestriction, projectID);
createVariantsFromUngroupedTableResult(storageAPIAvroReader, fullVqsLodMap, fullYngMap, noFilteringRequested);
if (cohortTableRef != null){
final StorageAPIAvroReader storageAPIAvroReader = new StorageAPIAvroReader(cohortTableRef, rowRestriction, projectID);
createVariantsFromUngroupedTableResult(storageAPIAvroReader, fullVqsLodMap, fullYngMap, noFilteringRequested);
}
else {
final AvroFileReader avroFileReader = new AvroFileReader(cohortAvroFileName);
createVariantsFromUngroupedTableResult(avroFileReader, fullVqsLodMap, fullYngMap, noFilteringRequested);
}
logger.debug("Finished Initializing Reader");
break;
case QUERY:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package org.broadinstitute.hellbender.utils.bigquery;

import org.apache.avro.file.DataFileStream;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DecoderFactory;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.bigquery.GATKAvroReader;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.apache.avro.Schema;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;

public class AvroFileReader implements GATKAvroReader {
private DatumReader<GenericRecord> datumReader;
private DataFileStream<GenericRecord> dataFileStream;
private org.apache.avro.Schema schema;

// Decoder object will be reused to avoid re-allocation and too much garbage collection.
private BinaryDecoder decoder = null;

// GenericRecord object will be reused.
private GenericRecord nextRow = null;

public AvroFileReader(final String avroFileURI ) {
try {
final Path avroFilePath = IOUtils.getPath(avroFileURI);

datumReader = new GenericDatumReader<>();
dataFileStream = new DataFileStream<>(Files.newInputStream(avroFilePath), datumReader);
schema = dataFileStream.getSchema();
} catch (IOException e ) {
throw new GATKException("I/O Error", e);
}
}

@Override
public Schema getSchema() {
return schema;
}

@Override
public boolean hasNext() {
return dataFileStream.hasNext();
}

@Override
public GenericRecord next() {
return dataFileStream.next();
}

@Override
public void close() {
try {
dataFileStream.close();
} catch (IOException e ) {
throw new GATKException("Error closing stream", e);
}
}

@Override
public Iterator<GenericRecord> iterator() {
return this;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package org.broadinstitute.hellbender.utils.bigquery;


import org.apache.avro.generic.GenericRecord;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;
import org.apache.avro.Schema;

import java.util.*;

/**
* A class to test the functionality of {@link AvroFileReader}.
*/
public class AvroFileReaderUnitTest extends GATKBaseTest {

private static final String avroFileName = "src/test/java/org/broadinstitute/hellbender/utils/bigquery/avro_test_avro_test_file.avro";
private static final AvroFileReader avroFile = new AvroFileReader(avroFileName);

@Test()
public void testGetSchema() {
Schema avroFileSchema = avroFile.getSchema();
String testAvroFileSchema = "{\"type\":\"record\",\"name\":\"Root\",\"fields\":[{\"name\":\"test_string\",\"type\":[\"null\",\"string\"]},{\"name\":\"test_float\",\"type\":[\"null\",\"double\"]},{\"name\":\"test_integer\",\"type\":[\"null\",\"long\"]}]}";
Assert.assertEquals(avroFileSchema.toString(), testAvroFileSchema, "AvroFileSchema did not match.");
}

@Test()
public void testAvroFileHasNext() {
boolean avroFileHasNext = avroFile.hasNext();
Assert.assertTrue(avroFileHasNext, "Arvo File Reader didn't detect");
}

@Test()
public void testAvroFileNext() {
GenericRecord avroFileNext = avroFile.next();
String testAvroFileNext = "{\"test_string\": \"one\", \"test_float\": 1111111.0, \"test_integer\": 1}";
Assert.assertEquals(avroFileNext.toString(), testAvroFileNext, "AvroFile Next did not match.");
}

}
Binary file not shown.

0 comments on commit 4d83f4f

Please sign in to comment.