Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix overflow with huge header in VCF file #12

Merged
merged 1 commit into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,16 @@
import java.util.List;
import java.util.Vector;

import genepi.io.text.LineReader;
import genepi.io.FileUtil;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.InputStreamReader;

import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;

public class FastVCFFileReader extends LineReader {
public class FastVCFFileReader {

private List<String> samples;

Expand All @@ -21,13 +26,19 @@ public class FastVCFFileReader extends LineReader {

private List<String> header = new Vector<>();

private VCFLineParser parser;
private String filename;

protected BufferedReader in;

private int lineNumber;

public FastVCFFileReader(String vcfFilename) throws IOException {
private String line;

super(vcfFilename);
private VCFLineParser parser;

public FastVCFFileReader(String filename) throws IOException {
// load header
VCFFileReader reader = new VCFFileReader(new File(vcfFilename), false);
VCFFileReader reader = new VCFFileReader(new File(filename), false);
VCFHeader header = reader.getFileHeader();
samples = header.getGenotypeSamples();
samplesCount = samples.size();
Expand All @@ -36,6 +47,11 @@ public FastVCFFileReader(String vcfFilename) throws IOException {

parser = new VCFLineParser(samplesCount);

this.filename = filename;
FileInputStream inputStream = new FileInputStream(filename);
InputStream in2 = FileUtil.decompressStream(inputStream);
this.in = new BufferedReader(new InputStreamReader(in2));

}

public List<String> getGenotypedSamples() {
Expand All @@ -54,27 +70,46 @@ public int getSamplesCount() {
return samplesCount;
}

@Override
protected void parseLine(String line) throws IOException {

// not a header line
if (line.charAt(0) != '#') {

variantContext = parser.parseLine(line);

if (variantContext.getNSamples() != samplesCount) {
throw new IOException("Line " + getLineNumber() + ": different number of samples.");
public boolean next() throws IOException {
while(true) {
if ((this.line = this.in.readLine()) != null) {
try {
this.lineNumber++;
if (this.line.trim().isEmpty()) {
continue;
}

// Check if the line starts with '#' and skip processing for header lines
if (this.line.startsWith("#")) {
header.add(this.line);
continue;
}

// Parse non-header lines
this.parseLine(this.line);
return true;
} catch (Exception var2) {
throw new IOException(this.filename + ": Line " + this.lineNumber + ": " + var2.getMessage());
}
}

snpsCount++;
return false;
}
}

} else {
header.add(line);
next();
protected void parseLine(String line) throws IOException {
variantContext = parser.parseLine(line);
if (variantContext.getNSamples() != samplesCount) {
throw new IOException("Line " + lineNumber + ": different number of samples.");
}
snpsCount++;
}

public void close() throws IOException {
in.close();
}


public List<String> getFileHeader() {
return header;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,20 @@ public void testQcStatisticsFilterSampleCallrate() throws Exception {

}

@Test
public void testBigHeader() throws Exception {

String inputFolder = "test-data/data/big-header";

QualityControlCommand command = buildCommand(inputFolder);
command.setReference("test-data/configs/hapmap-chr1/hapmap2-region-simple.json");
assertEquals(1, (int) command.call());

OutputReader log = new OutputReader(CLOUDGENE_LOG);
log.view();

}

@Test
public void testChr23PipelineLifting() throws Exception {

Expand Down
Binary file added test-data/data/big-header/minimac_test.50.vcf.gz
Binary file not shown.
Loading