Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update SpliceAI GPU #983

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
5 changes: 5 additions & 0 deletions modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ sub merge_vcf_files {

# Sort final file
$self->run_system_command("bcftools sort -o $final_file_sorted $final_file");
$self->run_system_command("bgzip $final_file_sorted");
$self->run_system_command("tabix -p vcf $final_file_sorted.gz");

# Remove unsorted file
$self->run_system_command("rm $final_file");
}

1;
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,12 @@ sub run_spliceai {
die("Directory ($output_vcf_files_dir) doesn't exist");
}

my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation";
my $tmp_dir = $main_dir . "/tmp";

# activate conda env
$self->run_system_command("source activate spliceai");

my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation -B 4096 -T 256 -t $tmp_dir";

# Add option to calculate masked scores
if($masked_scores) {
Expand Down
32 changes: 21 additions & 11 deletions modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
Original file line number Diff line number Diff line change
Expand Up @@ -54,21 +54,21 @@ sub default_options {
pipeline_name => 'spliceai_scores',
main_dir => $self->o('main_dir'), # main directory where all files and directories are going to be stored
input_directory => $self->o('main_dir') . '/input_vcf_files', # input files
split_vcf_no_header_dir => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers)
split_vcf_no_header_dir => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers)
split_vcf_input_dir => $self->o('main_dir') . '/split_vcf_input', # contains the splitted input vcf files with headers, these are the files used to run SpliceAI
split_vcf_output_dir => $self->o('main_dir') . '/split_vcf_output', # temporary output files, still splitted
output_dir => $self->o('main_dir') . '/output', # final output files already merged by chromosome
fasta_file => $self->o('fasta_file'),
gene_annotation => $self->o('gene_annotation'),
step_size => 4000, # number of variants used to split the main vcf files
step_size => 500000, # number of variants used to split the main vcf files
check_transcripts => 0, # if set to 1 checks which are the new MANE Select transcripts for the last months and only calculates SpliceAI scores for these variants overlapping these transcripts
transcripts_from_file => undef,
time_interval => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db
masked_scores => 1, # calculate masked scores
time_interval => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db
masked_scores => 1, # calculate masked scores
registry => undef, # database where new MANE transcripts are going to be checked; only used if check_transcripts = 1
output_file_name => 'spliceai_final_scores_',
output_file_name => 'spliceai_final_scores_' . $self->o('ensembl_release') . '_',

pipeline_wide_analysis_capacity => 500,
pipeline_wide_analysis_capacity => 80,

pipeline_db => {
-host => $self->o('hive_db_host'),
Expand All @@ -85,8 +85,18 @@ sub resource_classes {
my ($self) = @_;
return {
%{$self->SUPER::resource_classes},
'8Gb_8c_job' => {'LSF' => '-n 8 -q production -R"select[mem>8000] rusage[mem=8000]" -M8000' },
'4Gb_job' => {'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000'},
'gpu' => {
'LSF' => '-q gpu-a100 -gpu "num=1:gmem=80000"', # the queue gpu-a100 can access /hps/nobackup
'SLURM' => '--time=5:00:00 --gres=gpu:a100:1 --mem=24G'
},
'4Gb_job' => {
'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000',
'SLURM' => "--partition=standard --time=4:00:00 --mem=4G"
},
'default' => {
'LSF' => '-R"select[mem>1000] rusage[mem=1000]" -M1000',
'SLURM' => "--partition=standard --time=1:00:00 --mem=2G"
}
};
}

Expand All @@ -99,11 +109,11 @@ sub pipeline_analyses {
-input_ids => [{}],
-parameters => {
'input_directory' => $self->o('input_directory'),
'inputcmd' => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf" -printf "%f\n"',
'inputcmd' => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf.gz" -printf "%f\n"',
},
-flow_into => {
'2->A' => {'split_files' => {'vcf_file' => '#_0#'}},
'A->1' => ['get_chr_dir'],
'A->1' => ['get_chr_dir']
},
},
{ -logic_name => 'split_files',
Expand Down Expand Up @@ -151,7 +161,7 @@ sub pipeline_analyses {
-hive_capacity => $self->o('pipeline_wide_analysis_capacity'),
-analysis_capacity => $self->o('pipeline_wide_analysis_capacity'),
-input_ids => [],
-rc_name => '8Gb_8c_job',
-rc_name => 'gpu',
-parameters => {
'main_dir' => $self->o('main_dir'),
'split_vcf_input_dir' => $self->o('split_vcf_input_dir'),
Expand Down
45 changes: 30 additions & 15 deletions modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm
Original file line number Diff line number Diff line change
Expand Up @@ -59,12 +59,15 @@ sub run {
$self->check_split_vcf_file();
}
$self->split_vcf_file();

# create tmp dir where spliceai tmp files will be stored
$self->create_dir($self->param_required('main_dir') . '/tmp');
}

sub set_chr_from_filename {
my $self = shift;
my $vcf_file = $self->param_required('vcf_file');
$vcf_file =~ /.*_chr(.*)\.vcf$/;
$vcf_file =~ /.*_chr(.*)\.vcf.gz$/;
my $chr = $1;
if (!$chr) {
die("Could not get chromosome name from file name ($vcf_file).");
Expand All @@ -82,8 +85,19 @@ sub split_vcf_file {
my $split_vcf_output_dir = $self->param_required('split_vcf_output_dir');
my $step_size = $self->param_required('step_size');

# If we are checking the transcripts then the input file is a VCF
# Remove .gz
if($check) {
$vcf_file =~ s/vcf.gz/vcf/;
}

my $vcf_file_path = $input_dir . '/' . $vcf_file;

if(!$check && $vcf_file =~ /vcf.gz/) {
$self->run_system_command("gunzip $vcf_file_path");
$vcf_file_path =~ s/vcf.gz/vcf/;
}

if (! -d $input_dir) {
die("Directory ($input_dir) doesn't exist");
}
Expand Down Expand Up @@ -173,8 +187,13 @@ sub check_split_vcf_file {
}

# prepare input vcf file for tabix
$self->run_system_command("bgzip $vcf_file_path");
$self->run_system_command("tabix -p vcf $vcf_file_path.gz");
if($vcf_file !~ /vcf.gz/) {
$self->run_system_command("bgzip $vcf_file_path");
$self->run_system_command("tabix -p vcf $vcf_file_path.gz");
}
else {
$self->run_system_command("tabix -p vcf $vcf_file_path");
}

# Create directory to store the input file only with the variants of interest (overlapping new transcripts)
my $vcf_file_path_subset = $main_dir . '/input_vcf_files_subset';
Expand All @@ -184,7 +203,10 @@ sub check_split_vcf_file {

$self->param('input_dir_subset', $vcf_file_path_subset);

open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file) or die $!;
my $vcf_file_sub = $vcf_file;
$vcf_file_sub =~ s/.gz//;

open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file_sub) or die $!;

my $positions_of_interest = $transcripts->{$chr};
foreach my $position (@$positions_of_interest) {
Expand All @@ -194,8 +216,8 @@ sub check_split_vcf_file {

my $pos_string = sprintf("%s:%i-%i", $chr, $transcript_start, $transcript_end);

(open my $read, "tabix " . $input_dir . "/" . $vcf_file . ".gz $pos_string |")
|| die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . ".gz \n" ;
(open my $read, "tabix " . $input_dir . "/" . $vcf_file . " $pos_string |")
or die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . " \n" ;

while (my $row = <$read>) {
chomp $row;
Expand All @@ -212,8 +234,8 @@ sub check_split_vcf_file {
close($write);

# Sort new vcf file
my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file;
my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file;
my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file_sub;
my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file_sub;
$self->run_system_command("sort -o $vcf_file_subset_sorted -k1,1 -k2,2n $vcf_file_subset");
$self->run_system_command("mv $vcf_file_subset_sorted $vcf_file_subset");
}
Expand Down Expand Up @@ -291,11 +313,4 @@ sub get_new_transcripts_file {
$self->param('transcripts', \%new_transcripts);
}

sub count_lines {
my $self = shift;
my $filename = shift;


}

1;
187 changes: 187 additions & 0 deletions scripts/python/spliceai_annotation_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
#!/usr/bin/env python3

"""
Script to generate the gene annotation file for SpliceAI.
This file contains data about the transcript that is being used for each gene.
SpliceAI only annotates variants overlapping these transcripts.

Template provided by SpliceAI: https://github.com/Illumina/SpliceAI/blob/master/spliceai/annotations/grch38.txt

Gene annotation file format:
#NAME CHROM STRAND TX_START TX_END EXON_START EXON_END
KRTAP27-1 21 - 30337013 30337694 30337013, 30337694,

Options:
--output_file gene annotation output file (Optional. Default: gene_annotation.txt)
--species species name (Optional. Default: homo_sapiens)
--assembly assembly version (Optional. Default: 38)
--host core database host (Mandatory)
--port host port (Mandatory)
--user host user (Mandatory)
--release core database version (Mandatory)
"""

import argparse
import mysql.connector
from mysql.connector import Error


def fetch_transcripts(species, assembly, release, host, port, user):
gene_annotation = {}
database = f"{species}_core_{release}_{assembly}"

# For human we select 'MANE_Select' transcripts and the gene name is a gene attrib
if species == "homo_sapiens":
sql_select = """
SELECT ga.value,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
e.seq_region_start,e.seq_region_end FROM transcript t
JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id
JOIN seq_region s ON t.seq_region_id = s.seq_region_id
JOIN gene g ON g.gene_id = t.gene_id
JOIN gene_attrib ga ON g.gene_id = ga.gene_id
JOIN exon_transcript et ON t.transcript_id = et.transcript_id
JOIN exon e ON e.exon_id = et.exon_id
WHERE t.stable_id like 'ENST%' and t.biotype = 'protein_coding'
and ga.attrib_type_id = 4 and atr.code = 'MANE_Select'
order by ga.value,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end
"""
else:
# For other species we select the canonical transcripts and the gene name is in xref
sql_select = """
SELECT DISTINCT g.stable_id,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
e.seq_region_start,e.seq_region_end FROM transcript t
JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id
JOIN seq_region s ON t.seq_region_id = s.seq_region_id
JOIN gene g ON g.gene_id = t.gene_id
JOIN exon_transcript et ON t.transcript_id = et.transcript_id
JOIN exon e ON e.exon_id = et.exon_id
JOIN xref xr ON g.display_xref_id = xr.xref_id
WHERE t.stable_id like 'ENS%' and t.biotype = 'protein_coding' and atr.code = 'is_canonical'
order by xr.display_label,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end
"""

connection = mysql.connector.connect(host=host,
database=database,
user=user,
password='',
port=port)

try:
if connection.is_connected():
cursor = connection.cursor()
cursor.execute(sql_select)
data = cursor.fetchall()
for row in data:
strand = row[2]
if strand == 1:
strand = "+"
else:
strand = "-"

if row[0] not in gene_annotation:
exons_start = []
exons_end = []
exons_start.append(str(row[5]))
exons_end.append(str(row[6]))

gene_annotation[row[0]] = {
"chr": row[1],
"strand": strand,
"start": row[3],
"end": row[4],
"exons_start": exons_start,
"exons_end": exons_end
}
else:
gene_annotation[row[0]]["exons_start"].append(str(row[5]))
gene_annotation[row[0]]["exons_end"].append(str(row[6]))

except Error as e:
print(f"Error while connecting to MySQL {database}", e)
finally:
if connection.is_connected():
cursor.close()
connection.close()

return gene_annotation

def sanity_checks(transcripts_list):
fail_list = []

for gene, data in transcripts_list.items():
check = 1

# transcript start > end
if data["start"] >= data["end"]:
check = 0

# the exon start has to be bigger than the last exon end
i = 0
for exon_start in data["exons_start"]:
if i > 0:
last_exon_end = data["exons_end"][i - 1]
if(int(exon_start) < int(last_exon_end)):
check = 0

i += 1

if check == 0:
fail_list.append(gene)

return fail_list

def write_output(transcripts_list, output_file):
# Write to output file
with open(output_file, "w") as f:
f.write("#NAME\tCHROM\tSTRAND\tTX_START\tTX_END\tEXON_START\tEXON_END\n")

for gene, data in transcripts_list.items():
chr = data["chr"]
strand = data["strand"]
start = data["start"]
end = data["end"]
exons_start = (",").join(data["exons_start"])
exons_end = (",").join(data["exons_end"])

f.write(f"{gene}\t{chr}\t{strand}\t{start}\t{end}\t{exons_start},\t{exons_end},\n")


def main():
parser = argparse.ArgumentParser(description="Generate the gene annotation file for SpliceAI")
parser.add_argument("-o", "--output_file",
default="gene_annotation.txt",
help="output file (default: gene_annotation.txt)")
parser.add_argument("-sp", "--species",
default="homo_sapiens",
help="species (default: homo_sapiens)")
parser.add_argument("-a", "--assembly",
default="38",
help="species assembly (default: 38)")
parser.add_argument("-r", "--release", required=True)
parser.add_argument("--host", required=True)
parser.add_argument("--port", required=True)
parser.add_argument("--user", required=True)
args = parser.parse_args()

output_file = args.output_file
species = args.species
assembly = args.assembly
release = args.release
host = args.host
port = args.port
user = args.user

transcripts_list = fetch_transcripts(species, assembly, release, host, port, user)

check = sanity_checks(transcripts_list)

if not check:
write_output(transcripts_list, output_file)
else:
print("Sanity checks failed for the following genes: ", (", ").join(check))


if __name__ == '__main__':
main()
Loading