Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove redundant symbolic alt to so term hash #1075

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ before_install:
- git clone --branch $ENSEMBL_BRANCH --depth 1 https://github.com/Ensembl/ensembl.git
- git clone --branch main --depth 1 https://github.com/Ensembl/ensembl-hive.git
- git clone --branch $ENSEMBL_BRANCH --depth 1 https://github.com/Ensembl/ensembl-io.git
- git clone --branch $ENSEMBL_BRANCH --depth 1 https://github.com/Ensembl/ensembl-vep.git
- git clone --branch $ENSEMBL_BRANCH --depth 1 https://github.com/Ensembl/ensembl-funcgen.git
- git clone --branch release-1-6-924 --depth 1 https://github.com/bioperl/bioperl-live.git
- git clone --branch 1.9 --depth 1 https://github.com/samtools/htslib.git
Expand Down
25 changes: 25 additions & 0 deletions modules/Bio/EnsEMBL/Variation/Utils/Config.pm
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ our @EXPORT_OK = qw(
$OVERLAP_CONSEQUENCE_CLASS
$MAX_ATTRIB_CODE_LENGTH
%SO_ACC_MAPPER
%SO_TERMS
);

our $OVERLAP_CONSEQUENCE_CLASS = 'Bio::EnsEMBL::Variation::OverlapConsequence';
Expand Down Expand Up @@ -1382,5 +1383,29 @@ our %SO_ACC_MAPPER = (
}
);

# Used to convert symbolic alternative allele to SO term
our %SO_TERMS = (
INS => 'insertion',
INS_ME => 'mobile_element_insertion',
INS_ALU => 'Alu_insertion',
INS_HERV => 'HERV_insertion',
INS_LINE1 => 'LINE1_insertion',
INS_SVA => 'SVA_insertion',

DEL => 'deletion',
DEL_ME => 'mobile_element_deletion',
DEL_ALU => 'Alu_deletion',
DEL_HERV => 'HERV_deletion',
DEL_LINE1 => 'LINE1_deletion',
DEL_SVA => 'SVA_deletion',

nakib103 marked this conversation as resolved.
Show resolved Hide resolved
TREP => 'tandem_repeat',
TDUP => 'tandem_duplication',
DUP => 'duplication',
CNV => 'copy_number_variation',
INV => 'inversion',
BND => 'chromosome_breakpoint'
);


1;
98 changes: 13 additions & 85 deletions modules/Bio/EnsEMBL/Variation/Utils/VEP.pm
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,13 @@ use Bio::EnsEMBL::Variation::DBSQL::VariationFeatureAdaptor;
use Bio::EnsEMBL::Variation::Utils::VariationEffect qw(overlap);
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use Bio::EnsEMBL::Variation::Utils::Sequence qw(unambiguity_code SO_variation_class);
use Bio::EnsEMBL::Variation::Utils::Config qw(%SO_TERMS);
use Bio::EnsEMBL::Variation::Utils::EnsEMBL2GFF3;
use Bio::EnsEMBL::Variation::StructuralVariationFeature;
use Bio::EnsEMBL::Variation::DBSQL::StructuralVariationFeatureAdaptor;
use Bio::EnsEMBL::Variation::TranscriptStructuralVariation;
use Bio::EnsEMBL::Variation::Source;
use Bio::EnsEMBL::VEP::Parser qw(get_SO_term &check_format);

# we need to manually include all these modules for caching to work
use Bio::EnsEMBL::CoordSystem;
Expand All @@ -113,8 +115,6 @@ use vars qw(@ISA @EXPORT_OK);
@ISA = qw(Exporter);

@EXPORT_OK = qw(
&_valid_region_regex
&check_format
&detect_format
&parse_line
&vf_to_consequences
Expand Down Expand Up @@ -384,68 +384,6 @@ sub parse_line {
return $vfs;
}

sub _valid_region_regex {
return qr/^([^:]+):(\d+)-(\d+)(:[-\+]?1)?[\/:]([a-z0-9:]{3,}|[ACGTN-]+)$/i;
}

# sub-routine to check format of string
sub check_format {
my @line = @_;
my $format;

# any changes here must be copied to the JavaScript file to run instant VEP:
# public-plugins/tools/htdocs/components/20_VEPForm.js

# region: chr21:10-10:1/A
if ( scalar @line == 1 && $line[0] =~ &_valid_region_regex() ) {
$format = 'region';
}

# SPDI: NC_000016.10:68684738:G:A
elsif ( scalar @line == 1 && $line[0] =~ /^(.*?\:){2}([^\:]+|)$/i ) {
$format = 'spdi';
}

# CAID: CA9985736
elsif ( scalar @line == 1 && $line[0] =~ /^CA\d{1,}$/i ) {
$format = 'caid';
}

# HGVS: ENST00000285667.3:c.1047_1048insC
elsif (
scalar @line == 1 &&
$line[0] =~ /^([^\:]+)\:.*?([cgmrp]?)\.?([\*\-0-9]+.*)$/i
) {
$format = 'hgvs';
}

# variant identifier: rs123456
elsif ( scalar @line == 1 ) {
$format = 'id';
}

# VCF: 20 14370 rs6054257 G A 29 0 NS=58;DP=258;AF=0.786;DB;H2 GT:GQ:DP:HQ
elsif (
$line[0] =~ /(chr)?\w+/ &&
$line[1] =~ /^\d+$/ &&
exists $line[3] && $line[3] =~ /^[ACGTN\-\.]+$/i &&
exists $line[4]
) {
$format = 'vcf';
}

# ensembl: 20 14370 14370 A/G +
elsif (
$line[0] =~ /\w+/ &&
$line[1] =~ /^\d+$/ &&
exists $line[2] && $line[2] =~ /^\d+$/ &&
exists $line[3] && $line[3] =~ /([a-z]{2,})|([ACGTN-]+\/[ACGTN-]+)/i
) {
$format = 'ensembl';
}
return $format;
}

# sub-routine to detect format of input
sub detect_format {
my $line = shift;
Expand Down Expand Up @@ -478,12 +416,7 @@ sub parse_ensembl {
my $so_term;

# convert to SO term
my %terms = (
INS => 'insertion',
DEL => 'deletion',
TDUP => 'tandem_duplication',
DUP => 'duplication'
);
my %terms = %SO_TERMS;

$so_term = defined $terms{$allele_string} ? $terms{$allele_string} : $allele_string;

Expand Down Expand Up @@ -631,14 +564,7 @@ sub parse_vcf {

if(defined($type)) {
# convert to SO term
my %terms = (
INS => 'insertion',
DEL => 'deletion',
TDUP => 'tandem_duplication',
DUP => 'duplication'
);

$so_term = defined $terms{$type} ? $terms{$type} : $type;
$so_term = get_SO_term(undef, $type) || $type;
}

my $svf = Bio::EnsEMBL::Variation::StructuralVariationFeature->new_fast({
Expand Down Expand Up @@ -933,14 +859,16 @@ sub convert_to_vcf {
else {

# convert to SO term
my %terms = (
'insertion' => 'INS',
'deletion' => 'DEL',
'tandem_duplication' => 'TDUP',
'duplication' => 'DUP'
);
my %terms = reverse %SO_TERMS;
my $abbrev = $terms{$vf->class_SO_term} || $vf->class_SO_term;

$abbrev = "DUP:TANDEM" if $abbrev eq "TDUP";
$abbrev = "CNV:TR" if $abbrev eq "TREP";
$abbrev =~ s/_/:/ if $abbrev =~ /^(INS|DEL)_ME$/;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is correct as of now, but is not future-proof in case other INS/DEL subtypes are supported in the future. Maybe we could change the INS/DEL to contain the ME, such as INS_ME_ALU instead of INS_ALU to be more precise.

This is a bit nitpick-y so maybe just ignore this comment. 😛

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Noted! but I think that requires a bigger change and not within the scope/objective of this PR.

$abbrev =~ s/_/:ME:/ if $abbrev =~ /^(INS|DEL)_[A-Z0-9]+$/;

my $alt = '<'.($terms{$vf->class_SO_term} || $vf->class_SO_term).'>';
my $alt = '<'.$abbrev.'>';
$alt = ( split(/\//, $vf->allele_string, 2) )[1] if ($alt eq "BND");

return [
$vf->{chr} || $vf->seq_region_name,
Expand Down
2 changes: 1 addition & 1 deletion travisci/harness.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

export PERL5LIB=$PWD/bioperl-live:$PWD/ensembl-test/modules:$PWD/ensembl/modules:$PWD/ensembl-hive/modules:$PWD/modules:$PWD/scripts/import/:$PWD/ensembl-io/modules:$PWD/ensembl-funcgen/modules
export PERL5LIB=$PWD/bioperl-live:$PWD/ensembl-test/modules:$PWD/ensembl/modules:$PWD/ensembl-hive/modules:$PWD/modules:$PWD/scripts/import/:$PWD/ensembl-io/modules:$PWD/ensembl-vep/modules:$PWD/ensembl-funcgen/modules

export PATH=$PATH:$PWD/C_code:$PWD/htslib

Expand Down