Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding check for population sets for variation sets script #1020

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
94 changes: 83 additions & 11 deletions scripts/import/dbSNP_v2/update_new_sets.pl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
use DBI;
use Socket;
use Bio::EnsEMBL::Registry;
use Data::Dumper;
use Getopt::Long;
use POSIX qw(strftime);
use Cwd qw(cwd);
Expand Down Expand Up @@ -89,6 +90,10 @@
dump_new_variation_sets($dbh, $tmp_num, $chunk, $max_id);
}


debug($config, "Recalculating the variation sets"); # takes from the merged file and recalculates and creates the concatenate file that will be used to update variation feature
recalculate($tmp_merged, $tmp_vs_file);

debug($config, "Updating the variation feature table");
update_variation_feature_table($dbh, $tmp_vs_file);

Expand All @@ -105,6 +110,7 @@
ALTER TABLE variation_set_variation ENABLE keys;
}) or die "Failed to alter variation_set_variation keys";



sub temp_table {
my $dbhvar = shift;
Expand All @@ -124,9 +130,9 @@ sub create_merged_file {
my $new_vf_file = shift;
my $tmp_merged = shift;

open FILE1, "<", "$file" or die "Cannot open $file: $!";
open FILE2, "<", "$second_file" or die "Cannot open $second_file: $!";
open OUTPUT, ">", "$third_file" or die "Cannot open $third_file: $!";
open FILE1, "<", "$tmp_vset" or die "Cannot open $tmp_vset: $!";
open FILE2, "<", "$new_vf_file" or die "Cannot open $new_vf_file: $!";
open OUTPUT, ">", "$tmp_merged" or die "Cannot open $tmp_merged: $!";
my %data;

while (<FILE1>) {
Expand Down Expand Up @@ -179,25 +185,36 @@ sub load_all_variation_sets {
}

sub update_variation_feature_table {
# this function after populating the variation_feature_backup table created by inserting from the original table would then update the variation_set_id column uaing the file from the
# dump_new_variation_sets
# this function after populating the variation_feature_backup table created by inserting from the original table would then update the variation_set_id column uaing the file from the recalculate
# using the parents and parents set id to update the variation feature table
my $dbhvar = shift;
my $load_file = shift;


my $update_temp_vf = $dbhvar->prepare(q{ UPDATE variation_feature SET variation_set_id = ?
WHERE variation_id = ? AND variation_set_id = ''});
WHERE variation_id = ? });

#my %var_data;
open FH, "<", "$load_file" or die "Can not open $load_file: $!";
while (<FH>) {
chomp;
my $var_id = (split)[0];
my $var_set_id = (split)[1];
use Data::Dumper;
print Dumper($var_set_id, $var_id);
my @fields = split("\t");
my $var_id = $fields[0];
my $var_set_id = $fields[1];


my @sets_array;
# to make sure only unique numbers are in the array
foreach my $x (split(',', $var_set_id)){
push @sets_array, $x if !grep{$_ eq $x}@sets_array;
olaaustine marked this conversation as resolved.
Show resolved Hide resolved
}

my @sorted_array = sort { $a<=>$b } @sets_array;
my $values = join(',', @sorted_array);
$values =~ s/\s*,\s*/,/g; # to eliminate spaces and stuff
$values =~ s/^\s+//; #to eliminate spaces and stuff

$update_temp_vf->execute($var_set_id, $var_id); # creating a hash which has the var_id has the key and the set var_set_id has the values
$update_temp_vf->execute($values, $var_id); # creating a hash which has the var_id has the key and the set var_set_id has the values
}

close FH;
Expand Down Expand Up @@ -295,6 +312,61 @@ sub dump_new_variation_feature {

}

sub get_structure {
my $dbhvar = shift;

my $get_struc_sth = $dbhvar->prepare(qq[ select variation_set_sub, variation_set_super from variation_set_structure]);

my %parent;
$get_struc_sth->execute() ||die;
my $dat = $get_struc_sth->fetchall_arrayref();
foreach my $l(@{$dat}){
$parent{$l->[0]} = $l->[1] ;
}
return \%parent;

}

sub recalculate {
my $input_file = shift;
my $output_file = shift;

my $parent = get_structure($dbh);
my %concat_sets;

open FH, "<", "$input_file" or die "Can not open $input_file: $!";
olaaustine marked this conversation as resolved.
Show resolved Hide resolved


while (<FH>) {
chomp;
my @fields = split("\t");
my $var_id = $fields[0];
my $var_set_id = $fields[1];
my @sets;
if (exists $concat_sets{$var_id}) {
$concat_sets{$var_id} = [] unless ref $concat_sets{$var_id} eq 'ARRAY';
olaaustine marked this conversation as resolved.
Show resolved Hide resolved

push @{$concat_sets{$var_id}}, $var_set_id;
push @{$concat_sets{$var_id}}, $parent->{$var_set_id} if exists $parent->{$var_set_id}; #pushing parents and var_set_id
push @{$concat_sets{$var_id}}, $parent->{$parent->{$var_set_id}} if exists $parent->{$parent->{$var_set_id}};
} else { # if it does not exist, it just creates a new key and an array
$concat_sets{$var_id} = $var_set_id;
}

}


open(my $fh, '>', $output_file) or die "Could not open file '$output_file': $!";
foreach my $var_id (keys %concat_sets) {
my $values_str = join(", ", @{$concat_sets{$var_id}});
print $fh "$var_id\t$values_str\n"; # adding the values str to it
}

# Close the file
close $fh;
close FH;
}

sub usage {

die "\n\tUsage: update_new_sets.pl -registry [registry file] -release [release number] \tOptional: -tmp [temp folder] or gets set based on current directory
Expand Down