-
Notifications
You must be signed in to change notification settings - Fork 2
/
ali-apos-to-uapos.pl
executable file
·194 lines (173 loc) · 6.91 KB
/
ali-apos-to-uapos.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/usr/bin/env perl
# EPN, Fri May 25 12:55:03 2018
# ali-apos-to-uapos.pl
# Given an alignment position return the unaligned position that aligns to that alignment column
#
use warnings;
use strict;
use Getopt::Long;
my $usage;
$usage = "perl ali-apos-to-uapos.pl [OPTIONS] <alignment file> <alignment RF position (or just alignment position if --notrf)\n\n";
$usage .= "\tOPTIONS:\n";
$usage .= "\t\t--notrf: alignment position is overall position, not nongap RF (reference) position\n";
$usage .= "\t\t--after: for sequences in which requested positions is a gap, output information\n";
$usage .= "\t\t about first nongap alignment position *after* requested position [default: *before*]\n";
$usage .= "\t\t--protein: alignment is protein (default DNA/RNA)\n";
my $do_notrf = 0; # '1' to not operate in RF coordinate space
my $do_protein = 0; # '1' to specify protein alignment
my $do_after = 0; # '1' if --after, changes output
&GetOptions( "notrf" => \$do_notrf,
"protein" => \$do_protein,
"after" => \$do_after);
if(scalar(@ARGV) != 2) { die $usage; }
my ($aln_file, $pos) = (@ARGV);
# run esl-alistat --list to get list of all sequences:
my $line;
my $list_file = $aln_file . ".list";
runCommand("esl-alistat --list $list_file $aln_file > /dev/null", 0);
open(IN, "$list_file") || die "ERROR unable to open $list_file";
my @seq_A = ();
my $seq_width = length("#seqname");
while($line = <IN>) {
chomp $line;
push(@seq_A, $line);
if(length($line) > $seq_width) {
$seq_width = length($line);
}
}
close(IN);
unlink "$list_file";
my $nseq = scalar(@seq_A);
my @len_A = ();
my $alphabet_option = ($do_protein) ? "--amino" : "--dna";
my $rf_option = ($do_notrf) ? "" : "--t-rf";
my $i;
# get full lengths of all sequences (actually only used if --after)
my $seqstat_file = $aln_file . ".a.seqstat.0";
my @full_len_A = ();
runCommand(sprintf("esl-seqstat -a --informat stockholm $alphabet_option $aln_file | grep ^\= > $seqstat_file", $pos-1), 0);
parse_seqstat_a_file($seqstat_file, \@seq_A, \@full_len_A);
unlink $seqstat_file;
# run esl-alimask to truncate alignment ending at postion $pos-1
if($pos == 1) { # special case
for($i = 0; $i < $nseq; $i++) {
$len_A[$i] = 0;
}
}
else {
$seqstat_file = $aln_file . ".a.seqstat.1";
runCommand(sprintf("esl-alimask -t $rf_option $alphabet_option $aln_file 1..%d | esl-seqstat -a --informat stockholm $alphabet_option - | grep ^\= > $seqstat_file", $pos-1), 0);
parse_seqstat_a_file($seqstat_file, \@seq_A, \@len_A);
unlink $seqstat_file;
}
# run esl-alimask to truncate alignment to only one position $pos-1 to determine if it is a gap or not
my @nongap_A = ();
$seqstat_file = $aln_file . ".a.seqstat.2";
runCommand("esl-alimask -t $rf_option $alphabet_option $aln_file $pos..$pos | esl-seqstat -a --informat stockholm $alphabet_option - | grep ^\= > $seqstat_file", 0);
parse_seqstat_a_file($seqstat_file, \@seq_A, \@nongap_A);
unlink $seqstat_file;
# output
printf("# Explanation of columns:\n");
printf("# seqname: name of sequence\n");
printf("# uapos: unaligned sequence position that aligns at alignment %s %d\n", ($do_notrf ? "position" : "RF position"), $pos);
printf("# if 'gap?' column is 'gap' then alignment is a gap for this sequence at position $pos\n");
if($do_after) {
printf("# and so 'uapos' column is the first sequence position aligned after position $pos\n");
printf("# or '-' if no residues exist after position $pos\n");
}
else {
printf("# and so 'uapos' column is the final sequence position aligned before position $pos\n");
printf("# or '-' if no residues exist before position $pos\n");
}
printf("# gap?: 'nongap' if position $pos is not a gap in sequence, 'gap' if it is\n");
printf("%-*s %6s %-6s\n", $seq_width, "#seqname", "uapos", "gap?");
for($i = 0; $i < $nseq; $i++) {
my $posn2print = undef;
if(($do_after) && ($len_A[$i] == $full_len_A[$i])) {
if($nongap_A[$i]) { die "ERROR found nongap after full sequence for sequence $seq_A[$i]\n"; }
$posn2print = "-";
}
elsif((! $do_after) && ($len_A[$i] == 0) && ($nongap_A[$i] == 0)) {
$posn2print = "-";
}
else {
$posn2print = $len_A[$i] + $nongap_A[$i];
}
printf("%-*s %6s %-6s\n", $seq_width, $seq_A[$i], $posn2print, ($nongap_A[$i] ? "nongap" : "gap"));
}
#################################################################
# Subroutine: runCommand()
# Incept: EPN, Thu Feb 11 13:32:34 2016
#
# Purpose: Runs a command using system() and exits in error
# if the command fails. If $be_verbose, outputs
# the command to stdout.
#
# Arguments:
# $cmd: command to run, with a "system" command;
# $be_verbose: '1' to output command to stdout before we run it, '0' not to
#
# Returns: void
#
# Dies: if $cmd fails
#################################################################
sub runCommand {
my $sub_name = "runCommand()";
my $nargs_expected = 2;
if(scalar(@_) != $nargs_expected) { printf STDERR ("ERROR, $sub_name entered with %d != %d input arguments.\n", scalar(@_), $nargs_expected); exit(1); }
my ($cmd, $be_verbose) = @_;
if($be_verbose) {
print ("Running cmd: $cmd\n");
}
system($cmd);
if($? != 0) {
die "ERROR, the following command failed:\n$cmd\n";
}
return;
}
#################################################################
# Subroutine: parse_seqstat_a_file()
# Incept: EPN, Fri May 25 14:04:48 2018
#
# Purpose: Parses a esl-seqstat -a output file.
#
# Arguments:
# $seqstat_file: command to run, with a "system" command;
# $name2check_AR: ref to array to check names against
# $len_AR: ref to array of sequence lengths to fill
#
# Returns: void
#
# Dies: if $cmd fails
#################################################################
sub parse_seqstat_a_file {
my $sub_name = "parse_seqstat_a_file()";
my $nargs_expected = 3;
if(scalar(@_) != $nargs_expected) { printf STDERR ("ERROR, $sub_name entered with %d != %d input arguments.\n", scalar(@_), $nargs_expected); exit(1); }
my ($seqstat_file, $name2check_AR, $len_AR) = @_;
open(IN, "$seqstat_file") || die "ERROR unable to open $seqstat_file";
#= SSU_rRNA_bacteria-sample8 1
my $i = 0;
my ($equal, $seqname, $length);
while($line = <IN>) {
chomp $line;
if($line =~ /^\=\s+\S+\s+\d+\s*.*$/) {
($equal, $seqname, $length) = split(/\s+/, $line);
}
elsif($line =~ /^\=\s+0\s*$/) {
# length 0
($equal, $length) = split(/\s+/, $line);
}
else {
die "ERROR unable to parse esl-seqstat -a line $line";
}
if($length ne "0") {
# esl-seqstat -a does not print sequence names for sequences of length 0
if($name2check_AR->[$i] ne $seqname) { die sprintf("ERROR dealing with sequence #%d, name mismatch %s != %s\n", ($i+1), $name2check_AR->[$i], $seqname); }
}
$len_AR->[$i] = $length;
$i++;
}
close(IN);
return;
}