-
Notifications
You must be signed in to change notification settings - Fork 2
/
ali-pfam-sindi2dot-bracket.pl
executable file
·183 lines (171 loc) · 7.31 KB
/
ali-pfam-sindi2dot-bracket.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env perl
# EPN, Tue Jul 3 06:59:56 2018
# ali-pfam-sindi2dot-bracket.pl
#
# Given a pfam formatted alignment with per-sequence secondary structure annotation (#=GR <seqname> SS)
# convert it to unaligned fasta dot-bracket notation.
#
use warnings;
use strict;
use Getopt::Long;
my $usage;
$usage = "ali-pfam-sindi2dot-bracket.pl\n\n";
$usage .= "Usage:\n\n";
$usage .= "ali-pfam-sindi2dot-bracket.pl\n";
$usage .= "\tOPTIONS:\n";
$usage .= "\t\t-a: keep sequences in aligned format (do not remove gaps) [default: do remove gaps]\n";
$usage .= "\t\t-l: do not convert sequences to all uppercase [default: do]\n";
$usage .= "\t\t-w: leave SS in current format (possibly WUSS)\n";
$usage .= "\t\t-c: include consensus structure as additional 'sequence' [default: do not]\n";
$usage .= "\t\t-d: do not include pknots in dot-bracket notation (convert to '.')\n";
$usage .= "\t\t-n: name individual secondary structures [default: do not]\n";
my $opt_a = 0; # set to '1' if -a used
my $opt_l = 0; # set to '1' if -l used
my $opt_w = 0; # set to '1' if -w used
my $opt_c = 0; # set to '1' if -c used
my $opt_d = 0; # set to '1' if -d used
my $opt_n = 0; # set to '1' if -n used
&GetOptions( "a" => \$opt_a,
"l" => \$opt_l,
"w" => \$opt_w,
"c" => \$opt_c,
"d" => \$opt_d,
"n" => \$opt_n);
if(scalar(@ARGV) != 1) { die $usage; }
my ($aln_file) = (@ARGV);
# set defaults
my $do_gapless = ($opt_a) ? 0 : 1;
my $do_upper = ($opt_l) ? 0 : 1;
my $do_dotbracket = ($opt_w) ? 0 : 1;
my $do_sscons = ($opt_c) ? 1 : 0;
my $do_nopknot = ($opt_d) ? 1 : 0;
my $do_name = ($opt_n) ? 1 : 0;
if($do_nopknot && (! $do_dotbracket)) {
die "ERROR -p does not make sense in combintation with -w";
}
my %seen_H = (); # key is sequence name, used to check to make sure we are in Pfam format
my %seen_ss_H = (); # key is sequence name, used to check to make sure we are in Pfam format
my @notgap_A = (); # array 1..$i..alen-1, value is '0' if position $i is a gap for current sequence, '1' if it is not a gap
my $seqname = undef; # current sequence name
my $seqname_ss = undef; # current sequence name for SS line
my $seq = undef; # current sequence, with gaps
my $gapless_seq = undef; # current sequence, without gaps
my $ss = undef; # current SS string, with gaps
my $gapless_ss = undef; # current SS string, without gaps
my @seq_A = (); # array version of current sequence, one element per position
my @ss_A = (); # array version of current SS line, one element per position
my $i = 0; # counter over alignment positions
my $left_ct = 0; # number of left basepair halves seen for current SS string
my $right_ct = 0; # number of right basepair halves seen for current SS string
my $line = undef; # a line of the file
my $ss_name = undef; # name for current SS string, will always be "" unless -n used
my $is_sscons = 0; # flag for whether current SS string is indi SS or SS_cons
open(IN, $aln_file) || die "ERROR unable to open $aln_file";
while(my $line = <>) {
if($line !~ /^\#/) {
if($line =~ /(\S+)\s+(\S+)/) {
($seqname, $seq) = ($1, $2);
if((exists $seen_H{$seqname}) && ($seen_H{$seqname} == 1)) {
die "ERROR saw sequence $seqname twice, did you convert to pfam format (e.g. esl-reformat pfam ...)";
}
$seen_H{$seqname} = 1;
# determine where the gaps in the sequence are
$gapless_seq = "";
@seq_A = split("", $seq);
@notgap_A = ();
for($i = 0; $i < scalar(@seq_A); $i++) {
if($seq_A[$i] =~ m/[\.\-\_\~]/) {
$notgap_A[$i] = 0;
if(! $do_gapless) { $gapless_seq .= $seq_A[$i]; }
}
else {
$gapless_seq .= $seq_A[$i];
$notgap_A[$i] = 1;
}
}
# convert lowercase to uppercase
if($do_upper) { $gapless_seq =~ tr/a-z/A-Z/; }
}
}
elsif($line =~ m/^\#=/) { # check all #= lines to see if they're either indi SS or SS_cons
($ss, $seqname_ss) = (undef, undef);
if($line =~ /^\#=GR\s+(\S+)\s+SS\s+(\S+)/) { # indi SS line
($seqname_ss, $ss) = ($1, $2);
$seen_ss_H{$seqname_ss} = 1;
if($seqname_ss ne $seqname) {
die "ERROR did not read SS line for $seqname in expected order, read SS for $seqname_ss instead";
}
$is_sscons = 0;
}
elsif($line =~ /^\#=GC\s+SS_cons\s+(\S+)/) { # SS_cons line
$ss = ($1);
$seqname_ss = "SS_cons";
$is_sscons = 1;
}
if(defined $ss) { # only true if either an indi SS line or SS_cons line
# remove gaps, creating SS as we go, we need @gap_A to do this because SS string does not indicate where gaps are
$left_ct = 0;
$right_ct = 0;
$gapless_ss = "";
@ss_A = split("", $ss);
for($i = 0; $i < scalar(@ss_A); $i++) {
if(($notgap_A[$i] == 1) || (! $do_gapless)) {
if($ss_A[$i] =~ m/[\{\[\<\(]/) {
$left_ct++;
$gapless_ss .= ($do_dotbracket) ? "(" : $ss_A[$i];
}
elsif($ss_A[$i] =~ m/[\}\]\>\)]/) {
$right_ct++;
$gapless_ss .= ($do_dotbracket) ? ")" : $ss_A[$i];
}
elsif($ss_A[$i] =~ m/[A-Z]/) { # pknotted bp (left half)
$left_ct++;
$gapless_ss .= ($do_nopknot) ? "." : $ss_A[$i];
}
elsif($ss_A[$i] =~ m/[a-z]/) { # pknotted bp (right half)
$right_ct++;
$gapless_ss .= ($do_nopknot) ? "." : $ss_A[$i];
}
else {
$gapless_ss .= ($do_dotbracket) ? "." : $ss_A[$i];
}
}
}
# output
if(! $is_sscons) {
# sanity checks:
if(length($gapless_seq) != length($gapless_ss)) {
die "ERROR problem removing gaps from SS for $seqname, unexpected length " . length($gapless_seq) . " != " . length($gapless_ss) . "\n";
}
if($left_ct != $right_ct) {
die "ERROR problem with SS for $seqname, num left parentheses ($left_ct) not equal to num right parentheses ($right_ct). Did you convert to pfam (e.g. esl-reformat pfam ...)\n";
}
$ss_name = ($do_name) ? ">$seqname-SS\n" : "";
print(">$seqname\n$gapless_seq\n$ss_name$gapless_ss\n");
}
elsif($is_sscons && $do_sscons) {
# sanity checks:
if(length($gapless_seq) != length($gapless_ss)) {
die "ERROR problem removing gaps from SS_cons, unexpected length " . length($gapless_seq) . " != " . length($gapless_ss) . "\n";
}
if($left_ct != $right_ct) {
die "ERROR problem with SS_cons, num left parentheses ($left_ct) not equal to num right parentheses ($right_ct), maybe you want to also use -a, or you did not convert to pfam format (e.g. esl-reformat pfam ...)?\n";
}
$ss_name = ($do_name) ? ">SS_cons\n" : "";
print("$ss_name$gapless_ss\n");
}
} # end of 'if(defined $ss)'
} # end of 'elsif($line =~ m/^\#=/) {'
}
close(IN);
# sanity check
foreach $seqname (sort keys %seen_H) {
if((! exists $seen_ss_H{$seqname}) || ($seen_ss_H{$seqname} != 1)) {
die "ERROR did not read SS annotation for $seqname\n";
}
}
foreach $seqname (sort keys %seen_ss_H) {
if((! exists $seen_H{$seqname}) || ($seen_H{$seqname} != 1)) {
die "ERROR did not read sequence, but did read SS annotation for $seqname\n";
}
}