-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_reference_products.pl
executable file
·65 lines (55 loc) · 1.5 KB
/
get_reference_products.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/perl
## Pombert Lab, IIT, 2020
my $name = 'get_reference_products.pl';
my $version = '0.1';
my $updated = '2021-04-08';
use strict; use warnings; use PerlIO::gzip; use Getopt::Long qw(GetOptions);
my $usage = <<"OPTIONS";
NAME ${name}
VERSION ${version}
UPDATED ${updated}
SYNOPSIS Creates a single tab-delimited list of products NCBI protein (.faa) FASTA files
USAGE ${name} \\
-f *.gz \\
-l reference.list
OPTIONS:
-f (--fasta) NCBI protein FASTA files
-l (--list) Desired output list name
OPTIONS
die "\n$usage\n" unless @ARGV;
my $list;
my @fasta;
GetOptions(
'l|list=s' => \$list,
'f|fasta=s@{1,}' => \@fasta
);
open OUT, ">", "$list" or die "Can't create $list: $!\n";
while (my $file = shift@fasta){
my $fh;
my $format;
my $stime = time;
if ($file =~ /.gz$/){ ## Autodecting if file is gzipped from the file extension
open $fh, "<:gzip", "$file" or die "Can't open $file: $!\n";
$format = 'gzip';
$file =~ s/.faa.gz$//;
$file =~ s/.fasta.gz$//;
$file =~ s/.gz$//;
}
else {
open $fh, "<", "$file" or die "Can't open $file: $!\n";
$format = 'fasta';
$file =~ s/.\w+$//;
}
print "Extracting information from $file. This might take a while...\n";
while (my $line = <$fh>){
chomp $line;
if ($line =~ /^>(\S+)\s(.*)\s\[/){
my $locus = $1;
my $product = $2;
print OUT "$locus\t$product\n";
}
}
if ($format eq 'gzip'){ binmode $fh, ":gzip(none)"; }
my $runtime = time - $stime;
print "Time to extract products from file $file: $runtime seconds.\n";
}