-
Notifications
You must be signed in to change notification settings - Fork 0
/
splitGFF3.pl
executable file
·86 lines (74 loc) · 2.1 KB
/
splitGFF3.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/perl
## Pombert Lab, IIT, 2020
my $name = 'splitGFF3.pl';
my $version = '0.3a';
my $updated = '2021-03-27';
use strict; use warnings; use File::Basename; use Getopt::Long qw(GetOptions);
## Defining options
my $usage = <<"OPTIONS";
NAME ${name}
VERSION ${version}
UPDATED ${updated}
SYNOPSIS Splits an Apollo-like GFF3 file into distinct GFF3 (.gff3) and FASTA (.fsa) files, one per contig/chromosome
USAGE ${name} \\
-g file.gff3 \\
-d splitGFF3/
OPTIONS:
-g (--gff3) GFF3 file generated by Apollo
-d (--dir) Output directory (Optional)
OPTIONS
die "\n$usage\n" unless @ARGV;
my $gff3;
my $odir;
GetOptions(
'g|gff3=s' => \$gff3,
'd|dir=s' => \$odir
);
## Checking output directory
unless (defined $odir){ $odir = './'; }
unless (-d $odir){
mkdir ($odir,0755) or die "Can't create folder $odir: $!\n";
}
print "\nOutput files will be located in directory $odir\n";
## Parsing GFF3 files
my %contigs;
my $fasta;
my ($gff, $dir) = fileparse($gff3);
print "Working on file $gff located in $dir\n\n";
open IN, "<", "$gff3" or die "Can't read $gff3: $!\n";
while (my $line = <IN>){
chomp $line;
if($line =~ /^##FASTA/){ ## Annotations are listed before FASTA sequences in Apollo GFF3 files
$fasta = 1;
}
elsif ($line =~ /^#/){ ## Skipping comments
next;
}
elsif ($line =~ /^>(\S+)/){ ## Checking for presence of FASTA sequences at the end of Apollo GFF3 files
$fasta = $1;
}
elsif (($line =~ /^(\S+)/) && (!defined $fasta)){
my $contig = $1;
push (@{$contigs{$contig}[0]}, $line);
}
else {
$contigs{$fasta}[1] .= $line;
}
}
for my $key (keys %contigs){
open GFF3, ">", "$odir/$key.gff3" or die "Can't create file $odir/$key.gff3: $!\n";
if (exists $contigs{$key}[1]){ ## Prints only if FASTA is present in the GFF3 file
open FASTA, ">", "$odir/$key.fsa" or die "Can't create file $odir/$key.fsa: $!\n";
print FASTA ">$key\n";
my @seq = unpack ("(A60)*", $contigs{$key}[1]);
while (my $tmp = shift@seq){
print FASTA "$tmp\n";
}
close FASTA;
}
if (exists $contigs{$key}[0]){
while (my $feature = shift@{$contigs{$key}[0]}){print GFF3 "$feature\n";}
close GFF3;
}
}
exit;