-
Notifications
You must be signed in to change notification settings - Fork 0
/
NGS_1_Demultiplex.pl
173 lines (127 loc) · 4.56 KB
/
NGS_1_Demultiplex.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/perl -w
use warnings;
use strict;
use 5.18.0;
use autodie;
system("clear");
##### User input #####
my $full_path = "/home/user/";
my $file_name = "NGS.fastq";
my $five_const = "GGGCAACTCCAAGCTAGATCTACCGGT"; ## 5' constant region of SELEX pool (5'->3')
my $three_const = "AAAATGGCTAGCAAAGGAGAAGAACTTTTCACT"; ## 3' constant region of SELEX pool (5'->3')
my $length_barcode = 7;
my $barcode_file = "barcodes.txt"; # Placed in $full_path
my %barcodes = &get_barcodes($full_path.$barcode_file);
my ($barcodes_recognized, $barcode_correct) = (0, 0);
######################
##### Expected input #####
#
# @title and optional description
# sequence line
# +optional repeat of title line --> in the output here the variable region will be written after the '+'-sign
# quality line
#
# EXAMPLE:
#
# @NS500786:89:HCV7MBGX2:1:11101:24717:1089 1:N:0:ATTACTCG+AGGCTATA
# TATAGTGGATCCGACCGTGGTGCCGTGATCACGGTATCGGATTAGGCCCATACTTATCGCTTTTCTACCTACGTCG
# +
# AAAAAEEEEEEEEAEEEEEEEEEEEEEEEEAEEEEEEEEEEEEEEE/EEEEEEEEEE</EEEEEAEAEEEEE/EEE
#
##########################
## Reading fastq file
my ($read_counter, $line_counter) = (0, 0,);
my ($identifier, $sequence, $library, $rnd_region, $optional_information, $quality_score);
open (FASTQ_FILE, '<', $full_path.$file_name);
say "Processing file: $file_name";
while (<FASTQ_FILE>){
chomp();
die "Lineissue $_" if ( ($line_counter == 0 ) && (substr($_,0,1) ne "@")); ## First, short quality control
if ($line_counter == 0) { $identifier = $_;
} elsif ($line_counter == 1) { $sequence = $_;
($library, $rnd_region) = &qualitycontrol($_);
} elsif ($line_counter == 2) { $optional_information = $_;
} elsif ($line_counter == 3) { $quality_score = $_;
} else { say "OOPS!";}
++$line_counter;
if ( $line_counter > 3) { $line_counter = 0; ++$read_counter; }
## Writing output to HD according to barcode
if ( ($line_counter == 0) && ($rnd_region ne "---") ) {
my $output = $full_path.$library.".fastq";
open (OUTPUT, ">>$output");
print OUTPUT $identifier."\n".$sequence."\n+".$rnd_region."\n".$quality_score."\n";
close (OUTPUT);
}
#last if $read_counter == 10_000; ## Remove in final run. For debugging only.
my $datestring = localtime();
say $datestring . ": " . $read_counter if (($line_counter == 0) && ($read_counter % 100_000 == 0));
}
close FASTQ_FILE;
say "FastQ file read & processed.\n\n";
say "Total number of barcodes recognized: " . $barcodes_recognized;
say "Total number of correct barcodes: " . $barcode_correct;
say "Percentage of corrected barcodes: " . (1-($barcode_correct/$barcodes_recognized))*100 . " %";
say "Percentage sequences recognized: " . $barcodes_recognized/$read_counter*100 . " %\n\n\n";
#######################
## Subroutines:
## Quality control
sub qualitycontrol {
my ($seq) = @_;
use re::engine::TRE (max_cost => 2); # Allow up to two mismatches while recognizing the constant regions
if ($seq =~ m/$five_const/){
return (&assign_library($`), &extract_rnd_region($'));
} elsif (&rc($seq) =~ m/$five_const/){
return (&assign_library($`), &extract_rnd_region($'));
} else {
# If constant region can not be detected, return "0"
return ("0", $seq);
}
no re::engine::TRE;
}
## Assigning libraries using error-correctable barcodes
sub assign_library {
my $barcode_to_identify = substr($_[0], -$length_barcode);
my ($assigned_library, $flag) = (0, 0);
while (my ($barcode, $library) = each %barcodes) {
# Next line just for statistics
if ($barcode_to_identify =~ m/$barcode/){
$barcode_correct++;
}
use re::engine::TRE (max_cost => 1); ## Max_cost depends on the designed barcodes
if ($barcode_to_identify =~ m/$barcode/){
$assigned_library = $barcodes{$barcode};
$flag++;
$barcodes_recognized++;
}
no re::engine::TRE;
}
return ($assigned_library);
}
## Extract rnd_region
sub extract_rnd_region {
# For matching the 3' constant region, max_cost is set to a higher value.
# This increases recovery rate
use re::engine::TRE (max_cost => 4, max_ins => 1, max_del => 1);
if ($_[0] =~ m/$three_const/){
return ($`);
} else {
return ("---");
}
no re::engine::TRE;
}
## Reverse complement
sub rc {
my $rc = reverse($_[0]);
$rc =~ tr/ATGCatgcNn/TACGtacgNn/;
return ($rc);
}
## Get barcodes from file and store in hash
sub get_barcodes {
my %barcodes;
open (BARCODE_FILE, '<', $_[0]);
while (<BARCODE_FILE>){
# To Do match: "CTAAGTC" => "1"
}
return (%barcodes)
}
## End of file