forked from douglasgscofield/sparseMEM-big
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmummer.cpp
250 lines (219 loc) · 8.1 KB
/
mummer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
#include <iostream>
#include <iomanip>
#include <fstream>
#include <vector>
#include "sparseSA.hpp"
#include "fasta.hpp"
#include <getopt.h>
#include <time.h>
#include <sys/time.h>
#include <cctype> // std::tolower(), uppercase/lowercase conversion
#include <boost/archive/binary_iarchive.hpp>
// NOTE use of special characters ~, `, and $ !!!!!!!!
using namespace std;
void usage(string prog);
enum mum_t { MUM, MAM, MEM };
int min_len = 20;
mum_t type = MAM;
bool rev_comp = false, nucleotides_only = false;
int K = 1, num_threads = 1, query_threads = 1;
sparseSA *sa;
string query_fasta;
struct query_arg {
int skip0;
int skip;
};
void *query_thread(void *arg_) {
query_arg *arg = (query_arg *)arg_;
string meta, line;
ifstream data(query_fasta.c_str());
vector<match_t> matches;
bool print = arg->skip == 1;
long seq_cnt = 0;
if(!data.is_open()) { cerr << "unable to open " << query_fasta << endl; exit(1); }
// Collect meta data.
while(!data.eof()) {
getline(data, line); // Load one line at a time.
if(line.length() == 0) continue;
if(line[0] == '>') {
long start = 1, end = line.length() - 1;
trim(line, start, end);
for(long i = start; i <= end; i++) {
if( line[i] == ' ') break; // Behave like MUMmer 3 cut off meta after first space.
meta += line[i];
}
cerr << "# " << meta << endl;
break;
}
}
string *P = new string;
while(!data.eof()) {
getline(data, line); // Load one line at a time.
if(line.length() == 0) continue;
long start = 0, end = line.length() - 1;
// Meta tag line and start of a new sequence.
// Collect meta data.
if(line[0] == '>') {
if(meta != "") {
if(seq_cnt % arg->skip == arg->skip0) {
// Process P.
cerr << "# P.length()=" << P->length() << endl;
if(print) printf("> %s\n", meta.c_str());
if(type == MAM) sa->MAM(*P, matches, min_len, print);
else if(type == MUM) sa->MUM(*P, matches, min_len, print);
else if(type == MEM) sa->MEM(*P, matches, min_len, print, num_threads);
if(!print) sa->print_match(meta, matches, false);
if(rev_comp) {
reverse_complement(*P, nucleotides_only);
if(print) printf("> %s Reverse\n", meta.c_str());
if(type == MAM) sa->MAM(*P, matches, min_len, print);
else if(type == MUM) sa->MUM(*P, matches, min_len, print);
else if(type == MEM) sa->MEM(*P, matches, min_len, print, num_threads);
if(!print) sa->print_match(meta, matches, true);
}
}
seq_cnt++;
delete P; P = new string; meta = "";
}
start = 1;
trim(line, start, end);
for(long i = start; i <= end; i++) {
if(line[i] == ' ') break; // Behave like MUMmer 3 cut of meta after first space.
meta += line[i];
}
cerr << "# " << meta << endl;
}
else { // Collect sequence data.
trim(line, start,end);
for(long i = start; i <= end; i++) {
char c = std::tolower(line[i]);
if(nucleotides_only) {
switch(c) {
case 'a': case 't': case 'g': case 'c': break;
default:
c = '~';
}
}
*P += c;
}
}
}
// Handle very last sequence.
if(meta != "") {
if(seq_cnt % arg->skip == arg->skip0) {
cerr << "# P.length()=" << P->length() << endl;
if(print) printf("> %s\n", meta.c_str());
if(type == MAM) sa->MAM(*P, matches, min_len, print);
else if(type == MUM) sa->MUM(*P, matches, min_len, print);
else if(type == MEM) sa->MEM(*P, matches, min_len, print, num_threads);
if(!print) sa->print_match(meta, matches, false);
if(rev_comp) {
reverse_complement(*P, nucleotides_only);
if(print) printf("> %s Reverse\n", meta.c_str());
if(type == MAM) sa->MAM(*P, matches, min_len, print);
else if(type == MUM) sa->MUM(*P, matches, min_len, print);
else if(type == MEM) sa->MEM(*P, matches, min_len, print, num_threads);
if(!print) sa->print_match(meta, matches, true);
}
}
}
delete P;
pthread_exit(NULL);
}
int main(int argc, char* argv[]) {
// Collect parameters from the command line.
while (1) {
static struct option long_options[] = {
{"l", 1, 0, 0}, // 0
{"mumreference", 0, 0, 0}, // 1
{"b", 0, 0, 0}, // 2
{"maxmatch", 0, 0, 0}, // 3
{"mum", 0, 0, 0}, // 4
{"mumcand", 0, 0, 0}, // 5
{"k", 1, 0, 0}, // 6
{"threads", 1, 0, 0}, // 7
{"n", 0, 0, 0}, // 8
{"qthreads", 1, 0, 0}, // 9
{0, 0, 0, 0}
};
int longindex = -1;
int c = getopt_long_only(argc, argv, "", long_options, &longindex);
if(c == -1) break; // Done parsing flags.
else if(c == '?') { // If the user entered junk, let him know.
cerr << "Invalid parameters." << endl;
usage(argv[0]);
}
else {
// Branch on long options.
switch(longindex) {
case 0: min_len = atol(optarg); break;
case 1: type = MAM; break;
case 2: rev_comp = true; break;
case 3: type = MEM; break;
case 4: type = MUM; break;
case 5: type = MAM; break;
case 6: K = atoi(optarg); break;
case 7: num_threads = atoi(optarg); break;
case 8: nucleotides_only = true; break;
case 9: query_threads = atoi(optarg) ; break;
default: break;
}
}
}
if (argc - optind != 2) usage(argv[0]);
if(K != 1 && type != MEM) { cerr << "-k option valid only for -maxmatch" << endl; exit(1); }
if(num_threads <= 0) { cerr << "invalid number of threads specified" << endl; exit(1); }
string ref_fasta_index = argv[optind];
query_fasta = argv[optind+1];
std::ifstream infile(ref_fasta_index.c_str(), std::ios::binary | std::ios::in);
boost::archive::binary_iarchive ia(infile);
sparseSA ssa;
ia >> ssa;
sa = &ssa;
pthread_attr_t attr; pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
vector<query_arg> args(query_threads);
vector<pthread_t> thread_ids(query_threads);
// Initialize additional thread data.
for(int i = 0; i < query_threads; i++) {
args[i].skip = query_threads;
args[i].skip0 = i;
}
// Create joinable threads to find MEMs.
for(int i = 0; i < query_threads; i++)
pthread_create(&thread_ids[i], &attr, query_thread, (void *)&args[i]);
// Wait for all threads to terminate.
for(int i = 0; i < query_threads; i++)
pthread_join(thread_ids[i], NULL);
}
void usage(string prog) {
cerr << "Usage: " << prog << " [options] <reference-file-index> <query-file>" << endl;
cerr << "Implemented MUMmer v3 options:" << endl;
cerr << "-mum compute maximal matches that are unique in both sequences" << endl;
cerr << "-mumreference compute maximal matches that are unique in the reference-" << endl;
cerr << " sequence but not necessarily in the query-sequence (default)" << endl;
cerr << "-mumcand same as -mumreference" << endl;
cerr << "-maxmatch compute all maximal matches regardless of their uniqueness" << endl;
cerr << "-l set the minimum length of a match" << endl;
cerr << " if not set, the default value is 20" << endl;
cerr << "-b compute forward and reverse complement matches" << endl;
cerr << "-n match only the characters a, c, g, or t" << endl;
cerr << endl;
cerr << "Additional options:" << endl;
cerr << "-k sampled suffix positions (one by default)" << endl;
cerr << "-threads number of threads to use for -maxmatch, only valid k > 1 " << endl;
cerr << "-qthreads number of threads to use for queries " << endl;
cerr << endl;
cerr << "Example usage:" << endl;
cerr << endl;
cerr << "./mummer -maxmatch -l 20 -b -n -k 3 -threads 3 query.fa ref.fa" << endl;
cerr << "Find all maximal matches on forward and reverse strands" << endl;
cerr << "of length 20 or greater, matching only a, c, t, or g." << endl;
cerr << "Index every 3rd position in the ref.fa and use 3 threads to find MEMs." << endl;
cerr << "Fastest method for one long query sequence." << endl;
cerr << endl;
cerr << "./mummer -maxmatch -l 20 -b -n -k 3 -qthreads 3 query.fa ref.fa" << endl;
cerr << "Same as above, but now use a single thread for every query sequence in" << endl;
cerr << "query.fa. Fastest for many small query sequences." << endl;
exit(1);
}