-
Notifications
You must be signed in to change notification settings - Fork 7
/
main.nf
327 lines (289 loc) · 11.5 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#!/usr/bin/env nextflow
/*
========================================================================================
LAVA
========================================================================================
Longitudinal Analysis of Viral Alleles
#### Homepage / Documentation
https://github.com/greninger-lab/lava
----------------------------------------------------------------------------------------
*/
// Using the Nextflow DSL-2 to account for the logic flow of this workflow
nextflow.preview.dsl=2
def helpMessage() {
log.info"""
LAVA: Longitudinal Analysis of Viral Alleles
Usage:
An example command for running the pipeline is as follows:
nextflow run greninger-lab/lava \\
--CONTROL_FASTQ The fastq reads for the first sample in
your longitudinal analysis [REQUIRED]
--METADATA Required argument: A two column csv - the first column is the
path to all the fastqs you wish to include in your analysis.
All fastqs that you want to include need to be specified in
this file AND be located in the folder from which you are
running lava. The second column is the temporal seperation
between the samples. This is unitless so you can input
passage number, days, or whatever condition your experiment
happens to have. By default, this column should have numbers unless run with --CATEGORICAL. [REQUIRED]
--OUTDIR Output directory [REQUIRED]
--FASTA Specify a reference fasta with the majority consensus of the
control fastq. This option must be used with the -g flag to
specify the protein annotations relative to the start of this
fasta. [REQUIRED IF NOT --GENBANK]
--GFF Specify a reference gff file with the protein annotations for
the reference fasta supplied with the -f flag. This option
must be paired with the -f flag. [REQUIRED IF NOT GENBANK]
--GENBANK Provide a Genbank accession number. This record will be used
to generate a majority consensus from the control fastq, and
this consensus will be annotated from the downloaded genbank
record as well. [REQUIRED IF NOT --FASTA + --GFF]
--AF Specify an allele frequency percentage to cut off
- with a minimum of 1 percent - in whole numbers. default = ' '
--NUC Results are listed as nucleotide changes not amino acid
changes. Do not use with -png.
--ALLELE_FREQ Specify an allele frequency percentage to cut off - with a
minimum of 1 percent - in whole numbers.
--PNG Output results as a png. Do not use with -nuc.
--DEDUPLICATE Optional flag, will perform automatic removal of PCR
duplicates via DeDup.
--CATEGORICAL Optional flag, will consider "Passage" as categorical variable.
Make sure there are no spaces in this name.
""".stripIndent()
}
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
/* */
/* SET UP CONFIGURATION VARIABLES */
/* */
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
// Show help message
params.help = false
if (params.help){
helpMessage()
exit 0
}
params.OUTDIR= false
params.GENBANK = 'False'
params.GFF = 'False'
params.FASTA = 'false'
params.DEDUPLICATE = 'false'
params.ALLELE_FREQ = 'NO_VAL'
params.CATEGORICAL = 'false'
METADATA_FILE = file(params.METADATA)
/*
* Import the processes used in this workflow
*/
include { CreateGFF_Genbank }from './Modules.nf'
include { CreateGFF } from './Modules.nf'
include { Alignment_prep } from './Modules.nf'
include { Align_samples } from './Modules.nf'
include { Pipeline_prep } from './Modules.nf'
include { Create_VCF } from './Modules.nf'
include { Ref_done } from './Modules.nf'
include { Extract_variants } from './Modules.nf'
include { Annotate_complex } from './Modules.nf'
include { Annotate_complex_first_passage } from './Modules.nf'
include { Generate_output } from './Modules.nf'
// Throws exception if CONTROL_FASTQ doesn't exist
CONTROL_FASTQ = file(params.CONTROL_FASTQ, checkIfExists:true)
// Staging python scripts
PULL_ENTREZ = file("$workflow.projectDir/bin/pull_entrez.py")
WRITE_GFF = file("$workflow.projectDir/bin/write_gff.py")
INITIALIZE_PROTEINS_CSV = file("$workflow.projectDir/bin/initialize_proteins_csv.py")
ANNOTATE_COMPLEX_MUTATIONS = file("$workflow.projectDir/bin/Annotate_complex_mutations.py")
MAT_PEPTIDE_ADDITION = file("$workflow.projectDir/bin/mat_peptide_addition.py")
RIBOSOMAL_SLIPPAGE = file("$workflow.projectDir/bin/ribosomal_slippage.py")
GENOME_PROTEIN_PLOTS = file("$workflow.projectDir/bin/genome_protein_plots.py")
PALETTE = file("$workflow.projectDir/bin/palette.py")
//FASTA = file(params.FASTA)
//input_read_ch = Channel
// Error handling for input flags
//if CONTROL_FASTQ not set
if (!params.CONTROL_FASTQ){
println("Must provide control FASTQ with --ControlFastq")
exit(1)
}
//if OUTDIR not set
if (params.OUTDIR == false) {
println( "Must provide an output directory with --OUTDIR")
exit(1)
}
// If --GENBANK and --FASTA or --GFF are specified at the same time
// if(((params.GENBANK != "False") && (params.FASTA != "NO_FILE"))){
// println("--GENBANK cannot be used with --FASTA or --GFF")
// exit(1)
// }
// if(((params.GENBANK != "False") && (params.GFF != "False"))){
// println("--GENBANK cannot be used with --FASTA or --GFF")
// exit(1)
// }
// // If --FASTA without --GENBANK or vice versa
// if( (params.FASTA != "NO_FILE") && params.GFF == 'False'){
// println('--GFF needs to be specified with --FASTA')
// exit(1)
// }
// if( (params.GFF != "False") && params.FASTA == 'NO_FILE'){
// println('--FASTA needs to be specified with --GFF')
// exit(1)
// }
// If no flags specified
if(params.GFF == "False" && params.FASTA == 'false' && params.GENBANK == "False"){
println('Either --GENBANK or --FASTA + --GFF are required flags')
exit(1)
}
// Make sure OUTDIR ends with trailing slash
if (!params.OUTDIR.endsWith("/")){
params.OUTDIR = "${params.OUTDIR}/"
}
input_read_ch = Channel
.fromPath(METADATA_FILE)
.splitCsv(header:true)
.map{ row-> tuple(file(row.Sample), (row.Passage)) }
// Throws exception if paths in METADATA are not valid
Channel
.fromPath(METADATA_FILE)
.splitCsv(header:true)
.map{row-> (file(row.Sample, checkIfExists:true))}.ifEmpty{error "Check metadata file"}
//.map{row-> (file(row.Sample).isEmpty())}
//.filter{ it == false}.subscribe{println it}
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
/* */
/* RUN THE WORKFLOW */
/* */
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
workflow {
log.info nfcoreHeader()
if(params.FASTA == 'false') {
CreateGFF_Genbank (
params.GENBANK,
CONTROL_FASTQ,
PULL_ENTREZ,
WRITE_GFF
)
Alignment_prep (
CreateGFF_Genbank.out[0],
CreateGFF_Genbank.out[1],
CreateGFF_Genbank.out[2],
CreateGFF_Genbank.out[3],
CreateGFF_Genbank.out[4],
CreateGFF_Genbank.out[5]
)
}
else {
if(params.FASTA == 'false') {
CreateGFF (
params.GENBANK,
CONTROL_FASTQ,
PULL_ENTREZ,
WRITE_GFF,
CreateGFF_Genbank.out[6],
CreateGFF_Genbank.out[7]
)
} else {
CreateGFF (
params.GENBANK,
CONTROL_FASTQ,
PULL_ENTREZ,
WRITE_GFF,
file(params.FASTA),
file(params.GFF)
)
}
Alignment_prep (
CreateGFF.out[0],
CreateGFF.out[1],
CreateGFF.out[2],
CreateGFF.out[3],
CreateGFF.out[4],
CreateGFF.out[5]
)
}
Align_samples (
input_read_ch,
Alignment_prep.out[0],
input_read_ch.first(),
params.DEDUPLICATE
)
Pipeline_prep (
Align_samples.out[0].collect(),
Alignment_prep.out[5],
Alignment_prep.out[6],
Alignment_prep.out[0],
INITIALIZE_PROTEINS_CSV
)
Create_VCF (
Alignment_prep.out[6],
Pipeline_prep.out[2],
Align_samples.out[0],
Alignment_prep.out[1],
input_read_ch.first(),
Alignment_prep.out[2]
)
Ref_done (
input_read_ch.first(),
params.ALLELE_FREQ,
Create_VCF.out[0],
Alignment_prep.out[6],
Pipeline_prep.out[3],
Align_samples.out[1],
METADATA_FILE
)
Extract_variants (
input_read_ch.first(),
Create_VCF.out[1],
METADATA_FILE
)
Annotate_complex(
Extract_variants.out[0],
ANNOTATE_COMPLEX_MUTATIONS
)
Annotate_complex_first_passage(
Ref_done.out[0],
ANNOTATE_COMPLEX_MUTATIONS
)
Generate_output(
Annotate_complex_first_passage.out,
Annotate_complex.out[0].collect(),
Annotate_complex.out[1].collect(),
Annotate_complex.out[2].collect(),
Annotate_complex.out[3].collect(),
Pipeline_prep.out[0],
Pipeline_prep.out[1],
Align_samples.out[2].collect(),
Create_VCF.out[2].collect(),
Alignment_prep.out[7],
Alignment_prep.out[8],
MAT_PEPTIDE_ADDITION,
RIBOSOMAL_SLIPPAGE,
GENOME_PROTEIN_PLOTS,
PALETTE
)
}
def nfcoreHeader() {
return """
ooO
ooOOOo
oOOOOOOoooo
ooOOOooo oooo
/vvv\\
/V V V\\
/V V V\\
/ \\ oh wow look at these alleles
/ \\ / /
/ \\ o o
__ / \\ /- o /-
/\\ / \\ /\\ -/- /\\
/\\
___ _______ __ __ _______
| | | _ || | | || _ |
| | | |_| || |_| || |_| |
| | | || || |
| |___ | || || |
| || _ | | | | _ |
|_______||__| |__| |___| |__| |__| Version 2
""".stripIndent()
}