-
Notifications
You must be signed in to change notification settings - Fork 6
/
main.nf
904 lines (665 loc) · 27.9 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
params.fragments = null // CR fragments file.
params.posbam = null // CR possorted bam file.
params.chromlen = null // File with names of chromosomes.
params.ncell = 0 // Can be set for testing purposes; suggest using a
// sampled fragment file, e.g.
// zcat fragments.tsv.gz | perl -ne 'print if $. % 11 == 0;' | gzip > fragments11.tsv.gz
params.cellcsv = null // CR singlecell.csv file
params.cellbatchsize = 500 // for demuxing and for masterpeak coverage
params.outdir = 'results'
params.sampleid = 'thesamp' // Used in some names of output files
params.winsize = 5000 // bin size for genome
params.nbcest = 10000 // estimated number of non-empty barcodes.
params.nclades = 10
params.ntfs = 20000
params.ntfs_ofs = 1
params.npcs = 20
params.sumrank = 'true'
params.clip_SVD = true
params.mermul = false // merge multiplets; only supported for single samples
params.usecls = '__seurat__'
params.mergepeaks = true
params.perclusterpeaks = false
params.stopatcluster = false
params.o_clusdef = false // Output cluster definitions (bed files) yes/no
params.o_winmcx = false // Output cluster definitions (bed files) yes/no
params.muxfile = null // analyse multiple samples
params.macs2_genomesize = '2.7e9'
if ((!params.fragments || !params.cellcsv || !params.posbam) && !params.muxfile) {
exit 1, "Please supply --fragments <CR-fragment-file> --cellcsv <CR-cellcsv-file> --posbam <CR-posbam-file>"
}
if (!params.chromlen) {
exit 1, "Please supply --chromlen <chr-names-file>"
}
ch_usercls = params.usecls =~ /^__.*__$/ ? Channel.empty() : Channel.fromPath(params.usecls)
ch_mux = params.muxfile && !params.mermul ? Channel.fromPath(params.muxfile) : Channel.empty()
ch_cellfile = params.cellcsv ? Channel.fromPath(params.cellcsv) : Channel.empty()
ch_fragfile_cr = params.fragments ? Channel.fromPath(params.fragments) : Channel.empty()
ch_bamfile = params.posbam ? Channel.fromPath(params.posbam) : Channel.empty()
ch_chrlen = Channel.fromPath(params.chromlen)
ch_cellfile.into { ch_cellfile_param; ch_cellfile_param2; }
ch_bamfile.into { ch_bamfile_param; ch_bamfile_param2 }
/* below umm yes is unsightly. DSL2 will help, should we port this to it */
ch_chrlen.into { ch_chrom_length_cr_single; ch_chrom_length_cr_mux; ch_chrom_length_mm; ch_chrom_length; ch_chrom_length2; ch_chrom_length3; ch_chrom_length4 }
/*
thefragfile = params.fragments ? file(params.fragments) : null
thecellfile = params.cellcsv ? file(params.cellcsv) : null
thebamfile = params.posbam ? file(params.posbam) : null
*/
def just_name(full_file_path) {
full_file_path.toString().split('/')[-1] // fixme: use generic path separator
}
process prepare_cr_single {
tag "cr-prep $cellbatchsize"
publishDir "${params.outdir}", pattern: 'cellmetadata', mode: 'copy'
when: !params.mermul && !params.muxfile
input:
file(f_cells) from ch_cellfile_param.collect()
file(posbam) from ch_bamfile_param.collect()
file(f_chromlen) from ch_chrom_length_cr_single.collect()
val cellbatchsize from params.cellbatchsize
val winsize from params.winsize
output:
file('cellmetadata/cells.tab') into ch_celltab_cr
file('cellmetadata/win.tab') into ch_wintab_cr
file('cellmetadata/singlecell.tsv') into ch_cellfile_single
set val("crsingle"), file('c_c.*') into ch_demux_cr
shell:
filter = params.ncell > 0 ? "head -n ${params.ncell}" : "cat"
'''
mkdir -p cellmetadata
# Names + info of selected cells.
# fixme '10th field (nine with zero-offset)' is very brittle;
cat !{f_cells} | tr ',' '\\t' | perl -ane 'print if $F[9] == 1' \\
| (sort -rnk 2 || true) | !{filter} > cellmetadata/chosen_cells.info
# Just the names of selected cells.
cut -f 1 cellmetadata/chosen_cells.info | sort > cellmetadata/cell.names
# Tab file for mcx matrix loading
nl -v0 -nln -w1 < cellmetadata/cell.names > cellmetadata/cells.tab
# Batch lists for demuxing
split -l !{cellbatchsize} cellmetadata/cell.names c_c.
# Tab file for windows
ca_make_chromtab.pl !{winsize} !{f_chromlen} > cellmetadata/win.tab
# Subselect what we need from singlecell.tsv file.
# fixme brittle header/data coupling.
echo -e "barcode\\ttotal\\tduplicate\\tchimeric\\tunmapped\\tlowmapq\\tmitochondrial\\tpassed_filters\\tcell_id\\tis__cell_barcode\\tTSS_fragments\\tDNase_sensitive_region_fragments\\tenhancer_region_fragments\\tpromoter_region_fragments\\ton_target_fragments\\tblacklist_region_fragments\\tpeak_region_fragments\\tpeak_region_cutsites" > cellmetadata/singlecell.tsv
cat cellmetadata/chosen_cells.info >> cellmetadata/singlecell.tsv
'''
}
process prepare_cr_mux { // integrate multiple fragment files
tag "cr-mux-prep $sampleid $sampletag"
container 'quay.io/cellgeni/cellclusterer'
when: (!params.mermul) && (!params.posbam) && (params.muxfile)
input:
file(f_chromlen) from ch_chrom_length_cr_mux.collect()
set val(sampletag), val(sampleid), val(root) from ch_mux
.splitCsv(sep: '\t')
.view{"manifest: $it"}
val cellbatchsize from params.cellbatchsize
val winsize from params.winsize
output:
file("cellmetadata/${sampleid}.names") into ch_cellnames_many_cr
file('cellmetadata/*-win.tab') into ch_wintab_many_cr
file("cellmetadata/${sampleid}.info_tagged") into ch_cellinfo_many_cr
file("cellmetadata/${sampleid}.map") into ch_tagmap_many_cr
set val(sampletag), file("*.fragments.gz"), file('c_c.*') into ch_demux_many_cr
shell:
filter = params.ncell > 0 ? "head -n ${params.ncell}" : "cat"
'''
# make fragments, bam, manifest available
fragfile=!{sampleid}.fragments.gz # output; not used below
cellfile=!{sampleid}.singlecell.csv
ln -s !{root}/fragments.tsv.gz $fragfile
ln -s !{root}/singlecell.csv $cellfile
mkdir -p cellmetadata
# Tab file for windows
ca_make_chromtab.pl !{winsize} !{f_chromlen} > cellmetadata/!{sampletag}-win.tab
# Names + info of selected cells.
get-col.py -i $cellfile -s ',' -c is__cell_barcode -f 1 -H -o $'\t' \\
| (sort -rnk 2 || true) | !{filter} > cellmetadata/!{sampleid}.base
perl -ne 'print "!{sampletag}-$_"' cellmetadata/!{sampleid}.base > cellmetadata/!{sampleid}.info_tagged
perl -ane 'chomp; print "!{sampletag}-$F[0]\\t!{sampleid}-$F[0]\\n"' cellmetadata/!{sampleid}.base > cellmetadata/!{sampleid}.map
# Just the names of selected cells.
cut -f 1 cellmetadata/!{sampleid}.info_tagged | sort > cellmetadata/!{sampleid}.names
# Batch lists for demuxing
split -l !{cellbatchsize} cellmetadata/!{sampleid}.names c_c.
'''
}
/* fixme: this process also needs to provide a singlecell.csv file after changes
* to prepare_cr_single
*/
process prepare_mm { // merge multiplets
tag "mm-prep $cellbatchsize"
container 'quay.io/cellgeni/cellclusterer'
publishDir "${params.outdir}", pattern: 'cellmetadata_mm', mode: 'copy'
when: params.mermul && !params.muxfile
input:
file(f_cells) from ch_cellfile_param2.collect()
file(posbam) from ch_bamfile_param2.collect()
file(f_chromlen) from ch_chrom_length_mm.collect()
val cellbatchsize from params.cellbatchsize
val winsize from params.winsize
output:
file('cellmetadata/cells.tab') into ch_celltab_mm
file('cellmetadata/win.tab') into ch_wintab_mm
set val("mm"), file('fragmints2.tsv.gz'), file('c_c.*') into ch_demux_mm
shell:
'''
mkdir -p cellmetadata
#
extract-fragments !{posbam} fragmints.tsv.gz !{task.cpus}
#
merge-multiplets --chrom !{f_chromlen} --max-n !{params.nbcest} --debug --outfrg fragmints2.tsv.gz fragmints.tsv.gz cellmetadata/bc-mm.map
# Just the names of selected cells.
cut -f 2 cellmetadata/bc-mm.map | sort -u > cellmetadata/cell.names
# Tab file for mcx matrix loading
nl -v0 -nln -w1 < cellmetadata/cell.names > cellmetadata/cells.tab
# Batch lists for demuxing
split -l !{cellbatchsize} cellmetadata/cell.names c_c.
# Tab file for windows
ca_make_chromtab.pl !{winsize} !{f_chromlen} > cellmetadata/win.tab
'''
}
process join_muxfiles {
// todo fixme this process is activated somehow in single-sample mode,
// even with a draconian `when: false` directive.
// it then sends a tab file and this leads to disaster.
// Need to understand `when` + process triggering (asked Paolo about that once).
// IIRC the issue then was a child or grandchild popping up unexpectedly.
// In this case join_muxfiles is the (grand)child.
// Does toSortedList make it empty and acceptable?
// fixme: I've duplicated the when: from prepare_cr_mux here, but need proper understanding.
publishDir "${params.outdir}/cellmetadata", pattern: 'singlecell.tsv', mode: 'link'
publishDir "${params.outdir}/cellmetadata", pattern: 'tagmap.txt', mode: 'link'
when: (!params.mermul) && (!params.posbam) && (params.muxfile)
input:
file(fnames) from ch_cellnames_many_cr.toSortedList { just_name(it) }
file(fninfo) from ch_cellinfo_many_cr.toSortedList { just_name(it) }
file(fmap) from ch_tagmap_many_cr.toSortedList { just_name(it) }
output:
file('merged.tab') into ch_celltab_manymerged_cr
file('singlecell.tsv') into ch_cellfile_mux
file('tagmap.txt')
shell:
'''
cat !{fnames} > merged.names
nl -v0 -nln -w1 < merged.names > merged.tab
# fixme brittle header/data coupling.
echo -e "barcode\\ttotal\\tduplicate\\tchimeric\\tunmapped\\tlowmapq\\tmitochondrial\\tpassed_filters\\tcell_id\\tis__cell_barcode\\tTSS_fragments\\tDNase_sensitive_region_fragments\\tenhancer_region_fragments\\tpromoter_region_fragments\\ton_target_fragments\\tblacklist_region_fragments\\tpeak_region_fragments\\tpeak_region_cutsites" > singlecell.tsv
cat !{fninfo} >> singlecell.tsv
cat !{fmap} >> tagmap.txt
'''
}
/*
The merge-multiplet code was inserted in a clone of the preparation process.
For further excitement, we also cater for merging multiple 10x experiments.
below are all the merging Y-junctions for plumbing the channels to the same destination.
prepare_cr_single - pure cr code
prepare_mm - multiplet merging code
prepare_cr_mux - cr code supporting multiple experiments (not yet multipletted)
In these three processes there is quite a bit of duplication of code.
prepare_cr_mux is the main input mode;
for now prepare_cr_single is also maintained;
prepare_mm was tested a few times but not heavily used (cannot be used in mux mode).
*/
ch_celltab_manymerged_cr
.mix(ch_celltab_cr, ch_celltab_mm)
.into { ch_celltab; ch_celltab2; ch_celltab3; ch_celltab4; ch_celltab5 }
ch_wintab_many_cr.toSortedList{ just_name(it) }
.flatten().first()
.mix(ch_wintab_cr, ch_wintab_mm)
.into { ch_wintab; ch_wintab2; ch_wintab3; ch_wintab4 }
ch_fragfile_cr.map { fragf -> [ "crsingle", fragf ] }
.join(ch_demux_cr)
.mix(ch_demux_mm, ch_demux_many_cr) // fixme this block is a bit convoluted
.transpose()
.set { ch_demux_batch }
process sample_demux {
tag "sample-$sampletag batch-$batchtag"
input:
set val(sampletag), file(frags), file(cells) from ch_demux_batch.view{"sample_demux: $it"}
file(f_chromlen) from ch_chrom_length4.collect()
output:
file('*celldata/[ACGT][ACGT][ACGT][ACGT]/*.bed') into ch_demuxed
file('*celldata/mtx*.edges') into ch_all_edges
set val(sampletag), file('*celldata/mtx*.edges') into ch_sample_edges
shell:
batchtag = cells.toString() - 'c_c.'
thesampletag = sampletag
'''
dir=celldata
if [[ -n "!{thesampletag}" ]]; then
dir=!{thesampletag}-celldata
fi
mkdir -p $dir
chromfile=my.!{sampletag}.chromo.txt
cut -f 1 !{f_chromlen} > $chromfile
zcat !{frags} | samdemux.pl --chromofile=$chromfile --barcodefile=!{cells} --outdir=$dir --bucket --fragments --ntest=0 --fnedges=mtx.!{thesampletag}-!{batchtag}.edges --tag=!{thesampletag}
cd $dir
ls | grep -v "mtx" | while read beddir; do
cd $beddir
for bedfile in *; do
sort-bedfiles.sh $bedfile
done
cd ..
done
'''
}
// fixme noteme; this depends on bed suffix;
// does samdemux outputs sam suffix in possorted bam mode.
ch_demuxed
.flatMap()
.map { it -> it.toString() - ~/.*celldata\/[ACGT]{4}\// - ~/\.bed/ + '\t' + it.toString() }
.into { ch_cellpaths_clusterindex; ch_cellpaths_masterpeakcov }
process make_sample_matrix {
tag "$sampleid"
container 'quay.io/cellgeni/cellclusterer'
output:
file("${sampleid}.w2c.mcx") into ch_sample_join_matrix
file("${sampleid}.stats") into ch_sample_join_window
input:
set val(sampleid), file(fnedges) from ch_sample_edges.groupTuple(sort: true).view()
file('allcelltab') from ch_celltab4.collect()
file('wintab') from ch_wintab3.collect()
// fixme: write w2c directly (swap fields)
shell:
'''
export MCLXIOFORMAT=8
mtx=!{sampleid}.c2w.mcx
mtxtp=!{sampleid}.w2c.mcx
cat !{fnedges} | mcxload --stream-split -abc - -strict-tabc allcelltab -restrict-tabr wintab --write-binary -o $mtx
mcxi /$mtx lm tp /$mtxtp wm
mcx query -imx $mtxtp -tab wintab | tail -n +2 > !{sampleid}.stats
'''
}
process join_sample_matrix {
container 'quay.io/cellgeni/cellclusterer'
publishDir "${params.outdir}/win_matrix", pattern: 'window.tab', mode: 'link'
input:
file(w2c) from ch_sample_join_matrix.toSortedList { just_name(it) }
file(winstats) from ch_sample_join_window.toSortedList { just_name(it) }
file(wintab) from ch_wintab4.collect()
file(celltab) from ch_celltab5.collect()
val ntfs from params.ntfs
val ntfs_ofs from params.ntfs_ofs
output:
file('mmtx') into ch_load_mmtx2
file('window.tab')
/*
Need to create these outputs, as they are currently hardcoded in seurat script (fixme).
f_binary_mat
regions.names
cell.names
*/
shell:
'''
## ca_winsect rank-transforms the data.
ca_winsect.pl -1 !{wintab} !{winstats} > __winsel.stats
# All approaches: use ranks to mitigate sample-level expression differences.
# first approach: sum ranks, take top N lowest sumrank windows.
# This implies taking windows with high expression across samples.
if !{params.sumrank}; then
( set +o pipefail; cut -f 1,2 __winsel.stats | tail -n +!{ntfs_ofs} | head -n !{ntfs} | sort -nk 1 ) > window.tab
else
# Second approach try to find windows that are variably expressed.
# We use IQR approach (on the ranks) skewed towards the higher expressed (== lower rank) windows.
# This is experimental code. Will be either removed or made clean.
R --no-save <<EOC
wintab <- read.table("!{wintab}", as.is=T, colClasses=c("character", "character"))
N <- !{ntfs}
rd <- read.table("winsect.stats", header=T, row.names=1, as.is=T) # rank data
sumrank <- rd[, "Sumrank"]
names(sumrank) <- rownames(rd)
srk_names <- names(sort(sumrank))[1:N]
rd2 <- rd[,-1] # remove sumrank column
rd2[rd2 > 1*N] <- 1*N # truncate counts at 1N; to avoid high-rank windows dominating; point of interest/debate
# Given skewed quantile this probably does not have any impact.
win_choice <- apply(rd2, 1, function (x) { quantile(x, 0.6) - quantile(x, 0.1) })
iqr2_names <- names(sort(win_choice, decreasing=TRUE))[1:N]
sum(srk_names %in% iqr2_names)
write.table(wintab[ wintab[,"V2"] %in% iqr2_names, ], "window.tab", quote=F, row.names=F, col.names=F, sep="\t")
write.table(as.data.frame(win_choice), "window.quant", quote=F, row.names=T, col.names=F, sep="\t")
EOC
fi
for m in !{w2c}; do
# input has format <tag>.w2c.mcx
subfile=__${m%.w2c.mcx}.sub.mcx
mcxsubs -imx $m --from-disk -tab window.tab "dom(c, t()), out($subfile,wb)"
echo "done $m"
done
export MCLXIOFORMAT=8
# below constructs an mcxi command to add up a bunch of matrices.
# The construction looks horrible, the redeeming feature is that it is really fast
# and the constructed file is pretty simple.
(
echo 'mcxi <<EOC'
echo __*.sub.mcx | perl -ane '@G = map { "/$_ lm add\\n" } @F; $G[0] =~ s/ add//; print @G;'
echo tp /c2w.mcx wm pop /w2c.mcx wm
echo EOC
) > makematrix.sh
bash -e makematrix.sh
# ri: re-indexed.
mcxmap -imx w2c.mcx -make-mapc win.map -o w2c.ri.mcx
mcxmap -tab window.tab -map win.map -o win.ri.tab
cut -f 2 win.ri.tab > win.names
cut -f 2 !{celltab} > cell.names
>&2 echo "Producing matrixmarket format"
n_entries=$(mcx query -imx w2c.ri.mcx | tail -n +2 | datamash sum 2)
mkdir mmtx
ca_make_mmtx.sh -r win.names -c cell.names -m w2c.ri.mcx \\
-e $n_entries -t integer -o mmtx/filtered_window_bc_matrix.mmtx.gz
ln win.names filtered_win.txt
ln win.names mmtx/regions.names
ln cell.names filtered_bc.txt
ln cell.names mmtx/cell.names
'''
}
process make_big_matrix {
container 'quay.io/cellgeni/cellclusterer'
publishDir "${params.outdir}/cellmetadata", mode: 'link', pattern: 'cell2win.mcx', enabled: params.o_winmcx
input:
file all_edges from ch_all_edges.toSortedList{ just_name(it) }
file celltab from ch_celltab
file wintab from ch_wintab
output:
file 'cell2win.mcx' into ch_cell2win
set file('cell2win.mcx'), file(celltab), file(wintab) into ch_dump_big_matrix
shell:
'''
cat !{all_edges} | mcxload --stream-split -abc - -strict-tabc !{celltab} -restrict-tabr !{wintab} --write-binary -o cell2win.mcx
sleep 3
'''
}
process mmtx_big_matrix {
tag "raw_window_bc_matrix"
container 'quay.io/cellgeni/cellclusterer'
publishDir "${params.outdir}/win_matrix", mode: 'link'
when: !params.stopatcluster
output:
file 'raw_window_bc_matrix.mmtx.gz'
file 'raw_*.txt'
input:
set file('cell2win.mcx'), file('celltab'), file('wintab') from ch_dump_big_matrix
script:
'''
# fixme build new image with datamash
# n_entries=$(mcx query -imx cell2win.mcx | tail -n +2 | datamash sum 2
n_entries=$(mcx query -imx cell2win.mcx | tail -n +2 | perl -ane 'chomp; $sum+=$_; END { print $sum }')
ca_make_mmtx.sh -r wintab -c celltab -m cell2win.mcx -e $n_entries -t integer -o raw_window_bc_matrix.mmtx.gz
cp celltab raw_bc.txt
cp wintab raw_window.txt
'''
}
process filter_big_matrix {
tag "bed-mcx-mmtx"
container = 'quay.io/cellgeni/cellclusterer'
publishDir "$params.outdir/win_matrix", mode: 'link', pattern: 'other_publish/filtered*',
saveAs: { fname -> fname - ~/other_publish\// }
//when: false
input:
file('cells.tab') from ch_celltab2
file('win.tab') from ch_wintab2
file('cell2win.mcx') from ch_cell2win
val ntfs from params.ntfs
output:
file('outputs') into ch_load_mmtx
file('other_publish/filtered*')
shell:
'''
# fixme, filtered_window_bc_matrix is still an implicit output from ca_top_region.sh
mkdir matrix
mkdir outputs
mkdir other_publish
cd matrix
ca_top_region.sh \\
-c ../cells.tab \\
-w ../win.tab \\
-m ../cell2win.mcx \\
-n !{ntfs} \\
-C ../outputs/filtered_cell.stats \\
-W ../outputs/win.stats \\
-R ../outputs/regions.names \\
-X ../outputs/filtered_window_bc_matrix.mmtx.gz \\
-N ../outputs/cell.names
ln ../outputs/regions.names ../other_publish/filtered_win.txt
ln ../outputs/cell.names ../other_publish/filtered_bc.txt
ln ../outputs/filtered_window_bc_matrix.mmtx.gz ../other_publish
ln ../outputs/filtered_cell.stats ../other_publish
'''
}
process seurat_clustering {
tag "seurat2020"
container = 'quay.io/cellgeni/cellclusterer'
publishDir "$params.outdir/qc", mode: 'link'
when: params.usecls == '__seurat__'
input:
val nclades from params.nclades
val sampleid from params.sampleid
val npcs from params.npcs
file('singlecell.csv') from ch_cellfile_single.mix(ch_cellfile_mux).collect()
file('mmtx') from ch_load_mmtx2
output:
file('seurat-clades.tsv') into ch_seurat_clades
file('seurat.pdf')
file('seurat*.rds')
shell:
noclip = params.clip_SVD ? "" : "--noclip"
'''
R --no-save --args !{noclip} < !{baseDir}/bin/ca_seurat_clades.R
'''
}
process episcanpy_clustering {
tag "episcanpy2021"
container = '/nfs/cellgeni/singularity/images/cellatac-episcanpy_v0.7.sif'
publishDir "$params.outdir/qc", mode: 'link'
when: params.usecls == '__episcanpy__'
input:
val nclades from params.nclades
val sampleid from params.sampleid
val npcs from params.npcs
file('singlecell.csv') from ch_cellfile_single.mix(ch_cellfile_mux).collect()
file('mmtx') from ch_load_mmtx2
output:
file('Leiden.tsv') into ch_episcanpy_clades
file('episcanpy_*.pdf')
file('figures/*.png')
file('adata*.h5ad')
shell:
'''
ca_episcanpy.py -m 0.515 -f 120000 -p !{task.cpus}
'''
}
process cusanovich_clustering {
tag "cusanovich2018"
container = 'quay.io/cellgeni/cellclusterer'
publishDir "$params.outdir/qc", mode: 'link', pattern: 'cus.*'
when: params.usecls == '__cusanovich__'
input:
val nclades from params.nclades
val sampleid from params.sampleid
val npcs from params.npcs
file('inputs') from ch_load_mmtx
output:
file('cus.obj.*.clades.tsv') into ch_cusanovich_clades
file('cus.qc.*.pdf')
file('cus.obj.*')
shell:
'''
ln -s !{baseDir}/bin/cusanovich2018_lib.r .
R --slave --quiet --no-save --args \\
--nclades=!{nclades} \\
--npcs=!{npcs} \\
--matrix=inputs/filtered_window_bc_matrix.mmtx.gz \\
--regionnames=inputs/regions.names \\
--cellnames=inputs/cell.names \\
--winstats=inputs/win.stats \\
--cellstats=inputs/filtered_cell.stats \\
--sampleid=!{sampleid} \\
< !{baseDir}/bin/cluster_cells_cusanovich2018.R
'''
}
ch_usercls.mix(ch_cusanovich_clades, ch_seurat_clades, ch_episcanpy_clades).set{ ch_clustering }
process clusters_index {
tag "${cladefile}"
when: !params.stopatcluster
input:
file metafile from ch_cellpaths_clusterindex.collectFile(name: 'cov.inputs', newLine: true)
file cladefile from ch_clustering
output:
file('clusinfo.cl*') into ch_clusinfo
shell:
'''
splitbyclus.pl !{cladefile} !{metafile} clusinfo.cl
'''
}
ch_clusinfo.flatMap().map { [ it.baseName - 'clusinfo.cl', it ] }.into { ch_cat_cluster_inputs; ch_per_cluster_inputs }
// This process derives the cluster tag from the file name. We only need to do this once,
// in subsequent processes we can pass this tag on.
process clusters_merge_inputs {
tag "${clustag}"
publishDir "${params.outdir}/clusdef", mode: 'link', enabled: params.o_clusdef
input:
set val(clustag), file(clusmetafile) from ch_cat_cluster_inputs
output:
set val(clustag), file("cluster.${clustag}.b??") into ch_clustermacs
// Fixme noteme b?? captures both bed and bam. A bit fragile; but bam is
// probably not needed anymore, and a new format unlikely. Still, noted.
shell:
'''
cut -f 2 !{clusmetafile} | tr '\\n' '\\0' > clusmetafile0
sort -m -k 1,1V -k 2,2n -k 3,3n --files0-from=clusmetafile0 | perl -pe 'chomp;$_.="\\t+\\n";' > cluster.!{clustag}.bed
'''
}
process clusters_macs2 {
tag "${clustag}"
container = 'fooliu/macs2'
publishDir "${params.outdir}/macs2", mode: 'link'
when: !params.stopatcluster
input:
set val(clustag), file(clusregionfile) from ch_clustermacs
output:
set file('*.xls'), file('*.bed')
set val(clustag), file('*.narrowPeak') into ch_combine_clusterpeaks, ch_per_cluster_analysis
shell:
'''
# source /nfs/cellgeni/miniconda3/bin/activate py2
outdir=macs2.!{clustag}.out
mkdir -p $outdir
macs2 callpeak -t !{clusregionfile} -g !{params.macs2_genomesize} -n !{clustag} --outdir . -f AUTO \\
--nomodel \\
--shift -100 \\
--extsize 200 \\
2> macs2.!{clustag}.log
'''
}
process peaks_make_masterlist {
tag "masterlist"
container = 'quay.io/cellgeni/cellclusterer'
publishDir "${params.outdir}/peaks", mode: 'link'
when: params.mergepeaks
input:
file np_files from ch_combine_clusterpeaks.map { it[1] }.toSortedList { just_name(it) }
file f_chromlen from ch_chrom_length.collect()
output:
file('allclusters_peaks_sorted.bed')
file('allclusters_masterlist_sps.bed') into (ch_masterbed_sps, ch_masterbed_sps2)
// NOTE may want to encode some cluster parameters in the file name? Also preceding processes
shell:
'''
cat !{np_files} | cut -f 1-3 | sort -k1,1V -k2,2n -k3,3n > allclusters_peaks_sorted.bed
# use -d -1 to avoid mergeing regions overlapping only 1bp
# use -d 0 (default) so that overlapping and/or book-ended features are merged.
bedtools merge -i allclusters_peaks_sorted.bed -d 0 > allclusters_masterlist.bed
# Note moved this step from P6 to here. sps == sample-pos-sorted, sorted according to sample bam.
bedtools sort -faidx !{f_chromlen} -i allclusters_masterlist.bed > allclusters_masterlist_sps.bed
'''
}
process cells_masterlist_coverage {
tag "${celldef_list}"
container = 'quay.io/cellgeni/cellclusterer'
cache 'lenient'
// publishDir "${params.outdir}/mp_counts"
// A lot of files. This is simply the raw input for the cell/peak matrix.
input:
file(masterbed_sps) from ch_masterbed_sps.collect()
file f_chromlen from ch_chrom_length2.collect()
file(celldef_list) from ch_cellpaths_masterpeakcov
.toSortedList { just_name(it) }
.flatMap()
.collate(params.cellbatchsize)
.map { it.join('\n') + '\n' }
output:
file('*.mp.txt') into ch_cellpeak
shell:
'''
while read cellname celldef; do
# celldef2=ttt.$cellname
# sort -k 1,1V -k 2,2n -k 3,3n $celldef > celldef2
bedtools coverage \\
-a !{masterbed_sps} \\
-b $celldef -sorted -header \\
-g !{f_chromlen} | awk -F"\t" '{if($4>0) print $0}' > $cellname.mp.txt
done < !{celldef_list}
'''
}
process make_subset_peakmatrix {
tag "${clustag} subset-peak-cell-matrix"
container = 'quay.io/cellgeni/cellclusterer'
publishDir "${params.outdir}/clus_peak_matrix", mode: 'link'
when: params.perclusterpeaks
input:
set val(clustag), file(clusmetafile), file(npeakfile) from ch_per_cluster_inputs.join(ch_per_cluster_analysis)
file f_chromlen from ch_chrom_length3.collect()
output:
// file('cell2peak.gz')
file('*.peaks_bc_matrix.mmtx.gz') optional true
file('*.bc_peaks_matrix.mmtx.gz') optional true
file('*.peaks.txt') optional true
file('*.bc.txt') optional true
file('*.macs2-empty') optional true
shell:
'''
if [[ ! -s !{npeakfile} ]]; then
> !{clustag}.macs2-empty
exit 0
fi
cut -f 1-3 !{npeakfile} | sort -k1,1V -k2,2n -k 3,3n > clusterpeak.bed
bedtools sort -faidx !{f_chromlen} -i clusterpeak.bed > clusterpeak_sps.bed
# ^ similar to peaks_makemasterlist; we only need the selection of columns and sorting
> peak.inputs
> cellnames.txt
while read cellname celldef; do
bedtools coverage \\
-a clusterpeak_sps.bed \\
-b $celldef -sorted -header \\
-g !{f_chromlen} | awk -F"\t" '{if($4>0) print $0}' > $cellname.mp.txt
echo "$cellname.mp.txt" >> peak.inputs
echo "$cellname" >> cellnames.txt
# construct this file within-process.
done < !{clusmetafile}
ca_peak_matrix.sh -c cellnames.txt -p clusterpeak_sps.bed -i peak.inputs -x "!{clustag}."
# ^ similar to make_master_peakmatrix
'''
}
process make_master_peakmatrix {
tag "peak-cell-matrix"
container = 'quay.io/cellgeni/cellclusterer'
publishDir "${params.outdir}/peak_matrix", mode: 'link'
when: !params.stopatcluster
input:
file metafile from ch_cellpeak.flatMap { ls -> ls.collect{ it.toString() } }.collectFile(name: 'peak.inputs', newLine: true)
file('masterpeak.bed') from ch_masterbed_sps2.collect()
file('cells.tab') from ch_celltab3.collect()
output:
// file('cell2peak.gz')
file('peaks_bc_matrix.mmtx.gz')
file('bc_peaks_matrix.mmtx.gz')
file('peaks.txt')
file('bc.txt')
shell:
'''
cut -f 2 cells.tab > cellnames.txt
# fixme: hardcoded output names.
ca_peak_matrix.sh -c cellnames.txt -p masterpeak.bed -i peak.inputs
'''
}