-
Notifications
You must be signed in to change notification settings - Fork 3
/
split_scalebio_atac_fastqs.sh
executable file
·363 lines (295 loc) · 17.3 KB
/
split_scalebio_atac_fastqs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
#!/bin/bash
#
# Copyright (C) 2020-2024 - Gert Hulselmans
#
# Purpose:
# Split ScaleBio ATAC FASTQ R1 and R2 files and corrected tagmentation
# barcode file per tagmentation barcode.
set -e
set -o pipefail
decompress_fastq_cat_cmd='cat';
decompress_fastq_zcat_cmd='zcat';
decompress_fastq_igzip_cmd='igzip -c -d';
decompress_fastq_zstd_cmd='zstd -c -d';
# Number of threads to use to compress each FASTQ output file.
compress_fastq_threads="${compress_fastq_threads:-4}";
# Gzip compression level for bgzip, pigz and gzip.
compress_fastq_level="${compress_fastq_level:-6}";
# Gzip compression level for igzip (3 is maximum).
compress_fastq_igzip_level="3";
compress_fastq_bgzip_cmd="bgzip -@ ${compress_fastq_threads} -l ${compress_fastq_level} -c";
compress_fastq_pigz_cmd="pigz -p ${compress_fastq_threads} -${compress_fastq_level} -c";
compress_fastq_igzip_cmd="igzip -${compress_fastq_igzip_level} -c";
compress_fastq_gzip_cmd="gzip -${compress_fastq_level} -c";
split_scalebio_atac_fastqs () {
local fastq_R1_filename="${1}";
local fastq_R2_filename="${2}";
local corrected_bc_filename="${3}";
local fastq_output_dir="${4}";
local compress_fastq_cmd="${5:-bgzip}";
if [ ${#@} -lt 4 ] ; then
printf '\nUsage:\n';
printf ' split_scalebio_atac_fastqs \\\n';
printf ' fastq_R1 \\\n';
printf ' fastq_R2 \\\n';
printf ' corrected_bc_filename \\\n';
printf ' fastq_output_dir \\\n';
printf ' <compress_fastq_cmd [bgzip|pigz|gzip|uncompressed]> \\\n\n';
printf 'Purpose: Split ScaleBio ATAC FASTQ R1 and R2 files and corrected tagmentation\n'
printf ' barcode file per tagmentation barcode.\n\n';
printf 'Parameters:\n';
printf ' - fastq_R1: FASTQ R1 filename.\n';
printf ' - fastq_R2: FASTQ R2 filename.\n';
printf ' - corrected_bc_filename: File with corrected barcodes.\n';
printf ' - fastq_output_dir: Output dir for FASTQ output file(s).\n';
printf ' - compress_fastq_cmd:\n';
printf ' - Compression program to use for output FASTQ files:\n';
printf " - \"bgzip\": '%s' (default)\n" "${compress_fastq_bgzip_cmd}";
printf " - \"pigz\": '%s'\n" "${compress_fastq_pigz_cmd}";
printf " - \"igzip\": '%s' (very fast, low compression)\n" "${compress_fastq_igzip_cmd}";
printf " - \"gzip\": '%s'\n" "${compress_fastq_gzip_cmd}";
printf ' - "uncompressed": Write uncompressed FASTQ files.\n';
printf ' - full custom command\n\n';
printf ' To change number of compression threads:\n';
printf ' - export compress_fastq_threads="%s"\n\n' "${compress_fastq_threads}";
printf ' To change compression level:\n';
printf ' - export compress_fastq_level="%s"\n\n' "${compress_fastq_level}";
return 1;
fi
local fastq_R1_basename="${fastq_R1_filename##*/}";
# Get current extglob setting.
local extglob_setting=$(shopt -p extglob);
# Enable extglob setting.
shopt -s extglob;
# Try to extract the suffix after the sample name from the FASTQ file name,
# so the same suffix can be used for the output FASTQ files.
local fastq_output_basename_prefix="${fastq_R1_basename%_S+([0-9])_R1?(_00[1-9]).fastq?(.gz)}"
local fastq_output_basename_prefix="${fastq_output_basename_prefix%_R1?(_00[1-9]).fastq?(.gz)}"
# Restore extglob setting.
eval ${extglob_setting};
if [ "${#fastq_output_basename_prefix}" -eq "${#fastq_R1_basename}" ] ; then
# FASTQ sample name could not be extracted. Set FASTQ R1 output prefix manually.
local fastq_output_R1_suffix="_R1.fastq.gz";
else
# FASTQ sample name and suffix could be extracted.
local fastq_output_R1_suffix="${fastq_R1_basename:${#fastq_output_basename_prefix}}";
fi
# Create output suffixes for FASTQ R2 and R3, based on FASTQ R1 suffix.
local fastq_output_R2_suffix="${fastq_output_R1_suffix/_R1/_R2}";
local fastq_output_R3_suffix="${fastq_output_R1_suffix/_R1/_R3}";
if type igzip > /dev/null 2>&1 ; then
# Decompress gzipped FASTQ files with igzip if installed (6x faster than gzip).
local decompress_fastq_gzipped_cmd="${decompress_fastq_igzip_cmd}";
else
# Decompress gzipped FASTQ files with gzip.
local decompress_fastq_gzipped_cmd="${decompress_fastq_zcat_cmd}";
fi
# Detect if input FASTQ files are compressed (gzip/zstd) or not.
if [ "${fastq_R1_filename}" != "${fastq_R1_filename%.gz}" ] ; then
local decompress_R1_fastq_cmd="${decompress_fastq_gzipped_cmd}";
elif [ "${fastq_R1_filename}" != "${fastq_R1_filename%.zst}" ] ; then
local decompress_R1_fastq_cmd="${decompress_fastq_zstd_cmd}";
else
local decompress_R1_fastq_cmd="${decompress_fastq_cat_cmd}";
fi
if [ "${fastq_R2_filename}" != "${fastq_R2_filename%.gz}" ] ; then
local decompress_R2_fastq_cmd="${decompress_fastq_gzipped_cmd}";
elif [ "${fastq_R2_filename}" != "${fastq_R2_filename%.zst}" ] ; then
local decompress_R2_fastq_cmd="${decompress_fastq_zstd_cmd}";
else
local decompress_R2_fastq_cmd="${decompress_fastq_cat_cmd}";
fi
# Detect if input corrected barcode file is compressed (gzip/zstd) or not.
if [ "${corrected_bc_filename}" != "${corrected_bc_filename%.gz}" ] ; then
local decompress_corrected_bc_file_cmd="${decompress_fastq_gzipped_cmd}";
elif [ "${corrected_bc_filename}" != "${corrected_bc_filename%.zst}" ] ; then
local decompress_corrected_bc_file_cmd="${decompress_fastq_zstd_cmd}";
else
local decompress_corrected_bc_file_cmd="${decompress_fastq_cat_cmd}";
fi
case "${compress_fastq_cmd}" in
bgzip)
local compress_fastq_cmd="${compress_fastq_bgzip_cmd}";;
pigz)
local compress_fastq_cmd="${compress_fastq_pigz_cmd}";;
igzip)
local compress_fastq_cmd="${compress_fastq_igzip_cmd}";;
gzip)
local compress_fastq_cmd="${compress_fastq_gzip_cmd}";;
uncompressed)
# Write uncompressed FASTQ files.
fastq_output_R1_suffix="${fastq_output_R1_suffix%.gz}";
fastq_output_R2_suffix="${fastq_output_R2_suffix%.gz}";
fastq_output_R3_suffix="${fastq_output_R3_suffix%.gz}";
local compress_fastq_cmd="cat";;
esac
if ! type mawk > /dev/null 2>&1 ; then
printf 'Error: "mawk" not found or executable.\n';
return 1;
fi
if ! type "${decompress_corrected_bc_file_cmd%% *}" > /dev/null 2>&1 ; then
printf 'Error: "%s" not found or executable.\n' "${decompress_corrected_bc_file_cmd%% *}";
return 1;
fi
if ! type "${compress_fastq_cmd%% *}" > /dev/null 2>&1 ; then
printf 'Error: "%s" not found or executable.\n' "${compress_fastq_cmd%% *}";
return 1;
fi
mawk \
-v fastq_R1_filename="${fastq_R1_filename}" \
-v fastq_R2_filename="${fastq_R2_filename}" \
-v corrected_bc_filename="${corrected_bc_filename}" \
-v fastq_output_prefix="${fastq_output_dir%/}/${fastq_output_basename_prefix}" \
-v fastq_output_R1_suffix="${fastq_output_R1_suffix}" \
-v fastq_output_R2_suffix="${fastq_output_R2_suffix}" \
-v fastq_output_R3_suffix="${fastq_output_R3_suffix}" \
-v decompress_R1_fastq_cmd="${decompress_R1_fastq_cmd}" \
-v decompress_R2_fastq_cmd="${decompress_R2_fastq_cmd}" \
-v decompress_corrected_bc_file_cmd="${decompress_corrected_bc_file_cmd}" \
-v compress_fastq_cmd="${compress_fastq_cmd}" \
'
BEGIN {
read_fastq_R1_cmd = decompress_R1_fastq_cmd " " fastq_R1_filename;
read_fastq_R2_cmd = decompress_R2_fastq_cmd " " fastq_R2_filename;
read_corrected_bc_file_cmd = decompress_corrected_bc_file_cmd " " corrected_bc_filename;
partial_write_fastq_cmd = compress_fastq_cmd " > " fastq_output_prefix;
read_name_corrected_bc = "";
corrected_bc_line = "";
fastq_line_number = 0;
corrected_bc_line_number = 0;
read_len_R1_prev = -1;
read_len_R2_prev = -1;
read_len_corrected_bc_prev = -1;
# Read FASTQ R1 file (which contains read 1).
while ( (read_fastq_R1_cmd | getline fastq_R1_line) > 0 ) {
fastq_line_number += 1;
fastq_part = fastq_line_number % 4;
# Read FASTQ R2 file (which contains read 2).
if ( (read_fastq_R2_cmd | getline fastq_R2_line) > 0 ) {
if ( fastq_part == 1 ) {
# Find first space position (0 if no comment found) in read name from all input FASTQ files.
read_name_R1_space_pos = index(fastq_R1_line, " ");
read_name_R2_space_pos = index(fastq_R2_line, " ");
# Extract read name from all input FASTQ files.
if (read_name_R1_space_pos > 0) {
read_name_R1 = substr(fastq_R1_line, 2, read_name_R1_space_pos - 2);
} else {
read_name_R1 = substr(fastq_R1_line, 2);
}
if (read_name_R2_space_pos > 0) {
read_name_R2 = substr(fastq_R2_line, 2, read_name_R2_space_pos - 2);
} else {
read_name_R2 = substr(fastq_R2_line, 2);
}
read_len_R1 = length(read_name_R1);
if ( read_len_R1 != read_len_R1_prev ) {
# Try to remove CB and UMI from end of read name, if current
# read name length is different from the previous one.
sub(/:[ACGTN+]+$/, "", read_name_R1);
read_len_R1 = length(read_name_R1);
read_len_R1_prev = read_len_R1;
}
read_len_R2 = length(read_name_R2);
if ( read_len_R2 != read_len_R2_prev ) {
# Try to remove CB and UMI from end of read name, if current
# read name length is different from the previous one.
sub(/:[ACGTN+]+$/, "", read_name_R2);
read_len_R2 = length(read_name_R2);
read_len_R2_prev = read_len_R2;
}
# Check if read names match between both FASTQ files.
if ( read_name_R1 != read_name_R2 ) {
print "Error: Read name R1 (\"" read_name_R1 "\") and read name R2 (\"" read_name_R2 "\") are not paired properly (line number: " fastq_line_number ").";
exit(1);
}
# Keep getting the next corrected barcode line till the read name matches with the
# read name in the FASTQ files.
# This allows to add corrected barcodes after quality trimming R1 and R2 FASTQ files,
# while the barcode correction was done on the original barcode FASTQ file (which
# can have more reads as none were removed as can happen during quality trimming).
while ( read_name_corrected_bc != read_name_R1 ) {
if ( (read_corrected_bc_file_cmd | getline corrected_bc_line) > 0 ) {
corrected_bc_line_number += 1;
# Split corrected barcode line on spaces:
# - read name: strip "@" from the start.
# - corrected barcode info as SAM tags separated by "\t":
# CR:Z:raw_bc\tCY:Z:raw_bc_qual\tCB:Z:corrected_bc_seq
# Find first space position (0 if no comment found) in read name from all input FASTQ files.
read_name_corrected_bc_space_pos = index(corrected_bc_line, " ");
# Extract read name from all input FASTQ files.
if (read_name_corrected_bc_space_pos > 0) {
read_name_corrected_bc = substr(corrected_bc_line, 2, read_name_corrected_bc_space_pos - 2);
corrected_bc_sam_tags = substr(corrected_bc_line, read_name_corrected_bc_space_pos + 1);
} else {
read_name_corrected_bc = substr(corrected_bc_line, 2);
corrected_bc_sam_tags = "";
}
read_len_corrected_bc = length(read_name_corrected_bc);
if ( read_len_corrected_bc != read_len_corrected_bc_prev ) {
# Try to remove CB and UMI from end of read name, if current
# read name length is different from the previous one.
sub(/:[ACGTN+]+$/, "", read_name_corrected_bc);
read_len_corrected_bc = length(read_name_corrected_bc);
read_len_corrected_bc_prev = read_len_corrected_bc;
}
# Split corrected barcode info on "\t".
n_corrected_bc_sam_tags = split(corrected_bc_sam_tags, corrected_bc_sam_tags_array, "\t");
if (n_corrected_bc_sam_tags == 3) {
cr_sam_tag = corrected_bc_sam_tags_array[1];
cy_sam_tag = corrected_bc_sam_tags_array[2];
tb_sam_tag = corrected_bc_sam_tags_array[3];
if ( cr_sam_tag !~ /^cr:Z:/ ) {
print "Error: Expected \"cr:Z:\" instead of \"" substr(cr_sam_tag, 1, 5) "\" for uncorrected barcode sequence with read name (\"" read_name_corrected_bc "\") (line number: " corrected_bc_line_number ").";
exit(1);
}
if ( cy_sam_tag !~ /^cy:Z:/ ) {
print "Error: Expected \"cy:Z:\" instead of \"" substr(cy_sam_tag, 1, 5) "\" for uncorrected barcode quality with read name (\"" read_name_corrected_bc "\") (line number: " corrected_bc_line_number ").";
exit(1);
}
if ( cr_sam_tag !~ /^cr:Z:/ ) {
print "Error: Expected \"tb:Z:\" instead of \"" substr(cr_sam_tag, 1, 5) "\" for corrected tagmentation barcode with read name (\"" read_name_corrected_bc "\") (line number: " corrected_bc_line_number ").";
exit(1);
}
# Extract 10x/HyDrop ATAC sequence and quality and ScaleBio ATAC tagmentation barcode.
sequence_CB = substr(cr_sam_tag, 6);
quality_CB = substr(cy_sam_tag, 6);
tagmentation_barcode = substr(tb_sam_tag, 6);
} else {
# No corrected tagmentation barcode found.
tagmentation_barcode = "";
}
}
}
} else if ( fastq_part == 2 ) {
# Sequence lines.
# Store sequence info from R1 and R2 for later use.
sequence_R1 = fastq_R1_line;
sequence_R2 = fastq_R2_line;
} else if ( fastq_part == 0 ) {
# Quality lines.
# Only write read if a corrected tagmentation barcode was found.
if ( tagmentation_barcode != "" ) {
# Keep track of seen tagmentation barcodes, so filehandles can be closed later.
tagmentation_barcodes[tagmentation_barcode] += 1;
# Write the full FASTQ record to the R1 and R2 output FASTQ file with barcode info in the read name comments.
# When write_fastq_R1_cmd and write_fastq_R2_cmd are the same, an interleaved FASTQ fille will be written.
print "@" read_name_R1 "\n" sequence_R1 "\n+\n" fastq_R1_line | partial_write_fastq_cmd "_" tagmentation_barcode fastq_output_R1_suffix;
print "@" read_name_corrected_bc "\n" sequence_CB "\n+\n" quality_CB | partial_write_fastq_cmd "_" tagmentation_barcode fastq_output_R2_suffix;
print "@" read_name_R2 "\n" sequence_R2 "\n+\n" fastq_R2_line | partial_write_fastq_cmd "_" tagmentation_barcode fastq_output_R3_suffix;
}
}
}
}
# Close open file handles.
close(read_fastq_R1_cmd);
close(read_fastq_R2_cmd);
close(read_corrected_bc_file_cmd);
for ( tagmentation_barcode in tagmentation_barcodes ) {
close(partial_write_fastq_cmd "_" tagmentation_barcode fastq_output_R1_suffix);
close(partial_write_fastq_cmd "_" tagmentation_barcode fastq_output_R2_suffix);
close(partial_write_fastq_cmd "_" tagmentation_barcode fastq_output_R3_suffix);
}
}'
return $?
}
split_scalebio_atac_fastqs "${@}";