-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtemplate.config.yaml
421 lines (382 loc) · 20.4 KB
/
template.config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
###### BEERS CONFIGURATION ######
# The following file configures BEERS 2.
#
# To set up this config for your BEERS2 run, please go through
# all items marked with a TODO value. These contain file paths
# that have to be set to use your data.
# See also the config/baby.config.yaml file for a runnable example.
# The seed value initializes the random number generator
# so that runs are reproducible. Any integer works.
seed: 100
output:
# TODO: configure which output types you wish to have generated
# Output types, set to True to generate these data.
# The results appear in the 'results' directory
# FastQ Files are named as S1_L8_R2.fastq for example
# for sample 1, lane 8, read 2.
# Moreover, an 'unidentified' file is produced for each sample
# that contains reads that originated from that sample but whose
# demultiplexing step failed due to errors in the barcode reads.
output_fastq: true
# Set to true for sam/bam files containing the true alignments to
# the reference genome. Note that such alignments may no longer
# be replicable even in theory from just the reads, for exampe if
# most or all of the read is a PolyA tail and so cannot be aligned.
# This file still indicates where it came from
# Output is in the 'results' directory and named like S1_L8.sam
# for sample 1, lane 8. Like fastq, also has the potential for
# unidentified reads, stored in separate files.
output_sam: true
output_bam: false
# Set to true to receive full logs of intermediate steps
# These can be very large for a full-sized BEERS run (100s of GB)
# since they document every intermediate molecule throughout library prep
full_logs: false
global_config:
# global configuration contains configuration values that are available to all
# steps of the Library Prep and Sequencing Pipeline.
samples:
# Specify sample-specific information
# Samples are given identfiers, which are strings like '1' or '2'
#
# TODO: for each sample you want to generate data from, enter it below
# and provide barcodes for it.
'1':
# Sample one configuration
barcodes:
# Barcodes are used by AdapterLigationStep
# Each sample should have unique (i5, i7) pair of barcodes
# These get sequenced to determine which sample the read came from
# Typical examples of these can be founda at:
# https://support-docs.illumina.com/SHARE/AdapterSeq/Content/SHARE/AdapterSeq/TruSeq/UDIndexes.htm
i5: AGCGCTAG
i7: AACCGCGG
'2':
# Sample 2 configuration, same as above
barcodes:
i5: GATATCGA
i7: TTATAACC
molecule_maker_parameters:
# BEERS has two different methods for generating molecules
# Either they can be provided molecule-by-molecule in the molecule file
# Or they can be generated on demand using CAMPAREE output sample directories
# that contain all the information to synthesize random molecules
# These options affect this second on-the-fly molecules.
# The range of polyA tails to generate. Selected uniformly within this range.
min_polyA_tail_length: 50
max_polyA_tail_length: 250
resources:
# Resoruces contain general-use information
# Adapters are added to the molecules in the AdapterLigationStep
# They are also read in the sequencing by synthesis step where they
# are used to initiate transcription.
# Each adapter flanks the corresponding sample barcode, which all flanks the original molecule
# So the molecule ends up looking like (5' to 3'):
# (pre_i5_adapter) (i5) (post_i5_adapter) (molecule sequence) (pre_i7_adapter) (i7) (post_i7_adapter)
# These ones have been obtained from:
# https://support-docs.illumina.com/SHARE/AdapterSeq/Content/SHARE/AdapterSeq/TruSeq/UDIndexes.htm
pre_i5_adapter: AATGATACGGCGACCACCGAGATCTACAC
post_i5_adapter: ACACTCTTTCCCTACACGACGCTCTTCCGATCT
pre_i7_adapter: GATCGGAAGAGCACACGTCTGAACTCCAGTCAC
post_i7_adapter: ATCTCGTATGCCGTCTTCTGCTTG
# Reference genome FASTA is used to generate output BAM files, which require chromosome lengths.
# Use the same FASTA as the BEERS input (such as from CAMPAREE) was
# generated from.
# TODO: this needs to be set to a fasta of the genome you generate data for,
# if generating SAM/BAM files (default)
reference_genome_fasta: /path/to/reference/genome.fa
library_prep_pipeline:
# The library prep pipeline takes input molecules and walks them through a library prep
# to get them ready for sequencing.
input:
# Input the Library Prep step, which is also the input to BEERS in general
# The results of this will be fed to the sequencing pipeline to generate the output
# There are TWO ways to provide input to BEERS.
# First, we can provide molecule files, such as output by CAMPAREE
# These specify exactly the molecules to prepare and sequence
# Specify these by a path to a directory containing the molecule packets
# in either .txt format (see README for details) or a pickle file from CAMPAREE
#
# /my/directory/path:
# - sample1:
# molecule_file1.txt
# molecule_file2.pickle
# ...
# - sample2:
# molecule_file1.txt
# molecule_file2.pickle
# ...
directory_path: /path/to/molecule_packetes/ # TODO: set this value
# Second, we can provide CAMPAREE sample directories
# Which instead contain the information to generate new molecules
# based off a sample's variants, gene, isoform, and allele distributions
from_distribution_data:
# Specify, for each sample, what molecule data to generate (if any)
# Samples here should correspond to the 'samples' entry above
#
# TODO: for each sample specified in the 'samples' section above
# indicate how many packets and molecules to generate for it
# from the CAMPAREE output distributions. If only using
# molecule files as input, set num_packets: 0 for all samples
'1':
# Number of packets to generate.
# If 0, don't generate any and just use the provided molecule files
num_packets: 1
# Number of molecules to generate per packet.
# Increasing this number increases the amount of work done per job.
# Generally want to keep this under 200000 to not run out of memory.
# If memory usage errors occur, reducing this can help.
num_molecules_per_packet: 10000
# Directory containing CAMPAREE output for the sample
# This data is used to generate the resulting molecules
sample_data_directory: /path/to/CAMPAREE_output/sample1/ # TODO: set this path
'2':
num_molecules_per_packet: 10000
num_packets: 1
sample_data_directory: /path/to/CAMPAREE_output/sample2/ # TODO: set this path
steps:
# The 'steps' list specifies all the steps that the Library Prep Pipeline
# will run. To specify a custom step, include it here.
# Python files with the corresponding code are looked up according to
# the pattern {module_name}.{class_name} pattern, so that the
# {module_name}.py file will be checked for {class_name} class
- step_name: polya_step.PolyAStep
# PolyA selection step removes RNA that does not have an poly A tail
# of sufficient length. Enriches for mRNA.
parameters:
# Chance per base of fragmentation prior to selection
# Increasing this above 0 induces a 3' bias.
# A value of 0.001 induces a reasonably high 3' bias
breakpoint_prob_per_base: 0.0
# Probability of retention is computed as a value between
# min_retention_prob and max_retention_prob
# For every base of the polyA tail beyond min_polya_tail_length
# the probability increases linearly from min_retention_prob
# by length_retention_prob, up to a max of max_polya_tail_length.
max_retention_prob: 1.0
min_retention_prob: 0.0
min_polya_tail_length: 40
length_retention_prob: 0.05
- step_name: fragment_step.FragmentStep
# Fragmentation step breaks each molecule into pieces
parameters:
# Fragmentation methods are available.
# 'uniform' fragments at each base equally and is the default
method: uniform
# This 'rate' is the rate parameter for an exponential distribution
# which determines the time it takes until a molecule to fragments.
# Molecules which would take longer then 'runtime' to fragment, do not fragment.
# Molecules may also fragment multiple times if rate is high enough.
rate: 0.005
runtime: 1
# Since fragmentation generates many very small fragments that will
# be quickly lost in the following steps, we have the option to
# drop those fragments immediately. Setting this value can significantly
# decrease runtime and memory requirements.
min_frag_size: 20
# If method == 'beta', then the fragmentation sites can be biased within
# the molecule, according to a beta distribution.
# NOTE: using 'beta' can generate unusual coverage plots since there are
# significant edge effects around the transcript. However, it can be used
# to give a more realistic fragment distribution.
#
# If using 'beta', you must also specify the following:
#
# The parameters for the beta distribution. Set these so that
# A = B > 0 to bias towards fragmentation in the middle of the fragment,
# with larger values biasing further towards the middle.
# If A > B, then would bias towards the 5' end instead.
# beta_A: 3.0
# beta_B: 3.0
#
# And the N factor allows a non-linear fragmentation rate depending
# upon the length of the molecule. Values >1 indicate that longer molecules
# are more likely to fragment than smaller ones, biasing towards larger
# fragment sizes.
# beta_N: 2.0
- step_name: first_strand_synthesis_step.FirstStrandSynthesisStep
# First Strand Synthesis performs (hexamer) priming on the RNA
# and then generates the cDNA from this. Priming sites can be
# biased according the position probability matrix (PPM) that
# specifies the weighting of each base according to the position
# relative to the first base of the primer.
# The 5' most primer is then used to extend the DNA and create
# the cDNA, thereby potentially losing some of the 5' end of
# the molecule.
parameters:
# If 'perfect_priming' is true, then we always prime
# exactly on the 5' most end of the molecule and the
# cDNA matches perfectly the original molecule
# If true, all other parameters are ignored.
perfect_priming: false
# PPM matrix describing biases of the priming sites
# Values must sum to 1 in each base. Length of the PPM
# determines the primer lengths.
# See Kasper et all, 2010 PMID: 20395217 for example
# observation of these biases in fragment start positions.
# If no bias is desired, set all values to 0.25.
position_probability_matrix:
A: [0.50, 0.1, 0.40, 0.30, 0.25, 0.15]
C: [0.20, 0.5, 0.3 , 0.25, 0.25, 0.15]
G: [0.15, 0.1, 0.15, 0.25, 0.25, 0.20]
T: [0.15, 0.3, 0.15, 0.20, 0.25, 0.50]
# Rate of priming: approximate number of primers that
# will bind (if bias-free) per kilobase of the fragment
# Higher numbers will loose less of the 5' side.
primes_per_kb: 50
- step_name: second_strand_synthesis_step.SecondStrandSynthesisStep
# Second Strand synthesis copies the single-stranded cDNA from
# the First Strand Synthesis step into double-stranded cDNA.
# All parameters are same as the First Strand Synthesis step.
# If not perfect_priming, will loose some of the 5' end of the
# first strand of cDNA, which is the 3' end of the original fragment
parameters:
perfect_priming: false
position_probability_matrix:
A: [0.50, 0.1, 0.40, 0.30, 0.25, 0.15]
C: [0.20, 0.5, 0.3 , 0.25, 0.25, 0.15]
G: [0.15, 0.1, 0.15, 0.25, 0.25, 0.20]
T: [0.15, 0.3, 0.15, 0.20, 0.25, 0.50]
primes_per_kb: 50
- step_name: sizing_step.SizingStep
# Sizing step throws out molecules that do not fit the
# desired size distribution
parameters:
# Probability of retention by length
# __________________
# 1| ___/ \___
# p | ____/ \____
# 0|___/ \____
# -----------------------------------------------
# | | | |
# min_length | | max_length
# | select_all |
# start end
#
# All molecules with lengths outside of this range
# will be discarded:
min_length: 100
max_length: 400
# Molecules whose length is inside this range will
# always be retained. Retention probability raises
# linearly from 0 to 1 between the min/max values
# above and this range here.
select_all_start_length: 200
select_all_end_length: 300
- step_name: adapter_ligation_step.AdapterLigationStep
# Adpater Ligation attaches adapters to each end of the molecules
# These are used for the PCR step to initiate PCR and include the
# sample identifying barcodes.
#
# This uses the adapters specified in the resources section
# as well as the sample i5/i7 barcodes in the samples section
parameters: {}
- step_name: pcr_amplification_step.PCRAmplificationStep
# PCR amplification step creates many copies of each molecule
# Since the number of molecules generated is quite large and
# since in real sequencing, only a small fraction of the molecules
# prepped end up forming clusters on the flowcell and being
# sequenced, we give the option to remove most of the PCR
# generated molecules prior to the end of the step.
# This provides a very large increase in speed and decrease
# in memory usage.
parameters:
# Number of cycles to use
# Generates 2^n fragments for each input fragment
number_cycles: 10
# Retain only this percent of the data
# NOTE: scale is 0-100, not 0-1, so 0.08 means 0.0008 of the generated molecules are kep
# This value can be approximated from real data with UMI tags by the following:
# Compute the PCR duplication rate using the UMI tags.
# Let N be the number of PCR steps. Assuming that each molecule generates 2^N
# PCR descendants, if all were sequenced, we would expect all molecules to be duped
# 2^N times. Instead, choose retention_percentage with the following:
#
# '''
# import scipy.stats
# pcr_rate = scipy.stats.binom(p=retention_percentage / 100, n=2^N).sf(1)
# # Choose retention_percentage so that pcr_rate is approximately the observed PCR rate
# '''
#
# For example, retention_percentage = 0.08 and number_cycles=10 gives dupe rate
# of about 20%, which is pretty typical. If number_cycles changes, this should
# be modified too to maintain dupe rate.
retention_percentage: 0.08
# Induce a GC bias by discarding some PCR duplicates
# according to their GC bias. All fragments overall
# GC content is computed and then the following three
# parameters are used to compute a probability of retention
# gc_bias_constant + gc_bias_linear*(gc - 0.5) + gc_bias_quadratic*(gc - 0.5)^2
# clipped to always be within 0 and 1
# For no bias, set to 1, 0, and 0 for const, linear, and quadratic, respectively.
# To bias towards GC=0.5 content, set gc_bias_quadratic to be negative,
# such as -100 for a large GC bias.
gc_bias_constant: 1.0
gc_bias_linear: 0.0
gc_bias_quadratic: -100
# During PCR, we have some chance of mis-copying the molecules
# These are specified here as probability per-base, so 0.001 is one error per 1kb
deletion_rate: 0.0001
insertion_rate: 0.0001
substitution_rate: 0.001
sequence_pipeline:
# The Sequence Pipeline takes clusters bound to the flowcell and turns them into
# final, sequenced results, including fastq files or SAM files with the 'true'
# alignments of the simulated molecules to the reference genome.
steps:
- step_name: bridge_amplification_step.BridgeAmplificationStep
# Bridge amplification takes a seed of a cluster and generates
# many copies of the molecule to form a cluster.
parameters:
# Number of cycles to perform. Generates 2^N molecules in the cluster
# via a PCR-like bridge amplification process.
cycles: 10
# Substitution rate per base in the cluster formation
# No indels are generated here, for computation simplicity.
substitution_rate: 0.01
- step_name: sequence_by_synthesis_step.SequenceBySynthesisStep
# Sequencing by Synthesis then synthesizes complementary strands to each
# molecule in the clusters with flourescence readings at each base.
# If some of the molecules have errors in them (from bridge amplification),
# the flourescence will be imperfect. Moreover, errors also happen in this step
# including phasing, where some molecules get ahead or behind in the sequencing
# and so are displaying the wrong base.
# From these readings, the final sequenced values are generated.
parameters:
# Determines which read is the forward and which is the reverse read
# NOTE: currently only 'true' is implemented
forward_is_5_prime: true
# Whether to sequence both ends or just one
# NOTE: currently only 'true' is implemented
paired_ends: true
# Length of the reads to generate, in bases
read_length: 100
# The rate of phasing, either forward (skip) or backwards (drop)
# per base per molecule
skip_rate: 0.002
drop_rate: 0.002
flowcell:
# BEERS simulates one flowcell
# Layout of the flowcell is defined here
# Coordinate strategy defines how coordinates are assigned
# Currently only 'random' is supported
coordinate_strategy: random
# Flowcell geometry defines the dimensions of the flowcell
# Example is given for a typical HiSeq 2500 flowcell
flowcell_geometry:
# Range of the lane numbers
min_lane: 1
max_lane: 8
# Range of the tile values
max_tile: 2228
min_tile: 1101
# Range of the x-coordinates
min_x: 1012
max_x: 32816
# Range of the y-coordinates
min_y: 998
max_y: 49247
# List of lanes that samples will be written into
# Molecules are distributed evenly across lanes
# These must all be within [min_lane, max_lane] range
lanes_to_use: [1, 2]