-
Notifications
You must be signed in to change notification settings - Fork 158
/
biodata.yaml
129 lines (124 loc) · 5.76 KB
/
biodata.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
---
# Configuration file defining biological data to retrieve and install.
# These are stored in an Amazon S3 buckets:
# https://s3.amazonaws.com/biodata
# https://s3.amazonaws.com/bcbio-nextgen
# and retrieved using the data_fabfile Fabric script.
# ## Genome data -- Next generation sequencing and Galaxy
# Details about the genomes you want to include.
# Required genome fields (corresponding to Galaxy's tool_data_table_conf.xml columns):
# dbkey - globally unique identifier for the genome (e.g., hg19)
# name - descriptive name for the given genome (to be displayed in Galaxy, e.g., Hsapiens)
# Optional genome fields (corresponding to Galaxy's tool_data_table_conf.xml columns):
# formats, species, dbkey1, dbkey2, value, path, index
# Additional genome fields specific to data deployment:
# genome_indexes - list of tool indexes specific to the associated genome (overrides global 'genome_indexes')
genomes:
- dbkey: phix
name: phiX174
- dbkey: hg19
name: Human (hg19)
indexes: [seq, twobit]
annotations: [GA4GH_problem_regions, capture_regions, rmsk,
MIG, prioritize, dbsnp, hapmap, 1000g_omni_snps, 1000g_snps,
mills_indels, cosmic, ancestral, clinvar, qsignature, ACMG56_genes, transcripts, RADAR, mirbase,
genesplicer, effects_transcripts, vcfanno, viral, exac, gnomad_exome, esp, 1000g, varpon, topmed,
genotype2phenotype, fusion-blacklist, simple_repeat, purecn_mappability, af_only_gnomad]
annotations_available: [battenberg, dbnsfp, dbscsnv, ericscript, gnomad, topmed]
validation: [giab-NA12878, platinum-genome-NA12878, giab-NA24385, giab-NA24631, giab-NA24143, giab-NA24149]
- dbkey: GRCh37
name: Human (GRCh37)
indexes: [seq, twobit]
annotations: [GA4GH_problem_regions, capture_regions,
MIG, prioritize, dbsnp, hapmap, 1000g_omni_snps, 1000g_snps,
mills_indels, cosmic, ancestral, clinvar, qsignature, ACMG56_genes, transcripts, RADAR, mirbase,
genesplicer, effects_transcripts, vcfanno, viral, exac, gnomad_exome, esp, 1000g, varpon, topmed,
genotype2phenotype, fusion-blacklist]
annotations_available: [battenberg, dbnsfp, dbscsnv, ericscript, gnomad, topmed]
validation: [giab-NA12878, giab-NA24385, giab-NA24631, dream-syn3, dream-syn4, giab-NA12878-NA24385-somatic,
giab-NA24143, giab-NA24149, giab-NA24694, giab-NA24695]
- dbkey: hg38
name: Human (hg38) full
indexes: [seq, twobit, bwa, hisat2]
annotations: [ccds, coverage, capture_regions, rmsk, prioritize, dbsnp, hapmap_snps,
1000g_omni_snps, 1000g_snps, 1000g_indels, mills_indels, clinvar, qsignature,
ACMG56_genes, transcripts, genesplicer, effects_transcripts, vcfanno, esp,
exac, gnomad_exome, viral, RADAR, mirbase, varpon, topmed, genotype2phenotype,
salmon-decoys, fusion-blacklist, purecn_mappability, simple_repeat, af_only_gnomad]
annotations_available: [dbnsfp, dbscsnv, ericscript, gnomad, topmed]
validation: [giab-NA12878, giab-NA24385, giab-NA24631,
platinum-genome-NA12878, giab-NA12878-remap, giab-NA12878-crossmap,
dream-syn4-crossmap, dream-syn3-crossmap, giab-NA12878-NA24385-somatic,
giab-NA24143, giab-NA24149, giab-NA24694, giab-NA24695]
- dbkey: hg38-noalt
name: Human (hg38) without alternative alleles
annotations: [coverage, dbsnp, hapmap_snps, 1000g_omni_snps, 1000g_snps,
1000g_indels, mills_indels, clinvar, transcripts, mirbase, genotype2phenotype]
annotations_available: [dbnsfp, dbscsnv]
- dbkey: mm9
name: Mouse (mm9)
- dbkey: mm10
name: Mouse (mm10)
indexes: [seq, twobit]
annotations: [problem_regions, dbsnp, transcripts, mirbase, rmsk, vcfanno, prioritize]
- dbkey: rn5
name: Rat (rn5)
- dbkey: rn6
name: Rat (rn6)
indexes: [seq, twobit]
annotations: [transcripts, mirbase]
- dbkey: canFam3
name: Dog (canFam3)
indexes: [twobit]
annotations: [dbsnp, transcripts, mirbase]
- dbkey: galGal4
name: Chicken (galGal4)
- dbkey: Sscrofa11.1
name: Pig (Sscrofa11.1)
indexes: [seq, twobit]
annotations: [transcripts]
- dbkey: dm3
name: D melangogaster (dm3)
- dbkey: BDGP6
name: D melangogaster (BDGP6)
indexes: [seq]
annotations: [transcripts, mirbase]
- dbkey: TAIR10
name: Arabidopsis thaliana (TAIR10)
annotations: [mirbase]
- dbkey: xenTro3
name: X tropicalis (xenTro3)
- dbkey: GRCz11
name: Zebrafish (GRCz11)
indexes: [seq, twobit]
annotations: [transcripts]
- dbkey: GRCz10
name: Zebrafish (GRCz10)
- dbkey: Zv9
name: Zebrafish (Zv9)
- dbkey: sacCer3
indexes: [seq]
annotations: [transcripts]
name: S cerevisiae (sacCer3)
- dbkey: WBcel235
name: C elegans (WBcel235)
- dbkey: pseudomonas_aeruginosa_ucbpp_pa14
name: Pseudomonas aeruginosa UCBPP-PA14
# High level targets for specifying annotations
annotation_groups:
variation: [ccds, problem_regions, GA4GH_problem_regions, capture_regions, MIG, coverage, prioritize, dbsnp,
hapmap, hapmap_snps, 1000g_omni_snps, ACMG56_genes,
1000g_snps, mills_indels, 1000g_indels, clinvar, cosmic, ancestral, qsignature,
genesplicer, effects_transcripts, varpon, vcfanno, viral, purecn_mappability, simple_repeat,
af_only_gnomad]
rnaseq: [transcripts, RADAR, rmsk, salmon-decoys, fusion-blacklist]
smallrna: [mirbase]
gemini: [esp, exac, gnomad_exome, 1000g]
# Global set of indexes to include for each genome.
# Available choices are in GENOME_INDEXES_SUPPORTED in cloudbio/biodata/genomes.py
genome_indexes:
- bwa
- twobit
# Additional data targets
install_liftover: false
install_uniref: false