config/biodata.yaml

---
# Configuration file defining biological data to retrieve and install.
# These are stored in an Amazon S3 buckets:
# https://s3.amazonaws.com/biodata
# https://s3.amazonaws.com/bcbio-nextgen
# and retrieved using the data_fabfile Fabric script.

# ## Genome data -- Next generation sequencing and Galaxy

# Details about the genomes you want to include.
#  Required genome fields (corresponding to Galaxy's tool_data_table_conf.xml columns):
#    dbkey   - globally unique identifier for the genome (e.g., hg19)
#    name    - descriptive name for the given genome (to be displayed in Galaxy, e.g., Hsapiens)
#  Optional genome fields (corresponding to Galaxy's tool_data_table_conf.xml columns):
#    formats, species, dbkey1, dbkey2, value, path, index
#  Additional genome fields specific to data deployment:
#    genome_indexes - list of tool indexes specific to the associated genome (overrides global 'genome_indexes')
genomes:
  - dbkey: phix
    name: phiX174
  - dbkey: hg19
    name: Human (hg19)
    indexes: [seq, twobit]
    annotations: [GA4GH_problem_regions, capture_regions, rmsk,
                  MIG, prioritize, dbsnp, hapmap, 1000g_omni_snps, 1000g_snps,
                  mills_indels, cosmic, ancestral, clinvar, qsignature, ACMG56_genes, transcripts, RADAR, mirbase,
                  genesplicer, effects_transcripts, vcfanno, viral, exac, gnomad_exome, esp, 1000g, varpon, topmed,
                  genotype2phenotype, fusion-blacklist, simple_repeat, purecn_mappability, af_only_gnomad]
    annotations_available: [battenberg, dbnsfp, dbscsnv, ericscript, gnomad, topmed]
    validation: [giab-NA12878, platinum-genome-NA12878, giab-NA24385, giab-NA24631, giab-NA24143, giab-NA24149]
  - dbkey: GRCh37
    name: Human (GRCh37)
    indexes: [seq, twobit]
    annotations: [GA4GH_problem_regions, capture_regions,
                  MIG, prioritize, dbsnp, hapmap, 1000g_omni_snps, 1000g_snps,
                  mills_indels, cosmic, ancestral, clinvar, qsignature, ACMG56_genes, transcripts, RADAR, mirbase,
                  genesplicer, effects_transcripts, vcfanno, viral, exac, gnomad_exome, esp, 1000g, varpon, topmed,
                  genotype2phenotype, fusion-blacklist]
    annotations_available: [battenberg, dbnsfp, dbscsnv, ericscript, gnomad, topmed]
    validation: [giab-NA12878, giab-NA24385, giab-NA24631, dream-syn3, dream-syn4, giab-NA12878-NA24385-somatic,
                 giab-NA24143, giab-NA24149, giab-NA24694, giab-NA24695]
  - dbkey: hg38
    name: Human (hg38) full
    indexes: [seq, twobit, bwa, hisat2]
    annotations: [ccds, coverage, capture_regions, rmsk, prioritize, dbsnp, hapmap_snps,
                  1000g_omni_snps, 1000g_snps, 1000g_indels, mills_indels, clinvar, qsignature,
                  ACMG56_genes, transcripts, genesplicer, effects_transcripts, vcfanno, esp,
                  exac, gnomad_exome, viral, RADAR, mirbase, varpon, topmed, genotype2phenotype,
                  salmon-decoys, fusion-blacklist, purecn_mappability, simple_repeat, af_only_gnomad]
    annotations_available: [dbnsfp, dbscsnv, ericscript, gnomad, topmed]
    validation: [giab-NA12878, giab-NA24385, giab-NA24631,
                 platinum-genome-NA12878, giab-NA12878-remap, giab-NA12878-crossmap,
                 dream-syn4-crossmap, dream-syn3-crossmap, giab-NA12878-NA24385-somatic,
                 giab-NA24143, giab-NA24149, giab-NA24694, giab-NA24695]
  - dbkey: hg38-noalt
    name: Human (hg38) without alternative alleles
    annotations: [coverage, dbsnp, hapmap_snps, 1000g_omni_snps, 1000g_snps,
                  1000g_indels, mills_indels, clinvar, transcripts, mirbase, genotype2phenotype]
    annotations_available: [dbnsfp, dbscsnv]
  - dbkey: mm9
    name: Mouse (mm9)
  - dbkey: mm10
    name: Mouse (mm10)
    indexes: [seq, twobit]
    annotations: [problem_regions, dbsnp, transcripts, mirbase, rmsk, vcfanno, prioritize]
  - dbkey: rn5
    name: Rat (rn5)
  - dbkey: rn6
    name: Rat (rn6)
    indexes: [seq, twobit]
    annotations: [transcripts, mirbase]
  - dbkey: canFam3
    name: Dog (canFam3)
    indexes: [twobit]
    annotations: [dbsnp, transcripts, mirbase]
  - dbkey: galGal4
    name: Chicken (galGal4)
  - dbkey: Sscrofa11.1
    name: Pig (Sscrofa11.1)
    indexes: [seq, twobit]
    annotations: [transcripts]
  - dbkey: dm3
    name: D melangogaster (dm3)
  - dbkey: BDGP6
    name: D melangogaster (BDGP6)
    indexes: [seq]
    annotations: [transcripts, mirbase]
  - dbkey: TAIR10
    name: Arabidopsis thaliana (TAIR10)
    annotations: [mirbase]
  - dbkey: xenTro3
    name: X tropicalis (xenTro3)
  - dbkey: GRCz11
    name: Zebrafish (GRCz11)
    indexes: [seq, twobit]
    annotations: [transcripts]
  - dbkey: GRCz10
    name: Zebrafish (GRCz10)
  - dbkey: Zv9
    name: Zebrafish (Zv9)
  - dbkey: sacCer3
    indexes: [seq]
    annotations: [transcripts]
    name: S cerevisiae (sacCer3)
  - dbkey: WBcel235
    name: C elegans (WBcel235)
  - dbkey: pseudomonas_aeruginosa_ucbpp_pa14
    name: Pseudomonas aeruginosa UCBPP-PA14

# High level targets for specifying annotations
annotation_groups:
  variation: [ccds, problem_regions, GA4GH_problem_regions, capture_regions, MIG, coverage, prioritize, dbsnp,
              hapmap, hapmap_snps, 1000g_omni_snps, ACMG56_genes,
              1000g_snps, mills_indels, 1000g_indels, clinvar, cosmic, ancestral, qsignature,
              genesplicer, effects_transcripts, varpon, vcfanno, viral, purecn_mappability, simple_repeat,
              af_only_gnomad]
  rnaseq: [transcripts, RADAR, rmsk, salmon-decoys, fusion-blacklist]
  smallrna: [mirbase]
  gemini: [esp, exac, gnomad_exome, 1000g]

# Global set of indexes to include for each genome.
# Available choices are in GENOME_INDEXES_SUPPORTED in cloudbio/biodata/genomes.py
genome_indexes:
  - bwa
  - twobit

# Additional data targets
install_liftover: false
install_uniref: false