my_profiles/example_multiple_inputs/builds.yaml


## This YAML file is sparsely commented, with a focus on the parts relevant to multiple inputs
## See my_profiles/example/builds.yaml for more general comments
## See docs/multiple_inputs.md for a walkthrough of this config.

# custom_rules:
#   - my_profiles/example_multiple_inputs/rules.smk

inputs:
  - name: "aus" 
    metadata: "data/example_metadata_aus.tsv"
    sequences: "data/example_sequences_aus.fasta"
  - name: "worldwide"
    metadata: "data/example_metadata_worldwide.tsv"
    sequences: "data/example_sequences_worldwide.fasta"

builds:
  multiple-inputs:
    subsampling_scheme: custom-scheme # use a custom subsampling scheme defined below

# STAGE 1: Input-specific filtering parameters
filter:
  aus:
    min_length: 5000 # Allow shorter genomes. Parameter used to filter alignment.
    skip_diagnostics: True # skip diagnostics (which can remove genomes) for this input

# STAGE 2: Subsampling parameters
subsampling:
  custom-scheme:
    # Use metadata key to include ALL from `input1`
    allFromAus:
      exclude: "--exclude-where 'aus!=yes'" # subset to sequences from input `aus`
    # Proximity subsampling from `worldwide` input to provide context 
    worldwideContext:
      exclude: "--exclude-where 'aus=yes'" # i.e. subset to sequences _not_ from input `aus`
      group_by: "year"    # NOTE: `augur filter` needs this to use `max_sequences` (TODO)
      max_sequences: 100
      priorities:
        type: "proximity"
        focus: "allFromAus"
    worldwideBackground:
      exclude: "--exclude-where 'aus=yes'"
      group_by: year month
      seq_per_group: 5

files:
  auspice_config: "my_profiles/example_multiple_inputs/my_auspice_config.json"
  description: "my_profiles/example_multiple_inputs/my_description.md"

traits:
  multiple-inputs:
    sampling_bias_correction: 2.5
    columns: ["country"]