Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate subsampling config with a script #1102

Closed
wants to merge 10 commits into from
2 changes: 2 additions & 0 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ jobs:
uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master
with:
build-args: all_regions -j 2 --profile nextstrain_profiles/nextstrain-ci
env: |
NEXTSTRAIN_DOCKER_IMAGE: nextstrain/base:branch-victorlin-augur-subsample

test-cram:
runs-on: ubuntu-latest
Expand Down
22 changes: 0 additions & 22 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -47,28 +47,6 @@ validate(config, schema="workflow/schemas/config.schema.yaml")
if isinstance(config.get("inputs"), list):
config["inputs"] = OrderedDict((v["name"], v) for v in config["inputs"])

# Check for overlapping subsampling schemes in user and default
# configurations. For now, issue a deprecation warning, so users know they
# should rename their subsampling schemes. In the future, this reuse of the same
# name will cause an error.
subsampling_config = config.get("subsampling", {})
overlapping_schemes = []
for scheme_name, scheme in user_subsampling.items():
if scheme_name in subsampling_config and subsampling_config.get(scheme_name) != scheme:
overlapping_schemes.append(scheme_name)

if len(overlapping_schemes) > 0:
logger.warning(f"WARNING: The following subsampling scheme(s) have the same name as a default scheme in this workflow but different definitions:")
logger.warning("")
for scheme in overlapping_schemes:
logger.warning(f" - {scheme}")
logger.warning("")
logger.warning(" This means Snakemake will merge your scheme with the default scheme and may produce unexpected behavior.")
logger.warning(f" To avoid errors in your workflow, rename your schemes with unique names (e.g., 'custom_{overlapping_schemes[0]}')")
logger.warning(" In future versions of this workflow, overlapping subsampling scheme names will produce an error.")
logger.warning("")
time.sleep(5)

# Assign a default build if none are specified in the config. Users can define a
# `default_build_name` in their builds config without assigning any other build
# information. Otherwise, we use a generic name for the default build.
Expand Down
134 changes: 134 additions & 0 deletions extract-subsampling-config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import argparse
from copy import deepcopy
import difflib
from pathlib import Path
import tempfile
# pip install "ruamel.yaml<0.18.0"
import ruamel.yaml

yaml=ruamel.yaml.YAML()


SUBSAMPLING_CONFIG_DIR = 'subsampling/'
Path(SUBSAMPLING_CONFIG_DIR).mkdir(exist_ok=True)


# https://stackoverflow.com/a/60099750
def recursive_delete_comments(d):
if isinstance(d, dict):
for k, v in d.items():
recursive_delete_comments(k)
recursive_delete_comments(v)
elif isinstance(d, list):
for elem in d:
recursive_delete_comments(elem)
try:
# literal scalarstring might have comment associated with them
attr = 'comment' if isinstance(d, ruamel.yaml.scalarstring.ScalarString) \
else ruamel.yaml.comments.Comment.attrib
delattr(d, attr)
except AttributeError:
pass


def recursive_replace(d, old, new):
if isinstance(d, dict):
for k, v in d.items():
recursive_replace(k, old, new)
if isinstance(v, str):
d[k] = v.replace(old, new)
else:
recursive_replace(v, old, new)
elif isinstance(d, list):
for i, v in enumerate(d):
if isinstance(v, str):
d[i] = v.replace(old, new)
else:
recursive_replace(v, old, new)


def resolve_template(config, old, new):
recursive_replace(config, old, new)


def write_subsampling_config(path, scheme):
config = {
'samples': scheme
}

with open(path, 'w') as f:
yaml.dump(config, f)


def extract_from_workflow_config_builds(input_path, use_scheme_name_for_filename=False):
print(f"Reading subsampling schemes from {input_path}. Configs that are already present and identical will be ignored.")

with open(input_path) as f:
workflow_config = yaml.load(f)
recursive_delete_comments(workflow_config)

# For each build entry, write the subsampling scheme as a file.
for build_name, build_config in workflow_config['builds'].items():
scheme_name = build_config['subsampling_scheme']
if use_scheme_name_for_filename:
output_path = Path(SUBSAMPLING_CONFIG_DIR, f"{scheme_name}.yaml")
else:
output_path = Path(SUBSAMPLING_CONFIG_DIR, f"{build_name}.yaml")

# deepcopy for temporary inplace modifications
scheme = deepcopy(workflow_config['subsampling'][scheme_name])

if 'region' in build_config:
resolve_template(scheme, '{region}', build_config['region'])
if 'country' in build_config:
resolve_template(scheme, '{country}', build_config['country'])
# TODO: add other templates

if output_path.exists():
# Check that it is the same.
new_config_path = tempfile.NamedTemporaryFile().name
write_subsampling_config(new_config_path, scheme)
with open(output_path) as existing_f, open(new_config_path) as new_f:
diff = list(difflib.unified_diff(
existing_f.readlines(),
new_f.readlines(),
))
if len(diff) != 0:
print(f"ERROR: Subsampling config for {build_name} exists and differs.")
for line in diff:
print(line, end="")
exit(1)
else:
print(f"Writing new config to {output_path}...")
write_subsampling_config(output_path, scheme)


def extract_from_workflow_config_subsampling(input_path):
with open(input_path) as f:
workflow_config = yaml.load(f)
recursive_delete_comments(workflow_config)

for name, scheme in workflow_config['subsampling'].items():
output_path = Path(SUBSAMPLING_CONFIG_DIR, f"{name}.yaml")
write_subsampling_config(output_path, scheme)


def main():
# Extract one subsampling config per build in the following configfiles.
extract_from_workflow_config_builds('nextstrain_profiles/100k/config-gisaid.yaml')
extract_from_workflow_config_builds('nextstrain_profiles/100k/config-open.yaml')
extract_from_workflow_config_builds('nextstrain_profiles/nextstrain-country/builds.yaml')
extract_from_workflow_config_builds('nextstrain_profiles/nextstrain-open/builds.yaml')
extract_from_workflow_config_builds('nextstrain_profiles/nextstrain-gisaid/builds.yaml')
extract_from_workflow_config_builds('nextstrain_profiles/nextstrain-gisaid-21L/builds.yaml')

# CI has a build definition but it's named "europe" which doesn't represent usage solely by CI.
extract_from_workflow_config_builds('nextstrain_profiles/nextstrain-ci/builds.yaml',use_scheme_name_for_filename=True)

# This file has no build definitions to extract from.
extract_from_workflow_config_subsampling('defaults/parameters.yaml')



if __name__ == '__main__':
main()
Loading
Loading