From c2d78f8797965d001f4642fe9b81f213b8cb92bb Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Fri, 19 Jan 2024 10:47:26 -0500 Subject: [PATCH 01/32] feat: accept inputs for both dna and rna --- metamorph | 19 ++++++++++++++++++- src/run.py | 25 +++++++++++++++++++++++-- workflow/rules/RNA.smk | 24 ++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 3 deletions(-) create mode 100644 workflow/rules/RNA.smk diff --git a/metamorph b/metamorph index 6145d1c..5df25d7 100755 --- a/metamorph +++ b/metamorph @@ -103,10 +103,16 @@ def run(sub_args): # copy over required resources to run # the pipeline git_repo = __home__ + + fastq_inputs = [sub_args.input] + + if sub_args.rna: + fastq_inputs.append(sub_args.rna) + input_files = init( repo_path = git_repo, output_path = sub_args.output, - links = sub_args.input + links = fastq_inputs ) # Step 2. Setup pipeline for execution, @@ -191,6 +197,7 @@ def run(sub_args): def cache(sub_args): + """Caches remote resources or reference files stored on DockerHub and S3. Local SIFs will be created from images defined in 'config/containers/images.json'. @TODO: add option to cache other shared S3 resources (i.e. kraken db and fqscreen indices) @@ -394,6 +401,16 @@ def parsed_arguments(name, description): help = argparse.SUPPRESS ) + subparser_run.add_argument( + '--rna', + # Check if the file exists and if it is readable + type = lambda file: permissions(parser, file, os.R_OK), + required = False, + nargs = '+', + help = argparse.SUPPRESS + ) + + # Output Directory, i.e # working directory subparser_run.add_argument( diff --git a/src/run.py b/src/run.py index 9c06915..89616c0 100644 --- a/src/run.py +++ b/src/run.py @@ -4,6 +4,9 @@ # Python standard library from __future__ import print_function from shutil import copytree +from uuid import uuid4 +from datetime import datetime +from itertools import chain import os, re, json, sys, subprocess # Local imports @@ -50,7 +53,17 @@ def init(repo_path, output_path, links=[], required=['workflow', 'resources', 'c # Create renamed symlinks for each rawdata # file provided as input to the pipeline - inputs = sym_safe(input_data = links, target = output_path) + try: + os.mkdir(os.path.join(output_path, 'dna')) + except FileExistsError: + pass + inputs = dict(dna=sym_safe(input_data = links[0], target = os.path.join(output_path, 'dna'))) + if len(links) == 2 and links[1]: + try: + os.mkdir(os.path.join(output_path, 'rna')) + except FileExistsError: + pass + inputs['rna'] = sym_safe(input_data = links[1], target = os.path.join(output_path, 'rna')) return inputs @@ -446,8 +459,11 @@ def add_rawdata_information(sub_args, config, ifiles): # or single-end # Updates config['project']['nends'] where # 1 = single-end, 2 = paired-end, -1 = bams + convert = {1: 'single-end', 2: 'paired-end', -1: 'bam'} - nends = get_nends(ifiles) # Checks PE data for both mates (R1 and R2) + import ipdb; ipdb.set_trace() + + nends = get_nends(ifiles['dna']) # Checks PE data for both mates (R1 and R2) config['project']['nends'] = nends config['project']['filetype'] = convert[nends] @@ -455,6 +471,11 @@ def add_rawdata_information(sub_args, config, ifiles): rawdata_paths = get_rawdata_bind_paths(input_files = sub_args.input) config['project']['datapath'] = ','.join(rawdata_paths) + if 'rna' in ifiles and ifiles['rna']: + config['project']['rnapath'] = ifiles['rna'] + + import ipdb; ipdb.set_trace() + # Add each sample's basename config = add_sample_metadata(input_files = ifiles, config = config) diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk new file mode 100644 index 0000000..ca780b2 --- /dev/null +++ b/workflow/rules/RNA.smk @@ -0,0 +1,24 @@ +# ~~~~~~~~~~ +# Metawrap metagenome assembly and analysis rules +# ~~~~~~~~~~ +from os.path import join +from itertools import chain + +# ~~~~~~~~~~ +# Constants and paths +# ~~~~~~~~~~ +workpath = config["project"]["workpath"] +datapath = config["project"]["datapath"] + +rule concat + +rule map_rna_to_metagenome: + input: + concat_rna_read = + output: + params: + shell: + """ + humann --threads 16 --input $(ANALYSIS)/READ_QC_RNA/$${sample}_concat.fastq --remove-temp-output --input-format fastq --output-basename $${sample} --output ./ + """ + \ No newline at end of file From f155f1a86ae314173881168da70b9cc528422b38 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 22 Jan 2024 16:41:18 -0500 Subject: [PATCH 02/32] wip: adding rna inputs ingestion partial --- config/images.json | 2 +- config/resources.json | 7 +--- metamorph | 3 ++ src/run.py | 81 +++++++++++++++++++------------------ workflow/rules/metawrap.smk | 4 +- 5 files changed, 48 insertions(+), 49 deletions(-) diff --git a/config/images.json b/config/images.json index 8fdfa95..8c1e3f5 100644 --- a/config/images.json +++ b/config/images.json @@ -4,7 +4,7 @@ "metagenome": "docker://rroutsong/metamorph_metagenome:0.0.1" }, "containers": { - "metawrap": "/data/OpenOmics/SIFs/metamorph_metawrap_1.3.2.sif", + "metawrap": "/data/OpenOmics/SIFs/metamorph_metawrap_0.0.2.sif", "metagenome": "/data/OpenOmics/SIFs/metamorph_metagenome_0.0.1.sif" } } diff --git a/config/resources.json b/config/resources.json index c8932e4..e56d2a5 100644 --- a/config/resources.json +++ b/config/resources.json @@ -1,5 +1,5 @@ { - "binds": [ + "databases": [ { "name": "KRAKEN_DB2", "to": "$HOME/KRAKEN_DB2", @@ -30,11 +30,6 @@ "to": "$HOME/checkm", "from": "/data/OpenOmics/references/metamorph/checkm", "mode": "rw" - }, { - "name": "CHECKM_CONFIG", - "to": "/opt/conda/envs/metawrap-env/lib/python2.7/site-packages/checkm/DATA_CONFIG", - "from": "/data/OpenOmics/references/metamorph/checkm/DATA_CONFIG", - "mode": "rw" } ] } \ No newline at end of file diff --git a/metamorph b/metamorph index 9f49a83..a403c4a 100755 --- a/metamorph +++ b/metamorph @@ -165,6 +165,9 @@ def run(sub_args): log = os.path.join(sub_args.output, 'logfiles', 'master.log') logfh = open(log, 'w') + if 'databases' in config: + bindpaths.extend([mount['from']+':'+mount['to']+':'+mount['mode'] for mount in config['databases']]) + if sub_args.coa: cjob = run_coa_pipeline(sub_args.mode, sub_args.output, diff --git a/src/run.py b/src/run.py index cc4e991..31b3521 100644 --- a/src/run.py +++ b/src/run.py @@ -225,27 +225,6 @@ def setup(sub_args, ifiles, repo_path, output_path): return config -def unpacked(nested_dict): - """Generator to recursively retrieves all values in a nested dictionary. - @param nested_dict dict[]: - Nested dictionary to unpack - @yields value in dictionary - """ - # Iterate over all values of - # given dictionary - for value in nested_dict.values(): - # Check if value is of dict type - if isinstance(value, dict): - # If value is dict then iterate - # over all its values recursively - for v in unpacked(value): - yield v - else: - # If value is not dict type - # then yield the value - yield value - - def get_fastq_screen_paths(fastq_screen_confs, match = 'DATABASE', file_index = -1): """Parses fastq_screen.conf files to get the paths of each fastq_screen database. This path contains bowtie2 indices for reference genome to screen against. @@ -332,19 +311,30 @@ def bind(sub_args, config): List of singularity/docker bind paths """ bindpaths = [] - for value in unpacked(config): - if not isinstance(value, str): - continue - if exists(value): - if os.path.isfile(value): - value = os.path.dirname(value) - if value not in bindpaths: - bindpaths.append(value) + + if 'databases' in config: + dbs = config.pop('databases') + bindpaths.extend([mount['from']+':'+mount['to']+':'+mount['mode'] for mount in dbs]) + + if 'options' in config and 'input' in config['options']: + inrents = list(set([os.path.abspath(os.path.dirname(p)) for p in config['options']['input'] if os.path.exists(os.path.dirname(p)) and os.path.isdir(os.path.dirname(p))])) + bindpaths.extend(inrents) + + if 'options' in config and 'rna' in config['options']: + rnarents = list(set([os.path.abspath(os.path.dirname(p)) for p in config['options']['rna'] if os.path.exists(os.path.dirname(p)) and os.path.isdir(os.path.dirname(p))])) + bindpaths.extend(rnarents) + + if 'options' in config and 'output' in config['options']: + if os.path.exists(config['options']['output']) and os.path.isdir(config['options']['output']): + bindpaths.append(os.path.abspath(config['options']['output'])) + + if 'tmp_dir' in config: + bindpaths.append(config['tmp_dir']) rawdata_bind_paths = [os.path.abspath(p) for p in config['project']['datapath'].split(',')] working_directory = os.path.realpath(config['project']['workpath']) - return bindpaths + return list(set(bindpaths)) def mixed_inputs(ifiles): @@ -410,12 +400,27 @@ def add_user_information(config): def add_sample_metadata(input_files, config, rna_files=None, group=None): - """ + """Adds sample metadata such as sample basename, label, and group information. + If sample sheet is provided, it will default to using information in that file. + If no sample sheet is provided, it will only add sample basenames and labels. + @params input_files list[]: + List containing pipeline input fastq files + @params config : + Config dictionary containing metadata to run pipeline + @params group : + Sample sheet containing basename, group, and label for each sample + @return config : + Updated config with basenames, labels, and groups (if provided) """ added = [] config['samples'] = [] - - + for file in input_files: + # Split sample name on file extension + sample = re.split('\.R[12]\.fastq\.gz', os.path.basename(file))[0] + if sample not in added: + # Only add PE sample information once + added.append(sample) + config['samples'].append(sample) return config @@ -446,15 +451,11 @@ def add_rawdata_information(sub_args, config, ifiles): config['project']['filetype'] = convert[nends] # Finds the set of rawdata directories to bind - rawdata_paths = get_rawdata_bind_paths(input_files = sub_args.input + sub_args.rna) + rawdata_paths = get_rawdata_bind_paths(input_files = sub_args.input) config['project']['datapath'] = ','.join(rawdata_paths) # Add each sample's basename - - if 'rna' in ifiles and ifiles['rna']: - config = add_sample_metadata(ifiles['dna'], config, rna_files=ifiles['rna']) - else: - config = add_sample_metadata(ifiles['dna'], config) + config = add_sample_metadata(ifiles['dna'], config) return config @@ -813,7 +814,7 @@ def runner( # Add Bind PATHs for outdir and tmp dir if bindpaths: bindpaths = ",{}".format(bindpaths) - bindpaths = "{}{}".format(additional_bind_paths,bindpaths) + bindpaths = "{}{}".format(additional_bind_paths, bindpaths) if not exists(os.path.join(outdir, 'logfiles')): # Create directory for logfiles diff --git a/workflow/rules/metawrap.smk b/workflow/rules/metawrap.smk index e10d5d0..7973957 100644 --- a/workflow/rules/metawrap.smk +++ b/workflow/rules/metawrap.smk @@ -60,8 +60,8 @@ rule metawrap_read_qc: - FastQC html report and zip file on trimmed data """ input: - R1 = join(datapath, "{name}_R1.fastq.gz"), - R2 = join(datapath, "{name}_R2.fastq.gz"), + R1 = expand(join(datapath, "{name}_R1.fastq.gz"), name=config['samples']), + R2 = expand(join(datapath, "{name}_R2.fastq.gz"), name=config['samples']), output: R1_bmtagger_report = join(top_readqc_dir, "{name}", "{name}.R1_bmtagger_report.html"), R2_bmtagger_report = join(top_readqc_dir, "{name}", "{name}.R2_bmtagger_report.html"), From 14090fd35b88f29d2b9b2a3dc9e6d4abd7629867 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 22 Jan 2024 16:47:27 -0500 Subject: [PATCH 03/32] fix: remove weird characters in config --- docker/metawrap/config-metawrap_1.3.2 | 5 ----- 1 file changed, 5 deletions(-) diff --git a/docker/metawrap/config-metawrap_1.3.2 b/docker/metawrap/config-metawrap_1.3.2 index 8d2e8bd..17d9f41 100755 --- a/docker/metawrap/config-metawrap_1.3.2 +++ b/docker/metawrap/config-metawrap_1.3.2 @@ -1,19 +1,14 @@ -#!/bin/bash # Paths to custon pipelines and scripts of metaWRAP mw_path=$(which metawrap) bin_path=${mw_path%/*} SOFT=${bin_path}/metawrap-scripts PIPES=${bin_path}/metawrap-modules - # OPTIONAL databases (see 'Databases' section of metaWRAP README for details) # path to kraken standard databases KRAKEN_DB=~/KRAKEN_DB KRAKEN2_DB=~/KRAKEN_DB2 - # path to indexed human (or other host) genome (see metaWRAP website for guide). This includes .bitmask and .srprism files BMTAGGER_DB=~/BMTAGGER_DB - # paths to BLAST databases BLASTDB=~/NCBI_NT_DB TAXDUMP=~/NCBI_TAX_DB - From 25e993e573c68b3c940636ac61a626311fb3d725 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 22 Jan 2024 15:49:53 -0600 Subject: [PATCH 04/32] fix: update metawrap docker 1.3.2 with additional applications --- docker/metawrap/docker_1.3.2 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/metawrap/docker_1.3.2 b/docker/metawrap/docker_1.3.2 index 02620a5..ec087d6 100644 --- a/docker/metawrap/docker_1.3.2 +++ b/docker/metawrap/docker_1.3.2 @@ -1,18 +1,18 @@ FROM condaforge/miniforge3:latest +RUN apt-get update; apt-get install -y -qq curl vim RUN mamba create -y -n metawrap-env RUN conda config --add channels defaults; conda config --add channels conda-forge; \ conda config --add channels bioconda;conda config --add channels ursky -RUN echo "export PATH=\"$PATH:/home/metaWRAP/bin\"" >> /etc/bash.bashrc # RUN mamba install -y -n metawrap-env biopython blas=2.5 blast=2.6.0 bmtagger bowtie2 bwa checkm-genome \ # fastqc kraken=1.1 kraken=2.0 krona=2.7 matplotlib maxbin2 megahit metabat2 pandas \ # prokka quast r-ggplot2 r-recommended salmon samtools=1.9 seaborn spades trim-galore RUN mamba install -y --only-deps -c ursky -n metawrap-env metawrap-mg==1.3.2 RUN cd /home; git clone https://github.com/bxlab/metaWRAP.git; chmod -R 777 metaWRAP -RUN sed '$d' /etc/skel/.bashrc; sed '$d' ~/.bashrc RUN echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/skel/.bashrc && \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate metawrap-env" >> ~/.bashrc -ENV PATH="/home/metawrap/bin:$PATH" +ENV PATH="/home/metaWRAP/bin:$PATH" COPY docker/metawrap/config-metawrap_1.3.2 /home/metaWRAP/bin/config-metawrap RUN chmod u+x /home/metaWRAP/bin/config-metawrap RUN mamba run -n metawrap-env pip3 install drep +ENV BASH_ENV=/etc/skel/.bashrc ENTRYPOINT ["tini", "--", "/bin/bash", "--rcfile", "/etc/skel/.bashrc"] From 20925508ed800881be28580abcba686099f5a6b5 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 23 Jan 2024 09:30:15 -0600 Subject: [PATCH 05/32] fix: add dos2unix conversion of config --- docker/metawrap/{config-metawrap_1.3.2 => config-metawrap} | 0 docker/metawrap/docker_1.3.2 | 5 +++-- 2 files changed, 3 insertions(+), 2 deletions(-) rename docker/metawrap/{config-metawrap_1.3.2 => config-metawrap} (100%) diff --git a/docker/metawrap/config-metawrap_1.3.2 b/docker/metawrap/config-metawrap similarity index 100% rename from docker/metawrap/config-metawrap_1.3.2 rename to docker/metawrap/config-metawrap diff --git a/docker/metawrap/docker_1.3.2 b/docker/metawrap/docker_1.3.2 index ec087d6..08666f5 100644 --- a/docker/metawrap/docker_1.3.2 +++ b/docker/metawrap/docker_1.3.2 @@ -1,5 +1,5 @@ FROM condaforge/miniforge3:latest -RUN apt-get update; apt-get install -y -qq curl vim +RUN apt-get update; apt-get install -y -qq curl vim dos2unix RUN mamba create -y -n metawrap-env RUN conda config --add channels defaults; conda config --add channels conda-forge; \ conda config --add channels bioconda;conda config --add channels ursky @@ -11,7 +11,8 @@ RUN cd /home; git clone https://github.com/bxlab/metaWRAP.git; chmod -R 777 meta RUN echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/skel/.bashrc && \ echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate metawrap-env" >> ~/.bashrc ENV PATH="/home/metaWRAP/bin:$PATH" -COPY docker/metawrap/config-metawrap_1.3.2 /home/metaWRAP/bin/config-metawrap +COPY docker/metawrap/config-metawrap /home/metaWRAP/bin/config-metawrap +RUN dos2unix /home/metaWRAP/bin/config-metawrap RUN chmod u+x /home/metaWRAP/bin/config-metawrap RUN mamba run -n metawrap-env pip3 install drep ENV BASH_ENV=/etc/skel/.bashrc From 3106bacb5ebb339bff98375bd5e2372ef0193248 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 23 Jan 2024 14:18:21 -0500 Subject: [PATCH 06/32] fix: remove tini from docker ep --- docker/metawrap/docker_1.3.2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/metawrap/docker_1.3.2 b/docker/metawrap/docker_1.3.2 index 08666f5..a255707 100644 --- a/docker/metawrap/docker_1.3.2 +++ b/docker/metawrap/docker_1.3.2 @@ -16,4 +16,4 @@ RUN dos2unix /home/metaWRAP/bin/config-metawrap RUN chmod u+x /home/metaWRAP/bin/config-metawrap RUN mamba run -n metawrap-env pip3 install drep ENV BASH_ENV=/etc/skel/.bashrc -ENTRYPOINT ["tini", "--", "/bin/bash", "--rcfile", "/etc/skel/.bashrc"] +ENTRYPOINT ["/bin/bash"] From 01f15037173177cb7f502c836d3d15b7db87bb67 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 23 Jan 2024 15:18:59 -0500 Subject: [PATCH 07/32] fix: metawrap add wrapper script, remove 1.3.0 docker, add dockerfile and md5sum to container --- docker/Dockerfile | 17 ----------------- docker/metawrap/Dockerfile | 15 +++++++++++++++ docker/metawrap/docker_1.3.0 | 1 - docker/metawrap/docker_1.3.2 | 12 +++++------- docker/metawrap/mw | 5 +++++ 5 files changed, 25 insertions(+), 25 deletions(-) delete mode 100644 docker/Dockerfile create mode 100644 docker/metawrap/Dockerfile delete mode 100644 docker/metawrap/docker_1.3.0 create mode 100755 docker/metawrap/mw diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 7f40036..0000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM mambaorg/micromamba - -# install sudo, update system pkgs -USER root -RUN apt-get update; apt-get upgrade; apt-get install -y sudo vim debianutils bash -RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers -RUN echo 'routsongrm ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers - -# $MAMBA_USER from inherited image -RUN usermod -a -G sudo $MAMBA_USER -RUN passwd -d $MAMBA_USER - -# back to mamba user -USER $MAMBA_USER - -# setup up metagenome environment -COPY assets /assets diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile new file mode 100644 index 0000000..86b0b73 --- /dev/null +++ b/docker/metawrap/Dockerfile @@ -0,0 +1,15 @@ +FROM condaforge/miniforge3:latest +RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash +RUN mamba create -y -n metawrap-env +RUN conda config --add channels defaults; conda config --add channels conda-forge; \ + conda config --add channels bioconda;conda config --add channels ursky +RUN mamba install -y --only-deps -c ursky -n metawrap-env metawrap-mg==1.3.2 +RUN cd /home; git clone https://github.com/bxlab/metaWRAP.git; chmod -R 777 metaWRAP +ENV PATH="/home/metaWRAP/bin:$PATH" +COPY docker/metawrap/config-metawrap /home/metaWRAP/bin/config-metawrap +ADD docker/metawrap/mw /home/metaWRAP/bin/mw +RUN dos2unix /home/metaWRAP/bin/config-metawrap +RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw +RUN mamba run -n metawrap-env pip3 install drep +ENV BASH_ENV=/etc/skel/.bashrc +ENTRYPOINT ["/bin/bash"] diff --git a/docker/metawrap/docker_1.3.0 b/docker/metawrap/docker_1.3.0 deleted file mode 100644 index 99f9d3d..0000000 --- a/docker/metawrap/docker_1.3.0 +++ /dev/null @@ -1 +0,0 @@ -FROM quay.io/biocontainers/metawrap-mg:1.3.0--hdfd78af_1 \ No newline at end of file diff --git a/docker/metawrap/docker_1.3.2 b/docker/metawrap/docker_1.3.2 index a255707..202104c 100644 --- a/docker/metawrap/docker_1.3.2 +++ b/docker/metawrap/docker_1.3.2 @@ -1,19 +1,17 @@ FROM condaforge/miniforge3:latest -RUN apt-get update; apt-get install -y -qq curl vim dos2unix +RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash RUN mamba create -y -n metawrap-env RUN conda config --add channels defaults; conda config --add channels conda-forge; \ conda config --add channels bioconda;conda config --add channels ursky -# RUN mamba install -y -n metawrap-env biopython blas=2.5 blast=2.6.0 bmtagger bowtie2 bwa checkm-genome \ -# fastqc kraken=1.1 kraken=2.0 krona=2.7 matplotlib maxbin2 megahit metabat2 pandas \ -# prokka quast r-ggplot2 r-recommended salmon samtools=1.9 seaborn spades trim-galore RUN mamba install -y --only-deps -c ursky -n metawrap-env metawrap-mg==1.3.2 RUN cd /home; git clone https://github.com/bxlab/metaWRAP.git; chmod -R 777 metaWRAP -RUN echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/skel/.bashrc && \ - echo ". ${CONDA_DIR}/etc/profile.d/conda.sh && conda activate metawrap-env" >> ~/.bashrc ENV PATH="/home/metaWRAP/bin:$PATH" COPY docker/metawrap/config-metawrap /home/metaWRAP/bin/config-metawrap +COPY docker/metawrap/Dockerfile /Dockerfile +RUN md5sum Dockerfile > /Dockerfile.md5 +ADD docker/metawrap/mw /home/metaWRAP/bin/mw RUN dos2unix /home/metaWRAP/bin/config-metawrap -RUN chmod u+x /home/metaWRAP/bin/config-metawrap +RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw RUN mamba run -n metawrap-env pip3 install drep ENV BASH_ENV=/etc/skel/.bashrc ENTRYPOINT ["/bin/bash"] diff --git a/docker/metawrap/mw b/docker/metawrap/mw new file mode 100755 index 0000000..1f3e023 --- /dev/null +++ b/docker/metawrap/mw @@ -0,0 +1,5 @@ +#!/bin/bash +set +eu +source /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env +set -eu +/home/metaWRAP/bin/metawrap "$@" \ No newline at end of file From 033b61b83c2013da7cccb895190f36758021a9dd Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 23 Jan 2024 17:55:53 -0500 Subject: [PATCH 08/32] fix: run dos2unix on mw executable --- docker/metawrap/docker_1.3.2 | 6 ++++-- docker/metawrap/mw | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/docker/metawrap/docker_1.3.2 b/docker/metawrap/docker_1.3.2 index 202104c..29d00ce 100644 --- a/docker/metawrap/docker_1.3.2 +++ b/docker/metawrap/docker_1.3.2 @@ -11,7 +11,9 @@ COPY docker/metawrap/Dockerfile /Dockerfile RUN md5sum Dockerfile > /Dockerfile.md5 ADD docker/metawrap/mw /home/metaWRAP/bin/mw RUN dos2unix /home/metaWRAP/bin/config-metawrap +RUN dos2unix /home/metaWRAP/bin/mw RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw RUN mamba run -n metawrap-env pip3 install drep -ENV BASH_ENV=/etc/skel/.bashrc -ENTRYPOINT ["/bin/bash"] +RUN echo ". /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/bash.bashrc +ENV BASH_ENV=/etc/bash.bashrc +ENTRYPOINT ["/bin/bash"] \ No newline at end of file diff --git a/docker/metawrap/mw b/docker/metawrap/mw index 1f3e023..c5fda92 100755 --- a/docker/metawrap/mw +++ b/docker/metawrap/mw @@ -2,4 +2,4 @@ set +eu source /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env set -eu -/home/metaWRAP/bin/metawrap "$@" \ No newline at end of file +/home/metaWRAP/bin/metawrap "$@" From aaf18463c72e80a19274e479e7c6f2491ed19670 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 23 Jan 2024 18:10:27 -0500 Subject: [PATCH 09/32] fix: add new dockerfile remove old one --- docker/metawrap/Dockerfile | 8 ++++++-- docker/metawrap/docker_1.3.2 | 19 ------------------- 2 files changed, 6 insertions(+), 21 deletions(-) delete mode 100644 docker/metawrap/docker_1.3.2 diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index 86b0b73..29d00ce 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -7,9 +7,13 @@ RUN mamba install -y --only-deps -c ursky -n metawrap-env metawrap-mg==1.3.2 RUN cd /home; git clone https://github.com/bxlab/metaWRAP.git; chmod -R 777 metaWRAP ENV PATH="/home/metaWRAP/bin:$PATH" COPY docker/metawrap/config-metawrap /home/metaWRAP/bin/config-metawrap +COPY docker/metawrap/Dockerfile /Dockerfile +RUN md5sum Dockerfile > /Dockerfile.md5 ADD docker/metawrap/mw /home/metaWRAP/bin/mw RUN dos2unix /home/metaWRAP/bin/config-metawrap +RUN dos2unix /home/metaWRAP/bin/mw RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw RUN mamba run -n metawrap-env pip3 install drep -ENV BASH_ENV=/etc/skel/.bashrc -ENTRYPOINT ["/bin/bash"] +RUN echo ". /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/bash.bashrc +ENV BASH_ENV=/etc/bash.bashrc +ENTRYPOINT ["/bin/bash"] \ No newline at end of file diff --git a/docker/metawrap/docker_1.3.2 b/docker/metawrap/docker_1.3.2 deleted file mode 100644 index 29d00ce..0000000 --- a/docker/metawrap/docker_1.3.2 +++ /dev/null @@ -1,19 +0,0 @@ -FROM condaforge/miniforge3:latest -RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash -RUN mamba create -y -n metawrap-env -RUN conda config --add channels defaults; conda config --add channels conda-forge; \ - conda config --add channels bioconda;conda config --add channels ursky -RUN mamba install -y --only-deps -c ursky -n metawrap-env metawrap-mg==1.3.2 -RUN cd /home; git clone https://github.com/bxlab/metaWRAP.git; chmod -R 777 metaWRAP -ENV PATH="/home/metaWRAP/bin:$PATH" -COPY docker/metawrap/config-metawrap /home/metaWRAP/bin/config-metawrap -COPY docker/metawrap/Dockerfile /Dockerfile -RUN md5sum Dockerfile > /Dockerfile.md5 -ADD docker/metawrap/mw /home/metaWRAP/bin/mw -RUN dos2unix /home/metaWRAP/bin/config-metawrap -RUN dos2unix /home/metaWRAP/bin/mw -RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw -RUN mamba run -n metawrap-env pip3 install drep -RUN echo ". /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/bash.bashrc -ENV BASH_ENV=/etc/bash.bashrc -ENTRYPOINT ["/bin/bash"] \ No newline at end of file From dade6442bc42a52bb78abdc5231a0f0f5a6cc4f3 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 23 Jan 2024 18:24:03 -0500 Subject: [PATCH 10/32] fix: move drep install up --- docker/metawrap/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index 29d00ce..c4532c4 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -5,6 +5,7 @@ RUN conda config --add channels defaults; conda config --add channels conda-forg conda config --add channels bioconda;conda config --add channels ursky RUN mamba install -y --only-deps -c ursky -n metawrap-env metawrap-mg==1.3.2 RUN cd /home; git clone https://github.com/bxlab/metaWRAP.git; chmod -R 777 metaWRAP +RUN mamba run -n metawrap-env pip3 install drep ENV PATH="/home/metaWRAP/bin:$PATH" COPY docker/metawrap/config-metawrap /home/metaWRAP/bin/config-metawrap COPY docker/metawrap/Dockerfile /Dockerfile @@ -13,7 +14,6 @@ ADD docker/metawrap/mw /home/metaWRAP/bin/mw RUN dos2unix /home/metaWRAP/bin/config-metawrap RUN dos2unix /home/metaWRAP/bin/mw RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw -RUN mamba run -n metawrap-env pip3 install drep RUN echo ". /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/bash.bashrc ENV BASH_ENV=/etc/bash.bashrc ENTRYPOINT ["/bin/bash"] \ No newline at end of file From 0d7816b108f3ca45982e839db4f601b081dd34bf Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 23 Jan 2024 20:05:18 -0500 Subject: [PATCH 11/32] fix: remove BASH_ENV --- docker/metawrap/Dockerfile | 1 - docker/metawrap/mw | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index c4532c4..1d61f68 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -15,5 +15,4 @@ RUN dos2unix /home/metaWRAP/bin/config-metawrap RUN dos2unix /home/metaWRAP/bin/mw RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw RUN echo ". /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/bash.bashrc -ENV BASH_ENV=/etc/bash.bashrc ENTRYPOINT ["/bin/bash"] \ No newline at end of file diff --git a/docker/metawrap/mw b/docker/metawrap/mw index c5fda92..85b0833 100755 --- a/docker/metawrap/mw +++ b/docker/metawrap/mw @@ -1,5 +1,5 @@ #!/bin/bash -set +eu -source /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env set -eu +source /etc/bash.bashrc /home/metaWRAP/bin/metawrap "$@" +set +eu \ No newline at end of file From 7d8f0b1163ba3cd71fec3da60cbe7c8c70e6dc70 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 23 Jan 2024 22:18:08 -0500 Subject: [PATCH 12/32] fix: add python2.7 to docker --- docker/metawrap/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index 1d61f68..39dc134 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -1,5 +1,5 @@ FROM condaforge/miniforge3:latest -RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash +RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash python2.7 RUN mamba create -y -n metawrap-env RUN conda config --add channels defaults; conda config --add channels conda-forge; \ conda config --add channels bioconda;conda config --add channels ursky From f880d66f63a81da7d6074f3e9d634ba699777e24 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 24 Jan 2024 10:41:43 -0500 Subject: [PATCH 13/32] fix: set +u instead of -u --- docker/metawrap/mw | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/metawrap/mw b/docker/metawrap/mw index 85b0833..281888d 100755 --- a/docker/metawrap/mw +++ b/docker/metawrap/mw @@ -1,5 +1,5 @@ #!/bin/bash -set -eu +set +eu source /etc/bash.bashrc /home/metaWRAP/bin/metawrap "$@" -set +eu \ No newline at end of file +set -eu \ No newline at end of file From 327ef2b3045f38da1f72ac6a134dec08cdfb32ac Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 24 Jan 2024 10:44:42 -0500 Subject: [PATCH 14/32] fix: add metawrap bin to default bashrc --- docker/metawrap/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index 39dc134..5aaa99b 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -15,4 +15,5 @@ RUN dos2unix /home/metaWRAP/bin/config-metawrap RUN dos2unix /home/metaWRAP/bin/mw RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw RUN echo ". /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/bash.bashrc +RUN echo 'export PATH="/home/metaWRAP/bin:$PATH"' >> /etc/bash.bashrc ENTRYPOINT ["/bin/bash"] \ No newline at end of file From e9b2865e29a30edc5b225dd1f6e972637013a9df Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 24 Jan 2024 12:02:48 -0500 Subject: [PATCH 15/32] fix: add gnu-which to docker --- docker/metawrap/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index 5aaa99b..5f3ef6a 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -1,5 +1,5 @@ FROM condaforge/miniforge3:latest -RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash python2.7 +RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash python2.7 gnu-which RUN mamba create -y -n metawrap-env RUN conda config --add channels defaults; conda config --add channels conda-forge; \ conda config --add channels bioconda;conda config --add channels ursky From fd79193588c6cd7d8efa0f58543b9770db83ae43 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 24 Jan 2024 12:27:57 -0500 Subject: [PATCH 16/32] fix: update binding paths for singularity --- docker/metawrap/Dockerfile | 2 +- docker/metawrap/config-metawrap | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index 5f3ef6a..5aaa99b 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -1,5 +1,5 @@ FROM condaforge/miniforge3:latest -RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash python2.7 gnu-which +RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash python2.7 RUN mamba create -y -n metawrap-env RUN conda config --add channels defaults; conda config --add channels conda-forge; \ conda config --add channels bioconda;conda config --add channels ursky diff --git a/docker/metawrap/config-metawrap b/docker/metawrap/config-metawrap index 17d9f41..0f5484b 100755 --- a/docker/metawrap/config-metawrap +++ b/docker/metawrap/config-metawrap @@ -5,10 +5,10 @@ SOFT=${bin_path}/metawrap-scripts PIPES=${bin_path}/metawrap-modules # OPTIONAL databases (see 'Databases' section of metaWRAP README for details) # path to kraken standard databases -KRAKEN_DB=~/KRAKEN_DB -KRAKEN2_DB=~/KRAKEN_DB2 +KRAKEN_DB=/data2/KRAKEN_DB +KRAKEN2_DB=/data2/KRAKEN_DB2 # path to indexed human (or other host) genome (see metaWRAP website for guide). This includes .bitmask and .srprism files -BMTAGGER_DB=~/BMTAGGER_DB +BMTAGGER_DB=/data2/BMTAGGER_DB # paths to BLAST databases -BLASTDB=~/NCBI_NT_DB -TAXDUMP=~/NCBI_TAX_DB +BLASTDB=/data2/NCBI_NT_DB +TAXDUMP=/data2/NCBI_TAX_DB From 73d319365661a4c88f999b8608161646a1c5ea0c Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 24 Jan 2024 14:49:16 -0500 Subject: [PATCH 17/32] fix: export PATH at dockerfile --- docker/metawrap/Dockerfile | 2 +- docker/metawrap/config-metawrap | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index 5aaa99b..e3a7414 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -15,5 +15,5 @@ RUN dos2unix /home/metaWRAP/bin/config-metawrap RUN dos2unix /home/metaWRAP/bin/mw RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw RUN echo ". /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/bash.bashrc -RUN echo 'export PATH="/home/metaWRAP/bin:$PATH"' >> /etc/bash.bashrc +RUN echo 'export PATH="/home/metaWRAP/bin:/opt/conda/envs/metawrap-env/bin:$PATH"' >> /etc/bash.bashrc ENTRYPOINT ["/bin/bash"] \ No newline at end of file diff --git a/docker/metawrap/config-metawrap b/docker/metawrap/config-metawrap index 0f5484b..62bdc74 100755 --- a/docker/metawrap/config-metawrap +++ b/docker/metawrap/config-metawrap @@ -11,4 +11,4 @@ KRAKEN2_DB=/data2/KRAKEN_DB2 BMTAGGER_DB=/data2/BMTAGGER_DB # paths to BLAST databases BLASTDB=/data2/NCBI_NT_DB -TAXDUMP=/data2/NCBI_TAX_DB +TAXDUMP=/data2/NCBI_TAX_DB \ No newline at end of file From f2f468adff40f2420c91b531323bd1407702e8f2 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 24 Jan 2024 16:12:11 -0500 Subject: [PATCH 18/32] fix: add conda bin to path envvar --- docker/metawrap/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index e3a7414..8ce1300 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -6,7 +6,7 @@ RUN conda config --add channels defaults; conda config --add channels conda-forg RUN mamba install -y --only-deps -c ursky -n metawrap-env metawrap-mg==1.3.2 RUN cd /home; git clone https://github.com/bxlab/metaWRAP.git; chmod -R 777 metaWRAP RUN mamba run -n metawrap-env pip3 install drep -ENV PATH="/home/metaWRAP/bin:$PATH" +ENV PATH="/home/metaWRAP/bin:/opt/conda/envs/metawrap-env/bin:$PATH" COPY docker/metawrap/config-metawrap /home/metaWRAP/bin/config-metawrap COPY docker/metawrap/Dockerfile /Dockerfile RUN md5sum Dockerfile > /Dockerfile.md5 @@ -14,6 +14,7 @@ ADD docker/metawrap/mw /home/metaWRAP/bin/mw RUN dos2unix /home/metaWRAP/bin/config-metawrap RUN dos2unix /home/metaWRAP/bin/mw RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw +ENV BASH_ENV="/etc/bash.bashrc" RUN echo ". /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/bash.bashrc RUN echo 'export PATH="/home/metaWRAP/bin:/opt/conda/envs/metawrap-env/bin:$PATH"' >> /etc/bash.bashrc ENTRYPOINT ["/bin/bash"] \ No newline at end of file From 735eb966c6f899840e43e6a37137187943971082 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 29 Jan 2024 15:17:59 -0500 Subject: [PATCH 19/32] fix: fix docker + workflow issues, refactor coassembly rules --- config/cluster.json | 12 +- config/resources.json | 22 ++- metamorph | 31 ++-- src/run.py | 140 ++---------------- src/run.sh | 7 +- workflow/Snakefile | 40 ++--- workflow/coa/Snakefile | 40 ----- workflow/rules/{metawrap.smk => DNA.smk} | 178 ++++++++++++++--------- workflow/rules/RNA.smk | 5 +- 9 files changed, 175 insertions(+), 300 deletions(-) delete mode 100644 workflow/coa/Snakefile rename workflow/rules/{metawrap.smk => DNA.smk} (61%) diff --git a/config/cluster.json b/config/cluster.json index 02ef370..22906ed 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -3,14 +3,14 @@ "threads": 4, "mem": "8g", "partition": "norm", - "time": "0-04:00:00" + "time": "0-04:00:00", + "gres": "lscratch:64" }, "metawrap_read_qc": { - "threads": 8, - "mem": "16g", - "partition": "norm", - "time": "0-04:00:00", - "gres": "None" + "threads": 16, + "mem": "32g", + "partition": "quick", + "time": "0-04:00:00" }, "metawrap_genome_assembly": { "threads": 24, diff --git a/config/resources.json b/config/resources.json index e56d2a5..27a89ac 100644 --- a/config/resources.json +++ b/config/resources.json @@ -2,34 +2,44 @@ "databases": [ { "name": "KRAKEN_DB2", - "to": "$HOME/KRAKEN_DB2", + "to": "/data2/KRAKEN_DB2", "from": "/data/OpenOmics/references/metamorph/kraken2/k2_pluspfp_08gb_20230605", "mode": "ro" }, { "name": "KRAKEN_DB", - "to": "$HOME/KRAKEN_DB", + "to": "/data2/KRAKEN_DB", "from": "/data/OpenOmics/references/metamorph/kraken/20171019_minikraken_8GB", "mode": "ro" }, { "name": "BMTAGGER_INDEX", - "to": "$HOME/BMTAGGER_DB", + "to": "/data2/BMTAGGER_DB", "from": "/data/OpenOmics/references/metamorph/BMTAGGER/BMTAGGER_INDEX", "mode": "ro" }, { "name": "BMTAGGER_INDEX", - "to": "$HOME/GTDBTK_DB", + "to": "/data2/GTDBTK_DB", "from": "/data/OpenOmics/references/metamorph/GTDBtk/release207_v2", "mode": "ro" }, { "name": "GUNC_DB", - "to": "$HOME/GUNC_DB", + "to": "/data2/GUNC_DB", "from": "/data/OpenOmics/references/metamorph/GUNC/gunc_1.0.5db", "mode": "ro" }, { "name": "CHECKM_DB", - "to": "$HOME/checkm", + "to": "/data2/checkm", "from": "/data/OpenOmics/references/metamorph/checkm", "mode": "rw" + }, { + "name": "NCBI_NT", + "to": "/data2/NCBI_NT_DB", + "from": "/data/OpenOmics/references/metamorph/NCBI_nt", + "mode": "rw" + }, { + "name": "NCBI_TAX", + "to": "/data2/NCBI_TAX_DB", + "from": "/data/OpenOmics/references/metamorph/NCBI_tax", + "mode": "rw" } ] } \ No newline at end of file diff --git a/metamorph b/metamorph index a403c4a..2795f03 100755 --- a/metamorph +++ b/metamorph @@ -27,13 +27,21 @@ in any work or product based on this material. USAGE: $ metamorph [OPTIONS] -EXAMPLE: - co-assembly: - $ metamorph run --coa --input *.R?.fastq.gz --output output/ - $ metamorph run -C --input *.R?.fastq.gz --output output/ - per-sample assembly: - $ metamorph run --input *.R?.fastq.gz --output output/ +EXAMPLES: + co-assembly dna-only: + $ metamorph run --coa --input *.R?.fastq.gz --output output + $ metamorph run -C --input *.R?.fastq.gz --output output + + per-sample assembly dna-only: + $ metamorph run --input *.R?.fastq.gz --output output + + co-assembly rna & dna: + $ metamorph run --coa --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output + $ metamorph run -C --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output + + per-sample assembly rna & dna: + $ metamorph run --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output """ # Python standard library @@ -45,7 +53,7 @@ import argparse # potential python3 3rd party package, added in python/3.5 # Local imports from src import version -from src.run import init, setup, bind, dryrun, runner, run_coa_pipeline +from src.run import init, setup, bind, dryrun, runner from src.utils import ( Colors, err, @@ -168,14 +176,6 @@ def run(sub_args): if 'databases' in config: bindpaths.extend([mount['from']+':'+mount['to']+':'+mount['mode'] for mount in config['databases']]) - if sub_args.coa: - cjob = run_coa_pipeline(sub_args.mode, - sub_args.output, - sub_args.singularity_cache, - logfh, - sub_args.tmp_dir, - ",".join(bindpaths)) - mjob = runner(mode = sub_args.mode, outdir = sub_args.output, # additional_bind_paths = all_bind_paths, @@ -215,7 +215,6 @@ def run(sub_args): def cache(sub_args): - """Caches remote resources or reference files stored on DockerHub and S3. Local SIFs will be created from images defined in 'config/containers/images.json'. @TODO: add option to cache other shared S3 resources (i.e. kraken db and fqscreen indices) diff --git a/src/run.py b/src/run.py index 31b3521..c6df6c2 100644 --- a/src/run.py +++ b/src/run.py @@ -4,8 +4,8 @@ # Python standard library from __future__ import print_function from shutil import copytree -from uuid import uuid4 from datetime import datetime +from pathlib import Path import os, re, json, sys, subprocess # Local imports @@ -311,10 +311,11 @@ def bind(sub_args, config): List of singularity/docker bind paths """ bindpaths = [] + resolve = lambda x: str(Path(x).resolve()) if 'databases' in config: dbs = config.pop('databases') - bindpaths.extend([mount['from']+':'+mount['to']+':'+mount['mode'] for mount in dbs]) + bindpaths.extend([resolve(mount['from'])+':'+resolve(mount['to'])+':'+mount['mode'] for mount in dbs]) if 'options' in config and 'input' in config['options']: inrents = list(set([os.path.abspath(os.path.dirname(p)) for p in config['options']['input'] if os.path.exists(os.path.dirname(p)) and os.path.isdir(os.path.dirname(p))])) @@ -393,8 +394,9 @@ def add_user_information(config): # username config['project']['userhome'] = home config['project']['username'] = username - dt = datetime.now().strftime("%m/%d/%Y") - config['project']['id'] = f"{uuid4()}_{dt}_metagenome" + + dt = datetime.now().strftime("%m_%d_%Y") + config['project']['id'] = f"{dt}_metagenome_results" return config @@ -619,130 +621,6 @@ def dryrun(outdir, config='config.json', snakefile=os.path.join('workflow', 'Sna return dryrun_output -def run_coa_pipeline(mode, outdir, alt_cache, logger, tmp_dir, additional_bind_paths): - # gzip compression speed: ~10.5 MB/s - # see: https://tukaani.org/lzma/benchmarks.html - # large fastq ~20GB - # 96 samples x 2 (R1 + R2) = 192 fastqs - # total size (high estimate): 192 fastqs * 20GB/fastq = 3,840 GB = 3,840,000 MB - # total compression time: 3,840,000 MB / (10.5 MB/s) = 365714 s = ~ 4 days - # pigz ~6.5 times faster than gzip - # https://github.com/neurolabusc/pigz-bench - # 3,840,000 MB / (10.5 MB/s * 6.5) = ~ 15.6 hours - # ~~~~~~~~~~~~~~ - - # Add additional singularity bind PATHs - # to mount the local filesystem to the - # containers filesystem, NOTE: these - # PATHs must be an absolute PATHs - outdir = os.path.abspath(outdir).strip() - # Add any default PATHs to bind to - # the container's filesystem, like - # tmp directories, /lscratch - addpaths = [] - temp = os.path.dirname(tmp_dir.rstrip('/')) - if temp == os.sep: - temp = tmp_dir.rstrip('/') - if outdir not in additional_bind_paths.split(','): - addpaths.append(outdir) - if temp not in additional_bind_paths.split(','): - addpaths.append(temp) - bindpaths = ','.join(addpaths) - - # Set ENV variable 'SINGULARITY_CACHEDIR' - # to output directory - my_env = {}; my_env.update(os.environ) - cache = os.path.join(outdir, ".singularity") - my_env['SINGULARITY_CACHEDIR'] = cache - if alt_cache: - # Override the pipeline's default - # cache location - my_env['SINGULARITY_CACHEDIR'] = alt_cache - cache = alt_cache - - if additional_bind_paths: - # Add Bind PATHs for outdir and tmp dir - if bindpaths: - bindpaths = ",{}".format(bindpaths) - bindpaths = "{}{}".format(additional_bind_paths,bindpaths) - - if not exists(os.path.join(outdir, 'logfiles')): - # Create directory for logfiles - os.makedirs(os.path.join(outdir, 'logfiles')) - - # Create .singularity directory for - # installations of snakemake without - # setuid which creates a sandbox in - # the SINGULARITY_CACHEDIR - if not exists(cache): - # Create directory for sandbox - # and image layers - os.makedirs(cache, mode=0o755) - - snakefile = os.path.abspath(os.path.join(__file__, '..', 'workflow', 'coa', 'Snakefile')) - slurm_dir = os.path.abspath(os.path.join(outdir, 'slurm')) - if not os.path.exists(slurm_dir): - os.mkdir(slurm_dir, mode=0o755) - - CLUSTER_OPTS = "sbatch --gres {cluster.gres}" + \ - " --cpus-per-task {cluster.threads}" + \ - " -p {cluster.partition}" + \ - " -t {cluster.time}" + \ - " --mem {cluster.mem}" + \ - " --job-name={params.rname}" + \ - " -e $SLURM_DIR/slurm-%j_{params.rname}.out" + \ - " -o $SLURM_DIR/slurm-%j_{params.rname}.out" - - sbatch_params = [ - "#SBATCH --cpus-per-task=28", - "#SBATCH --mem=64g", - "#SBATCH --time=10-00:00:00", - "#SBATCH -p norm", - "#SBATCH --parsable", - "#SBATCH -J \"metagenome_coa\"", - "#SBATCH --mail-type=BEGIN,END,FAIL", - "#SBATCH --output \"" + outdir + "/logfiles/snakemake.log\"", - "#SBATCH --error \"" + outdir + "/logfiles/snakemake.log\"", - ] - - jobscript = [ - "#!/usr/bin/env bash", - "module load snakemake singularity", - "snakemake \\", - "--latency-wait 120 \\", - "-s " + snakefile + " \\", - "-d \"{outdir}\" \\", - "--use-singularity \\", - "--singularity-args \"'-B " + bindpaths + "'\" \\", - "--configfile=\"" + outdir + "/config.json\" \\", - "--printshellcmds \\", - "--cluster-config \"" + outdir + "/resources/cluster.json\" \\", - "--cluster \"" + CLUSTER_OPTS + "\" \\", - "--keep-going \\", - "--restart-times 3 \\", - "-j 500 \\", - "--rerun-incomplete --stats \"" + outdir + "/logfiles/runtime_statistics.json\" \\", - "--keep-remote \\", - "--local-cores 28 2>&1 | tee -a \"" + outdir + "/logfiles/master.log\"", - ] - - exec_sh = 'bash' - if mode == 'slurm': - exec_sh = 'sbatch' - jobscript = [jobscript[0], *sbatch_params, *jobscript[1:]] - - coa_jobscript = os.path.join(slurm_dir, 'jobscript.sh') - with open(coa_jobscript, 'w') as fo: - fo.write("\n".join(jobscript)) - - coajob = subprocess.Popen([ - exec_sh, str(coa_jobscript) - ], cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env) - - coajob.wait() - return coajob.returncode - - try: __job_name__ = 'metamorph_' + os.getlogin() + ':master' except OSError: @@ -837,9 +715,11 @@ def runner( # replacing Popen subprocess with a direct # snakemake API call: https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html masterjob = subprocess.Popen([ - 'snakemake', '-pr', '--rerun-incomplete', + 'snakemake', '-pr', + #'--rerun-incomplete', + '--verbose', '--use-singularity', - '--singularity-args', "'-B {}'".format(bindpaths), + '--singularity-args', "\\-C \\-B '{}'".format(bindpaths), '--cores', str(threads), '--configfile=config.json' ], cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env) diff --git a/src/run.sh b/src/run.sh index 1c3bb06..b01df3a 100755 --- a/src/run.sh +++ b/src/run.sh @@ -223,11 +223,12 @@ function submit(){ set -euo pipefail # Main process of pipeline snakemake --latency-wait 120 -s "$3/workflow/Snakefile" -d "$3" \\ - --use-singularity --singularity-args "'-B $4'" \\ - --use-envmodules --configfile="$3/config.json" \\ + --use-singularity --singularity-args "\\-C \\-B '$4'" \\ + --use-envmodules --verbose --configfile="$3/config.json" \\ --printshellcmds --cluster-config "$3/config/cluster.json" \\ - --cluster "${CLUSTER_OPTS}" --keep-going --restart-times 3 -j 500 \\ + --cluster "${CLUSTER_OPTS}" --keep-going -j 500 \\ --rerun-incomplete --stats "$3/logfiles/runtime_statistics.json" \\ + --keep-incomplete --restart-times 0 \\ --keep-remote --local-cores 14 2>&1 # Create summary report snakemake -d "$3" --report "Snakemake_Report.html" diff --git a/workflow/Snakefile b/workflow/Snakefile index 617cf7d..f8a0c3a 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -5,9 +5,10 @@ from os import listdir from scripts.common import allocated, provided, references, str_bool -workpath = config["project"]["workpath"] # Pipeline"s output directory -tmpdir = config["options"]["tmp_dir"] # Temporary directory -samples = config["samples"] # List containing basename of samples +workpath = config["project"]["workpath"] # Pipeline"s output directory +tmpdir = config["options"]["tmp_dir"] # Temporary directory +samples = config["samples"] # List containing basename of samples +top_readqc_dir = join(workpath, config['project']['id'], "metawrap_read_qc") # Read in resource information, containing information about threads, mem, walltimes, etc. @@ -25,28 +26,16 @@ rule all: input: # read qc and filtering # ~~~~~~~~~~~~~~~ - expand( - join( - workpath, config['project']['id'], "metawrap_qc", "{name}", "{name}.R1_bmtagger_report.html" - ), - name=samples, - ), - expand( - join( - workpath, config['project']['id'], "metawrap_qc", "{name}", "{name}.R2_bmtagger_report.html" - ), - name=samples, - ), + expand(join(top_readqc_dir, "{name}", "{name}_R1_pretrim_report.html"), name=samples), + expand(join(top_readqc_dir, "{name}", "{name}_R2_pretrim_report.html"), name=samples), + expand(join(top_readqc_dir, "{name}", "{name}_R1_postrim_report.html"), name=samples), + expand(join(top_readqc_dir, "{name}", "{name}_R2_postrim_report.html"), name=samples), + expand(join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq.gz"), name=samples), + expand(join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq.gz"), name=samples), # genome assembly # ~~~~~~~~~~~~~~~ - expand( - join(workpath, config['project']['id'], "metawrap_assembly", "{name}", "final_assembly.fasta"), - name=samples, - ), - expand( - join(workpath, config['project']['id'], "metawrap_assembly", "{name}", "assembly_report.html"), - name=samples, - ), + expand(join(workpath, config['project']['id'], "metawrap_assembly", "{name}", "final_assembly.fasta"), name=samples), + expand(join(workpath, config['project']['id'], "metawrap_assembly", "{name}", "assembly_report.html"), name=samples), # taxonomic classification # ~~~~~~~~~~~~~~~ expand(join(workpath, config['project']['id'], "metawrap_kmer", "{name}", "final_assembly.krona"), name=samples), @@ -67,6 +56,7 @@ rule all: expand(join(workpath, config['project']['id'], "metawrap_bin_refine", "{name}", "dereplicated_bins"), name=samples), - # Import rules -include: join("rules", "metawrap.smk") \ No newline at end of file +include: join("rules", "DNA.smk") +include: join("rules", "RNA.smk") +include: join("rules", "hooks.smk") \ No newline at end of file diff --git a/workflow/coa/Snakefile b/workflow/coa/Snakefile deleted file mode 100644 index f125d53..0000000 --- a/workflow/coa/Snakefile +++ /dev/null @@ -1,40 +0,0 @@ -# ~~~~~~~~~~ -# Metagenome read concatenation -# ~~~~~~~~~~ -ruleorder: - cat_reads < decompress_reads - - -rule all: - input: - expand(join(workpath, "{name}.R{rnum}.fastq.gz", name=samples)), - - - -rule decompress_reads: - input: - all_reads = join(workpath, "{name}.R{rnum}.fastq.gz") - output: - all_uncompressed_read = join(workpath, "{name}.R{rnum}.fastq.gz") - shell: - "gunzip {input.all_reads}" - - -rule cat_reads_and_compress: - input: - all_r1_reads = expand(join(workpath, "{name}.R1.fastq")) - all_r2_reads = expand(join(workpath, "{name}.R2.fastq")) - output: - big_read_r1 = join(workpath, "coa.R1.fastq.gz") - big_read_r2 = join(workpath, "coa.R1.fastq.gz") - params: - uncompressed_r1 = lambda wc, thisout: Path(thisout.big_read_r1).name[:-3] - uncompressed_r2 = lambda wc, thisout: Path(thisout.big_read_r2).name[:-3] - threads: 28 - shell: - """ - cat {input.all_r1_reads} > {params.uncompressed_r1} - cat {input.all_r2_reads} > {params.uncompressed_r2} - pigz -9 {params.uncompressed_r1} - pigz -9 {params.uncompressed_r2} - """ \ No newline at end of file diff --git a/workflow/rules/metawrap.smk b/workflow/rules/DNA.smk similarity index 61% rename from workflow/rules/metawrap.smk rename to workflow/rules/DNA.smk index 7973957..fd5bc76 100644 --- a/workflow/rules/metawrap.smk +++ b/workflow/rules/DNA.smk @@ -12,8 +12,9 @@ workpath = config["project"]["workpath"] datapath = config["project"]["datapath"] default_threads = cluster["__default__"]['threads'] default_memory = cluster["__default__"]['mem'] -top_log_dir = join(workpath, config['project']['id'], "logs") -top_readqc_dir = join(workpath, config['project']['id'], "metawrap_qc") +top_log_dir = join(workpath, "logs") +top_readqc_dir = join(workpath, config['project']['id'], "metawrap_read_qc") +top_trim_dir = join(workpath, config['project']['id'], "trimmed_reads") top_assembly_dir = join(workpath, config['project']['id'], "metawrap_assembly") top_tax_dir = join(workpath, config['project']['id'], "metawrap_kmer") top_binning_dir = join(workpath, config['project']['id'], "metawrap_binning") @@ -22,6 +23,7 @@ metawrap_container = config["containers"]["metawrap"] """ + 0. concat or no concat 1. read qc 2. assembly 3. binning @@ -30,66 +32,115 @@ metawrap_container = config["containers"]["metawrap"] 6. annotate bins 7. index depreplicated genomes 8. align DNA to assembly -""" +""" + +if config['coassembly']: + start_r1 = join(workpath, "concatenated.R1.fastq") + start_r2 = join(workpath, "concatenated.R1.fastq") + start_dc = [] +else: + start_r1 = expand(join(workpath, "{name}_R1.fastq"), name=config['samples']) + start_r2 = expand(join(workpath, "{name}_R2.fastq"), name=config['samples']) + start_dc = join(datapath, "{stem}.fastq.gz") + + +rule decompress_reads: + input: + this_compressed_read = start_dc + output: + this_uncompressed_read = join(workpath, "{stem}.fastq"), + params: + rname = "decompress_reads", + shell: + "gunzip -c {input.this_compressed_read} > {output.this_uncompressed_read}" + + +rule concat_reads: + input: + all_r1_reads = expand(join(workpath, "{name}.R1.fastq"), name=config['samples']), + all_r2_reads = expand(join(workpath, "{name}.R2.fastq"), name=config['samples']), + output: + big_read_r1 = join(workpath, "concatenated.R1.fastq"), + big_read_r2 = join(workpath, "concatenated.R2.fastq"), + big_compressed_read_r1 = join(workpath, "concatenated.R1.fastq.gz"), + big_compressed_read_r2 = join(workpath, "concatenated.R2.fastq.gz"), + big_read1_hash = join(workpath, "concatenated.R1.md5"), + big_read2_hash = join(workpath, "concatenated.R2.md5"), + params: + rname = "concat_reads", + input_dir = workpath, + threads: 28 + shell: + """ + for fastq in {params.input_dir}\*R1*fastq; do cat $fastq > {output.big_read_r1}; done + for fastq in {params.input_dir}\*R2*fastq; do cat $fastq > {output.big_read_r2}; done + pigz -9 -p 28 -c {params.uncompressed_r1} > {output.big_compressed_read_r1} + pigz -9 -p 28 -c {params.uncompressed_r2} > {output.big_compressed_read_r2} + md5sum {output.big_compressed_read_r1} > {output.big_read1_hash} + md5sum {output.big_compressed_read_r2} > {output.big_read2_hash} + """ rule metawrap_read_qc: """ - Metawrap read quality control rule for producing high quality reads for assembly. + Metawrap read quality control rule for producing high quality reads for assembly. - Quality-control step accomplishes three tasks: - - Trims adapter sequences from input read pairs (.fastq.gz) - - Quality filters from input read pairs - - Removes human contamination from input read pairs - @Container requirements - - [MetaWrap](https://github.com/bxlab/metaWRAP) orchestrates execution of - these tools in a psuedo-pipeline: - - [FastQC](https://github.com/s-andrews/FastQC) - - [bmtagger](ftp://ftp.ncbi.nlm.nih.gov/pub/agarwala/bmtagger/) - - [Trim-Galore](https://github.com/FelixKrueger/TrimGalore) - - [Cutadapt](https://github.com/marcelm/cutadapt) + Quality-control step accomplishes three tasks: + - Trims adapter sequences from input read pairs (.fastq.gz) + - Quality filters from input read pairs + - Removes human contamination from input read pairs + @Container requirements + - [MetaWrap](https://github.com/bxlab/metaWRAP) orchestrates execution of + these tools in a psuedo-pipeline: + - [FastQC](https://github.com/s-andrews/FastQC) + - [bmtagger](ftp://ftp.ncbi.nlm.nih.gov/pub/agarwala/bmtagger/) + - [Trim-Galore](https://github.com/FelixKrueger/TrimGalore) + - [Cutadapt](https://github.com/marcelm/cutadapt) + + @Environment specifications + - minimum 16gb memory + - will load bmtagger index in memory (8gb) + @Input: + - Raw fastq.gz reads (R1 & R2 per-sample) from the instrument - @Environment specifications - - minimum 16gb memory - - will load bmtagger index in memory (8gb) - @Input: - - Raw fastq.gz reads (R1 & R2 per-sample) from the instrument - - @Outputs: - - Trimmed, quality filtered reads (R1 & R2) - - FastQC html report and zip file on trimmed data + @Outputs: + - Trimmed, quality filtered reads (R1 & R2) + - FastQC html report and zip file on trimmed data """ input: - R1 = expand(join(datapath, "{name}_R1.fastq.gz"), name=config['samples']), - R2 = expand(join(datapath, "{name}_R2.fastq.gz"), name=config['samples']), + R1 = start_r1, + R2 = start_r2, output: - R1_bmtagger_report = join(top_readqc_dir, "{name}", "{name}.R1_bmtagger_report.html"), - R2_bmtagger_report = join(top_readqc_dir, "{name}", "{name}.R2_bmtagger_report.html"), - R1_fastqc_report = join(top_readqc_dir, "{name}", "{name}.R1_fastqc_report.html"), - R2_fastqc_report = join(top_readqc_dir, "{name}", "{name}.R2_fastqc_report.html"), - R1_qc_reads = join(workpath, "{name}", "{name}.R1_readqc.fastq"), - R2_qc_reads = join(workpath, "{name}", "{name}.R2_readqc.fastq"), + R1_pretrim_report = join(top_readqc_dir, "{name}", "{name}_R1_pretrim_report.html"), + R2_pretrim_report = join(top_readqc_dir, "{name}", "{name}_R2_pretrim_report.html"), + R1_postrim_report = join(top_readqc_dir, "{name}", "{name}_R1_postrim_report.html"), + R2_postrim_report = join(top_readqc_dir, "{name}", "{name}_R2_postrim_report.html"), + R1_trimmed = join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq"), + R2_trimmed = join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq"), + R1_trimmed_gz = join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq.gz"), + R2_trimmed_gz = join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq.gz"), params: rname = "metawrap_read_qc", + sid = "{name}", this_qc_dir = join(top_readqc_dir, "{name}"), - R1_mw_named = join(top_readqc_dir, "{name}", "{name}_1.fastq"), - R2_mw_named = join(top_readqc_dir, "{name}", "{name}_2.fastq"), + trim_out = join(top_trim_dir, "{name}.fastq"), containerized: metawrap_container, - log: join(top_log_dir, "read_qc", "{name}") + log: join(top_log_dir, "read_qc", "{name}"), threads: int(cluster["metawrap_genome_assembly"].get('threads', default_threads)), shell: """ - gunzip -c {input.R1} > {params.R1_mw_named} - gunzip -c {input.R2} > {params.R2_mw_named} - metawrap read_qc -1 {params.R1_mw_named} -2 {params.R2_mw_named} -t {threads} -o {params.this_qc_dir} - mv {params.this_qc_dir}/final_pure_reads_1.fastq {output.R1_qc_reads} - rm -f {params.R1_mw_named} - cp {params.this_qc_dir}/post-QC_report/final_pure_reads_1_fastqc.html {output.R1_bmtagger_report} - cp {params.this_qc_dir}/pre-QC_report/{params.R1_mw_named}_fastqc.html {output.R1_fastqc_report} - mv {params.this_qc_dir}/final_pure_reads_2.fastq {output.R2_qc_reads} - rm -f {params.R2_mw_named} - cp {params.this_qc_dir}/post-QC_report/final_pure_reads_2_fastqc.html {output.R2_bmtagger_report} - cp {params.this_qc_dir}/pre-QC_report/{params.R2_mw_named}_fastqc.html {output.R2_fastqc_report} + mw read_qc -1 {input.R1} -2 {input.R2} -t {threads} -o {params.this_qc_dir} + mv {params.this_qc_dir}/final_pure_reads_1.fastq {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq + mv {params.this_qc_dir}/final_pure_reads_2.fastq {params.this_qc_dir}/{params.sid}_R2_trimmed.fastq + pigz -9 -p {threads} -c {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq > {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq.gz + pigz -9 -p {threads} -c {params.this_qc_dir}/{params.sid}_R2_trimmed.fastq > {params.this_qc_dir}/{params.sid}_R2_trimmed.fastq.gz + mkdir -p {params.trim_out} + ln -s {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq.gz {params.trim_out}/{params.sid}_R1.fastq.gz + ln -s {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq.gz {params.trim_out}/{params.sid}_R2.fastq.gz + ln -s {params.this_qc_dir}/post-QC_report/final_pure_reads_1_fastqc.html {params.this_qc_dir}/{params.sid}_R1_postrim_report.html + ln -s {params.this_qc_dir}/post-QC_report/final_pure_reads_2_fastqc.html {params.this_qc_dir}/{params.sid}_R2_postrim_report.html + ln -s {params.this_qc_dir}/pre-QC_report/{params.sid}_1_fastqc.html {params.this_qc_dir}/{params.sid}_R1_pretrim_report.html + ln -s {params.this_qc_dir}/pre-QC_report/{params.sid}_2_fastqc.html {params.this_qc_dir}/{params.sid}_R2_pretrim_report.html """ @@ -104,8 +155,8 @@ rule metawrap_genome_assembly: Ensemble assembled contigs and reports """ input: - R1 = expand(join(workpath, "{name}", "{name}.R1_readqc.fastq"), name=samples), - R2 = expand(join(workpath, "{name}", "{name}.R2_readqc.fastq"), name=samples), + R1 = expand(join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq"), name=samples), + R2 = expand(join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq"), name=samples), output: # megahit outputs megahit_assembly = expand(join(top_assembly_dir, "{name}", "megahit", "final.contigs.fa"), name=samples), @@ -136,7 +187,7 @@ rule metawrap_genome_assembly: ln -s {input.R1} {output.assembly_R1} ln -s {input.R2} {output.assembly_R2} # run genome assembler - metawrap assembly \ + mw assembly \ --megahit \ --metaspades \ -m {params.memlimit} \ @@ -153,7 +204,7 @@ rule metawrap_tax_classification: TODO: docstring """ input: - reads = expand(join(workpath, "{name}", "{name}.R{pair}_readqc.fastq"), name=samples, pair=['1', '2']), + reads = expand(join(top_readqc_dir, "{name}", "{name}_R{pair}_trimmed.fastq"), name=samples, pair=['1', '2']), final_assembly = expand(join(top_assembly_dir, "{name}", "final_assembly.fasta"), name=samples), output: krak2_asm = expand(join(top_tax_dir, "{name}", "final_assembly.krak2"), name=samples), @@ -168,7 +219,7 @@ rule metawrap_tax_classification: threads: cluster["metawrap_tax_classification"].get("threads", default_threads), shell: """ - metawrap kraken2 \ + mw kraken2 \ -t {threads} \ -s {params.tax_subsample} \ -o {output.tax_dir} \ @@ -177,26 +228,10 @@ rule metawrap_tax_classification: """ -rule metawrap_setup_binning: - input: - R1_from_qc = join(workpath, "{name}", "{name}.R1_readqc.fastq"), - R2_from_qc = join(workpath, "{name}", "{name}.R2_readqc.fastq"), - output: - R1_bin_name = join(workpath, "{name}", "{name}_1.fastq"), - R2_bin_name = join(workpath, "{name}", "{name}_2.fastq"), - params: - rname = "metawrap_setup_binning", - shell: - """ - ln -s {input.R1_from_qc} {output.R1_bin_name} - ln -s {input.R2_from_qc} {output.R2_bin_name} - """ - - rule metawrap_binning: input: - r1_read = join(workpath, "{name}", "{name}_1.fastq"), - r2_read = join(workpath, "{name}", "{name}_2.fastq"), + R1 = expand(join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq"), name=samples), + R2 = expand(join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq"), name=samples), assembly = join(top_assembly_dir, "{name}", "final_assembly.fasta"), output: maxbin_bins = directory(join(top_binning_dir, "{name}", "maxbin2_bins")), @@ -220,14 +255,13 @@ rule metawrap_binning: threads: cluster["metawrap_tax_classification"].get("threads", default_threads), shell: """ - # metawrap binning - metawrap binning \ + mw binning \ --metabat2 --maxbin2 --concoct \ -m {params.bin_mem} \ -t {threads} \ -a {input.assembly} \ -o {params.bin_dir} \ - {input.r1_read} {input.r2_read} + {input.R1} {input.R2} # metawrap bin refinement metawrap bin_refinement \ -o {params.refine_dir} \ diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk index ca780b2..d3bb15c 100644 --- a/workflow/rules/RNA.smk +++ b/workflow/rules/RNA.smk @@ -10,13 +10,14 @@ from itertools import chain workpath = config["project"]["workpath"] datapath = config["project"]["datapath"] -rule concat rule map_rna_to_metagenome: input: - concat_rna_read = + concat_rna_read = "" output: + concat_rna_read = "" params: + concat_rna_read = "" shell: """ humann --threads 16 --input $(ANALYSIS)/READ_QC_RNA/$${sample}_concat.fastq --remove-temp-output --input-format fastq --output-basename $${sample} --output ./ From 6c91890c0ddf35cf35c134817bd2836ec8442576 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 29 Jan 2024 15:20:56 -0500 Subject: [PATCH 20/32] fix: increase read_qc walltime and change partition --- config/cluster.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/cluster.json b/config/cluster.json index 22906ed..148a57d 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -9,8 +9,8 @@ "metawrap_read_qc": { "threads": 16, "mem": "32g", - "partition": "quick", - "time": "0-04:00:00" + "partition": "norm", + "time": "1-00:00:00" }, "metawrap_genome_assembly": { "threads": 24, From 0ec3527f1ba3fa1d3899ad328c5e36f4aed203f1 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 21 Feb 2024 10:20:07 -0500 Subject: [PATCH 21/32] feat: update rules for new configuration --- workflow/Snakefile | 20 +++++- workflow/rules/DNA.smk | 155 ++++++++++++++++++++++++++++------------- workflow/rules/RNA.smk | 46 +++++++++++- 3 files changed, 167 insertions(+), 54 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index f8a0c3a..47a4480 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -5,10 +5,12 @@ from os import listdir from scripts.common import allocated, provided, references, str_bool +datapath = config["project"]["datapath"] workpath = config["project"]["workpath"] # Pipeline"s output directory tmpdir = config["options"]["tmp_dir"] # Temporary directory samples = config["samples"] # List containing basename of samples top_readqc_dir = join(workpath, config['project']['id'], "metawrap_read_qc") +top_trim_dir = join(workpath, config['project']['id'], "trimmed_reads") # Read in resource information, containing information about threads, mem, walltimes, etc. @@ -20,18 +22,30 @@ with open(join(workpath, "config", "cluster.json")) as fh: # Global workflow variables configfile: "config.json" # Generated from user input and config/*.json +if config['coassembly']: + start_r1 = join(workpath, "concatenated.R1.fastq") + start_r2 = join(workpath, "concatenated.R1.fastq") + start_dc = [] +else: + start_r1 = expand(join(workpath, "dna", "{name}_R1.fastq"), name=config['samples']) + start_r2 = expand(join(workpath, "dna", "{name}_R2.fastq"), name=config['samples']) + start_dc = join(datapath, "{stem}.fastq.gz") # Final ouput files of the pipeline rule all: input: + # dna read decompress + # ~~~~~~~~~~~~~~~ + start_r1, + start_r2, # read qc and filtering # ~~~~~~~~~~~~~~~ expand(join(top_readqc_dir, "{name}", "{name}_R1_pretrim_report.html"), name=samples), expand(join(top_readqc_dir, "{name}", "{name}_R2_pretrim_report.html"), name=samples), expand(join(top_readqc_dir, "{name}", "{name}_R1_postrim_report.html"), name=samples), expand(join(top_readqc_dir, "{name}", "{name}_R2_postrim_report.html"), name=samples), - expand(join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq.gz"), name=samples), - expand(join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq.gz"), name=samples), + expand(join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq.gz"), name=samples), + expand(join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq.gz"), name=samples), # genome assembly # ~~~~~~~~~~~~~~~ expand(join(workpath, config['project']['id'], "metawrap_assembly", "{name}", "final_assembly.fasta"), name=samples), @@ -58,5 +72,5 @@ rule all: # Import rules include: join("rules", "DNA.smk") -include: join("rules", "RNA.smk") +# include: join("rules", "RNA.smk") include: join("rules", "hooks.smk") \ No newline at end of file diff --git a/workflow/rules/DNA.smk b/workflow/rules/DNA.smk index fd5bc76..a4e5392 100644 --- a/workflow/rules/DNA.smk +++ b/workflow/rules/DNA.smk @@ -3,6 +3,7 @@ # ~~~~~~~~~~ from os.path import join from itertools import chain +from uuid import uuid4 # ~~~~~~~~~~ @@ -12,7 +13,7 @@ workpath = config["project"]["workpath"] datapath = config["project"]["datapath"] default_threads = cluster["__default__"]['threads'] default_memory = cluster["__default__"]['mem'] -top_log_dir = join(workpath, "logs") +top_log_dir = join(workpath, "logfiles") top_readqc_dir = join(workpath, config['project']['id'], "metawrap_read_qc") top_trim_dir = join(workpath, config['project']['id'], "trimmed_reads") top_assembly_dir = join(workpath, config['project']['id'], "metawrap_assembly") @@ -20,18 +21,21 @@ top_tax_dir = join(workpath, config['project']['id'], "metawrap_k top_binning_dir = join(workpath, config['project']['id'], "metawrap_binning") top_refine_dir = join(workpath, config['project']['id'], "metawrap_bin_refine") metawrap_container = config["containers"]["metawrap"] +pairedness = list(range(1, config['project']['nends']+1)) +mem2int = lambda x: int(str(x).lower().replace('gb', '').replace('g', '')) """ - 0. concat or no concat - 1. read qc - 2. assembly - 3. binning - 4. bin refinement - 5. depreplicate bins - 6. annotate bins - 7. index depreplicated genomes - 8. align DNA to assembly + Step-wise pipeline outline: + 0. concat or no concat + 1. read qc + 2. assembly + 3. binning + 4. bin refinement + 5. depreplicate bins + 6. annotate bins + 7. index depreplicated genomes + 8. align DNA to assembly """ if config['coassembly']: @@ -39,8 +43,8 @@ if config['coassembly']: start_r2 = join(workpath, "concatenated.R1.fastq") start_dc = [] else: - start_r1 = expand(join(workpath, "{name}_R1.fastq"), name=config['samples']) - start_r2 = expand(join(workpath, "{name}_R2.fastq"), name=config['samples']) + start_r1 = expand(join(workpath, "dna", "{name}_R1.fastq"), name=config['samples']) + start_r2 = expand(join(workpath, "dna", "{name}_R2.fastq"), name=config['samples']) start_dc = join(datapath, "{stem}.fastq.gz") @@ -48,7 +52,7 @@ rule decompress_reads: input: this_compressed_read = start_dc output: - this_uncompressed_read = join(workpath, "{stem}.fastq"), + this_uncompressed_read = join(workpath, "dna", "{stem}.fastq"), params: rname = "decompress_reads", shell: @@ -115,28 +119,36 @@ rule metawrap_read_qc: R2_pretrim_report = join(top_readqc_dir, "{name}", "{name}_R2_pretrim_report.html"), R1_postrim_report = join(top_readqc_dir, "{name}", "{name}_R1_postrim_report.html"), R2_postrim_report = join(top_readqc_dir, "{name}", "{name}_R2_postrim_report.html"), - R1_trimmed = join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq"), - R2_trimmed = join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq"), - R1_trimmed_gz = join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq.gz"), - R2_trimmed_gz = join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq.gz"), + R1_trimmed = join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq"), + R2_trimmed = join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq"), + R1_trimmed_gz = join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq.gz"), + R2_trimmed_gz = join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq.gz"), params: rname = "metawrap_read_qc", sid = "{name}", this_qc_dir = join(top_readqc_dir, "{name}"), - trim_out = join(top_trim_dir, "{name}.fastq"), + trim_out = join(top_trim_dir, "{name}"), + tmpr1 = lambda _, output, input: str(input.R1).replace('_R1.', '_1.'), + tmpr2 = lambda _, output, input: str(input.R2).replace('_R2.', '_2.') containerized: metawrap_container, - log: join(top_log_dir, "read_qc", "{name}"), threads: int(cluster["metawrap_genome_assembly"].get('threads', default_threads)), shell: """ - mw read_qc -1 {input.R1} -2 {input.R2} -t {threads} -o {params.this_qc_dir} - mv {params.this_qc_dir}/final_pure_reads_1.fastq {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq - mv {params.this_qc_dir}/final_pure_reads_2.fastq {params.this_qc_dir}/{params.sid}_R2_trimmed.fastq - pigz -9 -p {threads} -c {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq > {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq.gz - pigz -9 -p {threads} -c {params.this_qc_dir}/{params.sid}_R2_trimmed.fastq > {params.this_qc_dir}/{params.sid}_R2_trimmed.fastq.gz - mkdir -p {params.trim_out} - ln -s {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq.gz {params.trim_out}/{params.sid}_R1.fastq.gz - ln -s {params.this_qc_dir}/{params.sid}_R1_trimmed.fastq.gz {params.trim_out}/{params.sid}_R2.fastq.gz + # metawrap input needs to be renamed [stem]_1.fastq and [stem]_2.fastq + ln -s {input.R1} {params.tmpr1} + ln -s {input.R2} {params.tmpr2} + + # read quality control, host removal + # TODO: add support for mouse reads (mm10 genome prefix, "-x mm10") + mw read_qc -1 {params.tmpr1} -2 {params.tmpr2} -t {threads} -o {params.this_qc_dir} + + # collate fastq outputs to facilitate workflow, compress + ln -s {params.this_qc_dir}/final_pure_reads_1.fastq {params.trim_out}/{params.sid}_R1_trimmed.fastq + ln -s {params.this_qc_dir}/final_pure_reads_2.fastq {params.trim_out}/{params.sid}_R2_trimmed.fastq + pigz -9 -p {threads} -c {params.this_qc_dir}/final_pure_reads_1.fastq > {params.trim_out}/{params.sid}_R1_trimmed.fastq.gz + pigz -9 -p {threads} -c {params.this_qc_dir}/final_pure_reads_2.fastq > {params.trim_out}/{params.sid}_R2_trimmed.fastq.gz + + # collate outputs to facilitate ln -s {params.this_qc_dir}/post-QC_report/final_pure_reads_1_fastqc.html {params.this_qc_dir}/{params.sid}_R1_postrim_report.html ln -s {params.this_qc_dir}/post-QC_report/final_pure_reads_2_fastqc.html {params.this_qc_dir}/{params.sid}_R2_postrim_report.html ln -s {params.this_qc_dir}/pre-QC_report/{params.sid}_1_fastqc.html {params.this_qc_dir}/{params.sid}_R1_pretrim_report.html @@ -146,17 +158,19 @@ rule metawrap_read_qc: rule metawrap_genome_assembly: """ - TODO + + @Input: Clean trimmed fastq.gz reads (R1 & R2 per sample) + @Output: Megahit assembled contigs and reports Metaspades assembled contigs and reports Ensemble assembled contigs and reports """ input: - R1 = expand(join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq"), name=samples), - R2 = expand(join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq"), name=samples), + R1 = expand(join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq"), name=samples), + R2 = expand(join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq"), name=samples), output: # megahit outputs megahit_assembly = expand(join(top_assembly_dir, "{name}", "megahit", "final.contigs.fa"), name=samples), @@ -167,22 +181,25 @@ rule metawrap_genome_assembly: metaspades_graph = expand(join(top_assembly_dir, "{name}", "metaspades", "assembly_graph.fastg"), name=samples), metaspades_longscaffolds = expand(join(top_assembly_dir, "{name}", "metaspades", "long_scaffolds.fasta"), name=samples), metaspades_scaffolds = expand(join(top_assembly_dir, "{name}", "metaspades", "scaffolds.fasta"), name=samples), - metaspades_cor_readsr1 = expand(join(top_assembly_dir, "{name}", "metaspades", "{name}_1.fastq.00.0_0.cor.fastq.gz"), name=samples), - metaspades_cor_readsr2 = expand(join(top_assembly_dir, "{name}", "metaspades", "corrected", "{name}_2.fastq.00.0_0.cor.fastq.gz"), name=samples), + metaspades_cor_readsr1 = expand(join(top_assembly_dir, "{name}", "metaspades", "corrected", "{name}_1.00.0_0.cor.fastq.gz"), name=samples), + metaspades_cor_readsr2 = expand(join(top_assembly_dir, "{name}", "metaspades", "corrected", "{name}_2.00.0_0.cor.fastq.gz"), name=samples), # ensemble outputs final_assembly = expand(join(top_assembly_dir, "{name}", "final_assembly.fasta"), name=samples), final_assembly_report = expand(join(top_assembly_dir, "{name}", "assembly_report.html"), name=samples), assembly_R1 = expand(join(top_assembly_dir, "{name}", "{name}_1.fastq"), name=samples), assembly_R2 = expand(join(top_assembly_dir, "{name}", "{name}_2.fastq"), name=samples), - assembly_dir = expand(join(top_assembly_dir, "{name}"), name=samples), singularity: metawrap_container, params: rname = "metawrap_genome_assembly", - memlimit = cluster["metawrap_genome_assembly"].get('mem', default_memory), + memlimit = mem2int(cluster["metawrap_genome_assembly"].get('mem', default_memory)), + mh_dir = expand(join(top_assembly_dir, "{name}", "megahit"), name=samples), + assembly_dir = expand(join(top_assembly_dir, "{name}"), name=samples), contig_min_len = "1000", threads: int(cluster["metawrap_genome_assembly"].get('threads', default_threads)), shell: """ + # remove empty directories by snakemake, to prevent metawrap error + rm -rf {params.mh_dir} # link to the file names metawrap expects ln -s {input.R1} {output.assembly_R1} ln -s {input.R2} {output.assembly_R2} @@ -195,16 +212,20 @@ rule metawrap_genome_assembly: -l {params.contig_min_len} \ -1 {output.assembly_R1} \ -2 {output.assembly_R2} \ - -o {output.assembly_dir} + -o {params.assembly_dir} """ rule metawrap_tax_classification: """ - TODO: docstring + The metaWRAP::Kraken module takes in any number of fastq or fasta files, classifies the contained sequences with KRAKEN, + and reports the taxonomy distribution in a kronagram using KronaTools. If the sequences passed to the module belong to an + assembly and follow the contig naming convention of the Assembly module, the taxonomy of each contig is weighted based on + its length and coverage [weight=coverage*length]. The classifications of the sequences are then summarized in a format + that KronaTools' ktImportText function recognizes, and a final kronagram in html format is made with all the samples. """ input: - reads = expand(join(top_readqc_dir, "{name}", "{name}_R{pair}_trimmed.fastq"), name=samples, pair=['1', '2']), + reads = expand(join(top_trim_dir, "{name}", "{name}_R{pair}_trimmed.fastq"), name=samples, pair=['1', '2']), final_assembly = expand(join(top_assembly_dir, "{name}", "final_assembly.fasta"), name=samples), output: krak2_asm = expand(join(top_tax_dir, "{name}", "final_assembly.krak2"), name=samples), @@ -216,9 +237,10 @@ rule metawrap_tax_classification: rname = "metawrap_tax_classification", tax_subsample = str(int(1e6)), singularity: metawrap_container, - threads: cluster["metawrap_tax_classification"].get("threads", default_threads), + threads: int(cluster["metawrap_tax_classification"].get("threads", default_threads)), shell: """ + mkdir -p """+top_tax_dir+""" mw kraken2 \ -t {threads} \ -s {params.tax_subsample} \ @@ -229,9 +251,33 @@ rule metawrap_tax_classification: rule metawrap_binning: + """ + Metawrap wrapper for binning of reads/contigs to assemblies. + + Orchestrates execution of this ensemble of metagenomic binning software: + - MaxBin2 + - metaBAT2 + - CONCOCT + + Stepwise algorithm for metawrap genome binning: + 1. Alignment + a. metagenomic assembly is indexed with bwa-index + b. paired end reads from any number of samples are aligned + 2. Collate, sort, stage alignment outputs + a. Alignments are sorted and compressed with samtools, and library insert size statistics are also gathered at the same time (insert size average and standard deviation). + b. metaBAT2's `jgi_summarize_bam_contig_depths` function is used to generate contig adundance table + i. It is converted into the correct format for each of the three binners to take as input. + 3. Ensemble binning + a. MaxBin2, metaBAT2, CONCOCT are run with default settings + b. Final bins directories are created with formatted bin fasta files for easy inspection. + 4. CheckM is run on the bin output from step 3 to determine the success of the binning. + a. CheckM's `lineage_wf` function is used to predict essential genes and estimate the completion and contamination of each bin. + b. Outputs are formatted and collected for better viewing. + + """ input: - R1 = expand(join(top_readqc_dir, "{name}", "{name}_R1_trimmed.fastq"), name=samples), - R2 = expand(join(top_readqc_dir, "{name}", "{name}_R2_trimmed.fastq"), name=samples), + R1 = expand(join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq"), name=samples), + R2 = expand(join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq"), name=samples), assembly = join(top_assembly_dir, "{name}", "final_assembly.fasta"), output: maxbin_bins = directory(join(top_binning_dir, "{name}", "maxbin2_bins")), @@ -247,24 +293,38 @@ rule metawrap_binning: params: rname = "metawrap_binning", bin_dir = join(top_binning_dir, "{name}"), - refine_dir = join(top_refine_dir, "{name}"), - bin_mem = cluster.get("metawrap_assembly_binning", default_memory), + bin_mem = mem2int(cluster['metawrap_binning'].get("mem", default_memory)), + mw_trim_linker_R1 = join(top_trim_dir, "{name}", "{name}_1.fastq"), + mw_trim_linker_R2 = join(top_trim_dir, "{name}", "{name}_2.fastq"), min_perc_complete = "50", max_perc_contam = "5", singularity: metawrap_container, - threads: cluster["metawrap_tax_classification"].get("threads", default_threads), + threads: int(cluster["metawrap_binning"].get("threads", default_threads)), shell: """ + # set checkm data + export CHECKM_DATA_PATH="/data2/CHECKM_DB" + + # make base dir if not exists + mkdir -p """+top_binning_dir+""" + if [ -d "{params.bin_dir}" ]; then rm -rf {params.bin_dir}; fi + + # setup links for metawrap input + [[ -f "{params.mw_trim_linker_R1}" ]] || ln -s {input.R1} {params.mw_trim_linker_R1} + [[ -f "{params.mw_trim_linker_R2}" ]] || ln -s {input.R2} {params.mw_trim_linker_R2} + + # metawrap binning mw binning \ --metabat2 --maxbin2 --concoct \ -m {params.bin_mem} \ -t {threads} \ -a {input.assembly} \ -o {params.bin_dir} \ - {input.R1} {input.R2} + {params.mw_trim_linker_R1} {params.mw_trim_linker_R2} + # metawrap bin refinement - metawrap bin_refinement \ - -o {params.refine_dir} \ + mw bin_refinement \ + -o {params.bin_dir} \ -t {threads} \ -A {params.bin_dir}/metabat2_bins \ -B {params.bin_dir}/maxbin2_bins \ @@ -283,7 +343,6 @@ rule derep_bins: metawrap_stats = join(top_binning_dir, "{name}", "metawrap_50_5_bins.stats"), output: dereplicated_bins = directory(join(top_refine_dir, "{name}", "dereplicated_bins")), - search_rep_bc = join(top_binning_dir, "{name}", "maxbin2_bins"), singularity: metawrap_container, threads: 32 params: @@ -304,10 +363,10 @@ rule derep_bins: coverage_method = 'larger', shell: """ + mkdir -p """+top_refine_dir+""" sed -i 's/^bin./{name}_bin./g' {input.maxbin_stats} && \ sed -i 's/^bin./{name}_bin./g' {input.metabat2_stats} && \ sed -i 's/^bin./{name}_bin./g' {input.metawrap_stats} && \ - touch {output.search_rep_bc} dRep dereplicate \ -p {threads} \ -l {params.minimum_genome_length} \ diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk index d3bb15c..46a9c9d 100644 --- a/workflow/rules/RNA.smk +++ b/workflow/rules/RNA.smk @@ -9,15 +9,55 @@ from itertools import chain # ~~~~~~~~~~ workpath = config["project"]["workpath"] datapath = config["project"]["datapath"] +rna_coasm = config["options"]["rnacoa"] +rna_sample_stems = config["rna"] +rule decompress_rna_reads: + input: + this_compressed_read = start_dc + output: + this_uncompressed_read = join(workpath, "{stem}.fastq"), + params: + rname = "decompress_rna_reads", + shell: + """ + """ + + +rule concat_rna_reads: + input: + output: + params: + rname = "concat_rna_reads", + shell: + """ + """ + + +rule rna_read_qc: + input: + output: + params: + rname = "rna_read_qc", + shell: + """ + mw read_qc \ + -1 $(RAWDATA.RNA)/$${sample}_1.fastq \ + -2 $(RAWDATA.RNA)/$${sample}_2.fastq \ + --skip-trimming --skip-pre-qc-report --skip-post-qc-report \ + -x hg38 \ + -t {threads} \ + -o $(ANALYSIS)/READ_QC_RNA/$${sample} + fastqc -o $(ANALYSIS)/FASTQC_RNA/PRE -t {threads} -f fastq $(RAWDATA.RNA)/$${sample}_1.fastq $(RAWDATA.RNA)/$${sample}_2.fastq + fastqc -o $(ANALYSIS)/FASTQC_RNA/POST -t {threads} -f fastq $(ANALYSIS)/READ_QC_RNA/$${sample}_1.fastq $(ANALYSIS)/READ_QC_RNA/$${sample}_2.fastq + """ + rule map_rna_to_metagenome: input: - concat_rna_read = "" output: - concat_rna_read = "" params: - concat_rna_read = "" + rname = "map_rna_to_metagenome", shell: """ humann --threads 16 --input $(ANALYSIS)/READ_QC_RNA/$${sample}_concat.fastq --remove-temp-output --input-format fastq --output-basename $${sample} --output ./ From a55430f71c9c6c6459acb1ba154bb7118f4f1306 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 26 Feb 2024 17:28:38 -0600 Subject: [PATCH 22/32] feat: install gnu-which 2.21 --- docker/metawrap/Dockerfile | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index 8ce1300..df2e7b1 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -1,14 +1,16 @@ FROM condaforge/miniforge3:latest -RUN apt-get update; apt-get install -y -qq curl vim dos2unix bash python2.7 +RUN apt-get update; apt-get install -y -qq curl build-essential vim dos2unix bash python2.7 RUN mamba create -y -n metawrap-env RUN conda config --add channels defaults; conda config --add channels conda-forge; \ conda config --add channels bioconda;conda config --add channels ursky RUN mamba install -y --only-deps -c ursky -n metawrap-env metawrap-mg==1.3.2 -RUN cd /home; git clone https://github.com/bxlab/metaWRAP.git; chmod -R 777 metaWRAP +RUN cd /home; git clone https://github.com/rroutsong/metaWRAP.git; chmod -R 777 metaWRAP RUN mamba run -n metawrap-env pip3 install drep ENV PATH="/home/metaWRAP/bin:/opt/conda/envs/metawrap-env/bin:$PATH" COPY docker/metawrap/config-metawrap /home/metaWRAP/bin/config-metawrap COPY docker/metawrap/Dockerfile /Dockerfile +RUN mkdir /install; cd /install; wget https://carlowood.github.io/which/which-2.21.tar.gz; tar xvf which-2.21.tar.gz +RUN cd /install/which-2.21; ./configure; make && make install RUN md5sum Dockerfile > /Dockerfile.md5 ADD docker/metawrap/mw /home/metaWRAP/bin/mw RUN dos2unix /home/metaWRAP/bin/config-metawrap @@ -17,4 +19,4 @@ RUN chmod a+rx /home/metaWRAP/bin/config-metawrap /home/metaWRAP/bin/mw ENV BASH_ENV="/etc/bash.bashrc" RUN echo ". /opt/conda/etc/profile.d/conda.sh && conda activate metawrap-env" >> /etc/bash.bashrc RUN echo 'export PATH="/home/metaWRAP/bin:/opt/conda/envs/metawrap-env/bin:$PATH"' >> /etc/bash.bashrc -ENTRYPOINT ["/bin/bash"] \ No newline at end of file +ENTRYPOINT ["/bin/bash"] From 6a3c80a06aa11e3a616199af4a6f4254fa5ea5cf Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 26 Feb 2024 18:29:50 -0500 Subject: [PATCH 23/32] fix: add snakemake pluging to docs --- docs/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/requirements.txt b/docs/requirements.txt index dbe98a1..6c6ace2 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -31,4 +31,5 @@ tornado==6.0.4 tqdm==4.48.2 zipp==3.1.0 mkdocs-git-revision-date-plugin +mkdocs-snakemake-rule-plugin mike From 3b51a1917bf7bdb159c997b1d6762eb4b0c852dd Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Mon, 26 Feb 2024 18:51:04 -0600 Subject: [PATCH 24/32] fix: replace old which with new which --- docker/metawrap/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/metawrap/Dockerfile b/docker/metawrap/Dockerfile index df2e7b1..620e1c1 100644 --- a/docker/metawrap/Dockerfile +++ b/docker/metawrap/Dockerfile @@ -11,6 +11,7 @@ COPY docker/metawrap/config-metawrap /home/metaWRAP/bin/config-metawrap COPY docker/metawrap/Dockerfile /Dockerfile RUN mkdir /install; cd /install; wget https://carlowood.github.io/which/which-2.21.tar.gz; tar xvf which-2.21.tar.gz RUN cd /install/which-2.21; ./configure; make && make install +RUN rm /usr/bin/which; ln -s /usr/local/bin/which /usr/bin/which RUN md5sum Dockerfile > /Dockerfile.md5 ADD docker/metawrap/mw /home/metaWRAP/bin/mw RUN dos2unix /home/metaWRAP/bin/config-metawrap From d6dcf75f5f31fa17523f2c66ab29c164d8b5dc76 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 27 Feb 2024 15:50:57 -0500 Subject: [PATCH 25/32] feat: expand into DNA and RNA workflows, build out workflow to MAG creation --- workflow/Snakefile | 65 +++++++++--- workflow/rules/DNA.smk | 200 +++++++++++++++++++++++++------------ workflow/rules/RNA.smk | 133 +++++++++++++++++------- workflow/rules/hooks.smk | 7 +- workflow/scripts/common.py | 15 ++- 5 files changed, 303 insertions(+), 117 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 47a4480..e568367 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -2,15 +2,25 @@ import json import os, sys from os.path import join from os import listdir -from scripts.common import allocated, provided, references, str_bool +from scripts.common import allocated, provided, references, str_bool, list_bool +# Global workflow variables +configfile: "config.json" # Generated from user input and config/*.json + datapath = config["project"]["datapath"] -workpath = config["project"]["workpath"] # Pipeline"s output directory -tmpdir = config["options"]["tmp_dir"] # Temporary directory -samples = config["samples"] # List containing basename of samples +rna_datapath = config["project"].get("rna_datapath", []) +workpath = config["project"]["workpath"] +tmpdir = config["options"]["tmp_dir"] +coassemble = config['coassembly'] is True +rna_included = list_bool(config.get("rna", 'false')) +rna_coasm = str_bool(config["options"].get("rnacoa", 'False')) +rna_sample_stems = config.get("rna", []) +samples = config["samples"] if not coassemble else ['concatenated'] top_readqc_dir = join(workpath, config['project']['id'], "metawrap_read_qc") top_trim_dir = join(workpath, config['project']['id'], "trimmed_reads") +top_readqc_dir_rna = join(workpath, config['project']['id'], "metawrap_read_qc_RNA") +top_trim_dir_rna = join(workpath, config['project']['id'], "trimmed_reads_RNA") # Read in resource information, containing information about threads, mem, walltimes, etc. @@ -18,22 +28,30 @@ top_trim_dir = join(workpath, config['project']['id'], "trimmed_re with open(join(workpath, "config", "cluster.json")) as fh: cluster = json.load(fh) +if coassemble: + start_r1 = expand(join(workpath, "dna", "{name}_R1.fastq.gz"), name=['concatenated']) + start_r2 = expand(join(workpath, "dna", "{name}_R2.fastq.gz"), name=['concatenated']) +else: + start_r1 = expand(join(workpath, "dna", "{name}_R1.fastq.gz"), name=samples) + start_r2 = expand(join(workpath, "dna", "{name}_R2.fastq.gz"), name=samples) -# Global workflow variables -configfile: "config.json" # Generated from user input and config/*.json - -if config['coassembly']: - start_r1 = join(workpath, "concatenated.R1.fastq") - start_r2 = join(workpath, "concatenated.R1.fastq") - start_dc = [] +if rna_included: + if rna_coasm: + start_r1_rna = expand(join(workpath, "rna", "{name}_R1.fastq.gz"), name=['concatenated']) + start_r2_rna = expand(join(workpath, "rna", "{name}_R2.fastq.gz"), name=['concatenated']) + else: + start_r1_rna = expand(join(workpath, "rna", "{rname}_R1.fastq.gz"), rname=rna_sample_stems) + start_r2_rna = expand(join(workpath, "rna", "{rname}_R2.fastq.gz"), rname=rna_sample_stems) else: - start_r1 = expand(join(workpath, "dna", "{name}_R1.fastq"), name=config['samples']) - start_r2 = expand(join(workpath, "dna", "{name}_R2.fastq"), name=config['samples']) - start_dc = join(datapath, "{stem}.fastq.gz") + start_r1_rna, start_r2_rna = [], [] + # Final ouput files of the pipeline rule all: input: + ################# + # DNA outputs # + ################# # dna read decompress # ~~~~~~~~~~~~~~~ start_r1, @@ -68,9 +86,24 @@ rule all: # bin refinement # ~~~~~~~~~~~~~~~ expand(join(workpath, config['project']['id'], "metawrap_bin_refine", "{name}", "dereplicated_bins"), name=samples), - + ################# + # RNA outputs # + ################# + # rna read decompress + # ~~~~~~~~~~~~~~~ + start_r1_rna, + start_r1_rna, + # read qc and filtering + # ~~~~~~~~~~~~~~~ + expand(join(top_readqc_dir_rna, "{rname}", "{rname}_R1_pretrim_report.html"), rname=rna_sample_stems), + expand(join(top_readqc_dir_rna, "{rname}", "{rname}_R2_pretrim_report.html"), rname=rna_sample_stems), + expand(join(top_readqc_dir_rna, "{rname}", "{rname}_R1_postrim_report.html"), rname=rna_sample_stems), + expand(join(top_readqc_dir_rna, "{rname}", "{rname}_R2_postrim_report.html"), rname=rna_sample_stems), + expand(join(top_trim_dir_rna, "{rname}", "{rname}_R1_trimmed.fastq.gz"), rname=rna_sample_stems), + expand(join(top_trim_dir_rna, "{rname}", "{rname}_R2_trimmed.fastq.gz"), rname=rna_sample_stems), + # Import rules include: join("rules", "DNA.smk") -# include: join("rules", "RNA.smk") +include: join("rules", "RNA.smk") include: join("rules", "hooks.smk") \ No newline at end of file diff --git a/workflow/rules/DNA.smk b/workflow/rules/DNA.smk index a4e5392..db211d9 100644 --- a/workflow/rules/DNA.smk +++ b/workflow/rules/DNA.smk @@ -1,7 +1,7 @@ # ~~~~~~~~~~ # Metawrap metagenome assembly and analysis rules # ~~~~~~~~~~ -from os.path import join +from os.path import join, basename from itertools import chain from uuid import uuid4 @@ -9,10 +9,13 @@ from uuid import uuid4 # ~~~~~~~~~~ # Constants and paths # ~~~~~~~~~~ -workpath = config["project"]["workpath"] -datapath = config["project"]["datapath"] +# resource defaults default_threads = cluster["__default__"]['threads'] default_memory = cluster["__default__"]['mem'] +# directories +workpath = config["project"]["workpath"] +datapath = config["project"]["datapath"] +samples = config["samples"] if not coassemble else ['concatenated'] top_log_dir = join(workpath, "logfiles") top_readqc_dir = join(workpath, config['project']['id'], "metawrap_read_qc") top_trim_dir = join(workpath, config['project']['id'], "trimmed_reads") @@ -20,11 +23,11 @@ top_assembly_dir = join(workpath, config['project']['id'], "metawrap_a top_tax_dir = join(workpath, config['project']['id'], "metawrap_kmer") top_binning_dir = join(workpath, config['project']['id'], "metawrap_binning") top_refine_dir = join(workpath, config['project']['id'], "metawrap_bin_refine") +# workflow flags metawrap_container = config["containers"]["metawrap"] pairedness = list(range(1, config['project']['nends']+1)) mem2int = lambda x: int(str(x).lower().replace('gb', '').replace('g', '')) - """ Step-wise pipeline outline: 0. concat or no concat @@ -38,61 +41,71 @@ mem2int = lambda x: int(str(x).lower().replace('gb', '').repl 8. align DNA to assembly """ -if config['coassembly']: - start_r1 = join(workpath, "concatenated.R1.fastq") - start_r2 = join(workpath, "concatenated.R1.fastq") - start_dc = [] -else: - start_r1 = expand(join(workpath, "dna", "{name}_R1.fastq"), name=config['samples']) - start_r2 = expand(join(workpath, "dna", "{name}_R2.fastq"), name=config['samples']) - start_dc = join(datapath, "{stem}.fastq.gz") - - -rule decompress_reads: - input: - this_compressed_read = start_dc - output: - this_uncompressed_read = join(workpath, "dna", "{stem}.fastq"), - params: - rname = "decompress_reads", - shell: - "gunzip -c {input.this_compressed_read} > {output.this_uncompressed_read}" +start_r1 = expand(join(workpath, "dna", "{name}_R1.fastq.gz"), name=samples) +start_r2 = expand(join(workpath, "dna", "{name}_R2.fastq.gz"), name=samples) rule concat_reads: input: - all_r1_reads = expand(join(workpath, "{name}.R1.fastq"), name=config['samples']), - all_r2_reads = expand(join(workpath, "{name}.R2.fastq"), name=config['samples']), + all_r1_reads = expand(join(workpath, "dna", "{sid}_R1.fastq.gz"), sid=config['samples'] if config['coassembly'] else []), + all_r2_reads = expand(join(workpath, "dna", "{sid}_R2.fastq.gz"), sid=config['samples'] if config['coassembly'] else []), output: - big_read_r1 = join(workpath, "concatenated.R1.fastq"), - big_read_r2 = join(workpath, "concatenated.R2.fastq"), - big_compressed_read_r1 = join(workpath, "concatenated.R1.fastq.gz"), - big_compressed_read_r2 = join(workpath, "concatenated.R2.fastq.gz"), - big_read1_hash = join(workpath, "concatenated.R1.md5"), - big_read2_hash = join(workpath, "concatenated.R2.md5"), + big_compressed_read_r1 = join(workpath, "dna", "concatenated_R1.fastq.gz"), + big_compressed_read_r2 = join(workpath, "dna", "concatenated_R2.fastq.gz"), + big_read1_hash = join(workpath, "dna", "concatenated_R1.md5"), + big_read2_hash = join(workpath, "dna", "concatenated_R2.md5"), params: + big_read_r1 = join(workpath, "dna", "concatenated_R1.fastq"), + big_read_r2 = join(workpath, "dna", "concatenated_R2.fastq"), rname = "concat_reads", - input_dir = workpath, - threads: 28 + input_dir = join(workpath, "dna"), + threads: int(cluster["concat_reads"].get('threads', default_threads)), shell: """ - for fastq in {params.input_dir}\*R1*fastq; do cat $fastq > {output.big_read_r1}; done - for fastq in {params.input_dir}\*R2*fastq; do cat $fastq > {output.big_read_r2}; done - pigz -9 -p 28 -c {params.uncompressed_r1} > {output.big_compressed_read_r1} - pigz -9 -p 28 -c {params.uncompressed_r2} > {output.big_compressed_read_r2} + shopt -s extglob + # concat r1 + for fastq in {params.input_dir}/*R1*f?(ast)q*; do + ext=$(echo "${{fastq: -2}}" | tr '[:upper:]' '[:lower:]') + if [[ "$ext" == "gz" ]]; then + zcat $fastq >> {params.big_read_r1} + else + cat $fastq >> {params.big_read_r1} + fi; + done + + # concat r2 + for fastq in {params.input_dir}/*R2*f?(ast)q*; do + ext=$(echo "${{fastq: -2}}" | tr '[:upper:]' '[:lower:]') + if [[ "$ext" == "gz" ]]; then + zcat $fastq > {params.big_read_r2} + else + cat $fastq > {params.big_read_r2} + fi; + done + shopt -u extglob + + pigz -9 -p 28 -c {params.big_read_r1} > {output.big_compressed_read_r1} + pigz -9 -p 28 -c {params.big_read_r2} > {output.big_compressed_read_r2} md5sum {output.big_compressed_read_r1} > {output.big_read1_hash} md5sum {output.big_compressed_read_r2} > {output.big_read2_hash} + rm {params.big_read_r1} {params.big_read_r2} """ rule metawrap_read_qc: """ - Metawrap read quality control rule for producing high quality reads for assembly. + The metaWRAP::Read_qc module is meant to pre-process raw Illumina sequencing reads in preparation for assembly and alignment. + The raw reads are trimmed based on adapted content and PHRED scored with the default setting of Trim-galore, ensuring that only high-quality + sequences are left. Then reads are then aligned to the host genome (e.g. human) with bmtagger, and any host reads are removed from the + metagenomic data to remove host contamination. Read pairs where only one read was aligned to the host genome are also removed. + Finally, FASTQC is used to generate quality reports of the raw and final read sets in order to assess read quality improvement. + The users have control over which of the above features they wish to use. Quality-control step accomplishes three tasks: - Trims adapter sequences from input read pairs (.fastq.gz) - Quality filters from input read pairs - Removes human contamination from input read pairs + @Container requirements - [MetaWrap](https://github.com/bxlab/metaWRAP) orchestrates execution of these tools in a psuedo-pipeline: @@ -103,9 +116,10 @@ rule metawrap_read_qc: @Environment specifications - minimum 16gb memory - - will load bmtagger index in memory (8gb) + - will load bmtagger index in memory (8gb) + @Input: - - Raw fastq.gz reads (R1 & R2 per-sample) from the instrument + - Raw fastq reads (R1 & R2 per-sample) from the instrument @Outputs: - Trimmed, quality filtered reads (R1 & R2) @@ -128,15 +142,28 @@ rule metawrap_read_qc: sid = "{name}", this_qc_dir = join(top_readqc_dir, "{name}"), trim_out = join(top_trim_dir, "{name}"), - tmpr1 = lambda _, output, input: str(input.R1).replace('_R1.', '_1.'), - tmpr2 = lambda _, output, input: str(input.R2).replace('_R2.', '_2.') + tmpr1 = lambda _, output, input: join(config['options']['tmp_dir'], str(basename(str(input.R1))).replace('_R1.', '_1.').replace('.gz', '')), + tmpr2 = lambda _, output, input: join(config['options']['tmp_dir'], str(basename(str(input.R2))).replace('_R2.', '_2.').replace('.gz', '')), containerized: metawrap_container, threads: int(cluster["metawrap_genome_assembly"].get('threads', default_threads)), shell: """ - # metawrap input needs to be renamed [stem]_1.fastq and [stem]_2.fastq - ln -s {input.R1} {params.tmpr1} - ln -s {input.R2} {params.tmpr2} + # uncompress to lscratch + rone="{input.R1}" + ext=$(echo "${{rone: -2}}" | tr '[:upper:]' '[:lower:]') + if [[ "$ext" == "gz" ]]; then + zcat {input.R1} > {params.tmpr1} + else + ln -s {input.R1} {params.tmpr1} + fi; + + rtwo="{input.R2}" + ext=$(echo "${{rtwo: -2}}" | tr '[:upper:]' '[:lower:]') + if [[ "$ext" == "gz" ]]; then + zcat {input.R2} > {params.tmpr2} + else + ln -s {input.R2} {params.tmpr2} + fi; # read quality control, host removal # TODO: add support for mouse reads (mm10 genome prefix, "-x mm10") @@ -148,7 +175,8 @@ rule metawrap_read_qc: pigz -9 -p {threads} -c {params.this_qc_dir}/final_pure_reads_1.fastq > {params.trim_out}/{params.sid}_R1_trimmed.fastq.gz pigz -9 -p {threads} -c {params.this_qc_dir}/final_pure_reads_2.fastq > {params.trim_out}/{params.sid}_R2_trimmed.fastq.gz - # collate outputs to facilitate + # collate outputs to facilitate + mkdir -p {params.this_qc_dir}/dna ln -s {params.this_qc_dir}/post-QC_report/final_pure_reads_1_fastqc.html {params.this_qc_dir}/{params.sid}_R1_postrim_report.html ln -s {params.this_qc_dir}/post-QC_report/final_pure_reads_2_fastqc.html {params.this_qc_dir}/{params.sid}_R2_postrim_report.html ln -s {params.this_qc_dir}/pre-QC_report/{params.sid}_1_fastqc.html {params.this_qc_dir}/{params.sid}_R1_pretrim_report.html @@ -158,10 +186,16 @@ rule metawrap_read_qc: rule metawrap_genome_assembly: """ - + The metaWRAP::Assembly module allows the user to assemble a set of metagenomic reads with either + metaSPAdes or MegaHit (default). While metaSPAdes results in a superior assembly in most samples, + MegaHit is much more memory efficient, faster, and scales well with large datasets. + In addition to simplifying parameter selection for the user, this module also sorts and formats + the MegaHit assembly in a way that makes it easier to inspect. The contigs are sorted by length + and their naming is changed to resemble that of SPAdes, including the contig ID, length, and coverage. + Finally, short scaffolds are discarded (<1000bp), and an assembly report is generated with QUAST. @Input: - Clean trimmed fastq.gz reads (R1 & R2 per sample) + Clean trimmed fastq reads (R1 & R2 per sample) @Output: Megahit assembled contigs and reports @@ -218,11 +252,21 @@ rule metawrap_genome_assembly: rule metawrap_tax_classification: """ - The metaWRAP::Kraken module takes in any number of fastq or fasta files, classifies the contained sequences with KRAKEN, - and reports the taxonomy distribution in a kronagram using KronaTools. If the sequences passed to the module belong to an - assembly and follow the contig naming convention of the Assembly module, the taxonomy of each contig is weighted based on - its length and coverage [weight=coverage*length]. The classifications of the sequences are then summarized in a format - that KronaTools' ktImportText function recognizes, and a final kronagram in html format is made with all the samples. + The metaWRAP::Taxonomic Classification module takes in any number of fastq or fasta files, classifies the contained sequences + with KRAKEN, and reports the taxonomy distribution in a kronagram using KronaTools. If the sequences passed to the module + belong to an assembly and follow the contig naming convention of the Assembly module, the taxonomy of each contig is weighted based on + its length and coverage [weight=coverage*length]. The classifications of the sequences are then summarized in a format that + KronaTools' ktImportText function recognizes, and a final kronagram in html format is made with all the samples. + + @Input: + - clean & trimmed reads (R1 & R2) + - ensemble genome assembly + + @Output: + - kraken2 kmer classification reports and tabular data outputs + - krona tabular outputs + - krona plot (interactive circular pie charts) of classified taxonomies + """ input: reads = expand(join(top_trim_dir, "{name}", "{name}_R{pair}_trimmed.fastq"), name=samples, pair=['1', '2']), @@ -232,8 +276,8 @@ rule metawrap_tax_classification: kraken2_asm = expand(join(top_tax_dir, "{name}", "final_assembly.kraken2"), name=samples), krona_asm = expand(join(top_tax_dir, "{name}", "final_assembly.krona"), name=samples), kronagram = expand(join(top_tax_dir, "{name}", "kronagram.html"), name=samples), - tax_dir = expand(join(top_tax_dir, "{name}"), name=samples), params: + tax_dir = expand(join(top_tax_dir, "{name}"), name=samples), rname = "metawrap_tax_classification", tax_subsample = str(int(1e6)), singularity: metawrap_container, @@ -244,7 +288,7 @@ rule metawrap_tax_classification: mw kraken2 \ -t {threads} \ -s {params.tax_subsample} \ - -o {output.tax_dir} \ + -o {params.tax_dir} \ {input.final_assembly} \ {input.reads} """ @@ -252,7 +296,21 @@ rule metawrap_tax_classification: rule metawrap_binning: """ - Metawrap wrapper for binning of reads/contigs to assemblies. + The metaWRAP::Binning module is meant to be a convenient wrapper around three metagenomic binning software: MaxBin2, metaBAT2, and CONCOCT. + + First the metagenomic assembly is indexed with bwa-index, and then paired end reads from any number of samples are aligned to it. + The alignments are sorted and compressed with samtools, and library insert size statistics are also gathered at the same time + (insert size average and standard deviation). + + metaBAT2's `jgi_summarize_bam_contig_depths` function is used to generate contig adundance table, and it is then converted + into the correct format for each of the three binners to take as input. After MaxBin2, metaBAT2, and CONCOCT finish binning + the contigs with default settings (the user can specify which software he wants to bin with), the final bins folders are + created with formatted bin fasta files for easy inspection. + + Optionally, the user can chose to immediately run CheckM on the bins to determine the success of the binning. + + CheckM's `lineage_wf` function is used to predict essential genes and estimate the completion and + contamination of each bin, and a custom script is used to generate an easy to view report on each bin set. Orchestrates execution of this ensemble of metagenomic binning software: - MaxBin2 @@ -264,7 +322,8 @@ rule metawrap_binning: a. metagenomic assembly is indexed with bwa-index b. paired end reads from any number of samples are aligned 2. Collate, sort, stage alignment outputs - a. Alignments are sorted and compressed with samtools, and library insert size statistics are also gathered at the same time (insert size average and standard deviation). + a. Alignments are sorted and compressed with samtools, and library insert size statistics are also + gathered at the same time (insert size average and standard deviation). b. metaBAT2's `jgi_summarize_bam_contig_depths` function is used to generate contig adundance table i. It is converted into the correct format for each of the three binners to take as input. 3. Ensemble binning @@ -274,6 +333,14 @@ rule metawrap_binning: a. CheckM's `lineage_wf` function is used to predict essential genes and estimate the completion and contamination of each bin. b. Outputs are formatted and collected for better viewing. + @Input: + Clean trimmed fastq.gz reads (R1 & R2 per sample) + + @Output: + Megahit assembled draft-genomes (bins) and reports + Metaspades assembled draft-genomes (bins) and reports + Ensemble assembled draft-genomes (bins) and reports + """ input: R1 = expand(join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq"), name=samples), @@ -334,6 +401,16 @@ rule metawrap_binning: rule derep_bins: + """ + dRep is a step which further refines draft-quality genomes (bins) by using a + fast, inaccurate estimation of genome distance, and a slow, accurate measure + of average nucleotide identity. + + @Input: + + @Output: + + """ input: maxbin_contigs = join(top_binning_dir, "{name}", "maxbin2_bins.contigs"), maxbin_stats = join(top_binning_dir, "{name}", "maxbin2_bins.stats"), @@ -344,7 +421,7 @@ rule derep_bins: output: dereplicated_bins = directory(join(top_refine_dir, "{name}", "dereplicated_bins")), singularity: metawrap_container, - threads: 32 + threads: int(cluster["derep_bins"].get("threads", default_threads)), params: rname = "derep_bins", sid = "{name}", @@ -363,17 +440,14 @@ rule derep_bins: coverage_method = 'larger', shell: """ - mkdir -p """+top_refine_dir+""" - sed -i 's/^bin./{name}_bin./g' {input.maxbin_stats} && \ - sed -i 's/^bin./{name}_bin./g' {input.metabat2_stats} && \ - sed -i 's/^bin./{name}_bin./g' {input.metawrap_stats} && \ + mkdir -p {output.dereplicated_bins} dRep dereplicate \ + -g $(ls {params.metawrap_bins}/* | tr '\\n' ' ') \ -p {threads} \ -l {params.minimum_genome_length} \ -pa {params.ani_primary_threshold} \ -sa {params.ani_secondary_threshold} \ -nc {params.min_overlap} \ -cm {params.coverage_method} \ - -g {params.metawrap_bins}/* \ {output.dereplicated_bins} """ diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk index 46a9c9d..eae8547 100644 --- a/workflow/rules/RNA.smk +++ b/workflow/rules/RNA.smk @@ -3,63 +3,126 @@ # ~~~~~~~~~~ from os.path import join from itertools import chain +from scripts.common import str_bool, list_bool # ~~~~~~~~~~ # Constants and paths # ~~~~~~~~~~ workpath = config["project"]["workpath"] -datapath = config["project"]["datapath"] -rna_coasm = config["options"]["rnacoa"] -rna_sample_stems = config["rna"] +rna_datapath = config["project"].get("rna_datapath", "/dev/null") +rna_included = list_bool(config.get("rna", 'false')) +# rna_coasm = str_bool(config["options"].get("rnacoa", 'False')) +rna_coasm = False +rna_sample_stems = config.get("rna", []) +rna_compressed = True # if accepting uncompressed fastq input +top_readqc_dir_rna = join(workpath, config['project']['id'], "metawrap_read_qc_RNA") +top_trim_dir_rna = join(workpath, config['project']['id'], "trimmed_reads_RNA") +metawrap_container = config["containers"]["metawrap"] +pairedness = list(range(1, config['project']['nends']+1)) - -rule decompress_rna_reads: - input: - this_compressed_read = start_dc - output: - this_uncompressed_read = join(workpath, "{stem}.fastq"), - params: - rname = "decompress_rna_reads", - shell: - """ - """ +if rna_included: + start_r1_rna = expand(join(workpath, "rna", "{rname}_R1.fastq.gz"), rname=rna_sample_stems) + start_r2_rna = expand(join(workpath, "rna", "{rname}_R2.fastq.gz"), rname=rna_sample_stems) +else: + start_r1_rna, start_r2_rna = [], [] rule concat_rna_reads: input: + all_r1_reads = expand(join(workpath, "rna", "{rname}_R1.fastq.gz"), rname=rna_sample_stems if rna_coasm else []), + all_r2_reads = expand(join(workpath, "rna", "{rname}_R2.fastq.gz"), rname=rna_sample_stems if rna_coasm else []), output: + big_compressed_read_r1 = join(workpath, "rna", "concatenated_R1.fastq.gz"), + big_compressed_read_r2 = join(workpath, "rna", "concatenated_R2.fastq.gz"), + big_read1_hash = join(workpath, "rna", "concatenated_R1.md5"), + big_read2_hash = join(workpath, "rna", "concatenated_R2.md5"), params: rname = "concat_rna_reads", - shell: + big_read_r1 = join(workpath, "dna", "concatenated_R1.fastq"), + big_read_r2 = join(workpath, "dna", "concatenated_R2.fastq"), + input_dir = workpath, + threads: int(cluster["concat_rna_reads"].get('threads', default_threads)), + shell: """ + # concat r1 + for fastq in {params.input_dir}/*R1*fastq; do + ext=$(echo "${{fastq: -2}}" | tr '[:upper:]' '[:lower:]') + if [[ "$ext" == "gz" ]]; then + zcat $fastq >> {params.big_read_r1} + else + cat $fastq >> {params.big_read_r1} + fi; + done + + # concat r2 + for fastq in {params.input_dir}/*R2*fastq; do + ext=$(echo "${{fastq: -2}}" | tr '[:upper:]' '[:lower:]') + if [[ "$ext" == "gz" ]]; then + zcat $fastq > {params.big_read_r2} + else + cat $fastq >> {params.big_read_r2} + fi; + done + pigz -9 -p 28 -c {output.big_read_r1} > {output.big_compressed_read_r1} + pigz -9 -p 28 -c {output.big_read_r2} > {output.big_compressed_read_r2} + md5sum {output.big_compressed_read_r1} > {output.big_read1_hash} + md5sum {output.big_compressed_read_r2} > {output.big_read2_hash} """ rule rna_read_qc: input: + R1 = start_r1_rna, + R2 = start_r2_rna, output: + R1_pretrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R1_pretrim_report.html"), + R2_pretrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R2_pretrim_report.html"), + R1_postrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R1_postrim_report.html"), + R2_postrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R2_postrim_report.html"), + R1_trimmed = join(top_trim_dir_rna, "{rname}", "{rname}_R1_trimmed.fastq"), + R2_trimmed = join(top_trim_dir_rna, "{rname}", "{rname}_R2_trimmed.fastq"), + R1_trimmed_gz = join(top_trim_dir_rna, "{rname}", "{rname}_R1_trimmed.fastq.gz"), + R2_trimmed_gz = join(top_trim_dir_rna, "{rname}", "{rname}_R2_trimmed.fastq.gz"), params: - rname = "rna_read_qc", + rname = "rna_read_qc", + sid = "{rname}", + this_qc_dir = join(top_readqc_dir_rna, "{rname}"), + trim_out = join(top_trim_dir_rna, "{rname}"), + tmpr1 = lambda _, output, input: join(config['options']['tmp_dir'], str(basename(str(input.R1))).replace('_R1.', '_1.').replace('.gz', '')), + tmpr2 = lambda _, output, input: join(config['options']['tmp_dir'], str(basename(str(input.R2))).replace('_R2.', '_2.').replace('.gz', '')), + containerized: metawrap_container, shell: """ - mw read_qc \ - -1 $(RAWDATA.RNA)/$${sample}_1.fastq \ - -2 $(RAWDATA.RNA)/$${sample}_2.fastq \ - --skip-trimming --skip-pre-qc-report --skip-post-qc-report \ - -x hg38 \ - -t {threads} \ - -o $(ANALYSIS)/READ_QC_RNA/$${sample} - fastqc -o $(ANALYSIS)/FASTQC_RNA/PRE -t {threads} -f fastq $(RAWDATA.RNA)/$${sample}_1.fastq $(RAWDATA.RNA)/$${sample}_2.fastq - fastqc -o $(ANALYSIS)/FASTQC_RNA/POST -t {threads} -f fastq $(ANALYSIS)/READ_QC_RNA/$${sample}_1.fastq $(ANALYSIS)/READ_QC_RNA/$${sample}_2.fastq - """ + # uncompress to lscratch + rone="{input.R1}" + ext=$(echo "${{rone: -2}}" | tr '[:upper:]' '[:lower:]') + if [[ "$ext" == "gz" ]]; then + zcat {input.R1} > {params.tmpr1} + else + ln -s {input.R1} {params.tmpr1} + fi; -rule map_rna_to_metagenome: - input: - output: - params: - rname = "map_rna_to_metagenome", - shell: - """ - humann --threads 16 --input $(ANALYSIS)/READ_QC_RNA/$${sample}_concat.fastq --remove-temp-output --input-format fastq --output-basename $${sample} --output ./ + rtwo="{input.R2}" + ext=$(echo "${{rtwo: -2}}" | tr '[:upper:]' '[:lower:]') + if [[ "$ext" == "gz" ]]; then + zcat {input.R2} > {params.tmpr2} + else + ln -s {input.R2} {params.tmpr2} + fi; + + # read quality control, host removal + # TODO: add support for mouse reads (mm10 genome prefix, "-x mm10") + mw read_qc -1 {params.tmpr1} -2 {params.tmpr2} -t {threads} -o {params.this_qc_dir} + + # collate fastq outputs to facilitate workflow, compress + ln -s {params.this_qc_dir}/final_pure_reads_1.fastq {params.trim_out}/{params.sid}_R1_trimmed.fastq + ln -s {params.this_qc_dir}/final_pure_reads_2.fastq {params.trim_out}/{params.sid}_R2_trimmed.fastq + pigz -9 -p {threads} -c {params.this_qc_dir}/final_pure_reads_1.fastq > {params.trim_out}/{params.sid}_R1_trimmed.fastq.gz + pigz -9 -p {threads} -c {params.this_qc_dir}/final_pure_reads_2.fastq > {params.trim_out}/{params.sid}_R2_trimmed.fastq.gz + + # collate outputs to facilitate + ln -s {params.this_qc_dir}/post-QC_report/final_pure_reads_1_fastqc.html {params.this_qc_dir}/{params.sid}_R1_postrim_report.html + ln -s {params.this_qc_dir}/post-QC_report/final_pure_reads_2_fastqc.html {params.this_qc_dir}/{params.sid}_R2_postrim_report.html + ln -s {params.this_qc_dir}/pre-QC_report/{params.sid}_1_fastqc.html {params.this_qc_dir}/{params.sid}_R1_pretrim_report.html + ln -s {params.this_qc_dir}/pre-QC_report/{params.sid}_2_fastqc.html {params.this_qc_dir}/{params.sid}_R2_pretrim_report.html """ - \ No newline at end of file diff --git a/workflow/rules/hooks.smk b/workflow/rules/hooks.smk index 3a68818..647624b 100644 --- a/workflow/rules/hooks.smk +++ b/workflow/rules/hooks.smk @@ -56,10 +56,13 @@ if config["options"]["mode"] == "slurm": # is run and no child jobs are submitted touch failed_jobs_${{timestamp}}.tsv }} - touch COMPLETED + failed_wc=$(wc -l failed_jobs_${{timestamp}}.tsv); + if [ "$failed_wc" -le "1" ]; then + rm failed_jobs_${{timestamp}}.tsv; + fi + touch COMPLETED """ ) - onerror: shell( """ diff --git a/workflow/scripts/common.py b/workflow/scripts/common.py index 92f9d23..5e660a2 100644 --- a/workflow/scripts/common.py +++ b/workflow/scripts/common.py @@ -65,6 +65,8 @@ def allocated(resource, rule, lookup, default="__default__"): def str_bool(s): """ + Deprecated from stdlib in 3.10, see: https://peps.python.org/pep-0632/ + Converts a string to boolean. It is dangerous to try to typecast a string into a boolean value using the built-in `bool()` function. This function avoids any issues that can @@ -74,6 +76,8 @@ def str_bool(s): boolean('False') returns False boolean('asdas') raises TypeError """ + if not isinstance(s, str): + s = str(s) val = s.lower() if val in ['true', '1', 'y', 'yes']: return True @@ -82,4 +86,13 @@ def str_bool(s): else: # Provided value could not be # type casted into a boolean - raise TypeError('Fatal: cannot type cast {} into a boolean'.format(val)) \ No newline at end of file + raise TypeError('Fatal: cannot type cast {} into a boolean'.format(val)) + + +def list_bool(l): + # some lists are strings of "None" if no files input + # type checking instead of string manip + if isinstance(l, list): + if len(l) > 0: + return True + return False \ No newline at end of file From 07ea12b572523089c977cba0b3a5d76c1f4390e4 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 27 Feb 2024 15:53:50 -0500 Subject: [PATCH 26/32] feat: expand cluster resourcing for new rna rules and better tailor dna ones --- config/cluster.json | 56 ++++++++++++++++++++++++++++++++----------- config/images.json | 4 ++-- config/resources.json | 25 +++++++++++-------- 3 files changed, 59 insertions(+), 26 deletions(-) diff --git a/config/cluster.json b/config/cluster.json index 148a57d..45bce7a 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -1,32 +1,60 @@ { "__default__": { "threads": 4, - "mem": "8g", + "mem": "8G", "partition": "norm", "time": "0-04:00:00", "gres": "lscratch:64" }, + "concat_reads": { + "threads": 28, + "mem": "16G", + "partition": "norm", + "time": "1-00:00:00", + "gres": "lscratch:400" + }, + "concat_rna_reads": { + "threads": 28, + "mem": "16G", + "partition": "norm", + "time": "1-00:00:00", + "gres": "lscratch:400" + }, "metawrap_read_qc": { "threads": 16, - "mem": "32g", + "mem": "32G", + "partition": "norm", + "time": "1-00:00:00", + "gres": "lscratch:400" + }, + "rna_read_qc": { + "threads": 16, + "mem": "32G", "partition": "norm", - "time": "1-00:00:00" + "time": "1-00:00:00", + "gres": "lscratch:400" }, "metawrap_genome_assembly": { - "threads": 24, - "mem": "128g", + "threads": 48, + "mem": "128G", "partition": "norm", - "time": "2-00:00:00" + "time": "5-00:00:00" }, "metawrap_tax_classification": { - "threads": 12, - "mem": "32g", - "partition": "quick", - "time": "0-04:00:00" - }, - "metawrap_assembly_binning": { - "threads": 16, - "mem": "64g", + "threads": 32, + "mem": "64G", + "partition": "norm", + "time": "5-00:00:00" + }, + "metawrap_binning": { + "threads": 32, + "mem": "64G", + "partition": "norm", + "time": "2-00:00:00" + }, + "derep_bins": { + "threads": 32, + "mem": "32G", "partition": "norm", "time": "2-00:00:00" } diff --git a/config/images.json b/config/images.json index 8c1e3f5..428943f 100644 --- a/config/images.json +++ b/config/images.json @@ -1,10 +1,10 @@ { "images": { - "metawrap": "docker://rroutsong/metamorph_metawrap:0.0.2", + "metawrap": "docker://rroutsong/metamorph_metawrap:0.0.4", "metagenome": "docker://rroutsong/metamorph_metagenome:0.0.1" }, "containers": { - "metawrap": "/data/OpenOmics/SIFs/metamorph_metawrap_0.0.2.sif", + "metawrap": "/data/OpenOmics/SIFs/metamorph_metawrap_0.0.4.sif", "metagenome": "/data/OpenOmics/SIFs/metamorph_metagenome_0.0.1.sif" } } diff --git a/config/resources.json b/config/resources.json index 27a89ac..284ec44 100644 --- a/config/resources.json +++ b/config/resources.json @@ -1,10 +1,10 @@ { "databases": [ { - "name": "KRAKEN_DB2", + "name": "KRAKEN2_DB", "to": "/data2/KRAKEN_DB2", - "from": "/data/OpenOmics/references/metamorph/kraken2/k2_pluspfp_08gb_20230605", - "mode": "ro" + "from": "/data/OpenOmics/references/metamorph/kraken2/k2_pluspfp_08gb_20240112", + "mode": "rw" }, { "name": "KRAKEN_DB", "to": "/data2/KRAKEN_DB", @@ -26,19 +26,24 @@ "from": "/data/OpenOmics/references/metamorph/GUNC/gunc_1.0.5db", "mode": "ro" }, { - "name": "CHECKM_DB", - "to": "/data2/checkm", - "from": "/data/OpenOmics/references/metamorph/checkm", - "mode": "rw" - }, { "name": "NCBI_NT", "to": "/data2/NCBI_NT_DB", "from": "/data/OpenOmics/references/metamorph/NCBI_nt", "mode": "rw" - }, { + }, { "name": "NCBI_TAX", "to": "/data2/NCBI_TAX_DB", - "from": "/data/OpenOmics/references/metamorph/NCBI_tax", + "from": "/data/OpenOmics/references/metamorph/taxonomy", + "mode": "rw" + }, { + "name": "CHECKM_DB", + "to": "/data2/CHECKM_DB", + "from": "/data/OpenOmics/references/metamorph/checkm", + "mode": "rw" + }, { + "name": "CHECKM_CONFIG", + "to": "/opt/conda/envs/metawrap-env/lib/python2.7/site-packages/checkm/DATA_CONFIG", + "from": "/data/OpenOmics/references/metamorph/checkm/DATA_CONFIG", "mode": "rw" } ] From e49c5ab544d228554baf5dcaae8d5f35dc0ef0a5 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 27 Feb 2024 15:54:38 -0500 Subject: [PATCH 27/32] feat: python execution infrastructure for snakemake execution with or without rna --- metamorph | 108 +++++++++++++++++++++++++++++++++-------------------- src/run.py | 61 ++++++++++++++++++------------ src/run.sh | 36 ++++++++++++------ 3 files changed, 128 insertions(+), 77 deletions(-) diff --git a/metamorph b/metamorph index 2795f03..17a1f55 100755 --- a/metamorph +++ b/metamorph @@ -47,9 +47,10 @@ EXAMPLES: # Python standard library from __future__ import print_function import sys, os, subprocess, re, json, textwrap +from datetime import timezone, datetime # 3rd party imports from pypi -import argparse # potential python3 3rd party package, added in python/3.5 +import argparse # Local imports from src import version @@ -70,6 +71,7 @@ __version__ = version __authors__ = 'Neelam Redekar, Skyler Kuhn' __email__ = 'neelam.redekar nih.gov, skyler.kuhn nih.gov' __home__ = os.path.dirname(os.path.abspath(__file__)) +_datetime = int(datetime.now(tz=timezone.utc).timestamp() * 1000) _name = os.path.basename(sys.argv[0]) _description = 'An awesome metagenomics and metatranscriptomics pipeline' @@ -87,7 +89,10 @@ def unlock(sub_args): try: unlock_output = subprocess.check_output([ - 'snakemake', '--unlock', + 'snakemake', + '--unlock', + '--force', + '-s', os.path.abspath(f'{outdir}/workflow/Snakefile'), '--cores', '1', '--configfile=config.json' ], cwd = outdir, @@ -146,7 +151,8 @@ def run(sub_args): ) config['bindpaths'] = bindpaths - config['coassembly'] = sub_args.coa + # config['coassembly'] = sub_args.coa + config['coassembly'] = False # Step 4. Save config to output directory with open(os.path.join(sub_args.output, 'config.json'), 'w') as fh: @@ -185,7 +191,7 @@ def run(sub_args): submission_script=os.path.join(__home__, 'src', 'run.sh'), logger = logfh, additional_bind_paths = ",".join(bindpaths), - tmp_dir = sub_args.tmp_dir, + tmp_dir = sub_args.tmp_dir, ) # Step 6. Wait for subprocess to complete, @@ -292,9 +298,7 @@ def parsed_arguments(name, description): provided working directory has not been initialized, it will be created automatically. Example: --output /data/$USER/output - - {3}{4}Analysis options:{5} - ...coming soon! + {3}{4}Orchestration options:{5} --mode {{slurm,local}} @@ -372,35 +376,50 @@ def parsed_arguments(name, description): # Display example usage in epilog run_epilog = textwrap.dedent("""\ - {2}{3}Example:{4} - # Step 1.) Grab an interactive node, - # do not run on head node! - srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash - module purge - module load singularity snakemake - - # Step 2A.) Dry-run the pipeline - ./{0} run --input .tests/*.R?.fastq.gz \\ - --output /data/$USER/output \\ - --mode slurm \\ - --dry-run - - # Step 3A.) Run the {0} pipeline in per-sample fashion - # The slurm mode will submit jobs to - # the cluster. It is recommended running - # the pipeline in this mode. - ./{0} run --input .tests/*.R?.fastq.gz \\ - --output /data/$USER/output \\ - --mode slurm - - # Step 3B.) Run the {0} pipeline in co-assembly fashion - # with slurm - ./{0} run --coa --input .tests/*.R?.fastq.gz \\ - --output /data/$USER/output \\ - --mode slurm + {2}{3}INPUT MODES:{4} + # Step 1.) Grab an interactive node, + # do not run on head node! + srun -N 1 -n 1 --time=1:00:00 --mem=8gb --cpus-per-task=2 --pty bash + module purge + module load singularity snakemake + + # Step 2A.) Dry-run the pipeline + ./{0} run --input .tests/*.R?.fastq.gz \\ + --output /data/$USER/output \\ + --mode slurm \\ + --dry-run + + # Step 3A.) Run the {0} pipeline in per-sample fashion + # The slurm mode will submit jobs to + # the cluster. It is recommended running + # the pipeline in this mode. + ./{0} run --input .tests/*.R?.fastq.gz \\ + --output /data/$USER/output \\ + --mode slurm + + # Step 3B.) Run the {0} pipeline in co-assembly fashion + # with slurm + ./{0} run --coa --input .tests/*.R?.fastq.gz \\ + --output /data/$USER/output \\ + --mode slurm + + {2}{3}EXAMPLES:{4} + co-assembly dna-only: + $ metamorph run --coa --input *.R?.fastq.gz --output output + $ metamorph run -C --input *.R?.fastq.gz --output output + + per-sample assembly dna-only: + $ metamorph run --input *.R?.fastq.gz --output output + + co-assembly rna & dna: + $ metamorph run --coa --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output + $ metamorph run -C --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output + + per-sample assembly rna & dna: + $ metamorph run --input *.R?.fastq.gz --rna rna/*.R?.fastq.gz --output output - {2}{3}Version:{4} + {2}{3}VERSION:{4} {1} """.format(name, __version__, c.bold, c.url, c.end)) @@ -469,12 +488,19 @@ def parsed_arguments(name, description): ) # a supported job scheduler, etc. - subparser_run.add_argument( - '-C', '--coa', - action="store_true", - required = False, - help = argparse.SUPPRESS - ) + # subparser_run.add_argument( + # '-C', '--coa', + # action="store_true", + # required = False, + # help = argparse.SUPPRESS + # ) + + # subparser_run.add_argument( + # '-R', '--rnacoa', + # action="store_true", + # required = False, + # help = argparse.SUPPRESS + # ) # Name of master job subparser_run.add_argument( @@ -529,7 +555,7 @@ def parsed_arguments(name, description): '--tmp-dir', type = str, required = False, - default = '/lscratch/$SLURM_JOBID/', + default = '/lscratch/$SLURM_JOB_ID/', help = argparse.SUPPRESS ) diff --git a/src/run.py b/src/run.py index c6df6c2..96b7391 100644 --- a/src/run.py +++ b/src/run.py @@ -18,6 +18,10 @@ from . import version as __version__ +FASTQ_INPUT_EXT = ".fastq.gz" +FASTQ_R1_POSTFIX = f"_R1{FASTQ_INPUT_EXT}" +FASTQ_R2_POSTFIX = f"_R2{FASTQ_INPUT_EXT}" + def init(repo_path, output_path, links=[], required=['workflow', 'resources', 'config']): """Initialize the output directory. If user provides a output @@ -125,19 +129,19 @@ def rename(filename): # key = regex to match string and value = how it will be renamed extensions = { # Matches: _R[12]_fastq.gz, _R[12].fastq.gz, _R[12]_fq.gz, etc. - ".R1.f(ast)?q.gz$": ".R1.fastq.gz", - ".R2.f(ast)?q.gz$": ".R2.fastq.gz", + ".R1.f(ast)?q.gz$": FASTQ_R1_POSTFIX, + ".R2.f(ast)?q.gz$": FASTQ_R2_POSTFIX, # Matches: _R[12]_001_fastq_gz, _R[12].001.fastq.gz, _R[12]_001.fq.gz, etc. # Capture lane information as named group - ".R1.(?P...).f(ast)?q.gz$": ".R1.fastq.gz", - ".R2.(?P...).f(ast)?q.gz$": ".R2.fastq.gz", + ".R1.(?P...).f(ast)?q.gz$": FASTQ_R1_POSTFIX, + ".R2.(?P...).f(ast)?q.gz$": FASTQ_R2_POSTFIX, # Matches: _[12].fastq.gz, _[12].fq.gz, _[12]_fastq_gz, etc. - "_1.f(ast)?q.gz$": ".R1.fastq.gz", - "_2.f(ast)?q.gz$": ".R2.fastq.gz" + "_1.f(ast)?q.gz$": FASTQ_R1_POSTFIX, + "_2.f(ast)?q.gz$": FASTQ_R2_POSTFIX } - if (filename.endswith('.R1.fastq.gz') or - filename.endswith('.R2.fastq.gz')): + if (filename.endswith(FASTQ_R1_POSTFIX) or + filename.endswith(FASTQ_R2_POSTFIX)): # Filename is already in the correct format return filename @@ -349,7 +353,7 @@ def mixed_inputs(ifiles): fastqs = False bams = False for file in ifiles: - if file.endswith('.R1.fastq.gz') or file.endswith('.R2.fastq.gz'): + if file.endswith(FASTQ_R1_POSTFIX) or file.endswith(FASTQ_R2_POSTFIX): fastqs = True fq_files.append(file) elif file.endswith('.bam'): @@ -395,13 +399,17 @@ def add_user_information(config): config['project']['userhome'] = home config['project']['username'] = username - dt = datetime.now().strftime("%m_%d_%Y") - config['project']['id'] = f"{dt}_metagenome_results" + # dt = datetime.now().strftime("%m_%d_%Y") + # config['project']['id'] = f"{dt}_metagenome_results" + + # TODO: figure up way to uniquely ID results, engineering out + # the problem of misidentifing results files + config['project']['id'] = "metagenome_results" return config -def add_sample_metadata(input_files, config, rna_files=None, group=None): +def add_sample_metadata(input_files, config, group_key='samples'): """Adds sample metadata such as sample basename, label, and group information. If sample sheet is provided, it will default to using information in that file. If no sample sheet is provided, it will only add sample basenames and labels. @@ -415,14 +423,14 @@ def add_sample_metadata(input_files, config, rna_files=None, group=None): Updated config with basenames, labels, and groups (if provided) """ added = [] - config['samples'] = [] + config[group_key] = [] for file in input_files: # Split sample name on file extension - sample = re.split('\.R[12]\.fastq\.gz', os.path.basename(file))[0] + sample = re.split('[\S]R[12]', os.path.basename(file))[0] if sample not in added: # Only add PE sample information once added.append(sample) - config['samples'].append(sample) + config[group_key].append(sample) return config @@ -453,11 +461,15 @@ def add_rawdata_information(sub_args, config, ifiles): config['project']['filetype'] = convert[nends] # Finds the set of rawdata directories to bind - rawdata_paths = get_rawdata_bind_paths(input_files = sub_args.input) - config['project']['datapath'] = ','.join(rawdata_paths) + config['project']['datapath'] = ','.join(get_rawdata_bind_paths(input_files = sub_args.input)) + if sub_args.rna: + config["project"]["rna_datapath"] = ','.join(get_rawdata_bind_paths(input_files = sub_args.rna)) # Add each sample's basename config = add_sample_metadata(ifiles['dna'], config) + + if 'rna' in ifiles: + config = add_sample_metadata(ifiles['rna'], config, group_key='rna') return config @@ -517,7 +529,7 @@ def get_nends(ifiles): bam_files = True nends_status = -1 break - elif file.endswith('.R2.fastq.gz'): + elif file.endswith(FASTQ_R2_POSTFIX): paired_end = True nends_status = 2 break # dataset is paired-end @@ -528,7 +540,7 @@ def get_nends(ifiles): nends = {} # keep count of R1 and R2 for each sample for file in ifiles: # Split sample name on file extension - sample = re.split('\.R[12]\.fastq\.gz', os.path.basename(file))[0] + sample = re.split('\_R[12]\.fastq\.gz', os.path.basename(file))[0] if sample not in nends: nends[sample] = 0 @@ -542,8 +554,8 @@ def get_nends(ifiles): both mates (R1 and R2) for the following samples:\n\t\t{}\n Please check that the basename for each sample is consistent across mates. Here is an example of a consistent basename across mates: - consistent_basename.R1.fastq.gz - consistent_basename.R2.fastq.gz + consistent_basename_R1.fastq.gz + consistent_basename_R2.fastq.gz Please do not run the pipeline with a mixture of single-end and paired-end samples. This feature is currently not supported within {}, and it is @@ -635,7 +647,7 @@ def runner( threads=2, jobname=__job_name__, submission_script='run.sh', - tmp_dir = '/lscratch/$SLURM_JOBID/' + tmp_dir = '/lscratch/$SLURM_JOB_ID/' ): """Runs the pipeline via selected executor: local or slurm. If 'local' is selected, the pipeline is executed locally on a compute node/instance. @@ -716,10 +728,11 @@ def runner( # snakemake API call: https://snakemake.readthedocs.io/en/stable/api_reference/snakemake.html masterjob = subprocess.Popen([ 'snakemake', '-pr', - #'--rerun-incomplete', + '--rerun-incomplete', + '--rerun-triggers input', '--verbose', '--use-singularity', - '--singularity-args', "\\-C \\-B '{}'".format(bindpaths), + '--singularity-args', "\\-c \\-B '{}'".format(bindpaths), '--cores', str(threads), '--configfile=config.json' ], cwd = outdir, stderr=subprocess.STDOUT, stdout=logger, env=my_env) diff --git a/src/run.sh b/src/run.sh index b01df3a..6006904 100755 --- a/src/run.sh +++ b/src/run.sh @@ -209,11 +209,10 @@ function submit(){ if [[ ${6#\'} != /lscratch* ]]; then CLUSTER_OPTS="sbatch --cpus-per-task {cluster.threads} -p {cluster.partition} -t {cluster.time} --mem {cluster.mem} --job-name={params.rname} -e $SLURM_DIR/slurm-%j_{params.rname}.out -o $SLURM_DIR/slurm-%j_{params.rname}.out" fi - # Create sbacth script to build index cat << EOF > kickoff.sh #!/usr/bin/env bash -#SBATCH --cpus-per-task=16 -#SBATCH --mem=96g +#SBATCH --cpus-per-task=16 +#SBATCH --mem=32g #SBATCH --time=5-00:00:00 #SBATCH --parsable #SBATCH -J "$2" @@ -222,16 +221,29 @@ function submit(){ #SBATCH --error "$3/logfiles/snakemake.log" set -euo pipefail # Main process of pipeline -snakemake --latency-wait 120 -s "$3/workflow/Snakefile" -d "$3" \\ - --use-singularity --singularity-args "\\-C \\-B '$4'" \\ - --use-envmodules --verbose --configfile="$3/config.json" \\ - --printshellcmds --cluster-config "$3/config/cluster.json" \\ - --cluster "${CLUSTER_OPTS}" --keep-going -j 500 \\ - --rerun-incomplete --stats "$3/logfiles/runtime_statistics.json" \\ - --keep-incomplete --restart-times 0 \\ - --keep-remote --local-cores 14 2>&1 +snakemake \\ + -p \\ + --latency-wait 120 \\ + -s "$3/workflow/Snakefile" \\ + -d "$3" \\ + --use-singularity \\ + --singularity-args "\\-c \\-B '$4'" \\ + --use-envmodules \\ + --verbose \\ + --configfile "$3/config.json" \\ + --printshellcmds \\ + --cluster-config $3/config/cluster.json \\ + --cluster "${CLUSTER_OPTS}" \\ + --keep-going \\ + --rerun-incomplete \\ + --jobs 500 \\ + --keep-remote \\ + --stats "$3/logfiles/runtime_statistics.json" \\ + --restart-times 0 \\ + --keep-incomplete \\ + --local-cores "14" 2>&1 # Create summary report -snakemake -d "$3" --report "Snakemake_Report.html" +snakemake -s "$3/workflow/Snakefile" -d "$3" --configfile="$3/config.json" --report "Snakemake_Report.html" EOF chmod +x kickoff.sh job_id=$(sbatch kickoff.sh | tee -a "$3"/logfiles/master.log) From 9a13f7c51641741adecade5f378e302921f329a3 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 27 Feb 2024 17:06:30 -0500 Subject: [PATCH 28/32] fix: run read qc rules per-sample --- workflow/rules/DNA.smk | 20 ++++++++++++-------- workflow/rules/RNA.smk | 7 +++---- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/workflow/rules/DNA.smk b/workflow/rules/DNA.smk index db211d9..227388f 100644 --- a/workflow/rules/DNA.smk +++ b/workflow/rules/DNA.smk @@ -41,10 +41,6 @@ mem2int = lambda x: int(str(x).lower().replace('gb', '').repl 8. align DNA to assembly """ -start_r1 = expand(join(workpath, "dna", "{name}_R1.fastq.gz"), name=samples) -start_r2 = expand(join(workpath, "dna", "{name}_R2.fastq.gz"), name=samples) - - rule concat_reads: input: all_r1_reads = expand(join(workpath, "dna", "{sid}_R1.fastq.gz"), sid=config['samples'] if config['coassembly'] else []), @@ -126,8 +122,8 @@ rule metawrap_read_qc: - FastQC html report and zip file on trimmed data """ input: - R1 = start_r1, - R2 = start_r2, + R1 = join(workpath, "dna", "{name}_R1.fastq.gz"), + R2 = join(workpath, "dna", "{name}_R2.fastq.gz"), output: R1_pretrim_report = join(top_readqc_dir, "{name}", "{name}_R1_pretrim_report.html"), R2_pretrim_report = join(top_readqc_dir, "{name}", "{name}_R2_pretrim_report.html"), @@ -359,7 +355,9 @@ rule metawrap_binning: bin_figure = join(top_binning_dir, "{name}", "figures", "binning_results.png"), params: rname = "metawrap_binning", + bin_parent_dir = top_binning_dir bin_dir = join(top_binning_dir, "{name}"), + bin_summary_dir = join(bin_dir, "summary"), bin_mem = mem2int(cluster['metawrap_binning'].get("mem", default_memory)), mw_trim_linker_R1 = join(top_trim_dir, "{name}", "{name}_1.fastq"), mw_trim_linker_R2 = join(top_trim_dir, "{name}", "{name}_2.fastq"), @@ -373,9 +371,9 @@ rule metawrap_binning: export CHECKM_DATA_PATH="/data2/CHECKM_DB" # make base dir if not exists - mkdir -p """+top_binning_dir+""" + mkdir -p {params.bin_parent_dir} if [ -d "{params.bin_dir}" ]; then rm -rf {params.bin_dir}; fi - + # setup links for metawrap input [[ -f "{params.mw_trim_linker_R1}" ]] || ln -s {input.R1} {params.mw_trim_linker_R1} [[ -f "{params.mw_trim_linker_R2}" ]] || ln -s {input.R2} {params.mw_trim_linker_R2} @@ -397,6 +395,8 @@ rule metawrap_binning: -B {params.bin_dir}/maxbin2_bins \ -c {params.min_perc_complete} \ -x {params.max_perc_contam} + + """ @@ -407,8 +407,12 @@ rule derep_bins: of average nucleotide identity. @Input: + maxbin2 ssembly bins, contigs, stat summaries + metabat2 ssembly bins, contigs, stat summaries + metawrap ssembly bins, contigs, stat summaries @Output: + directory of consensus ensemble bins (deterministic output) """ input: diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk index eae8547..44a94e2 100644 --- a/workflow/rules/RNA.smk +++ b/workflow/rules/RNA.smk @@ -21,8 +21,7 @@ metawrap_container = config["containers"]["metawrap"] pairedness = list(range(1, config['project']['nends']+1)) if rna_included: - start_r1_rna = expand(join(workpath, "rna", "{rname}_R1.fastq.gz"), rname=rna_sample_stems) - start_r2_rna = expand(join(workpath, "rna", "{rname}_R2.fastq.gz"), rname=rna_sample_stems) + else: start_r1_rna, start_r2_rna = [], [] @@ -72,8 +71,8 @@ rule concat_rna_reads: rule rna_read_qc: input: - R1 = start_r1_rna, - R2 = start_r2_rna, + R1 = join(workpath, "rna", "{rname}_R1.fastq.gz") + R2 = join(workpath, "rna", "{rname}_R2.fastq.gz") output: R1_pretrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R1_pretrim_report.html"), R2_pretrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R2_pretrim_report.html"), From b030269c484568c48bcfdab994149ce02c7c2fe5 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Tue, 27 Feb 2024 17:53:43 -0500 Subject: [PATCH 29/32] fix: PR feedback --- config/cluster.json | 2 +- src/run.sh | 2 +- workflow/rules/DNA.smk | 7 ++----- workflow/rules/RNA.smk | 9 ++------- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/config/cluster.json b/config/cluster.json index 45bce7a..91bfdb7 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -38,7 +38,7 @@ "threads": 48, "mem": "128G", "partition": "norm", - "time": "5-00:00:00" + "time": "10-00:00:00" }, "metawrap_tax_classification": { "threads": 32, diff --git a/src/run.sh b/src/run.sh index 6006904..790c8bc 100755 --- a/src/run.sh +++ b/src/run.sh @@ -213,7 +213,7 @@ function submit(){ #!/usr/bin/env bash #SBATCH --cpus-per-task=16 #SBATCH --mem=32g -#SBATCH --time=5-00:00:00 +#SBATCH --time=10-00:00:00 #SBATCH --parsable #SBATCH -J "$2" #SBATCH --mail-type=BEGIN,END,FAIL diff --git a/workflow/rules/DNA.smk b/workflow/rules/DNA.smk index 227388f..92e6901 100644 --- a/workflow/rules/DNA.smk +++ b/workflow/rules/DNA.smk @@ -229,7 +229,7 @@ rule metawrap_genome_assembly: shell: """ # remove empty directories by snakemake, to prevent metawrap error - rm -rf {params.mh_dir} + rm -rf {params.mh_dir:q} # link to the file names metawrap expects ln -s {input.R1} {output.assembly_R1} ln -s {input.R2} {output.assembly_R2} @@ -355,9 +355,8 @@ rule metawrap_binning: bin_figure = join(top_binning_dir, "{name}", "figures", "binning_results.png"), params: rname = "metawrap_binning", - bin_parent_dir = top_binning_dir + bin_parent_dir = top_binning_dir, bin_dir = join(top_binning_dir, "{name}"), - bin_summary_dir = join(bin_dir, "summary"), bin_mem = mem2int(cluster['metawrap_binning'].get("mem", default_memory)), mw_trim_linker_R1 = join(top_trim_dir, "{name}", "{name}_1.fastq"), mw_trim_linker_R2 = join(top_trim_dir, "{name}", "{name}_2.fastq"), @@ -395,8 +394,6 @@ rule metawrap_binning: -B {params.bin_dir}/maxbin2_bins \ -c {params.min_perc_complete} \ -x {params.max_perc_contam} - - """ diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk index 44a94e2..cd79655 100644 --- a/workflow/rules/RNA.smk +++ b/workflow/rules/RNA.smk @@ -20,11 +20,6 @@ top_trim_dir_rna = join(workpath, config['project']['id'], "t metawrap_container = config["containers"]["metawrap"] pairedness = list(range(1, config['project']['nends']+1)) -if rna_included: - -else: - start_r1_rna, start_r2_rna = [], [] - rule concat_rna_reads: input: @@ -71,8 +66,8 @@ rule concat_rna_reads: rule rna_read_qc: input: - R1 = join(workpath, "rna", "{rname}_R1.fastq.gz") - R2 = join(workpath, "rna", "{rname}_R2.fastq.gz") + R1 = join(workpath, "rna", "{rname}_R1.fastq.gz"), + R2 = join(workpath, "rna", "{rname}_R2.fastq.gz"), output: R1_pretrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R1_pretrim_report.html"), R2_pretrim_report = join(top_readqc_dir_rna, "{rname}", "{rname}_R2_pretrim_report.html"), From c3870c5922de4c559b20c6eaad1b995e25ec38e6 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 28 Feb 2024 12:57:51 -0500 Subject: [PATCH 30/32] fix: PR requested changes --- config/cluster.json | 2 +- workflow/rules/DNA.smk | 11 ++++++++--- workflow/rules/RNA.smk | 11 ++++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/config/cluster.json b/config/cluster.json index 91bfdb7..45bce7a 100644 --- a/config/cluster.json +++ b/config/cluster.json @@ -38,7 +38,7 @@ "threads": 48, "mem": "128G", "partition": "norm", - "time": "10-00:00:00" + "time": "5-00:00:00" }, "metawrap_tax_classification": { "threads": 32, diff --git a/workflow/rules/DNA.smk b/workflow/rules/DNA.smk index 92e6901..a3fc2dd 100644 --- a/workflow/rules/DNA.smk +++ b/workflow/rules/DNA.smk @@ -138,12 +138,18 @@ rule metawrap_read_qc: sid = "{name}", this_qc_dir = join(top_readqc_dir, "{name}"), trim_out = join(top_trim_dir, "{name}"), - tmpr1 = lambda _, output, input: join(config['options']['tmp_dir'], str(basename(str(input.R1))).replace('_R1.', '_1.').replace('.gz', '')), - tmpr2 = lambda _, output, input: join(config['options']['tmp_dir'], str(basename(str(input.R2))).replace('_R2.', '_2.').replace('.gz', '')), + tmp_safe_dir = join(config['options']['tmp_dir'], 'read_qc'), + tmpr1 = lambda _, output, input: join(config['options']['tmp_dir'], 'read_qc', str(basename(str(input.R1))).replace('_R1.', '_1.').replace('.gz', '')), + tmpr2 = lambda _, output, input: join(config['options']['tmp_dir'], 'read_qc', str(basename(str(input.R2))).replace('_R2.', '_2.').replace('.gz', '')), containerized: metawrap_container, threads: int(cluster["metawrap_genome_assembly"].get('threads', default_threads)), shell: """ + # safe temp directory + if [ ! -d "{params.tmp_safe_dir}" ]; then mkdir -p "{params.tmp_safe_dir}"; fi + tmp=$(mktemp -d -p "{params.tmp_safe_dir}") + trap 'rm -rf "{params.tmp_safe_dir}"' EXIT + # uncompress to lscratch rone="{input.R1}" ext=$(echo "${{rone: -2}}" | tr '[:upper:]' '[:lower:]') @@ -152,7 +158,6 @@ rule metawrap_read_qc: else ln -s {input.R1} {params.tmpr1} fi; - rtwo="{input.R2}" ext=$(echo "${{rtwo: -2}}" | tr '[:upper:]' '[:lower:]') if [[ "$ext" == "gz" ]]; then diff --git a/workflow/rules/RNA.smk b/workflow/rules/RNA.smk index cd79655..67a7475 100644 --- a/workflow/rules/RNA.smk +++ b/workflow/rules/RNA.smk @@ -82,11 +82,17 @@ rule rna_read_qc: sid = "{rname}", this_qc_dir = join(top_readqc_dir_rna, "{rname}"), trim_out = join(top_trim_dir_rna, "{rname}"), - tmpr1 = lambda _, output, input: join(config['options']['tmp_dir'], str(basename(str(input.R1))).replace('_R1.', '_1.').replace('.gz', '')), - tmpr2 = lambda _, output, input: join(config['options']['tmp_dir'], str(basename(str(input.R2))).replace('_R2.', '_2.').replace('.gz', '')), + tmp_safe_dir = join(config['options']['tmp_dir'], 'read_qc'), + tmpr1 = lambda _, output, input: join(config['options']['tmp_dir'], 'read_qc', str(basename(str(input.R1))).replace('_R1.', '_1.').replace('.gz', '')), + tmpr2 = lambda _, output, input: join(config['options']['tmp_dir'], 'read_qc', str(basename(str(input.R2))).replace('_R2.', '_2.').replace('.gz', '')), containerized: metawrap_container, shell: """ + # safe temp directory + if [ ! -d "{params.tmp_safe_dir}" ]; then mkdir -p "{params.tmp_safe_dir}"; fi + tmp=$(mktemp -d -p "{params.tmp_safe_dir}") + trap 'rm -rf "{params.tmp_safe_dir}"' EXIT + # uncompress to lscratch rone="{input.R1}" ext=$(echo "${{rone: -2}}" | tr '[:upper:]' '[:lower:]') @@ -95,7 +101,6 @@ rule rna_read_qc: else ln -s {input.R1} {params.tmpr1} fi; - rtwo="{input.R2}" ext=$(echo "${{rtwo: -2}}" | tr '[:upper:]' '[:lower:]') if [[ "$ext" == "gz" ]]; then From 479c82ef5315d66e2246b2cc849123877b84d21d Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 28 Feb 2024 14:29:41 -0500 Subject: [PATCH 31/32] fix: remove expand from binning step --- workflow/rules/DNA.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/DNA.smk b/workflow/rules/DNA.smk index a3fc2dd..3f0fb64 100644 --- a/workflow/rules/DNA.smk +++ b/workflow/rules/DNA.smk @@ -344,8 +344,8 @@ rule metawrap_binning: """ input: - R1 = expand(join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq"), name=samples), - R2 = expand(join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq"), name=samples), + R1 = join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq") + R2 = join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq") assembly = join(top_assembly_dir, "{name}", "final_assembly.fasta"), output: maxbin_bins = directory(join(top_binning_dir, "{name}", "maxbin2_bins")), From 181909879962c85ddbda95deb14fd46cbae7b800 Mon Sep 17 00:00:00 2001 From: Ryan Routsong Date: Wed, 28 Feb 2024 14:31:22 -0500 Subject: [PATCH 32/32] fix: commas --- workflow/rules/DNA.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/rules/DNA.smk b/workflow/rules/DNA.smk index 3f0fb64..f485e25 100644 --- a/workflow/rules/DNA.smk +++ b/workflow/rules/DNA.smk @@ -344,8 +344,8 @@ rule metawrap_binning: """ input: - R1 = join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq") - R2 = join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq") + R1 = join(top_trim_dir, "{name}", "{name}_R1_trimmed.fastq"), + R2 = join(top_trim_dir, "{name}", "{name}_R2_trimmed.fastq"), assembly = join(top_assembly_dir, "{name}", "final_assembly.fasta"), output: maxbin_bins = directory(join(top_binning_dir, "{name}", "maxbin2_bins")),