diff --git a/.travis.yml b/.travis.yml index b3e7a99..da79ee6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,13 +8,12 @@ matrix: fast_finish: true before_install: - # PRs to master are only ok if coming from dev branch - - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])' + - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && ([ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ] || [ $TRAVIS_PULL_REQUEST_BRANCH = "patch" ]))' # Pull the docker image first so the test doesn't wait for this - docker pull nfcore/hic:dev # Fake the tag locally so that the pipeline runs properly # Looks weird when this is :dev to :dev, but makes sense when testing code for a release (:dev to :1.0.1) - - docker tag nfcore/hic:dev nfcore/hic:1.0.0 + - docker tag nfcore/hic:dev nfcore/hic:1.1.0 install: # Install Nextflow @@ -30,7 +29,7 @@ install: - sudo apt-get install npm && npm install -g markdownlint-cli env: - - NXF_VER='0.32.0' # Specify a minimum NF version that should be tested and work + - NXF_VER='19.04.0' # Specify a minimum NF version that should be tested and work - NXF_VER='' # Plus: get the latest NF version and check that it works script: diff --git a/CHANGELOG.md b/CHANGELOG.md index 08f3ef0..aac5146 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,14 +1,26 @@ # nf-core/hic: Changelog +## v1.1.0 - 2019-10-15 + +* Support 'N' base motif in restriction/ligation sites +* Support multiple restriction enzymes/ligattion sites (comma separated) ([#31](https://github.com/nf-core/hic/issues/31)) +* Add --saveInteractionBAM option +* Add DOI ([#29](https://github.com/nf-core/hic/issues/29)) +* Fix bug for reads extension _1/_2 ([#30](https://github.com/nf-core/hic/issues/30)) +* Update manual ([#28](https://github.com/nf-core/hic/issues/28)) + ## v1.0 - 2019-05-06 -First version of nf-core Hi-C pipeline which is a Nextflow implementation of the [HiC-Pro pipeline](https://github.com/nservant/HiC-Pro/). +First version of nf-core Hi-C pipeline which is a Nextflow implementation of +the [HiC-Pro pipeline](https://github.com/nservant/HiC-Pro/). Note that all HiC-Pro functionalities are not yet all implemented. -The current version supports most protocols including Hi-C, in situ Hi-C, DNase Hi-C, Micro-C, capture-C or HiChip data. +The current version supports most protocols including Hi-C, in situ Hi-C, +DNase Hi-C, Micro-C, capture-C or HiChip data. In summary, this version allows : -* Automatic detection and generation of annotation files based on igenomes if not provided. +* Automatic detection and generation of annotation files based on igenomes +if not provided. * Two-steps alignment of raw sequencing reads * Reads filtering and detection of valid interaction products * Generation of raw contact matrices for a set of resolutions diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 09226d0..a977481 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -2,11 +2,17 @@ ## Our Pledge -In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project +and our community a harassment-free experience for everyone, regardless of +age, body size, disability, ethnicity, gender identity and expression, level +of experience, nationality, personal appearance, race, religion, or sexual +identity and orientation. ## Our Standards -Examples of behavior that contributes to creating a positive environment include: +Examples of behavior that contributes to creating a positive environment +include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences @@ -16,31 +22,55 @@ Examples of behavior that contributes to creating a positive environment include Examples of unacceptable behavior by participants include: -* The use of sexualized language or imagery and unwelcome sexual attention or advances +* The use of sexualized language or imagery and unwelcome sexual attention +or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment -* Publishing others' private information, such as a physical or electronic address, without explicit permission -* Other conduct which could reasonably be considered inappropriate in a professional setting +* Publishing others' private information, such as a physical or electronic +address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting ## Our Responsibilities -Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. -Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. ## Scope -This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an +appointed representative at an online or offline event. Representation of a +project may be further defined and clarified by project maintainers. ## Enforcement -Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team on [Slack](https://nf-core-invite.herokuapp.com/). The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team on +[Slack](https://nf-core-invite.herokuapp.com/). The project team will review +and investigate all complaints, and will respond in a way that it deems +appropriate to the circumstances. The project team is obligated to maintain +confidentiality with regard to the reporter of an incident. Further details +of specific enforcement policies may be posted separately. -Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. ## Attribution -This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 1.4, available at +[http://contributor-covenant.org/version/1/4][version] [homepage]: http://contributor-covenant.org [version]: http://contributor-covenant.org/version/1/4/ diff --git a/Dockerfile b/Dockerfile index 06374cf..8b6ee9b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM nfcore/base +FROM nfcore/base:1.7 LABEL authors="Nicolas Servant" \ description="Docker image containing all requirements for nf-core/hic pipeline" @@ -7,4 +7,4 @@ RUN apt-get update && apt-get install -y gcc g++ && apt-get clean -y COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/nf-core-hic-1.0.0/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-hic-1.1.0/bin:$PATH diff --git a/README.md b/README.md index 37692cf..c2ae60f 100644 --- a/README.md +++ b/README.md @@ -3,23 +3,35 @@ **Analysis of Chromosome Conformation Capture data (Hi-C)**. [![Build Status](https://travis-ci.com/nf-core/hic.svg?branch=master)](https://travis-ci.com/nf-core/hic) -[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.32.0-brightgreen.svg)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A519.04.0-brightgreen.svg)](https://www.nextflow.io/) [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) [![Docker](https://img.shields.io/docker/automated/nfcore/hic.svg)](https://hub.docker.com/r/nfcore/hic) -![Singularity Container available]( -https://img.shields.io/badge/singularity-available-7E4C74.svg) +![Singularity Container available](https://img.shields.io/badge/singularity-available-7E4C74.svg) -### Introduction -This pipeline is based on the [HiC-Pro workflow](https://github.com/nservant/HiC-Pro). -It was designed to process Hi-C data from raw fastq files (paired-end Illumina data) to normalized contact maps. -The current version supports most protocols, including digestion protocols as well as protocols that do not require restriction enzymes such as DNase Hi-C. -In practice, this workflow was successfully applied to many data-sets including dilution Hi-C, in situ Hi-C, DNase Hi-C, Micro-C, capture-C, capture Hi-C or HiChip data. +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.2669513.svg)](https://doi.org/10.5281/zenodo.2669513) -The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker / singularity containers making installation trivial and results highly reproducible. +## Introduction -### Pipeline summary -1. Mapping using a two steps strategy to rescue reads spanning the ligation sites (bowtie2) +This pipeline is based on the +[HiC-Pro workflow](https://github.com/nservant/HiC-Pro). +It was designed to process Hi-C data from raw fastq files (paired-end Illumina +data) to normalized contact maps. +The current version supports most protocols, including digestion protocols as +well as protocols that do not require restriction enzymes such as DNase Hi-C. +In practice, this workflow was successfully applied to many data-sets including +dilution Hi-C, in situ Hi-C, DNase Hi-C, Micro-C, capture-C, capture Hi-C or +HiChip data. + +The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool +to run tasks across multiple compute infrastructures in a very portable manner. +It comes with docker / singularity containers making installation trivial and +results highly reproducible. + +## Pipeline summary + +1. Mapping using a two steps strategy to rescue reads spanning the ligation +sites (bowtie2) 2. Detection of valid interaction products 3. Duplicates removal 4. Create genome-wide contact maps at various resolution @@ -27,17 +39,63 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool 6. Quality controls and report (MultiQC) 7. Addition export for visualisation and downstream analysis (cooler) -### Documentation -The nf-core/hic pipeline comes with documentation about the pipeline, found in the `docs/` directory: +## Quick Start + +i. Install [`nextflow`](https://nf-co.re/usage/installation) + +ii. Install one of [`docker`](https://docs.docker.com/engine/installation/), +[`singularity`](https://www.sylabs.io/guides/3.0/user-guide/) or +[`conda`](https://conda.io/miniconda.html) + +iii. Download the pipeline and test it on a minimal dataset with a single command + +```bash +nextflow run hic -profile test, +``` + +iv. Start running your own analysis! + +```bash +nextflow run hic -profile --reads '*_R{1,2}.fastq.gz' --genome GRCh37 +``` + +See [usage docs](docs/usage.md) for all of the available options when running the pipeline. -1. [Installation](docs/installation.md) +## Documentation + +The nf-core/hic pipeline comes with documentation about the pipeline, found in +the `docs/` directory: + +1. [Installation](https://nf-co.re/usage/installation) 2. Pipeline configuration - * [Local installation](docs/configuration/local.md) - * [Adding your own system](docs/configuration/adding_your_own.md) - * [Reference genomes](docs/configuration/reference_genomes.md) + * [Local installation](https://nf-co.re/usage/local_installation) + * [Adding your own system config](https://nf-co.re/usage/adding_own_config) + * [Reference genomes](https://nf-co.re/usage/reference_genomes) 3. [Running the pipeline](docs/usage.md) 4. [Output and how to interpret the results](docs/output.md) -5. [Troubleshooting](docs/troubleshooting.md) +5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) + +## Contributions and Support + +If you would like to contribute to this pipeline, please see the +[contributing guidelines](.github/CONTRIBUTING.md). + +For further information or help, don't hesitate to get in touch on +[Slack](https://nfcore.slack.com/channels/hic). +You can join with [this invite](https://nf-co.re/join/slack). + + +## Credits -### Credits nf-core/hic was originally written by Nicolas Servant. + +## Citation + +If you use nf-core/hic for your analysis, please cite it using the following +doi: [10.5281/zenodo.2669513](https://doi.org/10.5281/zenodo.2669513) + +You can cite the `nf-core` pre-print as follows: +Ewels PA, Peltzer A, Fillinger S, Alneberg JA, Patel H, Wilm A, Garcia MU, Di +Tommaso P, Nahnsen S. **nf-core: Community curated bioinformatics pipelines**. +*bioRxiv*. 2019. p. 610741. +[doi: 10.1101/610741](https://www.biorxiv.org/content/10.1101/610741v1). diff --git a/bin/digest_genome.py b/bin/digest_genome.py index db2d151..ac6d8da 100755 --- a/bin/digest_genome.py +++ b/bin/digest_genome.py @@ -47,6 +47,7 @@ def find_re_sites(filename, sequences, offset): indices.sort() all_indices.append(indices) indices = [] + # This is a new chromosome. Empty the sequence string, and add the # correct chrom id big_str = "" @@ -67,6 +68,7 @@ def find_re_sites(filename, sequences, offset): for m in re.finditer(pattern, big_str)] indices.sort() all_indices.append(indices) + return contig_names, all_indices @@ -87,6 +89,22 @@ def find_chromsomose_lengths(reference_filename): return chromosome_names, np.array(chromosome_lengths) +def replaceN(cs): + npos = int(cs.find('N')) + cseql = [] + if npos!= -1: + for nuc in ["A","C","G","T"]: + tmp = cs.replace('N', nuc, 1) + tmpl = replaceN(tmp) + if type(tmpl)==list: + cseql = cseql + tmpl + else: + cseql.append(tmpl) + else: + cseql.append(cs) + return cseql + + if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('fastafile') @@ -102,8 +120,13 @@ def find_chromsomose_lengths(reference_filename): filename = args.fastafile out = args.out - cutsites = args.res_sites - + + # Split restriction sites if comma-separated + cutsites=[] + for s in args.res_sites: + for m in s.split(','): + cutsites.append(m) + # process args and get restriction enzyme sequences sequences = [] offset = [] @@ -112,15 +135,34 @@ def find_chromsomose_lengths(reference_filename): cseq = ''.join(RE_cutsite[cs.lower()]) else: cseq = cs + offpos = int(cseq.find('^')) if offpos == -1: print "Unable to detect offset for", cseq print "Please, use '^' to specified the cutting position,", print "i.e A^GATCT for HindIII digestion" sys.exit(-1) + + for nuc in list(set(cs)): + if nuc != 'A' and nuc != 'C' and nuc != 'G' and nuc != 'T' and nuc != 'N' and nuc != '^': + print "Find unexpected character ['",nuc,"']in restriction motif" + print "Note that multiple motifs should be separated by a space (not a comma !)" + sys.exit(-1) + offset.append(offpos) sequences.append(re.sub('\^', '', cseq)) + # replace all N in restriction motif + sequences_without_N = [] + offset_without_N = [] + for rs in range(len(sequences)): + nrs = replaceN(sequences[rs]) + sequences_without_N = sequences_without_N + nrs + offset_without_N = offset_without_N + [offset[rs]] * len(nrs) + + sequences = sequences_without_N + offset = offset_without_N + if out is None: out = os.path.splitext(filename)[0] + "_fragments.bed" @@ -129,8 +171,7 @@ def find_chromsomose_lengths(reference_filename): print "Offset(s)", ','.join(str(x) for x in offset) # Read fasta file and look for rs per chromosome - contig_names, all_indices = find_re_sites(filename, sequences, - offset=offset) + contig_names, all_indices = find_re_sites(filename, sequences, offset=offset) _, lengths = find_chromsomose_lengths(filename) valid_fragments = [] diff --git a/bin/mapped_2hic_fragments.py b/bin/mapped_2hic_fragments.py index efa32e6..d4790ee 100755 --- a/bin/mapped_2hic_fragments.py +++ b/bin/mapped_2hic_fragments.py @@ -53,7 +53,7 @@ def get_args(): "minInsertSize=", "maxInsertSize", "minFragSize", "maxFragSize", "minDist", - "gatg", "samOut", "verbose", "all", "help"]) + "gatg", "sam", "verbose", "all", "help"]) except getopt.GetoptError: usage() sys.exit(-1) @@ -442,7 +442,7 @@ def get_interaction_type(read1, read1_chrom, resfrag1, read2, # If returned InteractionType=None -> Same restriction fragment # and same strand = Dump interactionType = None - + if not read1.is_unmapped and not read2.is_unmapped and resfrag1 is not None and resfrag2 is not None: # same restriction fragment if resfrag1 == resfrag2: @@ -501,9 +501,9 @@ def get_read_tag(read, tag): minInsertSize = arg elif opt in ("-l", "--longestInsertSize"): maxInsertSize = arg - elif opt in ("-t", "--shortestFragmentSize"): + elif opt in ("-t", "--shortestFragmentLength"): minFragSize = arg - elif opt in ("-m", "--longestFragmentSize"): + elif opt in ("-m", "--longestFragmentLength"): maxFragSize = arg elif opt in ("-d", "--minCisDist"): minDist = arg @@ -613,6 +613,7 @@ def get_read_tag(read, tag): r2_chrom = None if r1_resfrag is not None or r2_resfrag is not None: + interactionType = get_interaction_type(r1, r1_chrom, r1_resfrag, r2, r2_chrom, r2_resfrag, verbose) dist = get_PE_fragment_size(r1, r2, r1_resfrag, r2_resfrag, interactionType) cdist = get_cis_dist(r1, r2) @@ -724,10 +725,14 @@ def get_read_tag(read, tag): if or1_resfrag is not None: or1_fragname = or1_resfrag.value['name'] - + else: + or1_fragname = 'None' + if or2_resfrag is not None: or2_fragname = or2_resfrag.value['name'] - + else: + or2_fragname = 'None' + cur_handler.write( or1.qname + "\t" + or1_chrom + "\t" + diff --git a/conf/curie.config b/conf/curie.config deleted file mode 100644 index ab85a2d..0000000 --- a/conf/curie.config +++ /dev/null @@ -1,16 +0,0 @@ -singularity { - enabled = false -} - -process { - executor = 'pbs' - queue = params.queue - //beforeScript = 'export PATH=/bioinfo/pipelines/sandbox/dev/nfcore/rnaseq/modules/conda/envs/nf-core-rnaseq-1.2/bin:$PATH' -} - -params { - clusterOptions = false - max_memory = 128.GB - max_cpus = 4 - max_time = 240.h -} diff --git a/conf/hicpro.config b/conf/hicpro.config index 0a2c9b9..01b755a 100644 --- a/conf/hicpro.config +++ b/conf/hicpro.config @@ -38,5 +38,6 @@ params { saveReference = false saveAlignedIntermediates = false + saveInteractionBAM = false } diff --git a/conf/multiqc_config.yaml b/conf/multiqc_config.yaml deleted file mode 100644 index f2a738c..0000000 --- a/conf/multiqc_config.yaml +++ /dev/null @@ -1,7 +0,0 @@ -report_comment: > - This report has been generated by the nf-core/hic - analysis pipeline. For information about how to interpret these results, please see the - documentation. -report_section_order: - nf-core/hic-software-versions: - order: -1000 diff --git a/conf/test.config b/conf/test.config index 592e3a4..00c47f8 100644 --- a/conf/test.config +++ b/conf/test.config @@ -29,5 +29,5 @@ params { min_mapq = 0 // Options - skip_cool = true + skipCool = true } diff --git a/docs/README.md b/docs/README.md index d7dbdac..e160867 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,11 +2,11 @@ The nf-core/hic documentation is split into the following files: -1. [Installation](installation.md) +1. [Installation](https://nf-co.re/usage/installation) 2. Pipeline configuration - * [Local installation](configuration/local.md) - * [Adding your own system](configuration/adding_your_own.md) - * [Reference genomes](configuration/reference_genomes.md) + * [Local installation](https://nf-co.re/usage/local_installation) + * [Adding your own system config](https://nf-co.re/usage/adding_own_config) + * [Reference genomes](https://nf-co.re/usage/reference_genomes) 3. [Running the pipeline](usage.md) 4. [Output and how to interpret the results](output.md) -5. [Troubleshooting](troubleshooting.md) +5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) diff --git a/docs/configuration/adding_your_own.md b/docs/configuration/adding_your_own.md deleted file mode 100644 index bf7f808..0000000 --- a/docs/configuration/adding_your_own.md +++ /dev/null @@ -1,86 +0,0 @@ -# nf-core/hic: Configuration for other clusters - -It is entirely possible to run this pipeline on other clusters, though you will need to set up your own config file so that the pipeline knows how to work with your cluster. - -> If you think that there are other people using the pipeline who would benefit from your configuration (eg. other common cluster setups), please let us know. We can add a new configuration and profile which can used by specifying `-profile ` when running the pipeline. The config file will then be hosted at `nf-core/configs` and will be pulled automatically before the pipeline is executed. - -If you are the only person to be running this pipeline, you can create your config file as `~/.nextflow/config` and it will be applied every time you run Nextflow. Alternatively, save the file anywhere and reference it when running the pipeline with `-c path/to/config` (see the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more). - -A basic configuration comes with the pipeline, which loads the [`conf/base.config`](../../conf/base.config) by default. This means that you only need to configure the specifics for your system and overwrite any defaults that you want to change. - -## Cluster Environment -By default, pipeline uses the `local` Nextflow executor - in other words, all jobs are run in the login session. If you're using a simple server, this may be fine. If you're using a compute cluster, this is bad as all jobs will run on the head node. - -To specify your cluster environment, add the following line to your config file: - -```nextflow -process.executor = 'YOUR_SYSTEM_TYPE' -``` - -Many different cluster types are supported by Nextflow. For more information, please see the [Nextflow documentation](https://www.nextflow.io/docs/latest/executor.html). - -Note that you may need to specify cluster options, such as a project or queue. To do so, use the `clusterOptions` config option: - -```nextflow -process { - executor = 'SLURM' - clusterOptions = '-A myproject' -} -``` - - -## Software Requirements -To run the pipeline, several software packages are required. How you satisfy these requirements is essentially up to you and depends on your system. If possible, we _highly_ recommend using either Docker or Singularity. - -Please see the [`installation documentation`](../installation.md) for how to run using the below as a one-off. These instructions are about configuring a config file for repeated use. - -### Docker -Docker is a great way to run nf-core/hic, as it manages all software installations and allows the pipeline to be run in an identical software environment across a range of systems. - -Nextflow has [excellent integration](https://www.nextflow.io/docs/latest/docker.html) with Docker, and beyond installing the two tools, not much else is required - nextflow will automatically fetch the [nfcore/hic](https://hub.docker.com/r/nfcore/hic/) image that we have created and is hosted at dockerhub at run time. - -To add docker support to your own config file, add the following: - -```nextflow -docker.enabled = true -process.container = "nfcore/hic" -``` - -Note that the dockerhub organisation name annoyingly can't have a hyphen, so is `nfcore` and not `nf-core`. - - -### Singularity image -Many HPC environments are not able to run Docker due to security issues. -[Singularity](http://singularity.lbl.gov/) is a tool designed to run on such HPC systems which is very similar to Docker. - -To specify singularity usage in your pipeline config file, add the following: - -```nextflow -singularity.enabled = true -process.container = "shub://nf-core/hic" -``` - -If you intend to run the pipeline offline, nextflow will not be able to automatically download the singularity image for you. -Instead, you'll have to do this yourself manually first, transfer the image file and then point to that. - -First, pull the image file where you have an internet connection: - -```bash -singularity pull --name nf-core-hic.simg shub://nf-core/hic -``` - -Then transfer this file and point the config file to the image: - -```nextflow -singularity.enabled = true -process.container = "/path/to/nf-core-hic.simg" -``` - - -### Conda -If you're not able to use Docker or Singularity, you can instead use conda to manage the software requirements. -To use conda in your own config file, add the following: - -```nextflow -process.conda = "$baseDir/environment.yml" -``` diff --git a/docs/configuration/local.md b/docs/configuration/local.md deleted file mode 100644 index d4530fa..0000000 --- a/docs/configuration/local.md +++ /dev/null @@ -1,47 +0,0 @@ -# nf-core/hic: Local Configuration - -If running the pipeline in a local environment, we highly recommend using either Docker or Singularity. - -## Docker -Docker is a great way to run `nf-core/hic`, as it manages all software installations and allows the pipeline to be run in an identical software environment across a range of systems. - -Nextflow has [excellent integration](https://www.nextflow.io/docs/latest/docker.html) with Docker, and beyond installing the two tools, not much else is required. The `nf-core/hic` profile comes with a configuration profile for docker, making it very easy to use. This also comes with the required presets to use the AWS iGenomes resource, meaning that if using common reference genomes you just specify the reference ID and it will be automatically downloaded from AWS S3. - -First, install docker on your system: [Docker Installation Instructions](https://docs.docker.com/engine/installation/) - -Then, simply run the analysis pipeline: - -```bash -nextflow run nf-core/hic -profile docker --genome '' -``` - -Nextflow will recognise `nf-core/hic` and download the pipeline from GitHub. The `-profile docker` configuration lists the [nf-core/hic](https://hub.docker.com/r/nfcore/hic/) image that we have created and is hosted at dockerhub, and this is downloaded. - -For more information about how to work with reference genomes, see [`docs/configuration/reference_genomes.md`](reference_genomes.md). - -### Pipeline versions -The public docker images are tagged with the same version numbers as the code, which you can use to ensure reproducibility. When running the pipeline, specify the pipeline version with `-r`, for example `-r 1.0`. This uses pipeline code and docker image from this tagged version. - - -## Singularity image -Many HPC environments are not able to run Docker due to security issues. [Singularity](http://singularity.lbl.gov/) is a tool designed to run on such HPC systems which is very similar to Docker. Even better, it can use create images directly from dockerhub. - -To use the singularity image for a single run, use `-with-singularity`. This will download the docker container from dockerhub and create a singularity image for you dynamically. - -If you intend to run the pipeline offline, nextflow will not be able to automatically download the singularity image for you. Instead, you'll have to do this yourself manually first, transfer the image file and then point to that. - -First, pull the image file where you have an internet connection: - -> NB: The "tag" at the end of this command corresponds to the pipeline version. -> Here, we're pulling the docker image for version 1.0 of the nf-core/hic pipeline -> Make sure that this tag corresponds to the version of the pipeline that you're using - -```bash -singularity pull --name nf-core-hic-1.0.img docker://nf-core/hic:1.0 -``` - -Then transfer this file and run the pipeline with this path: - -```bash -nextflow run /path/to/nf-core-hic -with-singularity /path/to/nf-core-hic-1.0.img -``` diff --git a/docs/configuration/reference_genomes.md b/docs/configuration/reference_genomes.md deleted file mode 100644 index c52faf8..0000000 --- a/docs/configuration/reference_genomes.md +++ /dev/null @@ -1,50 +0,0 @@ -# nf-core/hic: Reference Genomes Configuration - -The nf-core/hic pipeline needs a reference genome for alignment and annotation. - -These paths can be supplied on the command line at run time (see the [usage docs](../usage.md)), -but for convenience it's often better to save these paths in a nextflow config file. -See below for instructions on how to do this. -Read [Adding your own system](adding_your_own.md) to find out how to set up custom config files. - -## Adding paths to a config file -Specifying long paths every time you run the pipeline is a pain. -To make this easier, the pipeline comes configured to understand reference genome keywords which correspond to preconfigured paths, meaning that you can just specify `--genome ID` when running the pipeline. - -Note that this genome key can also be specified in a config file if you always use the same genome. - -To use this system, add paths to your config file using the following template: - -```nextflow -params { - genomes { - 'YOUR-ID' { - fasta = '/genome.fa' - } - 'OTHER-GENOME' { - // [..] - } - } - // Optional - default genome. Ignored if --genome 'OTHER-GENOME' specified on command line - genome = 'YOUR-ID' -} -``` - -You can add as many genomes as you like as long as they have unique IDs. - -## illumina iGenomes -To make the use of reference genomes easier, illumina has developed a centralised resource called [iGenomes](https://support.illumina.com/sequencing/sequencing_software/igenome.html). -Multiple reference index types are held together with consistent structure for multiple genomes. - -We have put a copy of iGenomes up onto AWS S3 hosting and this pipeline is configured to use this by default. -The hosting fees for AWS iGenomes are currently kindly funded by a grant from Amazon. -The pipeline will automatically download the required reference files when you run the pipeline. -For more information about the AWS iGenomes, see [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) - -Downloading the files takes time and bandwidth, so we recommend making a local copy of the iGenomes resource. -Once downloaded, you can customise the variable `params.igenomes_base` in your custom configuration file to point to the reference location. -For example: - -```nextflow -params.igenomes_base = '/path/to/data/igenomes/' -``` diff --git a/docs/installation.md b/docs/installation.md deleted file mode 100644 index 9ac66d5..0000000 --- a/docs/installation.md +++ /dev/null @@ -1,110 +0,0 @@ -# nf-core/hic: Installation - -To start using the nf-core/hic pipeline, follow the steps below: - -1. [Install Nextflow](#1-install-nextflow) -2. [Install the pipeline](#2-install-the-pipeline) - * [Automatic](#21-automatic) - * [Offline](#22-offline) - * [Development](#23-development) -3. [Pipeline configuration](#3-pipeline-configuration) - * [Software deps: Docker and Singularity](#31-software-deps-docker-and-singularity) - * [Software deps: Bioconda](#32-software-deps-bioconda) - * [Configuration profiles](#33-configuration-profiles) -4. [Reference genomes](#4-reference-genomes) - -## 1) Install NextFlow -Nextflow runs on most POSIX systems (Linux, Mac OSX etc). It can be installed by running the following commands: - -```bash -# Make sure that Java v8+ is installed: -java -version - -# Install Nextflow -curl -fsSL get.nextflow.io | bash - -# Add Nextflow binary to your PATH: -mv nextflow ~/bin/ -# OR system-wide installation: -# sudo mv nextflow /usr/local/bin -``` - -See [nextflow.io](https://www.nextflow.io/) for further instructions on how to install and configure Nextflow. - -## 2) Install the pipeline - -#### 2.1) Automatic -This pipeline itself needs no installation - NextFlow will automatically fetch it from GitHub if `nf-core/hic` is specified as the pipeline name. - -#### 2.2) Offline -The above method requires an internet connection so that Nextflow can download the pipeline files. If you're running on a system that has no internet connection, you'll need to download and transfer the pipeline files manually: - -```bash -wget https://github.com/nf-core/hic/archive/master.zip -mkdir -p ~/my-pipelines/nf-core/ -unzip master.zip -d ~/my-pipelines/nf-core/ -cd ~/my_data/ -nextflow run ~/my-pipelines/nf-core/hic-master -``` - -To stop nextflow from looking for updates online, you can tell it to run in offline mode by specifying the following environment variable in your ~/.bashrc file: - -```bash -export NXF_OFFLINE='TRUE' -``` - -#### 2.3) Development - -If you would like to make changes to the pipeline, it's best to make a fork on GitHub and then clone the files. Once cloned you can run the pipeline directly as above. - - -## 3) Pipeline configuration -By default, the pipeline loads a basic server configuration [`conf/base.config`](../conf/base.config) -This uses a number of sensible defaults for process requirements and is suitable for running -on a simple (if powerful!) local server. - -Be warned of two important points about this default configuration: - -1. The default profile uses the `local` executor - * All jobs are run in the login session. If you're using a simple server, this may be fine. If you're using a compute cluster, this is bad as all jobs will run on the head node. - * See the [nextflow docs](https://www.nextflow.io/docs/latest/executor.html) for information about running with other hardware backends. Most job scheduler systems are natively supported. -2. Nextflow will expect all software to be installed and available on the `PATH` - * It's expected to use an additional config profile for docker, singularity or conda support. See below. - -#### 3.1) Software deps: Docker -First, install docker on your system: [Docker Installation Instructions](https://docs.docker.com/engine/installation/) - -Then, running the pipeline with the option `-profile docker` tells Nextflow to enable Docker for this run. An image containing all of the software requirements will be automatically fetched and used from [dockerhub](https://hub.docker.com/r/nfcore/hic). - -#### 3.1) Software deps: Singularity -If you're not able to use Docker then [Singularity](http://singularity.lbl.gov/) is a great alternative. -The process is very similar: running the pipeline with the option `-profile singularity` tells Nextflow to enable singularity for this run. An image containing all of the software requirements will be automatically fetched and used from singularity hub. - -If running offline with Singularity, you'll need to download and transfer the Singularity image first: - -```bash -singularity pull --name nf-core-hic.simg shub://nf-core/hic -``` - -Once transferred, use `-with-singularity` and specify the path to the image file: - -```bash -nextflow run /path/to/nf-core-hic -with-singularity nf-core-hic.simg -``` - -Remember to pull updated versions of the singularity image if you update the pipeline. - - -#### 3.2) Software deps: conda -If you're not able to use Docker _or_ Singularity, you can instead use conda to manage the software requirements. -This is slower and less reproducible than the above, but is still better than having to install all requirements yourself! -The pipeline ships with a conda environment file and nextflow has built-in support for this. -To use it first ensure that you have conda installed (we recommend [miniconda](https://conda.io/miniconda.html)), then follow the same pattern as above and use the flag `-profile conda` - -#### 3.3) Configuration profiles - -See [`docs/configuration/adding_your_own.md`](configuration/adding_your_own.md) - -## 4) Reference genomes - -See [`docs/configuration/reference_genomes.md`](configuration/reference_genomes.md) diff --git a/docs/output.md b/docs/output.md index 53c9c0c..a83d0da 100644 --- a/docs/output.md +++ b/docs/output.md @@ -1,8 +1,11 @@ # nf-core/hic: Output -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. Most of the plots +are taken from the MultiQC report, which summarises results at the end of the +pipeline. ## Pipeline overview + The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: @@ -10,27 +13,38 @@ and processes data using the following steps: * [Valid pairs detection](#valid-pairs-detection) * [Duplicates removal](#duplicates-removal) * [Contact maps](#contact-maps) -* [MultiQC](#multiqc) - aggregate report and quality controls, describing results of the whole pipeline -* [Export](#exprot) - additionnal export for compatibility with downstream analysis tool and visualization +* [MultiQC](#multiqc) - aggregate report and quality controls, describing +results of the whole pipeline +* [Export](#exprot) - additionnal export for compatibility with downstream +analysis tool and visualization -The current version is mainly based on the [HiC-Pro](https://github.com/nservant/HiC-Pro) pipeline. -For details about the workflow, see [Servant et al. 2015](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0831-x) +The current version is mainly based on the +[HiC-Pro](https://github.com/nservant/HiC-Pro) pipeline. +For details about the workflow, see +[Servant et al. 2015](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0831-x) ## Reads alignment -Using Hi-C data, each reads mate has to be independantly aligned on the reference genome. -The current workflow implements a two steps mapping strategy. First, the reads are aligned using an end-to-end aligner. -Second, reads spanning the ligation junction are trimmmed from their 3' end, and aligned back on the genome. -Aligned reads for both fragment mates are then paired in a single paired-end BAM file. -Singletons are discarded, and multi-hits are filtered according to the configuration parameters (`--rm-multi`). -Note that if the `--dnase` mode is activated, HiC-Pro will skip the second mapping step. +Using Hi-C data, each reads mate has to be independantly aligned on the +reference genome. +The current workflow implements a two steps mapping strategy. First, the reads +are aligned using an end-to-end aligner. +Second, reads spanning the ligation junction are trimmmed from their 3' end, +and aligned back on the genome. +Aligned reads for both fragment mates are then paired in a single paired-end +BAM file. +Singletons are discarded, and multi-hits are filtered according to the +configuration parameters (`--rm-multi`). +Note that if the `--dnase` mode is activated, HiC-Pro will skip the second +mapping step. **Output directory: `results/mapping`** * `*bwt2pairs.bam` - final BAM file with aligned paired data * `*.pairstat` - mapping statistics -if `--saveAlignedIntermediates` is specified, additional mapping file results are available ; +if `--saveAlignedIntermediates` is specified, additional mapping file results +are available ; * `*.bam` - Aligned reads (R1 and R2) from end-to-end alignment * `*_unmap.fastq` - Unmapped reads after end-to-end alignment @@ -39,68 +53,117 @@ if `--saveAlignedIntermediates` is specified, additional mapping file results ar * `*bwt2merged.bam` - merged BAM file after the two-steps alignment * `*.mapstat` - mapping statistics per read mate -Usually, a high fraction of reads is expected to be aligned on the genome (80-90%). Among them, we usually observed a few percent (around 10%) of step 2 aligned reads. Those reads are chimeric fragments for which we detect a ligation junction. An abnormal level of chimeric reads can reflect a ligation issue during the library preparation. -The fraction of singleton or multi-hits depends on the genome complexity and the fraction of unmapped reads. The fraction of singleton is usually close to the sum of unmapped R1 and R2 reads, as it is unlikely that both mates from the same pair were unmapped. +Usually, a high fraction of reads is expected to be aligned on the genome +(80-90%). Among them, we usually observed a few percent (around 10%) of step 2 +aligned reads. Those reads are chimeric fragments for which we detect a +ligation junction. An abnormal level of chimeric reads can reflect a ligation +issue during the library preparation. +The fraction of singleton or multi-hits depends on the genome complexity and +the fraction of unmapped reads. The fraction of singleton is usually close to +the sum of unmapped R1 and R2 reads, as it is unlikely that both mates from the +same pair were unmapped. ## Valid pairs detection -Each aligned reads can be assigned to one restriction fragment according to the reference genome and the digestion protocol. +Each aligned reads can be assigned to one restriction fragment according to the +reference genome and the digestion protocol. Invalid pairs are classified as follow: -* Dangling end, i.e. unligated fragments (both reads mapped on the same restriction fragment) -* Self circles, i.e. fragments ligated on themselves (both reads mapped on the same restriction fragment in inverted orientation) -* Religation, i.e. ligation of juxtaposed fragments -* Filtered pairs, i.e. any pairs that do not match the filtering criteria on inserts size, restriction fragments size -* Dumped pairs, i.e. any pairs for which we were not able to reconstruct the ligation product. - -Only valid pairs involving two different restriction fragments are used to build the contact maps. -Duplicated valid pairs associated to PCR artefacts are discarded (see `--rm_dup`. -In case of Hi-C protocols that do not require a restriction enzyme such as DNase Hi-C or micro Hi-C, the assignment to a restriction is not possible (see `--dnase`). -Short range interactions that are likely to be spurious ligation products can thus be discarded using the `--min_cis_dist` parameter. +* Dangling end, i.e. unligated fragments (both reads mapped on the same +restriction fragment) +* Self circles, i.e. fragments ligated on themselves (both reads mapped on the +same restriction fragment in inverted orientation) +* Religation, i.e. ligation of juxtaposed fragments +* Filtered pairs, i.e. any pairs that do not match the filtering criteria on +inserts size, restriction fragments size +* Dumped pairs, i.e. any pairs for which we were not able to reconstruct the +ligation product. + +Only valid pairs involving two different restriction fragments are used to +build the contact maps. +Duplicated valid pairs associated to PCR artefacts are discarded +(see `--rm_dup`). + +In case of Hi-C protocols that do not require a restriction enzyme such as +DNase Hi-C or micro Hi-C, the assignment to a restriction is not possible +(see `--dnase`). +Short range interactions that are likely to be spurious ligation products +can thus be discarded using the `--min_cis_dist` parameter. * `*.validPairs` - List of valid ligation products +* `*.DEpairs` - List of dangling-end products +* `*.SCPairs` - List of self-circle products +* `*.REPairs` - List of religation products +* `*.FiltPairs` - List of filtered pairs * `*RSstat` - Statitics of number of read pairs falling in each category The validPairs are stored using a simple tab-delimited text format ; ```bash -read name / chr_reads1 / pos_reads1 / strand_reads1 / chr_reads2 / pos_reads2 / strand_reads2 / fragment_size / res frag name R1 / res frag R2 / mapping qual R1 / mapping qual R2 [/ allele_specific_tag] +read name / chr_reads1 / pos_reads1 / strand_reads1 / chr_reads2 / pos_reads2 / +strand_reads2 / fragment_size / res frag name R1 / res frag R2 / mapping qual R1 +/ mapping qual R2 [/ allele_specific_tag] ``` -The ligation efficiency can be assessed using the filtering of valid and invalid pairs. As the ligation is a random process, 25% of each valid ligation class is expected. In the same way, a high level of dangling-end or self-circle read pairs is associated with a low quality experiment, and reveals a problem during the digestion, fill-in or ligation steps. +The ligation efficiency can be assessed using the filtering of valid and +invalid pairs. As the ligation is a random process, 25% of each valid ligation +class is expected. In the same way, a high level of dangling-end or self-circle +read pairs is associated with a low quality experiment, and reveals a problem +during the digestion, fill-in or ligation steps. -In the context of Hi-C protocol without restriction enzyme, this analysis step is skipped. The aligned pairs are therefore directly used to generate the contact maps. A filter of the short range contact (typically <1kb) is recommanded as this pairs are likely to be self ligation products. +In the context of Hi-C protocol without restriction enzyme, this analysis step +is skipped. The aligned pairs are therefore directly used to generate the +contact maps. A filter of the short range contact (typically <1kb) is +recommanded as this pairs are likely to be self ligation products. ## Duplicates removal Note that validPairs file are generated per reads chunck. -These files are then merged in the allValidPairs file, and duplicates are removed if the `--rm_dup` parameter is used. +These files are then merged in the allValidPairs file, and duplicates are +removed if the `--rm_dup` parameter is used. * `*allValidPairs` - combined valid pairs from all read chunks * `*mergestat` - statistics about duplicates removal and valid pairs information -Additional quality controls such as fragment size distribution can be extracted from the list of valid interaction products. -We usually expect to see a distribution centered around 300 pb which correspond to the paired-end insert size commonly used. -The fraction of dplicates is also presented. A high level of duplication indicates a poor molecular complexity and a potential PCR bias. -Finaly, an important metric is to look at the fraction of intra and inter-chromosomal interactions, as well as long range (>20kb) versus short range (<20kb) intra-chromosomal interactions. +Additional quality controls such as fragment size distribution can be extracted +from the list of valid interaction products. +We usually expect to see a distribution centered around 300 pb which correspond +to the paired-end insert size commonly used. +The fraction of dplicates is also presented. A high level of duplication +indicates a poor molecular complexity and a potential PCR bias. +Finaly, an important metric is to look at the fraction of intra and +inter-chromosomal interactions, as well as long range (>20kb) versus short +range (<20kb) intra-chromosomal interactions. ## Contact maps Intra et inter-chromosomal contact maps are build for all specified resolutions. -The genome is splitted into bins of equal size. Each valid interaction is associated with the genomic bins to generate the raw maps. -In addition, Hi-C data can contain several sources of biases which has to be corrected. -The current workflow uses the [ìced](https://github.com/hiclib/iced) and [Varoquaux and Servant, 2018](http://joss.theoj.org/papers/10.21105/joss.01286) python package which proposes a fast implementation of the original ICE normalization algorithm (Imakaev et al. 2012), making the assumption of equal visibility of each fragment. +The genome is splitted into bins of equal size. Each valid interaction is +associated with the genomic bins to generate the raw maps. +In addition, Hi-C data can contain several sources of biases which has to be +corrected. +The current workflow uses the [ìced](https://github.com/hiclib/iced) and +[Varoquaux and Servant, 2018](http://joss.theoj.org/papers/10.21105/joss.01286) +python package which proposes a fast implementation of the original ICE +normalization algorithm (Imakaev et al. 2012), making the assumption of equal +visibility of each fragment. * `*.matrix` - genome-wide contact maps * `*_iced.matrix` - genome-wide iced contact maps -The contact maps are generated for all specified resolution (see `--bin_size` argument) +The contact maps are generated for all specified resolution +(see `--bin_size` argument) A contact map is defined by : + * A list of genomic intervals related to the specified resolution (BED format). * A matrix, stored as standard triplet sparse format (i.e. list format). -Based on the observation that a contact map is symmetric and usually sparse, only non-zero values are stored for half of the matrix. The user can specified if the 'upper', 'lower' or 'complete' matrix has to be stored. The 'asis' option allows to store the contacts as they are observed from the valid pairs files. +Based on the observation that a contact map is symmetric and usually sparse, +only non-zero values are stored for half of the matrix. The user can specified +if the 'upper', 'lower' or 'complete' matrix has to be stored. The 'asis' +option allows to store the contacts as they are observed from the valid pairs +files. ```bash A B 10 @@ -109,19 +172,27 @@ Based on the observation that a contact map is symmetric and usually sparse, onl (...) ``` -This format is memory efficient, and is compatible with several software for downstream analysis. +This format is memory efficient, and is compatible with several software for +downstream analysis. ## MultiQC -[MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. +[MultiQC](http://multiqc.info) is a visualisation tool that generates a single +HTML report summarising all samples in your project. Most of the pipeline QC +results are visualised in the report and further statistics are available in +within the report data directory. -The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability. +The pipeline has special steps which allow the software versions used to be +reported in the MultiQC output for future traceability. **Output directory: `results/multiqc`** * `Project_multiqc_report.html` - * MultiQC report - a standalone HTML file that can be viewed in your web browser + * MultiQC report - a standalone HTML file that can be viewed in your +web browser * `Project_multiqc_data/` - * Directory containing parsed statistics from the different tools used in the pipeline + * Directory containing parsed statistics from the different tools used +in the pipeline -For more information about how to use MultiQC reports, see [http://multiqc.info](http://multiqc.info) +For more information about how to use MultiQC reports, see +[http://multiqc.info](http://multiqc.info) diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md deleted file mode 100644 index e0f2d07..0000000 --- a/docs/troubleshooting.md +++ /dev/null @@ -1,28 +0,0 @@ -# nf-core/hic: Troubleshooting - -## Input files not found - -If only no file, only one input file , or only read one and not read two is picked up then something is wrong with your input file declaration - -1. The path must be enclosed in quotes (`'` or `"`) -2. The path must have at least one `*` wildcard character. This is even if you are only running one paired end sample. -3. When using the pipeline with paired end data, the path must use `{1,2}` or `{R1,R2}` notation to specify read pairs. -4. If you are running Single end data make sure to specify `--singleEnd` - -If the pipeline can't find your files then you will get the following error - -```bash -ERROR ~ Cannot find any reads matching: *{1,2}.fastq.gz -``` - -Note that if your sample name is "messy" then you have to be very particular with your glob specification. A file name like `L1-1-D-2h_S1_L002_R1_001.fastq.gz` can be difficult enough for a human to read. Specifying `*{1,2}*.gz` wont work give you what you want Whilst `*{R1,R2}*.gz` will. - - -## Data organization -The pipeline can't take a list of multiple input files - it takes a glob expression. If your input files are scattered in different paths then we recommend that you generate a directory with symlinked files. If running in paired end mode please make sure that your files are sensibly named so that they can be properly paired. See the previous point. - -## Extra resources and getting help -If you still have an issue with running the pipeline then feel free to contact us. -Have a look at the [pipeline website](https://github.com/nf-core/hic) to find out how. - -If you have problems that are related to Nextflow and not our pipeline then check out the [Nextflow gitter channel](https://gitter.im/nextflow-io/nextflow) or the [google group](https://groups.google.com/forum/#!forum/nextflow). diff --git a/docs/usage.md b/docs/usage.md index d166cf6..f1cd3a5 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -51,11 +51,12 @@ * [`--splitFastq`](#--splitFastq) * [`--saveReference`](#--saveReference) * [`--saveAlignedIntermediates`](#--saveAlignedIntermediates) + * [`--saveInteractionBAM`](#--saveInteractionBAM) * [Skip options](#skip-options) - * [--skip_maps](#--skip_maps) - * [--skip_ice](#--skip_ice) - * [--skip_cool](#--skip_cool) - * [--skip_multiqc](#--skip_multiqc) + * [--skipMaps](#--skipMaps) + * [--skipIce](#--skipIce) + * [--skipCool](#--skipCool) + * [--skipMultiQC](#--skipMultiQC) * [Job resources](#job-resources) * [Automatic resubmission](#automatic-resubmission) * [Custom resource requests](#custom-resource-requests) @@ -76,24 +77,32 @@ * [`--plaintext_email`](#--plaintext_email) * [`--multiqc_config`](#--multiqc_config) - ## General Nextflow info -Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is finished. We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. Alternatively you can run nextflow within a cluster job submitted your job scheduler. -It is recommended to limit the Nextflow Java virtual machines memory. We recommend adding the following line to your environment (typically in `~/.bashrc` or `~./bash_profile`): +Nextflow handles job submissions on SLURM or other environments, and supervises +running the jobs. Thus the Nextflow process must run until the pipeline is +finished. We recommend that you put the process running in the background +through `screen` / `tmux` or similar tool. Alternatively you can run nextflow +within a cluster job submitted your job scheduler. + +It is recommended to limit the Nextflow Java virtual machines memory. +We recommend adding the following line to your environment (typically +in `~/.bashrc` or `~./bash_profile`): ```bash NXF_OPTS='-Xms1g -Xmx4g' ``` ## Running the pipeline + The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/hic --reads '*_R{1,2}.fastq.gz' -genome GRCh37 -profile docker +nextflow run nf-core/hic --reads '*_R{1,2}.fastq.gz' --genome GRCh37 -profile docker ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `docker` configuration profile. +See below for more information about profiles. Note that the pipeline will create the following files in your working directory: @@ -105,26 +114,46 @@ results # Finished results (configurable, see below) ``` ### Updating the pipeline -When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: + +When you run the above command, Nextflow automatically pulls the pipeline code +from GitHub and stores it as a cached version. When running the pipeline after +this, it will always use the cached version if available - even if the pipeline +has been updated since. To make sure that you're running the latest version of +the pipeline, make sure that you regularly update the cached version of the +pipeline: ```bash nextflow pull nf-core/hic ``` ### Reproducibility -It's a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/hic releases page](https://github.com/nf-core/hic/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. +It's a good idea to specify a pipeline version when running the pipeline on +your data. This ensures that a specific version of the pipeline code and +software are used when you run your pipeline. If you keep using the same tag, +you'll be running the same version of the pipeline, even if there have been +changes to the code since. -This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. +First, go to the +[nf-core/hic releases page](https://github.com/nf-core/hic/releases) and find +the latest version number - numeric only (eg. `1.3.1`). +Then specify this when running the pipeline with `-r` (one hyphen) +eg. `-r 1.3.1`. +This version number will be logged in reports when you run the pipeline, so +that you'll know what you used when you look back in the future. ## Main arguments ### `-profile` -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Note that multiple profiles can be loaded, for example: `-profile docker` - the order of arguments is important! -If `-profile` is not specified at all the pipeline will be run locally and expects all software to be installed and available on the `PATH`. +Use this parameter to choose a configuration profile. Profiles can give +configuration presets for different compute environments. Note that multiple +profiles can be loaded, for example: `-profile docker` - the order of arguments +is important! + +If `-profile` is not specified at all the pipeline will be run locally and +expects all software to be installed and available on the `PATH`. * `awsbatch` * A generic configuration profile to be used with AWS Batch. @@ -142,6 +171,7 @@ If `-profile` is not specified at all the pipeline will be run locally and expec * Includes links to test data so needs no other parameters ### `--reads` + Use this to specify the location of your input FastQ files. For example: ```bash @@ -152,18 +182,26 @@ Please note the following requirements: 1. The path must be enclosed in quotes 2. The path must have at least one `*` wildcard character -3. When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs. +3. When using the pipeline with paired end data, the path must use `{1,2}` +notation to specify read pairs. If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz` ## Reference genomes and annotation files -The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. +The pipeline config files come bundled with paths to the illumina iGenomes +reference index files. If running with docker or AWS, the configuration is +set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) +resource. ### `--genome` (using iGenomes) -There are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag. -You can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are: +There are 31 different species supported in the iGenomes references. To run +the pipeline, you must specify which to use with the `--genome` flag. + +You can find the keys to specify the genomes in the +[iGenomes config file](../conf/igenomes.config). +Common genomes that are supported are: * Human * `--genome GRCh37` @@ -176,11 +214,13 @@ You can find the keys to specify the genomes in the [iGenomes config file](../co > There are numerous others - check the config file for more. -Note that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file. +Note that you can use the same configuration setup to save sets of reference +files for your own use, even if they are not part of the iGenomes resource. +See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) +for instructions on where to save such a file. The syntax for this reference configuration is as follows: - ```nextflow params { genomes { @@ -194,18 +234,26 @@ params { ``` ### `--fasta` -If you prefer, you can specify the full path to your reference genome when you run the pipeline: + +If you prefer, you can specify the full path to your reference genome when you +run the pipeline: ```bash --fasta '[path to Fasta reference]' ``` ### `--igenomesIgnore` -Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`. + +Do not load `igenomes.config` when running the pipeline. You may choose this +option if you observe clashes between custom parameters and those supplied +in `igenomes.config`. ### `--bwt2_index` -The bowtie2 indexes are required to run the Hi-C pipeline. If the `--bwt2_index` is not specified, the pipeline will either use the igenome bowtie2 indexes (see `--genome` option) or build the indexes on-the-fly (see `--fasta` option) +The bowtie2 indexes are required to run the Hi-C pipeline. If the +`--bwt2_index` is not specified, the pipeline will either use the igenome +bowtie2 indexes (see `--genome` option) or build the indexes on-the-fly +(see `--fasta` option) ```bash --bwt2_index '[path to bowtie2 index (with basename)]' @@ -213,8 +261,10 @@ The bowtie2 indexes are required to run the Hi-C pipeline. If the `--bwt2_index` ### `--chromosome_size` -The Hi-C pipeline will also requires a two-columns text file with the chromosome name and its size (tab separated). -If not specified, this file will be automatically created by the pipeline. In the latter case, the `--fasta` reference genome has to be specified. +The Hi-C pipeline will also requires a two-columns text file with the +chromosome name and its size (tab separated). +If not specified, this file will be automatically created by the pipeline. +In the latter case, the `--fasta` reference genome has to be specified. ```bash chr1 249250621 @@ -236,7 +286,8 @@ If not specified, this file will be automatically created by the pipeline. In th ### `--restriction_fragments` -Finally, Hi-C experiments based on restriction enzyme digestion requires a BED file with coordinates of restriction fragments. +Finally, Hi-C experiments based on restriction enzyme digestion requires a BED +file with coordinates of restriction fragments. ```bash chr1 0 16007 HIC_chr1_1 0 + @@ -252,22 +303,30 @@ Finally, Hi-C experiments based on restriction enzyme digestion requires a BED f (...) ``` -If not specified, this file will be automatically created by the pipline. In this case, the `--fasta` reference genome will be used. +If not specified, this file will be automatically created by the pipline. +In this case, the `--fasta` reference genome will be used. Note that the `--restriction_site` parameter is mandatory to create this file. ## Hi-C specific options -The following options are defined in the `hicpro.config` file, and can be updated either using a custom configuration file (see `-c` option) or using command line parameter. +The following options are defined in the `hicpro.config` file, and can be +updated either using a custom configuration file (see `-c` option) or using +command line parameter. ### Reads mapping -The reads mapping is currently based on the two-steps strategy implemented in the HiC-pro pipeline. The idea is to first align reads from end-to-end. -Reads that do not aligned are then trimmed at the ligation site, and their 5' end is re-aligned to the reference genome. -Note that the default option are quite stringent, and can be updated according to the reads quality or the reference genome. +The reads mapping is currently based on the two-steps strategy implemented in +the HiC-pro pipeline. The idea is to first align reads from end-to-end. +Reads that do not aligned are then trimmed at the ligation site, and their 5' +end is re-aligned to the reference genome. +Note that the default option are quite stringent, and can be updated according +to the reads quality or the reference genome. #### `--bwt2_opts_end2end` -Bowtie2 alignment option for end-to-end mapping. Default: '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end --reorder' +Bowtie2 alignment option for end-to-end mapping. +Default: '--very-sensitive -L 30 --score-min L,-0.6,-0.2 --end-to-end +--reorder' ```bash --bwt2_opts_end2end '[Options for bowtie2 step1 mapping on full reads]' @@ -275,7 +334,9 @@ Bowtie2 alignment option for end-to-end mapping. Default: '--very-sensitive -L 3 #### `--bwt2_opts_trimmed` -Bowtie2 alignment option for trimmed reads mapping (step 2). Default: '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end --reorder' +Bowtie2 alignment option for trimmed reads mapping (step 2). +Default: '--very-sensitive -L 20 --score-min L,-0.6,-0.2 --end-to-end +--reorder' ```bash --bwt2_opts_trimmed '[Options for bowtie2 step2 mapping on trimmed reads]' @@ -293,15 +354,20 @@ Minimum mapping quality. Reads with lower quality are discarded. Default: 10 #### `--restriction_site` -Restriction motif(s) for Hi-C digestion protocol. The restriction motif(s) is(are) used to generate the list of restriction fragments. -The precise cutting site of the restriction enzyme has to be specified using the '^' character. Default: 'A^AGCTT' +Restriction motif(s) for Hi-C digestion protocol. The restriction motif(s) +is(are) used to generate the list of restriction fragments. +The precise cutting site of the restriction enzyme has to be specified using +the '^' character. Default: 'A^AGCTT' Here are a few examples: -* MboI: '^GATC' -* DpnII: '^GATC' -* BglII: 'A^GATCT' -* HindIII: 'A^AGCTT' -Note that multiples restriction motifs can be provided (comma-separated). +* MboI: ^GATC +* DpnII: ^GATC +* BglII: A^GATCT +* HindIII: A^AGCTT +* ARIMA kit: ^GATC,^GANT + +Note that multiples restriction motifs can be provided (comma-separated) and +that 'N' base are supported. ```bash --restriction_size '[Cutting motif]' @@ -309,16 +375,22 @@ Note that multiples restriction motifs can be provided (comma-separated). #### `--ligation_site` -Ligation motif after reads ligation. This motif is used for reads trimming and depends on the fill in strategy. -Note that multiple ligation sites can be specified. Default: 'AAGCTAGCTT' +Ligation motif after reads ligation. This motif is used for reads trimming and +depends on the fill in strategy. +Note that multiple ligation sites can be specified (comma separated) and that +'N' base is interpreted and replaced by 'A','C','G','T'. +Default: 'AAGCTAGCTT' ```bash --ligation_site '[Ligation motif]' ``` +Exemple of the ARIMA kit: GATCGATC,GATCGANT,GANTGATC,GANTGANT + #### `--min_restriction_fragment_size` -Minimum size of restriction fragments to consider for the Hi-C processing. Default: '' +Minimum size of restriction fragments to consider for the Hi-C processing. +Default: '' ```bash --min_restriction_fragment_size '[numeric]' @@ -326,7 +398,8 @@ Minimum size of restriction fragments to consider for the Hi-C processing. Defau #### `--max_restriction_fragment_size` -Maximum size of restriction fragments to consider for the Hi-C processing. Default: '' +Maximum size of restriction fragments to consider for the Hi-C processing. +Default: '' ```bash --max_restriction_fragment_size '[numeric]' @@ -334,7 +407,8 @@ Maximum size of restriction fragments to consider for the Hi-C processing. Defau #### `--min_insert_size` -Minimum reads insert size. Shorter 3C products are discarded. Default: '' +Minimum reads insert size. Shorter 3C products are discarded. +Default: '' ```bash --min_insert_size '[numeric]' @@ -342,7 +416,8 @@ Minimum reads insert size. Shorter 3C products are discarded. Default: '' #### `--max_insert_size` -Maximum reads insert size. Longer 3C products are discarded. Default: '' +Maximum reads insert size. Longer 3C products are discarded. +Default: '' ```bash --max_insert_size '[numeric]' @@ -352,8 +427,10 @@ Maximum reads insert size. Longer 3C products are discarded. Default: '' #### `--dnase` -In DNAse Hi-C mode, all options related to digestion Hi-C (see previous section) are ignored. -In this case, it is highly recommanded to use the `--min_cis_dist` parameter to remove spurious ligation products. +In DNAse Hi-C mode, all options related to digestion Hi-C +(see previous section) are ignored. +In this case, it is highly recommanded to use the `--min_cis_dist` parameter +to remove spurious ligation products. ```bash --dnase' @@ -363,7 +440,8 @@ In this case, it is highly recommanded to use the `--min_cis_dist` parameter to #### `--min_cis_dist` -Filter short range contact below the specified distance. Mainly useful for DNase Hi-C. Default: '' +Filter short range contact below the specified distance. +Mainly useful for DNase Hi-C. Default: '' ```bash --min_cis_dist '[numeric]' @@ -387,7 +465,9 @@ If specified, duplicates reads are discarded before building contact maps. #### `--rm_multi` -If specified, reads that aligned multiple times on the genome are discarded. Note the default mapping options are based on random hit assignment, meaning that only one position is kept per read. +If specified, reads that aligned multiple times on the genome are discarded. +Note the default mapping options are based on random hit assignment, meaning +that only one position is kept per read. ```bash --rm_multi @@ -395,41 +475,46 @@ If specified, reads that aligned multiple times on the genome are discarded. Not ## Genome-wide contact maps -#### `--bin_size` +### `--bin_size` -Resolution of contact maps to generate (space separated). Default:'1000000,500000' +Resolution of contact maps to generate (space separated). +Default:'1000000,500000' ```bash --bins_size '[numeric]' ``` -#### `--ice_max_iter` +### `--ice_max_iter` -Maximum number of iteration for ICE normalization. Default: 100 +Maximum number of iteration for ICE normalization. +Default: 100 ```bash --ice_max_iter '[numeric]' ``` -#### `--ice_filer_low_count_perc` +### `--ice_filer_low_count_perc` -Define which pourcentage of bins with low counts should be force to zero. Default: 0.02 +Define which pourcentage of bins with low counts should be force to zero. +Default: 0.02 ```bash --ice_filter_low_count_perc '[numeric]' ``` -#### `--ice_filer_high_count_perc` +### `--ice_filer_high_count_perc` -Define which pourcentage of bins with low counts should be discarded before normalization. Default: 0 +Define which pourcentage of bins with low counts should be discarded before +normalization. Default: 0 ```bash --ice_filter_high_count_perc '[numeric]' ``` -#### `--ice_eps` +### `--ice_eps` -The relative increment in the results before declaring convergence for ICE normalization. Default: 0.1 +The relative increment in the results before declaring convergence for ICE +normalization. Default: 0.1 ```bash --ice_eps '[numeric]' @@ -437,108 +522,166 @@ The relative increment in the results before declaring convergence for ICE norma ## Inputs/Outputs -#### `--splitFastq` +### `--splitFastq` -By default, the nf-core Hi-C pipeline expects one read pairs per sample. However, for large Hi-C data processing single fastq files can be very time consuming. -The `--splitFastq` option allows to automatically split input read pairs into chunks of reads. In this case, all chunks will be processed in parallel and merged before generating the contact maps, thus leading to a significant increase of processing performance. +By default, the nf-core Hi-C pipeline expects one read pairs per sample. +However, for large Hi-C data processing single fastq files can be very +time consuming. +The `--splitFastq` option allows to automatically split input read pairs +into chunks of reads. In this case, all chunks will be processed in parallel +and merged before generating the contact maps, thus leading to a significant +increase of processing performance. ```bash --splitFastq '[Number of reads per chunk]' ``` -#### `--saveReference` +### `--saveReference` -If specified, annotation files automatically generated from the `--fasta` file are exported in the results folder. Default: false +If specified, annotation files automatically generated from the `--fasta` file +are exported in the results folder. Default: false ```bash --saveReference ``` -#### `--saveAlignedIntermediates` +### `--saveAlignedIntermediates` -If specified, all intermediate mapping files are saved and exported in the results folder. Default: false +If specified, all intermediate mapping files are saved and exported in the +results folder. Default: false ```bash --saveReference ``` +### `--saveInteractionBAM` + +If specified, write a BAM file with all classified reads (valid paires, +dangling end, self-circle, etc.) and its tags. + ## Skip options -#### `--skip_maps` +### `--skipMaps` -If defined, the workflow stops with the list of valid interactions, and the genome-wide maps are not built. Usefult for capture-C analysis. Default: false +If defined, the workflow stops with the list of valid interactions, and the +genome-wide maps are not built. Usefult for capture-C analysis. Default: false ```bash ---skip_maps +--skipMaps ``` -#### `--skip_ice` +### `--skipIce` -If defined, the ICE normalization is not run on the raw contact maps. Default: false +If defined, the ICE normalization is not run on the raw contact maps. +Default: false ```bash ---skip_ice +--skipIce ``` -#### `--skip_cool` +### `--skipCool` If defined, cooler files are not generated. Default: false ```bash ---skip_cool +--skipCool ``` -#### `--skip_multiqc` +### `--skipMultiQC` If defined, the MultiQC report is not generated. Default: false ```bash ---skip_multiqc +--skipMultiQC ``` ## Job resources + ### Automatic resubmission -Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources) it will automatically resubmit with higher requests (2 x original, then 3 x original). If it still fails after three times then the pipeline is stopped. -### Custom resource requests -Wherever process-specific requirements are set in the pipeline, the default value can be changed by creating a custom config file. See the files hosted at [`nf-core/configs`](https://github.com/nf-core/configs/tree/master/conf) for examples. +Each step in the pipeline has a default set of requirements for number of CPUs, +memory and time. For most of the steps in the pipeline, if the job exits with +an error code of `143` (exceeded requested resources) it will automatically +resubmit with higher requests (2 x original, then 3 x original). If it still +fails after three times then the pipeline is stopped. -If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition below). You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. +### Custom resource requests -If you have any questions or issues please send us a message on [`Slack`](https://nf-core-invite.herokuapp.com/). +Wherever process-specific requirements are set in the pipeline, the default +value can be changed by creating a custom config file. +See the files hosted at +[`nf-core/configs`](https://github.com/nf-core/configs/tree/master/conf) +for examples. + +If you are likely to be running `nf-core` pipelines regularly it may be a good +idea to request that your custom config file is uploaded to the +`nf-core/configs` git repository. Before you do this please can you test that +the config file works with your pipeline of choice using the `-c` parameter +(see definition below). You can then create a pull request to the +`nf-core/configs` repository with the addition of your config file, associated +documentation file (see examples in +[`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), +and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) +to include your custom profile. + +If you have any questions or issues please send us a message on +[`Slack`](https://nf-core-invite.herokuapp.com/). ## AWS Batch specific parameters -Running the pipeline on AWS Batch requires a couple of specific parameters to be set according to your AWS Batch configuration. Please use the `-awsbatch` profile and then specify all of the following parameters. + +Running the pipeline on AWS Batch requires a couple of specific parameters to +be set according to your AWS Batch configuration. Please use the `-awsbatch` +profile and then specify all of the following parameters. + ### `--awsqueue` + The JobQueue that you intend to use on AWS Batch. + ### `--awsregion` -The AWS region to run your job in. Default is set to `eu-west-1` but can be adjusted to your needs. -Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to a S3 storage bucket of your choice - you'll get an error message notifying you if you didn't. +The AWS region to run your job in. Default is set to `eu-west-1` but can be +adjusted to your needs. + +Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to +a S3 storage bucket of your choice - you'll get an error message notifying you +if you didn't. ## Other command line parameters ### `--outdir` + The output directory where the results will be saved. ### `--email` -Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to speicfy this on the command line for every run. + +Set this parameter to your e-mail address to get a summary e-mail with details +of the run sent to you when the workflow exits. If set in your user config file +(`~/.nextflow/config`) then you don't need to speicfy this on the command line +for every run. ### `-name` -Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. -This is used in the MultiQC report (if not default) and in the summary HTML / e-mail (always). +Name for the pipeline run. If not specified, Nextflow will automatically generate +a random mnemonic. + +This is used in the MultiQC report (if not default) and in the summary HTML / +e-mail (always). **NB:** Single hyphen (core Nextflow option) ### `-resume` -Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. -You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. +Specify this when restarting a pipeline. Nextflow will used cached results from +any pipeline steps where the inputs are the same, continuing from where it got +to previously. + +You can also supply a run name to resume a specific run: `-resume [run-name]`. +Use the `nextflow log` command to show previous run names. **NB:** Single hyphen (core Nextflow option) ### `-c` + Specify the path to a specific config file (this is a core NextFlow command). **NB:** Single hyphen (core Nextflow option) @@ -546,7 +689,10 @@ Specify the path to a specific config file (this is a core NextFlow command). Note - you can use this to override pipeline defaults. ### `--custom_config_version` -Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default is set to `master`. + +Provide git commit id for custom Institutional configs hosted at +`nf-core/configs`. This was implemented for reproducibility purposes. +Default is set to `master`. ```bash ## Download and use config file with following git commid id @@ -554,19 +700,24 @@ Provide git commit id for custom Institutional configs hosted at `nf-core/config ``` ### `--max_memory` + Use to set a top-limit for the default memory requirement for each process. Should be a string in the format integer-unit. eg. `--max_memory '8.GB'` ### `--max_time` + Use to set a top-limit for the default time requirement for each process. Should be a string in the format integer-unit. eg. `--max_time '2.h'` ### `--max_cpus` + Use to set a top-limit for the default CPU requirement for each process. Should be a string in the format integer-unit. eg. `--max_cpus 1` ### `--plaintext_email` + Set to receive plain-text e-mails instead of HTML formatted. ### `--multiqc_config` + Specify a path to a custom MultiQC configuration file. diff --git a/environment.yml b/environment.yml index 34958b7..4d9c20c 100644 --- a/environment.yml +++ b/environment.yml @@ -1,6 +1,6 @@ # You can use this file to create a conda environment for this pipeline: # conda env create -f environment.yml -name: nf-core-hic-1.0.0 +name: nf-core-hic-1.1.0 channels: - conda-forge - bioconda diff --git a/main.nf b/main.nf index ce29fd5..a0f7ac1 100644 --- a/main.nf +++ b/main.nf @@ -22,57 +22,62 @@ def helpMessage() { nextflow run nf-core/hic --reads '*_R{1,2}.fastq.gz' -profile conda Mandatory arguments: - --reads Path to input data (must be surrounded with quotes) - -profile Configuration profile to use. Can use multiple (comma separated) - Available: conda, docker, singularity, awsbatch, test and more. - - References: If not specified in the configuration file or you wish to overwrite any of the references. - --genome Name of iGenomes reference - --bwt2_index Path to Bowtie2 index - --fasta Path to Fasta reference - --chromosome_size Path to chromosome size file - --restriction_fragments Path to restriction fragment file (bed) - - Options: - --bwt2_opts_end2end Options for bowtie2 end-to-end mappinf (first mapping step) - --bwt2_opts_trimmed Options for bowtie2 mapping after ligation site trimming - --min_mapq Minimum mapping quality values to consider - - --restriction_site Cutting motif(s) of restriction enzyme(s) (comma separated) - --ligation_site Ligation motifs to trim (comma separated) - --min_restriction_fragment_size Minimum size of restriction fragments to consider - --max_restriction_framgnet_size Maximum size of restriction fragmants to consider - --min_insert_size Minimum insert size of mapped reads to consider - --max_insert_size Maximum insert size of mapped reads to consider - - --dnase Run DNase Hi-C mode. All options related to restriction fragments are not considered - - --min_cis_dist Minimum intra-chromosomal distance to consider - --rm_singleton Remove singleton reads - --rm_multi Remove multi-mapped reads - --rm_dup Remove duplicates - - --bin_size Bin size for contact maps (comma separated) - --ice_max_iter Maximum number of iteration for ICE normalization - --ice_filter_low_count_perc Percentage of low counts columns/rows to filter before ICE normalization - --ice_filter_high_count_perc Percentage of high counts columns/rows to filter before ICE normalization - --ice_eps Convergence criteria for ICE normalization - - Other options: - --splitFastq Size of read chuncks to use to speed up the workflow - --outdir The output directory where the results will be saved - --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits - -name Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. - - Step options: - --skip_maps Skip generation of contact maps. Useful for capture-C - --skip_ice Skip ICE normalization - --skip_cool Skip generation of cooler files - --skip_multiQC Skip MultiQC - - AWSBatch options: - --awsqueue The AWSBatch JobQueue that needs to be set when running on AWSBatch - --awsregion The AWS Region for your AWS Batch job to run on + --reads Path to input data (must be surrounded with quotes) + -profile Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, awsbatch, test and more. + + References If not specified in the configuration file or you wish to overwrite any of the references. + --genome Name of iGenomes reference + --bwt2_index Path to Bowtie2 index + --fasta Path to Fasta reference + --chromosome_size Path to chromosome size file + --restriction_fragments Path to restriction fragment file (bed) + --saveReference Save reference genome to output folder. Default: False + --saveAlignedIntermediates Save intermediates alignment files. Default: False + + Alignments + --bwt2_opts_end2end Options for bowtie2 end-to-end mappinf (first mapping step). See hic.config for default. + --bwt2_opts_trimmed Options for bowtie2 mapping after ligation site trimming. See hic.config for default. + --min_mapq Minimum mapping quality values to consider. Default: 10 + --restriction_site Cutting motif(s) of restriction enzyme(s) (comma separated). Default: 'A^AGCTT' + --ligation_site Ligation motifs to trim (comma separated). Default: 'AAGCTAGCTT' + --rm_singleton Remove singleton reads. Default: true + --rm_multi Remove multi-mapped reads. Default: true + --rm_dup Remove duplicates. Default: true + + Contacts calling + --min_restriction_fragment_size Minimum size of restriction fragments to consider. Default: None + --max_restriction_framgnet_size Maximum size of restriction fragmants to consider. Default: None + --min_insert_size Minimum insert size of mapped reads to consider. Default: None + --max_insert_size Maximum insert size of mapped reads to consider. Default: None + --saveInteractionBAM Save BAM file with interaction tags (dangling-end, self-circle, etc.). Default: False + + --dnase Run DNase Hi-C mode. All options related to restriction fragments are not considered. Default: False + --min_cis_dist Minimum intra-chromosomal distance to consider. Default: None + + Contact maps + --bin_size Bin size for contact maps (comma separated). Default: '1000000,500000' + --ice_max_iter Maximum number of iteration for ICE normalization. Default: 100 + --ice_filter_low_count_perc Percentage of low counts columns/rows to filter before ICE normalization. Default: 0.02 + --ice_filter_high_count_perc Percentage of high counts columns/rows to filter before ICE normalization. Default: 0 + --ice_eps Convergence criteria for ICE normalization. Default: 0.1 + + + Workflow + --skipMaps Skip generation of contact maps. Useful for capture-C. Default: False + --skipIce Skip ICE normalization. Default: False + --skipCool Skip generation of cool files. Default: False + --skipMultiQC Skip MultiQC. Default: False + + Other + --splitFastq Size of read chuncks to use to speed up the workflow. Default: None + --outdir The output directory where the results will be saved. Default: './results' + --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. Default: None + -name Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. Default: None + + AWSBatch + --awsqueue The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion The AWS Region for your AWS Batch job to run on """.stripIndent() } @@ -80,7 +85,7 @@ def helpMessage() { * SET UP CONFIGURATION VARIABLES */ -// Show help emssage +// Show help message if (params.help){ helpMessage() exit 0 @@ -154,7 +159,7 @@ if ( params.splitFastq ){ raw_reads_full = raw_reads.concat( raw_reads_2 ) raw_reads = raw_reads_full.splitFastq( by: params.splitFastq , file: true) }else{ - raw_reads = raw_reads.concat( raw_reads_2 ) + raw_reads = raw_reads.concat( raw_reads_2 ).dump(tag: "data") } @@ -240,6 +245,12 @@ summary['Restriction Motif']= params.restriction_site summary['Ligation Motif'] = params.ligation_site summary['DNase Mode'] = params.dnase summary['Remove Dup'] = params.rm_dup +summary['Min MAPQ'] = params.min_mapq +summary['Min Fragment Size']= params.min_restriction_fragment_size +summary['Max Fragment Size']= params.max_restriction_framgnet_size +summary['Min Insert Size'] = params.min_insert_size +summary['Max Insert Size'] = params.max_insert_size +summary['Min CIS dist'] = params.min_cis_dist summary['Maps resolution'] = params.bin_size summary['Max Memory'] = params.max_memory @@ -270,7 +281,7 @@ if(params.email) { summary['MultiQC maxsize'] = params.maxMultiqcEmailFileSize } log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") -log.info "\033[2m----------------------------------------------------\033[0m" +log.info "-\033[2m--------------------------------------------------\033[0m-" // Check the hostnames against configured profiles checkHostname() @@ -400,36 +411,36 @@ process bowtie2_end_to_end { saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' input: - set val(sample), file(reads) from raw_reads - file index from bwt2_index_end2end.collect() + set val(sample), file(reads) from raw_reads + file index from bwt2_index_end2end.collect() output: - set val(prefix), file("${prefix}_unmap.fastq") into unmapped_end_to_end - set val(prefix), file("${prefix}.bam") into end_to_end_bam + set val(prefix), file("${prefix}_unmap.fastq") into unmapped_end_to_end + set val(prefix), file("${prefix}.bam") into end_to_end_bam script: - prefix = reads.toString() - ~/(\.fq)?(\.fastq)?(\.gz)?$/ - def bwt2_opts = params.bwt2_opts_end2end - - if (!params.dnase){ - """ - bowtie2 --rg-id BMG --rg SM:${prefix} \\ - ${bwt2_opts} \\ - -p ${task.cpus} \\ - -x ${index}/${bwt2_base} \\ - --un ${prefix}_unmap.fastq \\ - -U ${reads} | samtools view -F 4 -bS - > ${prefix}.bam - """ - }else{ - """ - bowtie2 --rg-id BMG --rg SM:${prefix} \\ - ${bwt2_opts} \\ - -p ${task.cpus} \\ - -x ${index}/${bwt2_base} \\ - --un ${prefix}_unmap.fastq \\ - -U ${reads} > ${prefix}.bam - """ - } + prefix = reads.toString() - ~/(\.fq)?(\.fastq)?(\.gz)?$/ + def bwt2_opts = params.bwt2_opts_end2end + + if (!params.dnase){ + """ + bowtie2 --rg-id BMG --rg SM:${prefix} \\ + ${bwt2_opts} \\ + -p ${task.cpus} \\ + -x ${index}/${bwt2_base} \\ + --un ${prefix}_unmap.fastq \\ + -U ${reads} | samtools view -F 4 -bS - > ${prefix}.bam + """ + }else{ + """ + bowtie2 --rg-id BMG --rg SM:${prefix} \\ + ${bwt2_opts} \\ + -p ${task.cpus} \\ + -x ${index}/${bwt2_base} \\ + --un ${prefix}_unmap.fastq \\ + -U ${reads} > ${prefix}.bam + """ + } } process trim_reads { @@ -438,20 +449,20 @@ process trim_reads { saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' when: - !params.dnase + !params.dnase input: - set val(prefix), file(reads) from unmapped_end_to_end + set val(prefix), file(reads) from unmapped_end_to_end output: - set val(prefix), file("${prefix}_trimmed.fastq") into trimmed_reads + set val(prefix), file("${prefix}_trimmed.fastq") into trimmed_reads script: - """ - cutsite_trimming --fastq $reads \\ - --cutsite ${params.ligation_site} \\ - --out ${prefix}_trimmed.fastq - """ + """ + cutsite_trimming --fastq $reads \\ + --cutsite ${params.ligation_site} \\ + --out ${prefix}_trimmed.fastq + """ } process bowtie2_on_trimmed_reads { @@ -460,24 +471,24 @@ process bowtie2_on_trimmed_reads { saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' when: - !params.dnase + !params.dnase input: - set val(prefix), file(reads) from trimmed_reads - file index from bwt2_index_trim.collect() + set val(prefix), file(reads) from trimmed_reads + file index from bwt2_index_trim.collect() output: - set val(prefix), file("${prefix}_trimmed.bam") into trimmed_bam + set val(prefix), file("${prefix}_trimmed.bam") into trimmed_bam script: - prefix = reads.toString() - ~/(_trimmed)?(\.fq)?(\.fastq)?(\.gz)?$/ - """ - bowtie2 --rg-id BMG --rg SM:${prefix} \\ - ${params.bwt2_opts_trimmed} \\ - -p ${task.cpus} \\ - -x ${index}/${bwt2_base} \\ - -U ${reads} | samtools view -bS - > ${prefix}_trimmed.bam - """ + prefix = reads.toString() - ~/(_trimmed)?(\.fq)?(\.fastq)?(\.gz)?$/ + """ + bowtie2 --rg-id BMG --rg SM:${prefix} \\ + ${params.bwt2_opts_trimmed} \\ + -p ${task.cpus} \\ + -x ${index}/${bwt2_base} \\ + -U ${reads} | samtools view -bS - > ${prefix}_trimmed.bam + """ } if (!params.dnase){ @@ -487,39 +498,39 @@ if (!params.dnase){ saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' input: - set val(prefix), file(bam1), file(bam2) from end_to_end_bam.join( trimmed_bam ) + set val(prefix), file(bam1), file(bam2) from end_to_end_bam.join( trimmed_bam ) output: - set val(sample), file("${prefix}_bwt2merged.bam") into bwt2_merged_bam - set val(oname), file("${prefix}.mapstat") into all_mapstat + set val(sample), file("${prefix}_bwt2merged.bam") into bwt2_merged_bam + set val(oname), file("${prefix}.mapstat") into all_mapstat script: - sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/ - tag = prefix.toString() =~/_R1|_val_1/ ? "R1" : "R2" - oname = prefix.toString() - ~/(\.[0-9]+)$/ + sample = prefix.toString() - ~/(_R1$|_R2$|_val_1$|_val_2$|_1$|_2$)/ + tag = prefix.toString() =~/_R1$|_val_1$|_1$/ ? "R1" : "R2" + oname = prefix.toString() - ~/(\.[0-9]+)$/ - """ - samtools merge -@ ${task.cpus} \\ - -f ${prefix}_bwt2merged.bam \\ - ${bam1} ${bam2} + """ + samtools merge -@ ${task.cpus} \\ + -f ${prefix}_bwt2merged.bam \\ + ${bam1} ${bam2} - samtools sort -@ ${task.cpus} -m 800M \\ + samtools sort -@ ${task.cpus} -m 800M \\ -n -T /tmp/ \\ -o ${prefix}_bwt2merged.sorted.bam \\ ${prefix}_bwt2merged.bam - mv ${prefix}_bwt2merged.sorted.bam ${prefix}_bwt2merged.bam - - echo "## ${prefix}" > ${prefix}.mapstat - echo -n "total_${tag}\t" >> ${prefix}.mapstat - samtools view -c ${prefix}_bwt2merged.bam >> ${prefix}.mapstat - echo -n "mapped_${tag}\t" >> ${prefix}.mapstat - samtools view -c -F 4 ${prefix}_bwt2merged.bam >> ${prefix}.mapstat - echo -n "global_${tag}\t" >> ${prefix}.mapstat - samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat - echo -n "local_${tag}\t" >> ${prefix}.mapstat - samtools view -c -F 4 ${bam2} >> ${prefix}.mapstat - """ + mv ${prefix}_bwt2merged.sorted.bam ${prefix}_bwt2merged.bam + + echo "## ${prefix}" > ${prefix}.mapstat + echo -n "total_${tag}\t" >> ${prefix}.mapstat + samtools view -c ${prefix}_bwt2merged.bam >> ${prefix}.mapstat + echo -n "mapped_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${prefix}_bwt2merged.bam >> ${prefix}.mapstat + echo -n "global_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat + echo -n "local_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${bam2} >> ${prefix}.mapstat + """ } }else{ process dnase_mapping_stats{ @@ -528,56 +539,57 @@ if (!params.dnase){ saveAs: { params.saveAlignedIntermediates ? it : null }, mode: 'copy' input: - set val(prefix), file(bam1) from end_to_end_bam + set val(prefix), file(bam1) from end_to_end_bam output: - set val(sample), file(bam1) into bwt2_merged_bam - set val(oname), file("${prefix}.mapstat") into all_mapstat + set val(sample), file(bam1) into bwt2_merged_bam + set val(oname), file("${prefix}.mapstat") into all_mapstat script: - sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/ - tag = prefix.toString() =~/_R1|_val_1/ ? "R1" : "R2" - oname = prefix.toString() - ~/(\.[0-9]+)$/ - - """ - echo "## ${prefix}" > ${prefix}.mapstat - echo -n "total_${tag}\t" >> ${prefix}.mapstat - samtools view -c ${bam1} >> ${prefix}.mapstat - echo -n "mapped_${tag}\t" >> ${prefix}.mapstat - samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat - echo -n "global_${tag}\t" >> ${prefix}.mapstat - samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat - echo -n "local_${tag}\t0" >> ${prefix}.mapstat - """ + sample = prefix.toString() - ~/(_R1$|_R2$|_val_1$|_val_2$|_1$|_2$)/ + tag = prefix.toString() =~/_R1$|_val_1$|_1$/ ? "R1" : "R2" + oname = prefix.toString() - ~/(\.[0-9]+)$/ + + """ + echo "## ${prefix}" > ${prefix}.mapstat + echo -n "total_${tag}\t" >> ${prefix}.mapstat + samtools view -c ${bam1} >> ${prefix}.mapstat + echo -n "mapped_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat + echo -n "global_${tag}\t" >> ${prefix}.mapstat + samtools view -c -F 4 ${bam1} >> ${prefix}.mapstat + echo -n "local_${tag}\t0" >> ${prefix}.mapstat + """ } } + process combine_mapped_files{ tag "$sample = $r1_prefix + $r2_prefix" publishDir "${params.outdir}/mapping", mode: 'copy', saveAs: {filename -> filename.indexOf(".pairstat") > 0 ? "stats/$filename" : "$filename"} input: - set val(sample), file(aligned_bam) from bwt2_merged_bam.groupTuple() + set val(sample), file(aligned_bam) from bwt2_merged_bam.groupTuple() output: - set val(sample), file("${sample}_bwt2pairs.bam") into paired_bam - set val(oname), file("*.pairstat") into all_pairstat + set val(sample), file("${sample}_bwt2pairs.bam") into paired_bam + set val(oname), file("*.pairstat") into all_pairstat script: - r1_bam = aligned_bam[0] - r1_prefix = r1_bam.toString() - ~/_bwt2merged.bam$/ - r2_bam = aligned_bam[1] - r2_prefix = r2_bam.toString() - ~/_bwt2merged.bam$/ - oname = sample.toString() - ~/(\.[0-9]+)$/ - - def opts = "-t" - opts = params.rm_singleton ? "${opts}" : "--single ${opts}" - opts = params.rm_multi ? "${opts}" : "--multi ${opts}" - if ("$params.min_mapq".isInteger()) opts="${opts} -q ${params.min_mapq}" - """ - mergeSAM.py -f ${r1_bam} -r ${r2_bam} -o ${sample}_bwt2pairs.bam ${opts} - """ + r1_bam = aligned_bam[0] + r1_prefix = r1_bam.toString() - ~/_bwt2merged.bam$/ + r2_bam = aligned_bam[1] + r2_prefix = r2_bam.toString() - ~/_bwt2merged.bam$/ + oname = sample.toString() - ~/(\.[0-9]+)$/ + + def opts = "-t" + opts = params.rm_singleton ? "${opts}" : "--single ${opts}" + opts = params.rm_multi ? "${opts}" : "--multi ${opts}" + if ("$params.min_mapq".isInteger()) opts="${opts} -q ${params.min_mapq}" + """ + mergeSAM.py -f ${r1_bam} -r ${r2_bam} -o ${sample}_bwt2pairs.bam ${opts} + """ } @@ -592,29 +604,33 @@ if (!params.dnase){ saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"} input: - set val(sample), file(pe_bam) from paired_bam - file frag_file from res_frag_file.collect() + set val(sample), file(pe_bam) from paired_bam + file frag_file from res_frag_file.collect() output: - set val(sample), file("*.validPairs") into valid_pairs - set val(sample), file("*.validPairs") into valid_pairs_4cool - set val(sample), file("*RSstat") into all_rsstat + set val(sample), file("*.validPairs") into valid_pairs + set val(sample), file("*.validPairs") into valid_pairs_4cool + set val(sample), file("*.DEPairs") into de_pairs + set val(sample), file("*.SCPairs") into sc_pairs + set val(sample), file("*.REPairs") into re_pairs + set val(sample), file("*.FiltPairs") into filt_pairs + set val(sample), file("*RSstat") into all_rsstat script: - if (params.splitFastq){ - sample = sample.toString() - ~/(\.[0-9]+)$/ - } - - def opts = "" - if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" - if ("$params.min_insert_size".isInteger()) opts="${opts} -s ${params.min_insert_size}" - if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}" - if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}" - if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}" - - """ - mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} ${opts} - """ + if (params.splitFastq){ + sample = sample.toString() - ~/(\.[0-9]+)$/ + } + + def opts = "" + if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" + if ("$params.min_insert_size".isInteger()) opts="${opts} -s ${params.min_insert_size}" + if ("$params.max_insert_size".isInteger()) opts="${opts} -l ${params.max_insert_size}" + if ("$params.min_restriction_fragment_size".isInteger()) opts="${opts} -t ${params.min_restriction_fragment_size}" + if ("$params.max_restriction_fragment_size".isInteger()) opts="${opts} -m ${params.max_restriction_fragment_size}" + if (params.saveInteractionBAM) opts="${opts} --sam" + """ + mapped_2hic_fragments.py -f ${frag_file} -r ${pe_bam} --all ${opts} + """ } } else{ @@ -624,23 +640,23 @@ else{ saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$filename" : "$filename"} input: - set val(sample), file(pe_bam) from paired_bam + set val(sample), file(pe_bam) from paired_bam output: - set val(sample), file("*.validPairs") into valid_pairs - set val(sample), file("*.validPairs") into valid_pairs_4cool - set val(sample), file("*RSstat") into all_rsstat + set val(sample), file("*.validPairs") into valid_pairs + set val(sample), file("*.validPairs") into valid_pairs_4cool + set val(sample), file("*RSstat") into all_rsstat script: - if (params.splitFastq){ - sample = sample.toString() - ~/(\.[0-9]+)$/ - } - - def opts = "" - if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" - """ - mapped_2hic_dnase.py -r ${pe_bam} ${opts} - """ + if (params.splitFastq){ + sample = sample.toString() - ~/(\.[0-9]+)$/ + } + + def opts = "" + if ("$params.min_cis_dist".isInteger()) opts="${opts} -d ${params.min_cis_dist}" + """ + mapped_2hic_dnase.py -r ${pe_bam} ${opts} + """ } } @@ -655,12 +671,12 @@ process remove_duplicates { saveAs: {filename -> filename.indexOf("*stat") > 0 ? "stats/$sample/$filename" : "$filename"} input: - set val(sample), file(vpairs) from valid_pairs.groupTuple() + set val(sample), file(vpairs) from valid_pairs.groupTuple() output: - set val(sample), file("*.allValidPairs") into all_valid_pairs - set val(sample), file("*.allValidPairs") into all_valid_pairs_4cool - file("stats/") into all_mergestat + set val(sample), file("*.allValidPairs") into all_valid_pairs + set val(sample), file("*.allValidPairs") into all_valid_pairs_4cool + file("stats/") into all_mergestat script: if ( params.rm_dup ){ @@ -700,21 +716,21 @@ process merge_sample { publishDir "${params.outdir}/hic_results/stats/${sample}", mode: 'copy' input: - set val(prefix), file(fstat) from all_mapstat.groupTuple().concat(all_pairstat.groupTuple(), all_rsstat.groupTuple()) + set val(prefix), file(fstat) from all_mapstat.groupTuple().concat(all_pairstat.groupTuple(), all_rsstat.groupTuple()) - output: - file("mstats/") into all_mstats + output: + file("mstats/") into all_mstats - script: - sample = prefix.toString() - ~/(_R1|_R2|_val_1|_val_2)/ - if ( (fstat =~ /.mapstat/) ){ ext = "mmapstat" } - if ( (fstat =~ /.pairstat/) ){ ext = "mpairstat" } - if ( (fstat =~ /.RSstat/) ){ ext = "mRSstat" } + script: + sample = prefix.toString() - ~/(_R1$|_R2$|_val_1$|_val_2$|_1$|_2$)/ + if ( (fstat =~ /.mapstat/) ){ ext = "mmapstat" } + if ( (fstat =~ /.pairstat/) ){ ext = "mpairstat" } + if ( (fstat =~ /.RSstat/) ){ ext = "mRSstat" } - """ - mkdir -p mstats/${sample} - merge_statfiles.py -f ${fstat} > mstats/${sample}/${prefix}.${ext} - """ + """ + mkdir -p mstats/${sample} + merge_statfiles.py -f ${fstat} > mstats/${sample}/${prefix}.${ext} + """ } @@ -723,15 +739,15 @@ process build_contact_maps{ publishDir "${params.outdir}/hic_results/matrix/raw", mode: 'copy' when: - !params.skip_maps + !params.skipMaps input: - set val(sample), file(vpairs), val(mres) from all_valid_pairs.combine(map_res) - file chrsize from chromosome_size.collect() + set val(sample), file(vpairs), val(mres) from all_valid_pairs.combine(map_res) + file chrsize from chromosome_size.collect() output: - file("*.matrix") into raw_maps - file "*.bed" + file("*.matrix") into raw_maps + file "*.bed" script: """ @@ -748,14 +764,14 @@ process run_ice{ publishDir "${params.outdir}/hic_results/matrix/iced", mode: 'copy' when: - !params.skip_maps && !params.skip_ice + !params.skipMaps && !params.skipIce input: - file(rmaps) from raw_maps - file "*.biases" + file(rmaps) from raw_maps + file "*.biases" output: - file("*iced.matrix") into iced_maps + file("*iced.matrix") into iced_maps script: prefix = rmaps.toString() - ~/(\.matrix)?$/ @@ -776,14 +792,14 @@ process generate_cool{ publishDir "${params.outdir}/export/cool", mode: 'copy' when: - !params.skip_cool + !params.skipCool input: - set val(sample), file(vpairs) from all_valid_pairs_4cool - file chrsize from chromosome_size_cool.collect() + set val(sample), file(vpairs) from all_valid_pairs_4cool + file chrsize from chromosome_size_cool.collect() output: - file("*mcool") into cool_maps + file("*mcool") into cool_maps script: """ @@ -793,51 +809,50 @@ process generate_cool{ /* - * STEP 5 - MultiQC + * STEP 6 - MultiQC */ process multiqc { - publishDir "${params.outdir}/MultiQC", mode: 'copy' + publishDir "${params.outdir}/MultiQC", mode: 'copy' - when: - !params.skip_multiqc - - input: - file multiqc_config from ch_multiqc_config - file ('input_*/*') from all_mstats.concat(all_mergestat).collect() - file ('software_versions/*') from software_versions_yaml - file workflow_summary from create_workflow_summary(summary) + when: + !params.skipMultiQC - output: - file "*multiqc_report.html" into multiqc_report - file "*_data" + input: + file multiqc_config from ch_multiqc_config + file ('input_*/*') from all_mstats.concat(all_mergestat).collect() + file ('software_versions/*') from software_versions_yaml + file workflow_summary from create_workflow_summary(summary) - script: - rtitle = custom_runName ? "--title \"$custom_runName\"" : '' - rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + output: + file "*multiqc_report.html" into multiqc_report + file "*_data" - """ - multiqc -f $rtitle $rfilename --config $multiqc_config . - """ + script: + rtitle = custom_runName ? "--title \"$custom_runName\"" : '' + rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + """ + multiqc -f $rtitle $rfilename --config $multiqc_config . + """ } /* - * STEP 3 - Output Description HTML + * STEP 7 - Output Description HTML */ process output_documentation { - publishDir "${params.outdir}/pipeline_info", mode: 'copy' + publishDir "${params.outdir}/pipeline_info", mode: 'copy' - input: - file output_docs from ch_output_docs + input: + file output_docs from ch_output_docs - output: - file "results_description.html" + output: + file "results_description.html" - script: - """ - markdown_to_html.r $output_docs results_description.html - """ + script: + """ + markdown_to_html.r $output_docs results_description.html + """ } @@ -938,10 +953,10 @@ workflow.onComplete { c_green = params.monochrome_logs ? '' : "\033[0;32m"; c_red = params.monochrome_logs ? '' : "\033[0;31m"; - if (workflow.stats.ignoredCountFmt > 0 && workflow.success) { + if (workflow.stats.ignoredCount > 0 && workflow.success) { log.info "${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}" - log.info "${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCountFmt} ${c_reset}" - log.info "${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCountFmt} ${c_reset}" + log.info "${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}" + log.info "${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}" } if(workflow.success){ @@ -966,14 +981,14 @@ def nfcoreHeader(){ c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; c_white = params.monochrome_logs ? '' : "\033[0;37m"; - return """ ${c_dim}----------------------------------------------------${c_reset} + return """ -${c_dim}--------------------------------------------------${c_reset}- ${c_green},--.${c_black}/${c_green},-.${c_reset} ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} ${c_green}`._,._,\'${c_reset} - ${c_purple} nf-core/hic v${workflow.manifest.version}${c_reset} - ${c_dim}----------------------------------------------------${c_reset} + ${c_purple} nf-core/atacseq v${workflow.manifest.version}${c_reset} + -${c_dim}--------------------------------------------------${c_reset}- """.stripIndent() } diff --git a/nextflow.config b/nextflow.config index 356f200..a521f3f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,10 +16,10 @@ params { readPaths = false chromosome_size = false restriction_fragments = false - skip_maps = false - skip_ice = false - skip_cool = false - skip_multiqc = false + skipMaps = false + skipIce = false + skipCool = false + skipMultiQC = false dnase = false // Boilerplate options @@ -45,7 +45,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'nfcore/hic:1.0.0' +process.container = 'nfcore/hic:1.1.0' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -101,8 +101,8 @@ manifest { homePage = 'https://github.com/nf-core/hic' description = 'Analysis of Chromosome Conformation Capture data (Hi-C)' mainScript = 'main.nf' - nextflowVersion = '>=0.32.0' - version = '1.0.0' + nextflowVersion = '>=19.04.0' + version = '1.1.0' } // Function to ensure that resource requirements don't go beyond