Merge pull request #5 from seanome/olgabot/multisearch

Add multisearch with probability of overlap calculation
seanome · Oct 18, 2024 · 6c8940c · 6c8940c
2 parents 863be75 + 4aaafce
commit 6c8940c
Show file tree

Hide file tree

Showing 18 changed files with 357 additions and 19 deletions.
diff --git a/Makefile b/Makefile
@@ -7,6 +7,9 @@ test_conda: clean
 test_docker: clean
 	nextflow run -profile docker,test --outdir ./results .
 
+resume_docker:
+	nextflow run -profile docker,test -resume --outdir ./results .
+
 debug_conda: clean
 	nextflow run . -profile debug,test,conda --outdir ./results
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -35,6 +35,21 @@ process {
         ext.args = "--by-size ${params.split2_size} --extension '.gz'"
     }
 
+    withName: '.*SOURMASH.*SKETCH'{
+        publishDir = [
+            path: { "${params.outdir}/sourmash/sigs" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+    withName: 'SOURMASH_MULTISEARCH'{
+        publishDir = [
+            path: { "${params.outdir}/sourmash/multisearch" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
 
 
 }
diff --git a/conf/test.config b/conf/test.config
@@ -25,9 +25,11 @@ params {
     input  = params.pipelines_testdata_base_path + 'kmerseek/samplesheet/snap25-samplesheet.csv'
 
     // Decrease this value to double check that seqkit split2 actually works
-    split2_size = 1000
+    split2_size = 100000
+
+    alphabet = "dayhoff"
 
     // Proteome references
-    fasta = params.pipelines_testdata_base_path + 'kmerseek/reference/uniprotkb_Synaptosomal_associated_2024_05_11.fasta.gz'
+    fasta = params.pipelines_testdata_base_path + 'kmerseek/reference/snap25_isoforms_human_P60880.fasta'
 }
 
diff --git a/docs/output.md b/docs/output.md
@@ -14,6 +14,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
 
 - [Seqkit Split2](#seqkit-split2) - split up a protein fasta file into smaller files for better performance with Sourmash
 - [Sourmash](#sourmash) - Convert protein sequence into k-mer signatures
+  - [Sketch]
+  - [Multisearch]
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
@@ -33,15 +35,30 @@ SeqKit is a cross-platform and ultrafast toolkit for FASTA/Q file manipulation.
 
 ### Sourmash
 
+#### Sourmash Sketch
+
 <details markdown="1">
 <summary>Output files</summary>
 
-- `sourmash/`
-  - `*.sig`: FastQC report containing quality metrics.
+- `sourmash/sigs/`
+  - `*.sig`: K-mer signature generated from protein sequence
 
 </details>
 
-[`Sourmash`](https://sourmash.readthedocs.io/) is a tool for genome analysis using k-mers. We specifically use the protein k-mer functionality to identify sequences with potentially related functions across large evolutionary distances. For further reading and documentation, see the [Sourmash Tutorials and Examples](https://sourmash.readthedocs.io/en/latest/sidebar.html)
+[`Sourmash`](https://sourmash.readthedocs.io/) is a tool for genome analysis using k-mers. We specifically use the protein k-mer functionality to identify sequences with potentially related functions across large evolutionary distances. For further reading and documentation, see the [Sourmash Tutorials and Examples](https://sourmash.readthedocs.io/en/latest/sidebar.html) Specifically, we use [`manysketch`](https://github.com/sourmash-bio/sourmash_plugin_branchwater/blob/main/doc/README.md#running-manysketch) from the [branchwater](https://github.com/sourmash-bio/sourmash_plugin_branchwater/) plugin for Sourmash to perform fast, parallelized sketch computation.
+
+The default arguments are: `"--singleton --param-string '$alphabet,scaled=1,k=$ksize,abund'"`, where the alphabet (or moltype) is a protein alphabet, and the k-size (k-mer size) is an integer.
+
+#### Sourmash Multisearch (Branchwater Plugin)
+
+<details markdown="1">
+<summary>Output files</summary>
+
+- `sourmash/multisearch/`
+  - `*.multisearch.csv`: Results from searching
+  </details>
+
+We use the [branchwater](https://github.com/sourmash-bio/sourmash_plugin_branchwater/) plugin for [`Sourmash`](https://sourmash.readthedocs.io/) to perform fast, parallelized search using Rust-optimized Python code. Specifically, we use the [`multisearch`](https://github.com/sourmash-bio/sourmash_plugin_branchwater/blob/main/doc/README.md#Running-multisearch-and-pairwise) plugin, which loads all the 'against' (aka 'target' or 'database') sketches into memory, and computes the [probability of overlap](https://github.com/sourmash-bio/sourmash_plugin_branchwater/pull/458) between the query and against sketches, useful for ranking matches that are less likely to be by chance.
 
 ### MultiQC
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -43,7 +43,7 @@ The typical command for running the pipeline is as follows:
 nextflow run nf-core/kmerseek --input ./samplesheet.csv --outdir ./results --fasta uniref50.fasta -profile docker
 ```
 
-This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.
+This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. `--fasta` is the file against which all the samples in `samplesheet.csv` are compared against.
 
 Note that the pipeline will create the following files in your working directory:
 

diff --git a/modules/local/sourmash/manysketch/Dockerfile b/modules/local/sourmash/manysketch/Dockerfile
@@ -1,17 +1,44 @@
-FROM nfcore/base:2.1
+# Miniconda3 uses a faster solver than the one available for nf-core/base:2.1
+FROM continuumio/miniconda3
 LABEL \
     author="Olga Botvinnik" \
-    description="sourmash branchwater image for nf-core pipelines" \
+    description="sourmash branchwater image with latest commit for nf-core pipelines" \
     maintainer="olga.botvinnik@gmail.com"
 
 # Install the conda environment
-COPY environment.yml /
-RUN conda env create -f /environment.yml && conda clean -a
 
-ARG SOURMASH_BRANCHWATER_VERSION=0.9.3
+ADD https://api.github.com/repos/sourmash-bio/sourmash_plugin_branchwater/git/refs/heads/main version.json
+RUN git clone --depth 1 https://github.com/sourmash-bio/sourmash_plugin_branchwater/
+RUN cd sourmash_plugin_branchwater \
+    && conda env create -y -n sourmash-branchwater -f environment.yml
+
+# COPY environment.yml /
+# RUN conda env create -f /environment.yml && conda clean -a
+
+# ARG SOURMASH_BRANCHWATER_VERSION=0.9.3
 
 # Add conda installation dir to PATH (instead of doing 'conda activate')
-ENV PATH /opt/conda/envs/nf-core-sourmash-branchwater-${SOURMASH_BRANCHWATER_VERSION}/bin:$PATH
+ENV PATH /opt/conda/envs/sourmash-branchwater/bin:$PATH
+
+
+RUN which -a conda
+RUN which -a pip
+RUN which -a maturin
+
+# Need to install clang libraries
+RUN apt update && apt-get install -y clang
+
+RUN which -a clang
+
+RUN cd sourmash_plugin_branchwater \
+    && which -a pip \
+    && which -a clang \
+    && which -a maturin \
+    && export LIBCLANG_PATH=/usr/lib/x86_64-linux-gnu/ \
+    && export CONDA_PREFIX=/opt/conda \
+    && pip install -e . \
+    && maturin develop
+
 
 # Dump the details of the installed packages to a file for posterity
-RUN conda env export --name nf-core-snpeff-${SOURMASH_BRANCHWATER_VERSION} > nf-core-snpeff-${SOURMASH_BRANCHWATER_VERSION}.yml
+RUN conda env export --name sourmash-branchwater > nf-core-sourmash-branchwater.yml
diff --git a/modules/local/sourmash/manysketch/Makefile b/modules/local/sourmash/manysketch/Makefile
@@ -1,5 +1,6 @@
 all: build push
 
+# TOOD: Switch to podman for building and quay.io for hosting images at some point
 build:
 	docker build -t olgabot/sourmash_branchwater .
 

diff --git a/modules/local/sourmash/manysketch/main.nf b/modules/local/sourmash/manysketch/main.nf
@@ -1,13 +1,14 @@
 process SOURMASH_MANYSKETCH {
-    tag "${meta.id}_k${ksize}"
+    tag "${meta.id}_${alphabet}_k${ksize}"
 
     conda "${moduleDir}/environment.yml"
-    container "docker.io/olgabot/sourmash_branchwater"
+    container "docker.io/olgabot/sourmash_branchwater:latest"
 
     input:
     tuple val(meta), path(sequence)
     val(alphabet)
     each ksize
+    val(query_or_against)
 
     output:
     tuple val(meta), path("*.sig.zip"), emit: signatures
@@ -34,7 +35,7 @@ process SOURMASH_MANYSKETCH {
         --debug \\
         -c $task.cpus \\
         $args \\
-        --output '${prefix}.${alphabet}.k${ksize}.sig.zip' \\
+        --output '${query_or_against}.${prefix}.${alphabet}.k${ksize}.sig.zip' \\
         ${meta.id}__manysketch.csv
 
     cat <<-END_VERSIONS > versions.yml

diff --git a/modules/local/sourmash/multisearch/Dockerfile b/modules/local/sourmash/multisearch/Dockerfile
@@ -0,0 +1,44 @@
+# Miniconda3 uses a faster solver than the one available for nf-core/base:2.1
+FROM continuumio/miniconda3
+LABEL \
+    author="Olga Botvinnik" \
+    description="sourmash branchwater image with specialized probability of overlap calculation for multisearch for nf-core pipelines" \
+    maintainer="olga.botvinnik@gmail.com"
+
+# Install the conda environment
+
+ADD https://api.github.com/repos/sourmash-bio/sourmash_plugin_branchwater/git/refs/heads/olgabot/multisearch-evalue version.json
+RUN git clone --depth 1 --branch olgabot/multisearch-evalue https://github.com/sourmash-bio/sourmash_plugin_branchwater/
+RUN cd sourmash_plugin_branchwater \
+    && conda env create -y -n sourmash-branchwater-multisearch-prob-overlap -f environment.yml
+
+# COPY environment.yml /
+# RUN conda env create -f /environment.yml && conda clean -a
+
+# ARG SOURMASH_BRANCHWATER_VERSION=0.9.3
+
+# Add conda installation dir to PATH (instead of doing 'conda activate')
+ENV PATH /opt/conda/envs/sourmash-branchwater-multisearch-prob-overlap/bin:$PATH
+
+
+RUN which -a conda
+RUN which -a pip
+RUN which -a maturin
+
+# Need to install clang libraries
+RUN apt update && apt-get install -y clang
+
+RUN which -a clang
+
+RUN cd sourmash_plugin_branchwater \
+    && which -a pip \
+    && which -a clang \
+    && which -a maturin \
+    && export LIBCLANG_PATH=/usr/lib/x86_64-linux-gnu/ \
+    && export CONDA_PREFIX=/opt/conda \
+    && pip install -e . \
+    && maturin develop
+
+
+# Dump the details of the installed packages to a file for posterity
+RUN conda env export --name sourmash-branchwater-multisearch-prob-overlap > nf-core-sourmash-branchwater-${SOURMASH_BRANCHWATER_VERSION}.yml
diff --git a/modules/local/sourmash/multisearch/Makefile b/modules/local/sourmash/multisearch/Makefile
@@ -0,0 +1,13 @@
+NAME=olgabot/sourmash_branchwater_multisearch_prob_overlap
+
+all: build login push
+
+# TOOD: Switch to podman for building and quay.io for hosting images at some point
+build:
+	docker build -t ${NAME} .
+
+login:
+	docker login
+
+push:
+	docker push ${NAME}
diff --git a/modules/local/sourmash/multisearch/environment.yml b/modules/local/sourmash/multisearch/environment.yml
@@ -0,0 +1,8 @@
+name: nf-core-sourmash-branchwater-0.9.3
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - conda-forge::sourmash_plugin_branchwater=0.9.3
+  - conda-forge::sourmash-minimal=4.8.8
diff --git a/modules/local/sourmash/multisearch/main.nf b/modules/local/sourmash/multisearch/main.nf
@@ -0,0 +1,72 @@
+process SOURMASH_MULTISEARCH {
+    tag "${meta.alphabet}_k${meta.ksize}"
+    label "process_medium"
+
+    conda "${moduleDir}/environment.yml"
+    container "docker.io/olgabot/sourmash_branchwater_multisearch_prob_overlap"
+
+    input:
+    tuple val(meta), val(query_meta), path(query_sig), val(against_meta), path(against_sig)
+
+    output:
+    tuple val(meta), path("*.csv"), emit: csv
+    path "versions.yml"           , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    // $ sourmash scripts multisearch --help
+
+    // == This is sourmash version 4.8.11. ==
+    // == Please cite Irber et. al (2024), doi:10.21105/joss.06830. ==
+
+    // usage:  multisearch [-h] [-q] [-d] -o OUTPUT [-t THRESHOLD] [-k KSIZE] [-s SCALED] [-m {DNA,protein,dayhoff,hp}] [-c CORES] [-a] query_paths against_paths
+
+    // massively parallel in-memory sketch search
+
+    // positional arguments:
+    // query_paths           input file of sketches
+    // against_paths         input file of sketches
+
+    // options:
+    // -h, --help            show this help message and exit
+    // -q, --quiet           suppress non-error output
+    // -d, --debug           provide debugging output
+    // -o OUTPUT, --output OUTPUT
+    //                         CSV output file for matches
+    // -t THRESHOLD, --threshold THRESHOLD
+    //                         containment threshold for reporting matches (default: 0.01)
+    // -k KSIZE, --ksize KSIZE
+    //                         k-mer size at which to select sketches
+    // -s SCALED, --scaled SCALED
+    //                         scaled factor at which to do comparisons
+    // -m {DNA,protein,dayhoff,hp}, --moltype {DNA,protein,dayhoff,hp}
+    //                         molecule type (DNA, protein, dayhoff, or hp; default DNA)
+    // -c CORES, --cores CORES
+    //                         number of cores to use (default is all available)
+    // -a, --ani             estimate ANI from containment
+    //
+    // Example run:
+    // sourmash scripts multisearch query.sig.gz database.zip -o results.csv
+
+    // required defaults for the tool to run, but can be overridden
+    def args = "--ksize ${meta.ksize} --moltype ${meta.alphabet} --threshold 0 --scaled 1"
+    def prefix = task.ext.prefix ?: "${query_meta.id}--in--${against_meta.id}.${meta.alphabet}.${meta.ksize}"
+    def BRANCHWATER_VERSION = '0.9.3' // Version not available using command line
+    """
+    sourmash scripts multisearch \\
+        --debug \\
+        -c $task.cpus \\
+        $args \\
+        --output '${prefix}.multisearch.csv' \\
+        ${query_sig} \\
+        ${against_sig}
+
+    cat <<-END_VERSIONS > versions.yml
+"${task.process}":
+    sourmash: \$(echo \$(sourmash --version 2>&1) | sed 's/^sourmash //' )
+    sourmash_plugin_branchwater: $BRANCHWATER_VERSION
+END_VERSIONS
+    """
+}
diff --git a/modules/local/sourmash/multisearch/meta.yml b/modules/local/sourmash/multisearch/meta.yml
@@ -0,0 +1,51 @@
+name: sourmash_sketch
+description: Create a signature (a group of FracMinHash sketches) of a sequence using sourmash
+keywords:
+  - hash sketch
+  - sourmash
+  - genomics
+  - metagenomics
+  - taxonomic classification
+  - taxonomic profiling
+  - kmer
+tools:
+  - sourmash:
+      description: Compute and compare FracMinHash signatures for DNA and protein data sets.
+      homepage: https://sourmash.readthedocs.io/
+      documentation: https://sourmash.readthedocs.io/
+      tool_dev_url: https://github.com/sourmash-bio/sourmash
+      doi: "10.21105/joss.00027"
+      licence: ["BSD-3-clause"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - sequence:
+      type: file
+      description: FASTA or FASTQ file containing (genomic, transcriptomic, or proteomic) sequence data
+      pattern: "*.{fna,fa,fasta,fastq,fq,faa}.gz"
+  - ksize:
+      type: int
+      description: |
+        The k-mer size to use for the MinHash sketches.
+        e.g.: 31
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - signatures:
+      type: file
+      description: FracMinHash signature of the given sequence
+      pattern: "*.{sig}"
+authors:
+  - "@Midnighter,@olgabot"
+maintainers:
+  - "@olgabot"
diff --git a/nextflow.config b/nextflow.config
@@ -29,7 +29,7 @@ params {
     ksizes                     = '5,6,7,8,9'
 
     // Split up the input databases into 100,000 sequences at a time
-    split2_size                 = 100000
+    split2_size                 = 100000000
 
     // Boilerplate options
     outdir                       = null