seanome · olgabot · May 18, 2024 · May 11, 2024 · May 11, 2024 · May 11, 2024
diff --git a/.editorconfig b/.editorconfig
@@ -8,6 +8,10 @@ trim_trailing_whitespace = true
 indent_size = 4
 indent_style = space
 
+[Makefile]
+indent_style = tab
+indent_size = 1
+
 [*.{md,yml,yaml,html,css,scss,js}]
 indent_size = 2
 

diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ results/
 testing/
 testing*
 *.pyc
+null/
diff --git a/Makefile b/Makefile
@@ -0,0 +1,18 @@
+test_conda: clean
+	nextflow run -profile conda,test --outdir ./results .
+
+test_docker: clean
+	nextflow run -profile docker,test --outdir ./results .
+
+debug_conda: clean
+	nextflow run . -profile debug,test,conda --outdir ./results
+
+debug_docker: clean
+	nextflow run . -profile debug,test,docker --outdir ./results
+
+clean:
+	rm -rf .nextflow* results work
+
+lint:
+	pre-commit run --all-files
+	nf-core lint
diff --git a/README.md b/README.md
@@ -5,21 +5,21 @@
   </picture>
 </h1>
 
-[![GitHub Actions CI Status](https://github.com/nf-core/kmerseek/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/kmerseek/actions/workflows/ci.yml)
-[![GitHub Actions Linting Status](https://github.com/nf-core/kmerseek/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/kmerseek/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/kmerseek/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)
+[![GitHub Actions CI Status](https://github.com/olgabot/nf-core-kmerseek/actions/workflows/ci.yml/badge.svg)](https://github.com/olgabot/nf-core-kmerseek/actions/workflows/ci.yml)
+[![GitHub Actions Linting Status](https://github.com/olgabot/nf-core-kmerseek/actions/workflows/linting.yml/badge.svg)](https://github.com/olgabot/nf-core-kmerseek/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/kmerseek/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)
 [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)
 
 [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/)
 [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)
 [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)
 [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)
-[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/kmerseek)
+[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/olgabot/nf-core-kmerseek)
 
 [![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23kmerseek-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/kmerseek)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)
 
 ## Introduction
 
-**nf-core/kmerseek** is a bioinformatics pipeline that ...
+**nf-core/kmerseek** is a bioinformatics pipeline that, like BLAST, searches for similar sequences using a query in a database, but unlike BLAST, uses protein k-mers. The goal is to find proteins of similar function.
 
 <!-- TODO nf-core:
    Complete this sentence with a 2-3 sentence summary of what types of data the pipeline ingests, a brief overview of the
@@ -31,8 +31,11 @@
      workflows use the "tube map" design for that. See https://nf-co.re/docs/contributing/design_guidelines#examples for examples.   -->
 <!-- TODO nf-core: Fill in short bullet-pointed list of the default steps in the pipeline -->
 
-1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/))
-2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
+1. Converts protein sequences into k-mer signatures ([`Sourmash`](https://sourmash.readthedocs.io/))
+2. [TODO] Searches query signature in target ([`Sourmash`](https://sourmash.readthedocs.io/))
+3. [TODO] Uses custom code to identify statistically significant matches
+4. [TODO] Uses custom code to show matching k-mer sequences
+5. Present QC for raw reads ([`MultiQC`](http://multiqc.info/))
 
 ## Usage
 
@@ -63,6 +66,7 @@ Now, you can run the pipeline using:
 nextflow run nf-core/kmerseek \
    -profile <docker/singularity/.../institute> \
    --input samplesheet.csv \
+   --fasta uniref50.fa.gz \
    --outdir <OUTDIR>
 ```
 

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
-sample,fastq_1,fastq_2
-SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz
-SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz,
+sample,fasta
+SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fasta
+SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fasta.gz
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -13,21 +13,14 @@
                 "errorMessage": "Sample name must be provided and cannot contain spaces",
                 "meta": ["id"]
             },
-            "fastq_1": {
+            "fasta": {
                 "type": "string",
                 "format": "file-path",
                 "exists": true,
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
-            },
-            "fastq_2": {
-                "type": "string",
-                "format": "file-path",
-                "exists": true,
-                "pattern": "^\\S+\\.f(ast)?q\\.gz$",
-                "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'"
+                "pattern": "^\\S+\\.f(ast|a)?a(\\.gz)?$",
+                "errorMessage": "FastA file must be provided, cannot contain spaces and must have extension '.fa' or '.faa' or '.fasta' or '.fa.gz' or '.fasta.gz'"
             }
         },
-        "required": ["sample", "fastq_1"]
+        "required": ["sample", "fasta"]
     }
 }
diff --git a/conf/base.config b/conf/base.config
@@ -59,4 +59,11 @@ process {
         errorStrategy = 'retry'
         maxRetries    = 2
     }
+    withName:SOURMASH_MANYSKETCH {
+        // 1_000_000 = 1 megabtye. Most query sequences are less than 1MB in size,
+        // while the database is usually MUCH larger than 1MB, like multiple GBs.
+        memory = { sequence.size() < 1_000_000 ? check_max( 1.GB * task.attempt, 'memory' ) : check_max( 200.GB * task.attempt, 'memory' ) }
+        cpus = { sequence.size() < 1_000_000 ? check_max( 1 * task.attempt, 'cpus' ) : check_max( 16 * task.attempt, 'cpus' ) }
+        time = { sequence.size() < 1_000_000 ? check_max( 1.h * task.attempt, 'time' ) : check_max( 16.h * task.attempt, 'time' ) }
+    }
 }
diff --git a/conf/modules.config b/conf/modules.config
@@ -31,4 +31,12 @@ process {
         ]
     }
 
+    withName: SOURMASH_SKETCH {
+        ext.args = {
+            params.alphabet == "protein"
+            ? "protein --singleton --param-string 'scaled=1,$params.ksizes,abund'"
+            : "protein --singleton --$params.alphabet --param-string 'scaled=1,$params.ksizes,abund'"
+        }
+    }
+
 }
diff --git a/conf/test.config b/conf/test.config
@@ -22,8 +22,8 @@ params {
     // Input data
     // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets
     // TODO nf-core: Give any required params for the test so that command line flags are not needed
-    input  = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv'
+    input  = params.pipelines_testdata_base_path + 'kmerseek/samplesheet/snap25-samplesheet.csv'
 
-    // Genome references
-    genome = 'R64-1-1'
+    // Proteome references
+    fasta = params.pipelines_testdata_base_path + 'kmerseek/reference/uniprotkb_Synaptosomal_associated_2024_05_11.fasta.gz'
 }
diff --git a/docs/output.md b/docs/output.md
@@ -12,32 +12,21 @@ The directories listed below will be created in the results directory after the
 
 The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps:
 
-- [FastQC](#fastqc) - Raw read QC
+- [Sourmash](#sourmash) - Convert protein sequence into k-mer signatures
 - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline
 - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution
 
-### FastQC
+### Sourmash
 
 <details markdown="1">
 <summary>Output files</summary>
 
-- `fastqc/`
-  - `*_fastqc.html`: FastQC report containing quality metrics.
-  - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images.
+- `sourmash/`
+  - `*.sig`: FastQC report containing quality metrics.
 
 </details>
 
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/).
-
-![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png)
-
-![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png)
-
-![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png)
-
-:::note
-The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality.
-:::
+[`Sourmash`](https://sourmash.readthedocs.io/) is a tool for genome analysis using k-mers. We specifically use the protein k-mer functionality to identify sequences with potentially related functions across large evolutionary distances. For further reading and documentation, see the [Sourmash Tutorials and Examples](https://sourmash.readthedocs.io/en/latest/sidebar.html)
 
 ### MultiQC
 

diff --git a/docs/usage.md b/docs/usage.md
@@ -10,45 +10,28 @@
 
 ## Samplesheet input
 
-You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below.
+You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 2 columns, and a header row as shown in the examples below.
 
 ```bash
 --input '[path to samplesheet file]'
 ```
 
-### Multiple runs of the same sample
-
-The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes:
-
-```csv title="samplesheet.csv"
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz
-CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz
-```
-
 ### Full samplesheet
 
 The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below.
 
 A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice.
 
 ```csv title="samplesheet.csv"
-sample,fastq_1,fastq_2
-CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz
-CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz
-CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz
-TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz,
-TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz,
-TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz,
+sample,fasta
+snap25a,snap25a_mxe_exon.fa
+snap25b,snap2ba_mxe_exon.fa.gz
 ```
 
-| Column    | Description                                                                                                                                                                            |
-| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `sample`  | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
-| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
-| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz".                                                             |
+| Column   | Description                                                                                                                                                                            |
+| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). |
+| `fasta`  | Full path FastA file for Illumina short reads 1. File doesn't need to be gzipped, but needs to end with _.fa(.gz)?, _.faa(.gz)?, \*.fasta(.gz)?.                                       |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
@@ -57,7 +40,7 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p
 The typical command for running the pipeline is as follows:
 
 ```bash
-nextflow run nf-core/kmerseek --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker
+nextflow run nf-core/kmerseek --input ./samplesheet.csv --outdir ./results --fasta uniref50.fasta -profile docker
 ```
 
 This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles.

diff --git a/modules.json b/modules.json
@@ -14,6 +14,11 @@
                         "branch": "master",
                         "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a",
                         "installed_by": ["modules"]
+                    },
+                    "sourmash/sketch": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
                     }
                 }
             },

diff --git a/modules/local/sourmash/manysketch/Dockerfile b/modules/local/sourmash/manysketch/Dockerfile
@@ -0,0 +1,17 @@
+FROM nfcore/base:2.1
+LABEL \
+    author="Olga Botvinnik" \
+    description="sourmash branchwater image for nf-core pipelines" \
+    maintainer="olga.botvinnik@gmail.com"
+
+# Install the conda environment
+COPY environment.yml /
+RUN conda env create -f /environment.yml && conda clean -a
+
+ARG SOURMASH_BRANCHWATER_VERSION=0.9.3
+
+# Add conda installation dir to PATH (instead of doing 'conda activate')
+ENV PATH /opt/conda/envs/nf-core-sourmash-branchwater-${SOURMASH_BRANCHWATER_VERSION}/bin:$PATH
+
+# Dump the details of the installed packages to a file for posterity
+RUN conda env export --name nf-core-snpeff-${SOURMASH_BRANCHWATER_VERSION} > nf-core-snpeff-${SOURMASH_BRANCHWATER_VERSION}.yml
diff --git a/modules/local/sourmash/manysketch/Makefile b/modules/local/sourmash/manysketch/Makefile
@@ -0,0 +1,7 @@
+all: build push
+
+build:
+	docker build -t olgabot/sourmash_branchwater .
+
+push:
+	docker push olgabot/sourmash_branchwater
diff --git a/modules/local/sourmash/manysketch/environment.yml b/modules/local/sourmash/manysketch/environment.yml
@@ -0,0 +1,8 @@
+name: nf-core-sourmash-branchwater-0.9.3
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - conda-forge::sourmash_plugin_branchwater=0.9.3
+  - conda-forge::sourmash-minimal=4.8.8
diff --git a/modules/local/sourmash/manysketch/main.nf b/modules/local/sourmash/manysketch/main.nf
@@ -0,0 +1,42 @@
+process SOURMASH_MANYSKETCH {
+    tag "${meta.id}_k${ksize}"
+    label 'process_low'
+
+    conda "${moduleDir}/environment.yml"
+    container "docker.io/olgabot/sourmash_branchwater"
+
+    input:
+    tuple val(meta), path(sequence)
+    val(alphabet)
+    each ksize
+
+    output:
+    tuple val(meta), path("*.sig.zip"), emit: signatures
+    path "versions.yml"               , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    // required defaults for the tool to run, but can be overridden
+    def args = "--singleton --param-string '$alphabet,scaled=1,k=$ksize,abund'"
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    def BRANCHWATER_VERSION = '0.9.3' // Version not available using command line
+    """
+    # manysketch only accepts CSV files (can't use fastas directly),
+    # so create a CSV file with the fasta sequence name
+    echo "name,genome_filename,protein_filename\n${meta.id},,${sequence}" > ${meta.id}__manysketch.csv
+    sourmash scripts manysketch \\
+        --debug \\
+        -c $task.cpus \\
+        $args \\
+        --output '${prefix}.${alphabet}.k${ksize}.sig.zip' \\
+        ${meta.id}__manysketch.csv
+
+    cat <<-END_VERSIONS > versions.yml
+"${task.process}":
+    sourmash: \$(echo \$(sourmash --version 2>&1) | sed 's/^sourmash //' )
+    sourmash_plugin_branchwater: $BRANCHWATER_VERSION
+END_VERSIONS
+    """
+}
diff --git a/modules/local/sourmash/manysketch/meta.yml b/modules/local/sourmash/manysketch/meta.yml
@@ -0,0 +1,51 @@
+name: sourmash_sketch
+description: Create a signature (a group of FracMinHash sketches) of a sequence using sourmash
+keywords:
+  - hash sketch
+  - sourmash
+  - genomics
+  - metagenomics
+  - taxonomic classification
+  - taxonomic profiling
+  - kmer
+tools:
+  - sourmash:
+      description: Compute and compare FracMinHash signatures for DNA and protein data sets.
+      homepage: https://sourmash.readthedocs.io/
+      documentation: https://sourmash.readthedocs.io/
+      tool_dev_url: https://github.com/sourmash-bio/sourmash
+      doi: "10.21105/joss.00027"
+      licence: ["BSD-3-clause"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - sequence:
+      type: file
+      description: FASTA or FASTQ file containing (genomic, transcriptomic, or proteomic) sequence data
+      pattern: "*.{fna,fa,fasta,fastq,fq,faa}.gz"
+  - ksize:
+      type: int
+      description: |
+        The k-mer size to use for the MinHash sketches.
+        e.g.: 31
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+  - signatures:
+      type: file
+      description: FracMinHash signature of the given sequence
+      pattern: "*.{sig}"
+authors:
+  - "@Midnighter,@olgabot"
+maintainers:
+  - "@olgabot"
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,3 +6,4 @@ results/ @@
     testing/
     testing*
     *.pyc
+    null/