Merge pull request #158 from rhpvorderman/release_0.9.0

Release 0.9.0
rhpvorderman · May 21, 2024 · c953a71 · c953a71
2 parents a39ceee + ed93b86
commit c953a71
Show file tree

Hide file tree

Showing 15 changed files with 406 additions and 51 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -7,6 +7,25 @@ Changelog
 .. This document is user facing. Please word the changes in such a way
 .. that users understand how the changes affect the new version.
 
+version 0.9.0
+-----------------
++ MultiQC support since MultiQC version 1.22
++ Sort modules for paired end reports in the same order as single end reports.
+  For example, the sequence length distributions for read 1 and read 2 are now
+  right after each other.
++ Add common human genome repeats and Illumina poly-G dark cycles to the
+  overrepresented sequences database.
++ Illumina adapter trimming sequences were added to the contaminants database
+  as these were missing from the UniVec database.
++ Sequence identity, rather than kmers matched is shown as a metric for
+  similarity in the overrepresented sequences table.
++ Overrepresented sequence classification now uses stable sorting to ensure
+  the classification results are the same on each rerun.
++ Overrepresented sequences are now classified using Smith-Waterman alignment
+  and sequence identity.
++ Fix an off by one error in the insert size metrics that was triggered for
+  insert sizes larger than 300 bp.
+
 version 0.8.0
 -----------------
 + A citation file was added to the repository.
@@ -67,7 +86,7 @@ version 0.5.0
   understand. There were some inconsistencies in the documentation about this
   that are now fixed.
 + Add a new `meta` section to the JSON report to allow integration with
-  `MultiQC <https://github.com/multiqc/MultiQC>`_.
+  `MultiQC <https://multiqc.info>`_.
 + Add all nanopore barcode sequences and native adapters to the contaminants.
 + Add native adapters to the adapter search.
 

diff --git a/README.rst b/README.rst
@@ -39,6 +39,7 @@ Sequence quality metrics for FASTQ and uBAM files.
 
 Features:
 
++ `MultiQC <https://multiqc.info>`_ support since MultiQC version 1.22.
 + Low memory footprint, small install size and fast execution times.
 + Informative graphs that allow for judging the quality of a sequence at
   a quick glance.

diff --git a/scripts/benchmark_sequence_identity.py b/scripts/benchmark_sequence_identity.py
@@ -0,0 +1,14 @@
+import json
+import sys
+
+from sequali.sequence_identification import identify_sequence_builtin
+
+
+if __name__ == "__main__":
+    json_file = sys.argv[1]
+    with open(json_file, "rb") as f:
+        data = json.load(f)
+    sequence_dicts = data["overrepresented_sequences"]["overrepresented_sequences"]
+    for seqdict in sequence_dicts:
+        identify_sequence_builtin(seqdict["sequence"])
+
diff --git a/scripts/check_presence_in_fasta.py b/scripts/check_presence_in_fasta.py
@@ -0,0 +1,21 @@
+import argparse
+
+from sequali.util import fasta_parser
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("fasta")
+    parser.add_argument("sequence")
+    args = parser.parse_args()
+
+    total = 0
+    sequence = args.sequence.upper()
+    for name, contig in fasta_parser(args.fasta):
+        contig = contig.upper()
+        total += contig.count(sequence)
+    print(total)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/setup.py b/setup.py
@@ -19,6 +19,7 @@
 
 setup(
     ext_modules=[
-        Extension("sequali._qc", ["src/sequali/_qcmodule.c"])
+        Extension("sequali._qc", ["src/sequali/_qcmodule.c"]),
+        Extension("sequali._seqident", ["src/sequali/_seqidentmodule.c"])
     ],
 )
diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
@@ -4629,7 +4629,7 @@ InsertSizeMetrics__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs)
     size_t hash_table_bits = (size_t)(log2(max_adapters * 1.5) + 1);
 
     self->max_adapters = max_adapters;
-    self->max_insert_size = 300;
+    self->max_insert_size = 0;
     self->hash_table_read1_entries = 0;
     self->hash_table_read2_entries = 0;
     self->hash_table_size = 1 << hash_table_bits; 
@@ -4664,7 +4664,7 @@ InsertSizeMetrics_resize(InsertSizeMetrics *self, size_t new_size)
         PyErr_NoMemory();
         return -1;
     }
-    memset(tmp + old_size, 0, (new_size - old_size) * sizeof(uint64_t));
+    memset(tmp + old_size + 1, 0, (new_size - old_size) * sizeof(uint64_t));
     self->max_insert_size = new_size;
     self->insert_sizes = tmp;
     return 0;

diff --git a/src/sequali/_seqident.pyi b/src/sequali/_seqident.pyi
@@ -0,0 +1,3 @@
+def sequence_identity(target: str, query: str,
+                      match_score=1, mismatch_penalty=-1, deletion_penalty=-1,
+                      insertion_penalty=-1) -> float: ...
diff --git a/src/sequali/_seqidentmodule.c b/src/sequali/_seqidentmodule.c
@@ -0,0 +1,165 @@
+/*
+Copyright (C) 2023 Leiden University Medical Center
+This file is part of Sequali
+
+Sequali is free software: you can redistribute it and/or modify
+it under the terms of the GNU Affero General Public License as
+published by the Free Software Foundation, either version 3 of the
+License, or (at your option) any later version.
+
+Sequali is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU Affero General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with Sequali.  If not, see <https://www.gnu.org/licenses/
+*/
+
+#include "Python.h"
+
+struct Entry {
+    Py_ssize_t score;
+    Py_ssize_t query_matches;
+};
+
+PyDoc_STRVAR(sequence_identity__doc__, 
+"Calculate sequence identity based on a smith-waterman matrix. Only keep\n"
+"two columns in memory as no walk-back is needed.\n"
+"Identity is given as (query_length - errors / query_length).\n"
+);
+
+#define sequence_identity_method METH_VARARGS | METH_KEYWORDS
+
+static PyObject * 
+sequence_identity(PyObject *module, PyObject *args, PyObject *kwargs)
+{
+    static char *format = "UU|nnnn:identify_sequence";
+    static char *kwnames[] = {
+        "target", "query", "match_score", "mismatch_penalty", 
+        "deletion_penalty", "inertion_penalty", NULL
+    };
+    PyObject *target_obj = NULL; 
+    PyObject *query_obj = NULL;
+    Py_ssize_t match_score = 1; 
+    Py_ssize_t mismatch_penalty = -1; 
+    Py_ssize_t deletion_penalty = -1; 
+    Py_ssize_t insertion_penalty = -1;
+    if (!PyArg_ParseTupleAndKeywords(
+        args, kwargs, format, kwnames, 
+        &target_obj, &query_obj, &match_score, &mismatch_penalty, 
+        &deletion_penalty, &insertion_penalty)
+    ) {
+        return NULL;
+    }
+    if (!PyUnicode_IS_COMPACT_ASCII(target_obj)) {
+        PyErr_Format(
+            PyExc_ValueError,
+            "Only ascii strings are allowed. Got %R",
+            target_obj
+        );
+        return NULL;
+    }
+    if (!PyUnicode_IS_COMPACT_ASCII(query_obj)) {
+        PyErr_Format(
+            PyExc_ValueError,
+            "Only ascii strings are allowed. Got %R",
+            target_obj
+        );
+        return NULL;
+    }
+    const uint8_t *target = PyUnicode_DATA(target_obj);
+    const uint8_t *query = PyUnicode_DATA(query_obj);
+    Py_ssize_t target_length = PyUnicode_GET_LENGTH(target_obj);
+    Py_ssize_t query_length = PyUnicode_GET_LENGTH(query_obj);
+    if (query_length > 31) {
+        PyErr_Format(
+            PyExc_ValueError,
+            "Only query with lengths less than 32 are supported. Got %zd",
+            query_length
+        );
+        return NULL;
+    }
+    Py_ssize_t highest_score = 0;
+    Py_ssize_t most_matches = 0;
+    struct Entry prev_column[32];
+    struct Entry new_column[32];
+    memset(prev_column, 0, 32 * sizeof(struct Entry));
+    memset(new_column, 0, 32 * sizeof(struct Entry));
+    for (Py_ssize_t i=0; i < target_length; i++) {
+        for (Py_ssize_t j=1; j < query_length + 1; j++) {
+            uint8_t target_char = target[i];
+            uint8_t query_char = query[j - 1];
+            struct Entry prev_entry = prev_column[j-1];
+            Py_ssize_t linear_score; 
+            Py_ssize_t linear_matches;
+            if (target_char == query_char) {
+                linear_score = prev_entry.score + match_score;
+                linear_matches = prev_entry.query_matches + 1;
+            } else {
+                linear_score = prev_entry.score + mismatch_penalty;
+                linear_matches = prev_entry.query_matches;
+            }
+            struct Entry prev_ins_entry = prev_column[j];
+            struct Entry prev_del_entry = new_column[j - 1];
+            Py_ssize_t insertion_score = prev_ins_entry.score + insertion_penalty;
+            Py_ssize_t deletion_score = prev_del_entry.score + deletion_penalty;
+            Py_ssize_t score;
+            Py_ssize_t matches; 
+            if (linear_score >= insertion_score && linear_score >= deletion_score) {
+                score = linear_score;
+                matches = linear_matches;
+            } else if (insertion_score >= deletion_score) {
+                /* When an insertion happens in the query in theory we can 
+                   match all query characeters still. So deduct one as a penalty. */
+                score = insertion_score;
+                matches = prev_ins_entry.query_matches - 1;
+            } else {
+                /* When a deletion happens in the query, that character cannot
+                   match anything anymore. So no need to deduct a penalty. */
+                score = deletion_score;
+                matches = prev_del_entry.query_matches;
+            }
+            if (score < 0) {
+                score = 0;
+                matches = 0;
+            }
+            new_column[j].score = score;
+            new_column[j].query_matches = matches;
+            if (score == highest_score && matches > most_matches) {
+                most_matches = matches;
+            } else if (score > highest_score) {
+                highest_score = score;
+                most_matches = matches;
+            }
+        }
+        memcpy(prev_column, new_column, sizeof(prev_column));
+    }
+    double identity = (double)most_matches / (double)query_length;
+    return PyFloat_FromDouble(identity);
+}
+
+static PyMethodDef _seqident_methods[] = {
+    {"sequence_identity", (PyCFunction)sequence_identity, 
+     sequence_identity_method, sequence_identity__doc__},
+    {NULL},
+};
+
+static struct PyModuleDef _seqident_module = {
+    PyModuleDef_HEAD_INIT,
+    "_seqident",
+    NULL, /* Module documentation*/
+    -1, 
+    _seqident_methods,
+    .m_slots = NULL,
+};
+
+PyMODINIT_FUNC
+PyInit__seqident(void)
+{
+    PyObject *m = PyModule_Create(&_seqident_module);
+    if (m == NULL) {
+        return NULL;
+    }
+    return m;
+}
diff --git a/src/sequali/contaminants/Illumina.fasta b/src/sequali/contaminants/Illumina.fasta
@@ -0,0 +1,10 @@
+>Illumina ThruSeq RNA DNA sequence for adapter trimming, read1
+AGATCGGAAGAGCACACGTCTGAACTCCAGTCA
+>Illumina ThruSeq RNA DNA sequence for adapter trimming, read2
+AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
+>Illumina ThruSeq Small RNA sequence for adapter trimming
+TGGAATTCTCGGGTGCCAAGG
+>Illumina Nextera/Ampliseq sequence for adapter trimming
+CTGTCTCTTATACACATCT
+>Poly-G. Illumina dark cycle pattern.
+GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG
diff --git a/src/sequali/contaminants/README b/src/sequali/contaminants/README
@@ -17,3 +17,17 @@ a lot of those that are used in Oxford nanopore sequencing. The
 oxford_nanopore.fasta file provides the sequences as represented in the
 technical documentation. All possible barcoding sequencing adapters for
 nanopore have not yet been added.
+
+Oxford Nanopore adapter and barcode sequences as listed in the
+technical documentation are included in oxford_nanopore.fasta and
+oxford_nanopore_barcodes.fasta.
+
+Illumina adapter trimming sequences are included in
+illumina.fasta. These sequences are common among
+a variety of commercially provided adapters. To prevent a false positive ID
+on the commercially provided adapters when only a part of the common sequence
+is found these sequences had to be added.
+
+
+Common human genome repeats were added on the basis of the tandem repeat
+database: https://tandem-test.bu.edu/cgi-bin/trdb/trdby.exe.
diff --git a/src/sequali/contaminants/common_human_repeats.fasta b/src/sequali/contaminants/common_human_repeats.fasta
@@ -0,0 +1,8 @@
+>Poly-A/T repeat. Common pattern in Human Genome.
+AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
+>Poly-AC/TG repeat. Common pattern in Human Genome.
+ACACACACACACACACACACACACACACACAC
+>Poly-CA/GT repeat. Common pattern in Human Genome.
+CACACACACACACACACACACACACACACACA
+>Poly-AT/TA repeat. Common pattern in Human Genome
+ATATATATATATATATATATATATATATATAT