C fastq (#86)

This adds a C interface for interacting with fastq files, plus the complementing python files. With this, you are able to use dnadesign to parse fastq files in python. This builds dnadesign `0.1.4` for pypi. Also cleaned up some linter problems introduced in golangci-lint 1.60
Koeng101 · Aug 16, 2024 · bb29c87 · bb29c87
1 parent 446e0ea
commit bb29c87
Show file tree

Hide file tree

Showing 20 changed files with 275 additions and 51 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -55,9 +55,9 @@ jobs:
       working-directory: ./py
       run: |
         if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "arm64" ]; then
-          CC="zig cc -target aarch64-linux-gnu" GOOS=linux GOARCH=arm64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
+          CC="zig cc -target aarch64-linux-gnu" CGO_ENABLED=1 GOOS=linux GOARCH=arm64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
         elif [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "amd64" ]; then
-          CC="zig cc -target x86_64-linux-gnu" GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
+          CC="zig cc -target x86_64-linux-gnu" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
         elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "arm64" ]; then
           CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 go build -o dnadesign/libdnadesign.dylib -buildmode=c-shared lib.go
         elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "amd64" ]; then

diff --git a/.golangci.yml b/.golangci.yml
@@ -1,8 +1,8 @@
 run:
   timeout: 1m
-  skip-dirs:
+issues:
+  exclude-dirs:
     - data
-    - api/gen
     - lib/bio/slow5/svb
 linters:
   enable:

diff --git a/README.md b/README.md
@@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+- Adds fastq parsing to python package. Releases version 0.1.4 of dnadesign python. [#86](https://github.com/Koeng101/dnadesign/pull/86)
 - Integrated errgroup into source tree [#84](https://github.com/Koeng101/dnadesign/pull/84)
 - Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83)
 - Added option for linear ligations [#82](https://github.com/Koeng101/dnadesign/pull/82)

diff --git a/lib/bio/fastq/fastq.go b/lib/bio/fastq/fastq.go
@@ -178,6 +178,9 @@ func (parser *Parser) Next() (Read, error) {
 	} else {
 		quality = string(line[:len(line)-1])
 	}
+	if len(sequence) != len(quality) {
+		return Read{}, fmt.Errorf("Got different lengths for sequence(%d) and quality(%d)", len(sequence), len(quality))
+	}
 
 	// Parsing ended. Check for inconsistencies.
 	if lookingForIdentifier {

diff --git a/lib/bio/fastq/fastq_test.go b/lib/bio/fastq/fastq_test.go
@@ -47,3 +47,13 @@ $$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0
 		t.Errorf("Optionals not parsed properly")
 	}
 }
+
+func TestSequenceQualityLength(t *testing.T) {
+	file := strings.NewReader("@test\nATCG\n+\nII\n")
+	const maxLineSize = 2 * 32 * 1024
+	parser := NewParser(file, maxLineSize)
+	_, err := parser.Next()
+	if err == nil {
+		t.Errorf("Should have gotten error on quality vs sequence lengths")
+	}
+}
diff --git a/lib/bio/sam/sam.go b/lib/bio/sam/sam.go
@@ -305,8 +305,8 @@ func (alignment *Alignment) Validate() error {
 		return errors.New("Invalid RNAME: must match " + rnameRegex)
 	}
 
-	// 4. Validate POS
-	if alignment.POS < 0 || alignment.POS > 2147483647 { // 2^31 - 1
+	// 4. Validate POS.
+	if alignment.POS < 0 {
 		return errors.New("Invalid POS: must be in range [0, 2147483647]")
 	}
 
@@ -325,7 +325,7 @@ func (alignment *Alignment) Validate() error {
 	}
 
 	// 8. Validate PNEXT
-	if alignment.PNEXT < 0 || alignment.PNEXT > 2147483647 { // 2^31 - 1
+	if alignment.PNEXT < 0 { // 2^31 - 1
 		return errors.New("Invalid PNEXT: must be in range [0, 2147483647]")
 	}
 

diff --git a/lib/bio/slow5/slow5_test.go b/lib/bio/slow5/slow5_test.go
@@ -101,7 +101,7 @@ func testParseReadsHelper(t *testing.T, fileTarget string, errorMessage string)
 		}
 	}
 	if len(targetErr) == 0 {
-		t.Errorf(errorMessage)
+		t.Errorf("%s", errorMessage)
 	}
 }
 

diff --git a/lib/fold/mfe/checks/checks.go b/lib/fold/mfe/checks/checks.go
@@ -26,7 +26,7 @@ func checkRegexpMatchesFullString(str, regex, errMsg string) (bool, error) {
 	}
 
 	if !doCheckRegexpMatchesFullString(str, regexp) {
-		return false, fmt.Errorf(errMsg)
+		return false, fmt.Errorf("%s", errMsg)
 	}
 	return true, nil
 }

diff --git a/lib/seqhash/seqhash_test.go b/lib/seqhash/seqhash_test.go
@@ -36,33 +36,33 @@ func TestHash2(t *testing.T) {
 	// Test circular double stranded hashing
 	seqhash, _ := EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, true))
 	if seqhash != "A_6VAbBfXD8BSZh2HJZqgGgR" {
-		t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: " + seqhash)
+		t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: %s", seqhash)
 	}
 	// Test circular single stranded hashing
 	seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, false))
 	if seqhash != "B_5xKbuHELJCCQWJwQi7W1ak" {
-		t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: " + seqhash)
+		t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: %s", seqhash)
 	}
 	// Test linear double stranded hashing
 	seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, true))
 	if seqhash != "C_5Z2pHCXbxWUPYiZj6J1Nag" {
-		t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: " + seqhash)
+		t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: %s", seqhash)
 	}
 	// Test linear single stranded hashing
 	seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, false))
 	if seqhash != "D_4yT7etihWZHHNXUpbM5tUf" {
-		t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: " + seqhash)
+		t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: %s", seqhash)
 	}
 
 	// Test RNA Seqhash
 	seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "RNA", false, false))
 	if seqhash != "H_56cWv4dacvRJxUUcXYsdP5" {
-		t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: " + seqhash)
+		t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: %s", seqhash)
 	}
 	// Test Protein Seqhash
 	seqhash, _ = EncodeHash2(Hash2("MGC*", "PROTEIN", false, false))
 	if seqhash != "I_5DQsEyDHLh2r4njCcupAuF" {
-		t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: " + seqhash)
+		t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: %s", seqhash)
 	}
 }
 

diff --git a/lib/synthesis/fix/synthesis_test.go b/lib/synthesis/fix/synthesis_test.go
@@ -48,11 +48,11 @@ func BenchmarkCds(b *testing.B) {
 		for _, cutSite := range []string{"GAAGAC", "GGTCTC", "GCGATG", "CGTCTC", "GCTCTTC", "CACCTGC"} {
 			if strings.Contains(optimizedSeq, cutSite) {
 				fmt.Println(changes)
-				b.Errorf("phusion" + " contains " + cutSite)
+				b.Errorf("phusion contains %s", cutSite)
 			}
 			if strings.Contains(transform.ReverseComplement(optimizedSeq), cutSite) {
 				fmt.Println(changes)
-				b.Errorf("phusion" + " reverse complement contains " + cutSite)
+				b.Errorf("phusion reverse complement contains %s", cutSite)
 			}
 		}
 	}
@@ -84,10 +84,10 @@ func TestCds(t *testing.T) {
 
 	for _, cutSite := range []string{"GAAGAC", "GGTCTC", "GCGATG", "CGTCTC", "GCTCTTC", "CACCTGC"} {
 		if strings.Contains(optimizedSeq, cutSite) {
-			t.Errorf("phusion" + " contains " + cutSite)
+			t.Errorf("phusion contains %s", cutSite)
 		}
 		if strings.Contains(transform.ReverseComplement(optimizedSeq), cutSite) {
-			t.Errorf("phusion" + " reverse complement contains " + cutSite)
+			t.Errorf("phusion reverse complement contains %s", cutSite)
 		}
 	}
 

diff --git a/lib/synthesis/fragment/fragment_test.go b/lib/synthesis/fragment/fragment_test.go
@@ -9,7 +9,7 @@ func TestFragment(t *testing.T) {
 
 	_, _, err := Fragment(gene, 90, 110, []string{})
 	if err != nil {
-		t.Errorf(err.Error())
+		t.Error(err.Error())
 	}
 }
 
@@ -51,7 +51,7 @@ func TestLongFragment(t *testing.T) {
 	gene := "GGAGGGTCTCAATGCTGGACGATCGCAAATTCAGCGAACAGGAGCTGGTCCGTCGCAACAAATACAAAACGCTGGTCGAGCAAAACAAAGACCCGTACAAGATTACGAACTGGAAACGCAATACCACCCTGCTGAAACTGAATGAGAAATACAAAGACTATAGCAAGGAGGACCTGTTGAACCTGAATCAAGAACTGGTCGTTGTTGCAGGTCGTATCAAACTGTATCGTGAAGCCGGTAAAAAAGCTGCCTTTGTGAACATTGATGATCAAGACTCCTCTATTCAGTTGTACGTGCGCCTGGATGAGATCGGTGATCAGAGCTTCGAGGATTTCCGCAATTTCGACCTGGGTGACATCATTGGTGTTAAAGGTATCATGATGCGCACCGACCACGGCGAGTTGAGCATCCGTTGTAAGGAAGTCGTGCTGCTGAGCAAGGCCCTGCGTCCGCTGCCGGATAAACACGCGGGCATTCAGGATATTGAGGAAAAGTACCGCCGTCGCTATGTGGACCTGATTATGAATCACGACGTGCGCAAGACGTTCCAGGCGCGTACCAAGATCATTCGTACCTTGCAAAACTTTCTGGATAATAAGGGTTACATGGAGGTCGAAACCCCGATCCTGCATAGCCTGAAGGGTGGCGCGAGCGCGAAACCGTTTATTACCCACTACAATGTGCTGAATACGGATGTGTATCTGCGTATCGCGACCGAGCTGCACCTGAAACGCCTGATTGTTGGCGGTTTCGAGGGTGTGTATGAGATCGGTCGCATCTTTCGCAATGAAGGTATGTCCACGCGTCACAATCCGGAATTCACGTCTATCGAACTGTATGTCGCCTATGAGGACATGTTCTTTTTGATGGATCTGACCGAAGAGATTTTTCGCGTTTGTAATGCCGCAGTCAACAGCTCCAGCATCATTGAGTATAACAACGTGAAAATTGACCTGAGCAAGCCGTTTAAGCGCCTGCATATGGTTGACGGTATTAAACAGGTGACCGGCGTCGACTTCTGGCAGGAGATGACGGTCCAACAGGCTCTGGAGCTGGCCAAAAAGCATAAAGTGCACGTTGAAAAACATCAAGAGTCTGTTGGTCACATTATCAATTTGTTCTATGAGGAGTTCGTGGAGTCCACGATTGTTGAGCCGACGTTCGTGTACGGTCACCCGAAGGAAATCTCTCCGCTGGCTAAGAGCAATCCGTCTGACCCGCGTTTCACGGACCGTTTCGAGCTGTTCATTCTGGGTCGTGAGTATGCGAATGCGTTTAGCGAGCTGAATGACCCGATTGACCAGTACGAACGCTTCAAGGCTCAGATTGAGGAGGAAAGCAAGGGCAACGATGAAGCCAACGACATGGACATTGATTTCATCGAGGCTCTGGAACACGCCATGCCGCCGACCGCGGGTATTGGTATCGGCATTGATCGCTTGGTTATGCTGCTGACGAATAGCGAATCCATCAAAGACGTGCTGTTGTTCCCGCAAATGAAGCCGCGCGAATGAAGAGCTTAGAGACCCGCT"
 	frags, _, err := Fragment(gene, 79, 94, []string{})
 	if err != nil {
-		t.Errorf(err.Error())
+		t.Error(err.Error())
 	}
 	for _, frag := range frags {
 		if len(frag) > 94 {
@@ -92,6 +92,6 @@ func TestFragmentWithOverhangs(t *testing.T) {
 
 	_, _, err := FragmentWithOverhangs(gene, 90, 110, []string{}, defaultOverhangs)
 	if err != nil {
-		t.Errorf(err.Error())
+		t.Error(err.Error())
 	}
 }
diff --git a/py/README.md b/py/README.md
@@ -5,3 +5,8 @@ This is a work-in-progress. Right now, we have only ported the fasta parser.
 
 ### Other platforms
 If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them.
+
+### Testing
+```
+CC="zig cc -target x86_64-linux-gnu" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
+```
diff --git a/py/dnadesign/definitions.h b/py/dnadesign/definitions.h
@@ -2,16 +2,40 @@ typedef struct FILE FILE;
 FILE* fopen(const char* path, const char* mode);
 int fclose(FILE* fp);
 
+// FASTA definitions
 typedef struct {
     char* identifier;
     char* sequence;
 } FastaRecord;
 
 typedef struct {
     FastaRecord* records;
-    GoInt numRecords;
+    int numRecords;
     char* error;
 } FastaResult;
 
 FastaResult ParseFastaFromCFile(void* cfile);
 FastaResult ParseFastaFromCString(char* cstring);
+
+// FASTQ definitions
+typedef struct {
+    char* key;
+    char* value;
+} FastqOptional;
+
+typedef struct {
+    char* identifier;
+    FastqOptional* optionals;
+    int optionals_count;
+    char* sequence;
+    char* quality;
+} FastqRecord;
+
+typedef struct {
+    FastqRecord* records;
+    int numRecords;
+    char* error;
+} FastqResult;
+
+FastqResult ParseFastqFromCFile(void* cfile);
+FastqResult ParseFastqFromCString(char* cstring);
diff --git a/py/dnadesign/fasta_parser.py b/py/dnadesign/fasta_parser.py
diff --git a/py/dnadesign/parsers.py b/py/dnadesign/parsers.py
@@ -0,0 +1,80 @@
+from typing import List, Optional, Dict
+from .cffi_bindings import ffi, lib
+import os
+
+class FastaRecord:
+    def __init__(self, identifier: str, sequence: str):
+        self.identifier = identifier
+        self.sequence = sequence
+
+class FastqRecord:
+    def __init__(self, identifier: str, sequence: str, quality: str, optionals: Dict[str, str]):
+        self.identifier = identifier
+        self.sequence = sequence
+        self.quality = quality
+        self.optionals = optionals
+
+def _safe_open_file(file_path: str):
+    if not os.path.exists(file_path):
+        raise FileNotFoundError(f"The file {file_path} does not exist.")
+    cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8'))
+    if cfile == ffi.NULL:
+        raise IOError(f"Failed to open the file {file_path}.")
+    return cfile
+
+def parse_fasta_from_c_file(file_path: str) -> List[FastaRecord]:
+    try:
+        cfile = _safe_open_file(file_path)
+        result = lib.ParseFastaFromCFile(cfile)
+        return _process_fasta_result(result)
+    finally:
+        if 'cfile' in locals() and cfile != ffi.NULL:
+            lib.fclose(cfile)
+
+def parse_fasta_from_c_string(cstring: str) -> List[FastaRecord]:
+    result = lib.ParseFastaFromCString(cstring.encode('utf-8'))
+    return _process_fasta_result(result)
+
+def _process_fasta_result(result) -> List[FastaRecord]:
+    if result.error != ffi.NULL:
+        error_str = ffi.string(result.error).decode('utf-8')
+        raise Exception("Error parsing FASTA: " + error_str)
+    num_records = result.numRecords
+    records = ffi.cast("FastaRecord*", result.records)
+    return [FastaRecord(ffi.string(records[i].identifier).decode('utf-8'),
+                        ffi.string(records[i].sequence).decode('utf-8'))
+            for i in range(num_records)]
+
+def parse_fastq_from_c_file(file_path: str) -> List[FastqRecord]:
+    try:
+        cfile = _safe_open_file(file_path)
+        result = lib.ParseFastqFromCFile(cfile)
+        return _process_fastq_result(result)
+    finally:
+        if 'cfile' in locals() and cfile != ffi.NULL:
+            lib.fclose(cfile)
+
+def parse_fastq_from_c_string(cstring: str) -> List[FastqRecord]:
+    result = lib.ParseFastqFromCString(cstring.encode('utf-8'))
+    return _process_fastq_result(result)
+
+def _process_fastq_result(result) -> List[FastqRecord]:
+    if result.error != ffi.NULL:
+        error_str = ffi.string(result.error).decode('utf-8')
+        raise Exception("Error parsing FASTQ: " + error_str)
+    num_records = result.numRecords
+    records = ffi.cast("FastqRecord*", result.records)
+    fastq_records = []
+    for i in range(num_records):
+        optionals = {}
+        for j in range(records[i].optionals_count):
+            key = ffi.string(records[i].optionals[j].key).decode('utf-8')
+            value = ffi.string(records[i].optionals[j].value).decode('utf-8')
+            optionals[key] = value
+        fastq_records.append(FastqRecord(
+            ffi.string(records[i].identifier).decode('utf-8'),
+            ffi.string(records[i].sequence).decode('utf-8'),
+            ffi.string(records[i].quality).decode('utf-8'),
+            optionals
+        ))
+    return fastq_records