Skip to content

Commit

Permalink
C fastq (#86)
Browse files Browse the repository at this point in the history
This adds a C interface for interacting with fastq files, plus the complementing python files. With this, you are able to use dnadesign to parse fastq files in python. This builds dnadesign `0.1.4` for pypi.

Also cleaned up some linter problems introduced in golangci-lint 1.60
  • Loading branch information
Koeng101 authored Aug 16, 2024
1 parent 446e0ea commit bb29c87
Show file tree
Hide file tree
Showing 20 changed files with 275 additions and 51 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ jobs:
working-directory: ./py
run: |
if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "arm64" ]; then
CC="zig cc -target aarch64-linux-gnu" GOOS=linux GOARCH=arm64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
CC="zig cc -target aarch64-linux-gnu" CGO_ENABLED=1 GOOS=linux GOARCH=arm64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
elif [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "amd64" ]; then
CC="zig cc -target x86_64-linux-gnu" GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
CC="zig cc -target x86_64-linux-gnu" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "arm64" ]; then
CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 go build -o dnadesign/libdnadesign.dylib -buildmode=c-shared lib.go
elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "amd64" ]; then
Expand Down
4 changes: 2 additions & 2 deletions .golangci.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
run:
timeout: 1m
skip-dirs:
issues:
exclude-dirs:
- data
- api/gen
- lib/bio/slow5/svb
linters:
enable:
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Adds fastq parsing to python package. Releases version 0.1.4 of dnadesign python. [#86](https://github.com/Koeng101/dnadesign/pull/86)
- Integrated errgroup into source tree [#84](https://github.com/Koeng101/dnadesign/pull/84)
- Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83)
- Added option for linear ligations [#82](https://github.com/Koeng101/dnadesign/pull/82)
Expand Down
3 changes: 3 additions & 0 deletions lib/bio/fastq/fastq.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ func (parser *Parser) Next() (Read, error) {
} else {
quality = string(line[:len(line)-1])
}
if len(sequence) != len(quality) {
return Read{}, fmt.Errorf("Got different lengths for sequence(%d) and quality(%d)", len(sequence), len(quality))
}

// Parsing ended. Check for inconsistencies.
if lookingForIdentifier {
Expand Down
10 changes: 10 additions & 0 deletions lib/bio/fastq/fastq_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,13 @@ $$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0
t.Errorf("Optionals not parsed properly")
}
}

func TestSequenceQualityLength(t *testing.T) {
file := strings.NewReader("@test\nATCG\n+\nII\n")
const maxLineSize = 2 * 32 * 1024
parser := NewParser(file, maxLineSize)
_, err := parser.Next()
if err == nil {
t.Errorf("Should have gotten error on quality vs sequence lengths")
}
}
6 changes: 3 additions & 3 deletions lib/bio/sam/sam.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,8 @@ func (alignment *Alignment) Validate() error {
return errors.New("Invalid RNAME: must match " + rnameRegex)
}

// 4. Validate POS
if alignment.POS < 0 || alignment.POS > 2147483647 { // 2^31 - 1
// 4. Validate POS.
if alignment.POS < 0 {
return errors.New("Invalid POS: must be in range [0, 2147483647]")
}

Expand All @@ -325,7 +325,7 @@ func (alignment *Alignment) Validate() error {
}

// 8. Validate PNEXT
if alignment.PNEXT < 0 || alignment.PNEXT > 2147483647 { // 2^31 - 1
if alignment.PNEXT < 0 { // 2^31 - 1
return errors.New("Invalid PNEXT: must be in range [0, 2147483647]")
}

Expand Down
2 changes: 1 addition & 1 deletion lib/bio/slow5/slow5_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ func testParseReadsHelper(t *testing.T, fileTarget string, errorMessage string)
}
}
if len(targetErr) == 0 {
t.Errorf(errorMessage)
t.Errorf("%s", errorMessage)
}
}

Expand Down
2 changes: 1 addition & 1 deletion lib/fold/mfe/checks/checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ func checkRegexpMatchesFullString(str, regex, errMsg string) (bool, error) {
}

if !doCheckRegexpMatchesFullString(str, regexp) {
return false, fmt.Errorf(errMsg)
return false, fmt.Errorf("%s", errMsg)
}
return true, nil
}
Expand Down
12 changes: 6 additions & 6 deletions lib/seqhash/seqhash_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,33 +36,33 @@ func TestHash2(t *testing.T) {
// Test circular double stranded hashing
seqhash, _ := EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, true))
if seqhash != "A_6VAbBfXD8BSZh2HJZqgGgR" {
t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: " + seqhash)
t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: %s", seqhash)
}
// Test circular single stranded hashing
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, false))
if seqhash != "B_5xKbuHELJCCQWJwQi7W1ak" {
t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: " + seqhash)
t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: %s", seqhash)
}
// Test linear double stranded hashing
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, true))
if seqhash != "C_5Z2pHCXbxWUPYiZj6J1Nag" {
t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: " + seqhash)
t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: %s", seqhash)
}
// Test linear single stranded hashing
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, false))
if seqhash != "D_4yT7etihWZHHNXUpbM5tUf" {
t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: " + seqhash)
t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: %s", seqhash)
}

// Test RNA Seqhash
seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "RNA", false, false))
if seqhash != "H_56cWv4dacvRJxUUcXYsdP5" {
t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: " + seqhash)
t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: %s", seqhash)
}
// Test Protein Seqhash
seqhash, _ = EncodeHash2(Hash2("MGC*", "PROTEIN", false, false))
if seqhash != "I_5DQsEyDHLh2r4njCcupAuF" {
t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: " + seqhash)
t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: %s", seqhash)
}
}

Expand Down
8 changes: 4 additions & 4 deletions lib/synthesis/fix/synthesis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,11 @@ func BenchmarkCds(b *testing.B) {
for _, cutSite := range []string{"GAAGAC", "GGTCTC", "GCGATG", "CGTCTC", "GCTCTTC", "CACCTGC"} {
if strings.Contains(optimizedSeq, cutSite) {
fmt.Println(changes)
b.Errorf("phusion" + " contains " + cutSite)
b.Errorf("phusion contains %s", cutSite)
}
if strings.Contains(transform.ReverseComplement(optimizedSeq), cutSite) {
fmt.Println(changes)
b.Errorf("phusion" + " reverse complement contains " + cutSite)
b.Errorf("phusion reverse complement contains %s", cutSite)
}
}
}
Expand Down Expand Up @@ -84,10 +84,10 @@ func TestCds(t *testing.T) {

for _, cutSite := range []string{"GAAGAC", "GGTCTC", "GCGATG", "CGTCTC", "GCTCTTC", "CACCTGC"} {
if strings.Contains(optimizedSeq, cutSite) {
t.Errorf("phusion" + " contains " + cutSite)
t.Errorf("phusion contains %s", cutSite)
}
if strings.Contains(transform.ReverseComplement(optimizedSeq), cutSite) {
t.Errorf("phusion" + " reverse complement contains " + cutSite)
t.Errorf("phusion reverse complement contains %s", cutSite)
}
}

Expand Down
6 changes: 3 additions & 3 deletions lib/synthesis/fragment/fragment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ func TestFragment(t *testing.T) {

_, _, err := Fragment(gene, 90, 110, []string{})
if err != nil {
t.Errorf(err.Error())
t.Error(err.Error())
}
}

Expand Down Expand Up @@ -51,7 +51,7 @@ func TestLongFragment(t *testing.T) {
gene := "GGAGGGTCTCAATGCTGGACGATCGCAAATTCAGCGAACAGGAGCTGGTCCGTCGCAACAAATACAAAACGCTGGTCGAGCAAAACAAAGACCCGTACAAGATTACGAACTGGAAACGCAATACCACCCTGCTGAAACTGAATGAGAAATACAAAGACTATAGCAAGGAGGACCTGTTGAACCTGAATCAAGAACTGGTCGTTGTTGCAGGTCGTATCAAACTGTATCGTGAAGCCGGTAAAAAAGCTGCCTTTGTGAACATTGATGATCAAGACTCCTCTATTCAGTTGTACGTGCGCCTGGATGAGATCGGTGATCAGAGCTTCGAGGATTTCCGCAATTTCGACCTGGGTGACATCATTGGTGTTAAAGGTATCATGATGCGCACCGACCACGGCGAGTTGAGCATCCGTTGTAAGGAAGTCGTGCTGCTGAGCAAGGCCCTGCGTCCGCTGCCGGATAAACACGCGGGCATTCAGGATATTGAGGAAAAGTACCGCCGTCGCTATGTGGACCTGATTATGAATCACGACGTGCGCAAGACGTTCCAGGCGCGTACCAAGATCATTCGTACCTTGCAAAACTTTCTGGATAATAAGGGTTACATGGAGGTCGAAACCCCGATCCTGCATAGCCTGAAGGGTGGCGCGAGCGCGAAACCGTTTATTACCCACTACAATGTGCTGAATACGGATGTGTATCTGCGTATCGCGACCGAGCTGCACCTGAAACGCCTGATTGTTGGCGGTTTCGAGGGTGTGTATGAGATCGGTCGCATCTTTCGCAATGAAGGTATGTCCACGCGTCACAATCCGGAATTCACGTCTATCGAACTGTATGTCGCCTATGAGGACATGTTCTTTTTGATGGATCTGACCGAAGAGATTTTTCGCGTTTGTAATGCCGCAGTCAACAGCTCCAGCATCATTGAGTATAACAACGTGAAAATTGACCTGAGCAAGCCGTTTAAGCGCCTGCATATGGTTGACGGTATTAAACAGGTGACCGGCGTCGACTTCTGGCAGGAGATGACGGTCCAACAGGCTCTGGAGCTGGCCAAAAAGCATAAAGTGCACGTTGAAAAACATCAAGAGTCTGTTGGTCACATTATCAATTTGTTCTATGAGGAGTTCGTGGAGTCCACGATTGTTGAGCCGACGTTCGTGTACGGTCACCCGAAGGAAATCTCTCCGCTGGCTAAGAGCAATCCGTCTGACCCGCGTTTCACGGACCGTTTCGAGCTGTTCATTCTGGGTCGTGAGTATGCGAATGCGTTTAGCGAGCTGAATGACCCGATTGACCAGTACGAACGCTTCAAGGCTCAGATTGAGGAGGAAAGCAAGGGCAACGATGAAGCCAACGACATGGACATTGATTTCATCGAGGCTCTGGAACACGCCATGCCGCCGACCGCGGGTATTGGTATCGGCATTGATCGCTTGGTTATGCTGCTGACGAATAGCGAATCCATCAAAGACGTGCTGTTGTTCCCGCAAATGAAGCCGCGCGAATGAAGAGCTTAGAGACCCGCT"
frags, _, err := Fragment(gene, 79, 94, []string{})
if err != nil {
t.Errorf(err.Error())
t.Error(err.Error())
}
for _, frag := range frags {
if len(frag) > 94 {
Expand Down Expand Up @@ -92,6 +92,6 @@ func TestFragmentWithOverhangs(t *testing.T) {

_, _, err := FragmentWithOverhangs(gene, 90, 110, []string{}, defaultOverhangs)
if err != nil {
t.Errorf(err.Error())
t.Error(err.Error())
}
}
5 changes: 5 additions & 0 deletions py/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,8 @@ This is a work-in-progress. Right now, we have only ported the fasta parser.

### Other platforms
If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them.

### Testing
```
CC="zig cc -target x86_64-linux-gnu" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go
```
26 changes: 25 additions & 1 deletion py/dnadesign/definitions.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,40 @@ typedef struct FILE FILE;
FILE* fopen(const char* path, const char* mode);
int fclose(FILE* fp);

// FASTA definitions
typedef struct {
char* identifier;
char* sequence;
} FastaRecord;

typedef struct {
FastaRecord* records;
GoInt numRecords;
int numRecords;
char* error;
} FastaResult;

FastaResult ParseFastaFromCFile(void* cfile);
FastaResult ParseFastaFromCString(char* cstring);

// FASTQ definitions
typedef struct {
char* key;
char* value;
} FastqOptional;

typedef struct {
char* identifier;
FastqOptional* optionals;
int optionals_count;
char* sequence;
char* quality;
} FastqRecord;

typedef struct {
FastqRecord* records;
int numRecords;
char* error;
} FastqResult;

FastqResult ParseFastqFromCFile(void* cfile);
FastqResult ParseFastqFromCString(char* cstring);
26 changes: 0 additions & 26 deletions py/dnadesign/fasta_parser.py

This file was deleted.

80 changes: 80 additions & 0 deletions py/dnadesign/parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from typing import List, Optional, Dict
from .cffi_bindings import ffi, lib
import os

class FastaRecord:
def __init__(self, identifier: str, sequence: str):
self.identifier = identifier
self.sequence = sequence

class FastqRecord:
def __init__(self, identifier: str, sequence: str, quality: str, optionals: Dict[str, str]):
self.identifier = identifier
self.sequence = sequence
self.quality = quality
self.optionals = optionals

def _safe_open_file(file_path: str):
if not os.path.exists(file_path):
raise FileNotFoundError(f"The file {file_path} does not exist.")
cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8'))
if cfile == ffi.NULL:
raise IOError(f"Failed to open the file {file_path}.")
return cfile

def parse_fasta_from_c_file(file_path: str) -> List[FastaRecord]:
try:
cfile = _safe_open_file(file_path)
result = lib.ParseFastaFromCFile(cfile)
return _process_fasta_result(result)
finally:
if 'cfile' in locals() and cfile != ffi.NULL:
lib.fclose(cfile)

def parse_fasta_from_c_string(cstring: str) -> List[FastaRecord]:
result = lib.ParseFastaFromCString(cstring.encode('utf-8'))
return _process_fasta_result(result)

def _process_fasta_result(result) -> List[FastaRecord]:
if result.error != ffi.NULL:
error_str = ffi.string(result.error).decode('utf-8')
raise Exception("Error parsing FASTA: " + error_str)
num_records = result.numRecords
records = ffi.cast("FastaRecord*", result.records)
return [FastaRecord(ffi.string(records[i].identifier).decode('utf-8'),
ffi.string(records[i].sequence).decode('utf-8'))
for i in range(num_records)]

def parse_fastq_from_c_file(file_path: str) -> List[FastqRecord]:
try:
cfile = _safe_open_file(file_path)
result = lib.ParseFastqFromCFile(cfile)
return _process_fastq_result(result)
finally:
if 'cfile' in locals() and cfile != ffi.NULL:
lib.fclose(cfile)

def parse_fastq_from_c_string(cstring: str) -> List[FastqRecord]:
result = lib.ParseFastqFromCString(cstring.encode('utf-8'))
return _process_fastq_result(result)

def _process_fastq_result(result) -> List[FastqRecord]:
if result.error != ffi.NULL:
error_str = ffi.string(result.error).decode('utf-8')
raise Exception("Error parsing FASTQ: " + error_str)
num_records = result.numRecords
records = ffi.cast("FastqRecord*", result.records)
fastq_records = []
for i in range(num_records):
optionals = {}
for j in range(records[i].optionals_count):
key = ffi.string(records[i].optionals[j].key).decode('utf-8')
value = ffi.string(records[i].optionals[j].value).decode('utf-8')
optionals[key] = value
fastq_records.append(FastqRecord(
ffi.string(records[i].identifier).decode('utf-8'),
ffi.string(records[i].sequence).decode('utf-8'),
ffi.string(records[i].quality).decode('utf-8'),
optionals
))
return fastq_records
Loading

0 comments on commit bb29c87

Please sign in to comment.