From bb29c8710067a0638d469f4912cf1b9f9f1f698a Mon Sep 17 00:00:00 2001 From: Koeng101 Date: Fri, 16 Aug 2024 13:47:33 -0700 Subject: [PATCH] C fastq (#86) This adds a C interface for interacting with fastq files, plus the complementing python files. With this, you are able to use dnadesign to parse fastq files in python. This builds dnadesign `0.1.4` for pypi. Also cleaned up some linter problems introduced in golangci-lint 1.60 --- .github/workflows/build.yml | 4 +- .golangci.yml | 4 +- README.md | 1 + lib/bio/fastq/fastq.go | 3 + lib/bio/fastq/fastq_test.go | 10 ++++ lib/bio/sam/sam.go | 6 +- lib/bio/slow5/slow5_test.go | 2 +- lib/fold/mfe/checks/checks.go | 2 +- lib/seqhash/seqhash_test.go | 12 ++-- lib/synthesis/fix/synthesis_test.go | 8 +-- lib/synthesis/fragment/fragment_test.go | 6 +- py/README.md | 5 ++ py/dnadesign/definitions.h | 26 +++++++- py/dnadesign/fasta_parser.py | 26 -------- py/dnadesign/parsers.py | 80 +++++++++++++++++++++++++ py/lib.go | 67 +++++++++++++++++++++ py/setup.py | 2 +- py/tests/data/example.fastq | 16 +++++ py/tests/test_fasta_parser.py | 2 +- py/tests/test_fastq_parser.py | 44 ++++++++++++++ 20 files changed, 275 insertions(+), 51 deletions(-) delete mode 100644 py/dnadesign/fasta_parser.py create mode 100644 py/dnadesign/parsers.py create mode 100644 py/tests/data/example.fastq create mode 100644 py/tests/test_fastq_parser.py diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0e14caf2..a59fd881 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -55,9 +55,9 @@ jobs: working-directory: ./py run: | if [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "arm64" ]; then - CC="zig cc -target aarch64-linux-gnu" GOOS=linux GOARCH=arm64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go + CC="zig cc -target aarch64-linux-gnu" CGO_ENABLED=1 GOOS=linux GOARCH=arm64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go elif [ "${{ runner.os }}" = "Linux" ] && [ "${{ matrix.arch }}" = "amd64" ]; then - CC="zig cc -target x86_64-linux-gnu" GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go + CC="zig cc -target x86_64-linux-gnu" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "arm64" ]; then CGO_ENABLED=1 GOOS=darwin GOARCH=arm64 go build -o dnadesign/libdnadesign.dylib -buildmode=c-shared lib.go elif [ "${{ runner.os }}" = "macOS" ] && [ "${{ matrix.arch }}" = "amd64" ]; then diff --git a/.golangci.yml b/.golangci.yml index cdd1de3c..46d6c511 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,8 +1,8 @@ run: timeout: 1m - skip-dirs: +issues: + exclude-dirs: - data - - api/gen - lib/bio/slow5/svb linters: enable: diff --git a/README.md b/README.md index 4d5f456a..2d60916f 100644 --- a/README.md +++ b/README.md @@ -75,6 +75,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Adds fastq parsing to python package. Releases version 0.1.4 of dnadesign python. [#86](https://github.com/Koeng101/dnadesign/pull/86) - Integrated errgroup into source tree [#84](https://github.com/Koeng101/dnadesign/pull/84) - Added kmer detection for ligation events in cloning and removed enzyme manager [#83](https://github.com/Koeng101/dnadesign/pull/83) - Added option for linear ligations [#82](https://github.com/Koeng101/dnadesign/pull/82) diff --git a/lib/bio/fastq/fastq.go b/lib/bio/fastq/fastq.go index 160cc064..c06685fc 100644 --- a/lib/bio/fastq/fastq.go +++ b/lib/bio/fastq/fastq.go @@ -178,6 +178,9 @@ func (parser *Parser) Next() (Read, error) { } else { quality = string(line[:len(line)-1]) } + if len(sequence) != len(quality) { + return Read{}, fmt.Errorf("Got different lengths for sequence(%d) and quality(%d)", len(sequence), len(quality)) + } // Parsing ended. Check for inconsistencies. if lookingForIdentifier { diff --git a/lib/bio/fastq/fastq_test.go b/lib/bio/fastq/fastq_test.go index 223c7be0..75f749d0 100644 --- a/lib/bio/fastq/fastq_test.go +++ b/lib/bio/fastq/fastq_test.go @@ -47,3 +47,13 @@ $$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0 t.Errorf("Optionals not parsed properly") } } + +func TestSequenceQualityLength(t *testing.T) { + file := strings.NewReader("@test\nATCG\n+\nII\n") + const maxLineSize = 2 * 32 * 1024 + parser := NewParser(file, maxLineSize) + _, err := parser.Next() + if err == nil { + t.Errorf("Should have gotten error on quality vs sequence lengths") + } +} diff --git a/lib/bio/sam/sam.go b/lib/bio/sam/sam.go index bf6ce1ab..ea810137 100644 --- a/lib/bio/sam/sam.go +++ b/lib/bio/sam/sam.go @@ -305,8 +305,8 @@ func (alignment *Alignment) Validate() error { return errors.New("Invalid RNAME: must match " + rnameRegex) } - // 4. Validate POS - if alignment.POS < 0 || alignment.POS > 2147483647 { // 2^31 - 1 + // 4. Validate POS. + if alignment.POS < 0 { return errors.New("Invalid POS: must be in range [0, 2147483647]") } @@ -325,7 +325,7 @@ func (alignment *Alignment) Validate() error { } // 8. Validate PNEXT - if alignment.PNEXT < 0 || alignment.PNEXT > 2147483647 { // 2^31 - 1 + if alignment.PNEXT < 0 { // 2^31 - 1 return errors.New("Invalid PNEXT: must be in range [0, 2147483647]") } diff --git a/lib/bio/slow5/slow5_test.go b/lib/bio/slow5/slow5_test.go index 1030546a..c97dc9c1 100644 --- a/lib/bio/slow5/slow5_test.go +++ b/lib/bio/slow5/slow5_test.go @@ -101,7 +101,7 @@ func testParseReadsHelper(t *testing.T, fileTarget string, errorMessage string) } } if len(targetErr) == 0 { - t.Errorf(errorMessage) + t.Errorf("%s", errorMessage) } } diff --git a/lib/fold/mfe/checks/checks.go b/lib/fold/mfe/checks/checks.go index 05abb037..5f48bf3a 100644 --- a/lib/fold/mfe/checks/checks.go +++ b/lib/fold/mfe/checks/checks.go @@ -26,7 +26,7 @@ func checkRegexpMatchesFullString(str, regex, errMsg string) (bool, error) { } if !doCheckRegexpMatchesFullString(str, regexp) { - return false, fmt.Errorf(errMsg) + return false, fmt.Errorf("%s", errMsg) } return true, nil } diff --git a/lib/seqhash/seqhash_test.go b/lib/seqhash/seqhash_test.go index 915df7a1..7742da9f 100644 --- a/lib/seqhash/seqhash_test.go +++ b/lib/seqhash/seqhash_test.go @@ -36,33 +36,33 @@ func TestHash2(t *testing.T) { // Test circular double stranded hashing seqhash, _ := EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, true)) if seqhash != "A_6VAbBfXD8BSZh2HJZqgGgR" { - t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: " + seqhash) + t.Errorf("Circular double stranded hashing failed. Expected A_6VAbBfXD8BSZh2HJZqgGgR, got: %s", seqhash) } // Test circular single stranded hashing seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", true, false)) if seqhash != "B_5xKbuHELJCCQWJwQi7W1ak" { - t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: " + seqhash) + t.Errorf("Circular single stranded hashing failed. Expected B_5xKbuHELJCCQWJwQi7W1ak, got: %s", seqhash) } // Test linear double stranded hashing seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, true)) if seqhash != "C_5Z2pHCXbxWUPYiZj6J1Nag" { - t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: " + seqhash) + t.Errorf("Linear double stranded hashing failed. Expected C_5Z2pHCXbxWUPYiZj6J1Nag, got: %s", seqhash) } // Test linear single stranded hashing seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "DNA", false, false)) if seqhash != "D_4yT7etihWZHHNXUpbM5tUf" { - t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: " + seqhash) + t.Errorf("Linear single stranded hashing failed. Expected D_4yT7etihWZHHNXUpbM5tUf, got: %s", seqhash) } // Test RNA Seqhash seqhash, _ = EncodeHash2(Hash2("TTAGCCCAT", "RNA", false, false)) if seqhash != "H_56cWv4dacvRJxUUcXYsdP5" { - t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: " + seqhash) + t.Errorf("Linear single stranded hashing failed. Expected H_56cWv4dacvRJxUUcXYsdP5, got: %s", seqhash) } // Test Protein Seqhash seqhash, _ = EncodeHash2(Hash2("MGC*", "PROTEIN", false, false)) if seqhash != "I_5DQsEyDHLh2r4njCcupAuF" { - t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: " + seqhash) + t.Errorf("Linear single stranded hashing failed. Expected I_5DQsEyDHLh2r4njCcupAuF, got: %s", seqhash) } } diff --git a/lib/synthesis/fix/synthesis_test.go b/lib/synthesis/fix/synthesis_test.go index 3bcfb46b..20aa5e40 100644 --- a/lib/synthesis/fix/synthesis_test.go +++ b/lib/synthesis/fix/synthesis_test.go @@ -48,11 +48,11 @@ func BenchmarkCds(b *testing.B) { for _, cutSite := range []string{"GAAGAC", "GGTCTC", "GCGATG", "CGTCTC", "GCTCTTC", "CACCTGC"} { if strings.Contains(optimizedSeq, cutSite) { fmt.Println(changes) - b.Errorf("phusion" + " contains " + cutSite) + b.Errorf("phusion contains %s", cutSite) } if strings.Contains(transform.ReverseComplement(optimizedSeq), cutSite) { fmt.Println(changes) - b.Errorf("phusion" + " reverse complement contains " + cutSite) + b.Errorf("phusion reverse complement contains %s", cutSite) } } } @@ -84,10 +84,10 @@ func TestCds(t *testing.T) { for _, cutSite := range []string{"GAAGAC", "GGTCTC", "GCGATG", "CGTCTC", "GCTCTTC", "CACCTGC"} { if strings.Contains(optimizedSeq, cutSite) { - t.Errorf("phusion" + " contains " + cutSite) + t.Errorf("phusion contains %s", cutSite) } if strings.Contains(transform.ReverseComplement(optimizedSeq), cutSite) { - t.Errorf("phusion" + " reverse complement contains " + cutSite) + t.Errorf("phusion reverse complement contains %s", cutSite) } } diff --git a/lib/synthesis/fragment/fragment_test.go b/lib/synthesis/fragment/fragment_test.go index ab3f0c15..97f7b9b0 100644 --- a/lib/synthesis/fragment/fragment_test.go +++ b/lib/synthesis/fragment/fragment_test.go @@ -9,7 +9,7 @@ func TestFragment(t *testing.T) { _, _, err := Fragment(gene, 90, 110, []string{}) if err != nil { - t.Errorf(err.Error()) + t.Error(err.Error()) } } @@ -51,7 +51,7 @@ func TestLongFragment(t *testing.T) { gene := "GGAGGGTCTCAATGCTGGACGATCGCAAATTCAGCGAACAGGAGCTGGTCCGTCGCAACAAATACAAAACGCTGGTCGAGCAAAACAAAGACCCGTACAAGATTACGAACTGGAAACGCAATACCACCCTGCTGAAACTGAATGAGAAATACAAAGACTATAGCAAGGAGGACCTGTTGAACCTGAATCAAGAACTGGTCGTTGTTGCAGGTCGTATCAAACTGTATCGTGAAGCCGGTAAAAAAGCTGCCTTTGTGAACATTGATGATCAAGACTCCTCTATTCAGTTGTACGTGCGCCTGGATGAGATCGGTGATCAGAGCTTCGAGGATTTCCGCAATTTCGACCTGGGTGACATCATTGGTGTTAAAGGTATCATGATGCGCACCGACCACGGCGAGTTGAGCATCCGTTGTAAGGAAGTCGTGCTGCTGAGCAAGGCCCTGCGTCCGCTGCCGGATAAACACGCGGGCATTCAGGATATTGAGGAAAAGTACCGCCGTCGCTATGTGGACCTGATTATGAATCACGACGTGCGCAAGACGTTCCAGGCGCGTACCAAGATCATTCGTACCTTGCAAAACTTTCTGGATAATAAGGGTTACATGGAGGTCGAAACCCCGATCCTGCATAGCCTGAAGGGTGGCGCGAGCGCGAAACCGTTTATTACCCACTACAATGTGCTGAATACGGATGTGTATCTGCGTATCGCGACCGAGCTGCACCTGAAACGCCTGATTGTTGGCGGTTTCGAGGGTGTGTATGAGATCGGTCGCATCTTTCGCAATGAAGGTATGTCCACGCGTCACAATCCGGAATTCACGTCTATCGAACTGTATGTCGCCTATGAGGACATGTTCTTTTTGATGGATCTGACCGAAGAGATTTTTCGCGTTTGTAATGCCGCAGTCAACAGCTCCAGCATCATTGAGTATAACAACGTGAAAATTGACCTGAGCAAGCCGTTTAAGCGCCTGCATATGGTTGACGGTATTAAACAGGTGACCGGCGTCGACTTCTGGCAGGAGATGACGGTCCAACAGGCTCTGGAGCTGGCCAAAAAGCATAAAGTGCACGTTGAAAAACATCAAGAGTCTGTTGGTCACATTATCAATTTGTTCTATGAGGAGTTCGTGGAGTCCACGATTGTTGAGCCGACGTTCGTGTACGGTCACCCGAAGGAAATCTCTCCGCTGGCTAAGAGCAATCCGTCTGACCCGCGTTTCACGGACCGTTTCGAGCTGTTCATTCTGGGTCGTGAGTATGCGAATGCGTTTAGCGAGCTGAATGACCCGATTGACCAGTACGAACGCTTCAAGGCTCAGATTGAGGAGGAAAGCAAGGGCAACGATGAAGCCAACGACATGGACATTGATTTCATCGAGGCTCTGGAACACGCCATGCCGCCGACCGCGGGTATTGGTATCGGCATTGATCGCTTGGTTATGCTGCTGACGAATAGCGAATCCATCAAAGACGTGCTGTTGTTCCCGCAAATGAAGCCGCGCGAATGAAGAGCTTAGAGACCCGCT" frags, _, err := Fragment(gene, 79, 94, []string{}) if err != nil { - t.Errorf(err.Error()) + t.Error(err.Error()) } for _, frag := range frags { if len(frag) > 94 { @@ -92,6 +92,6 @@ func TestFragmentWithOverhangs(t *testing.T) { _, _, err := FragmentWithOverhangs(gene, 90, 110, []string{}, defaultOverhangs) if err != nil { - t.Errorf(err.Error()) + t.Error(err.Error()) } } diff --git a/py/README.md b/py/README.md index 63b4ad96..0b7b365f 100644 --- a/py/README.md +++ b/py/README.md @@ -5,3 +5,8 @@ This is a work-in-progress. Right now, we have only ported the fasta parser. ### Other platforms If you have interest in other platforms, like openbsd or freebsd, please add an issue! I'd be happy to add automatic packaging for these alternative platforms if I know someone will use them. + +### Testing +``` +CC="zig cc -target x86_64-linux-gnu" CGO_ENABLED=1 GOOS=linux GOARCH=amd64 go build -o dnadesign/libdnadesign.so -buildmode=c-shared lib.go +``` diff --git a/py/dnadesign/definitions.h b/py/dnadesign/definitions.h index f66ca240..f1b56126 100644 --- a/py/dnadesign/definitions.h +++ b/py/dnadesign/definitions.h @@ -2,6 +2,7 @@ typedef struct FILE FILE; FILE* fopen(const char* path, const char* mode); int fclose(FILE* fp); +// FASTA definitions typedef struct { char* identifier; char* sequence; @@ -9,9 +10,32 @@ typedef struct { typedef struct { FastaRecord* records; - GoInt numRecords; + int numRecords; char* error; } FastaResult; FastaResult ParseFastaFromCFile(void* cfile); FastaResult ParseFastaFromCString(char* cstring); + +// FASTQ definitions +typedef struct { + char* key; + char* value; +} FastqOptional; + +typedef struct { + char* identifier; + FastqOptional* optionals; + int optionals_count; + char* sequence; + char* quality; +} FastqRecord; + +typedef struct { + FastqRecord* records; + int numRecords; + char* error; +} FastqResult; + +FastqResult ParseFastqFromCFile(void* cfile); +FastqResult ParseFastqFromCString(char* cstring); diff --git a/py/dnadesign/fasta_parser.py b/py/dnadesign/fasta_parser.py deleted file mode 100644 index 3b7cf1f1..00000000 --- a/py/dnadesign/fasta_parser.py +++ /dev/null @@ -1,26 +0,0 @@ -from typing import List, Optional -from .cffi_bindings import ffi, lib - -class FastaRecord: - def __init__(self, identifier: str, sequence: str): - self.identifier = identifier - self.sequence = sequence - -def parse_fasta_from_c_file(file_path: str) -> List[FastaRecord]: - cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8')) - result = lib.ParseFastaFromCFile(cfile) - return _process_result(result) - -def parse_fasta_from_c_string(cstring: str) -> List[FastaRecord]: - result = lib.ParseFastaFromCString(cstring.encode('utf-8')) - return _process_result(result) - -def _process_result(result) -> List[FastaRecord]: - if result.error != ffi.NULL: - error_str = ffi.string(result.error).decode('utf-8') - raise Exception("Error parsing FASTA: " + error_str) - num_records = result.numRecords - records = ffi.cast("FastaRecord*", result.records) - return [FastaRecord(ffi.string(records[i].identifier).decode('utf-8'), - ffi.string(records[i].sequence).decode('utf-8')) - for i in range(num_records)] diff --git a/py/dnadesign/parsers.py b/py/dnadesign/parsers.py new file mode 100644 index 00000000..590fd71e --- /dev/null +++ b/py/dnadesign/parsers.py @@ -0,0 +1,80 @@ +from typing import List, Optional, Dict +from .cffi_bindings import ffi, lib +import os + +class FastaRecord: + def __init__(self, identifier: str, sequence: str): + self.identifier = identifier + self.sequence = sequence + +class FastqRecord: + def __init__(self, identifier: str, sequence: str, quality: str, optionals: Dict[str, str]): + self.identifier = identifier + self.sequence = sequence + self.quality = quality + self.optionals = optionals + +def _safe_open_file(file_path: str): + if not os.path.exists(file_path): + raise FileNotFoundError(f"The file {file_path} does not exist.") + cfile = lib.fopen(file_path.encode('utf-8'), "r".encode('utf-8')) + if cfile == ffi.NULL: + raise IOError(f"Failed to open the file {file_path}.") + return cfile + +def parse_fasta_from_c_file(file_path: str) -> List[FastaRecord]: + try: + cfile = _safe_open_file(file_path) + result = lib.ParseFastaFromCFile(cfile) + return _process_fasta_result(result) + finally: + if 'cfile' in locals() and cfile != ffi.NULL: + lib.fclose(cfile) + +def parse_fasta_from_c_string(cstring: str) -> List[FastaRecord]: + result = lib.ParseFastaFromCString(cstring.encode('utf-8')) + return _process_fasta_result(result) + +def _process_fasta_result(result) -> List[FastaRecord]: + if result.error != ffi.NULL: + error_str = ffi.string(result.error).decode('utf-8') + raise Exception("Error parsing FASTA: " + error_str) + num_records = result.numRecords + records = ffi.cast("FastaRecord*", result.records) + return [FastaRecord(ffi.string(records[i].identifier).decode('utf-8'), + ffi.string(records[i].sequence).decode('utf-8')) + for i in range(num_records)] + +def parse_fastq_from_c_file(file_path: str) -> List[FastqRecord]: + try: + cfile = _safe_open_file(file_path) + result = lib.ParseFastqFromCFile(cfile) + return _process_fastq_result(result) + finally: + if 'cfile' in locals() and cfile != ffi.NULL: + lib.fclose(cfile) + +def parse_fastq_from_c_string(cstring: str) -> List[FastqRecord]: + result = lib.ParseFastqFromCString(cstring.encode('utf-8')) + return _process_fastq_result(result) + +def _process_fastq_result(result) -> List[FastqRecord]: + if result.error != ffi.NULL: + error_str = ffi.string(result.error).decode('utf-8') + raise Exception("Error parsing FASTQ: " + error_str) + num_records = result.numRecords + records = ffi.cast("FastqRecord*", result.records) + fastq_records = [] + for i in range(num_records): + optionals = {} + for j in range(records[i].optionals_count): + key = ffi.string(records[i].optionals[j].key).decode('utf-8') + value = ffi.string(records[i].optionals[j].value).decode('utf-8') + optionals[key] = value + fastq_records.append(FastqRecord( + ffi.string(records[i].identifier).decode('utf-8'), + ffi.string(records[i].sequence).decode('utf-8'), + ffi.string(records[i].quality).decode('utf-8'), + optionals + )) + return fastq_records diff --git a/py/lib.go b/py/lib.go index 1081a2bb..35982aa2 100644 --- a/py/lib.go +++ b/py/lib.go @@ -9,6 +9,21 @@ typedef struct { char* identifier; char* sequence; } FastaRecord; + +// FastqOptional +typedef struct { + char* key; + char* value; +} FastqOptional; + +// FastqRecord +typedef struct { + char* identifier; + FastqOptional* optionals; + int optionals_count; + char* sequence; + char* quality; +} FastqRecord; */ import "C" import ( @@ -86,6 +101,58 @@ func ParseFastaFromCString(cstring *C.char) (*C.FastaRecord, int, *C.char) { return goFastaToCFasta(reader) } +/****************************************************************************** +Aug 16, 2024 + +Fastq + +******************************************************************************/ + +// goFastqToCFastq converts an io.Reader to a C.FastqRecord +func goFastqToCFastq(reader io.Reader) (*C.FastqRecord, int, *C.char) { + parser := bio.NewFastqParser(reader) + records, err := parser.Parse() + if err != nil { + return nil, 0, C.CString(err.Error()) + } + cRecords := (*C.FastqRecord)(C.malloc(C.size_t(len(records)) * C.size_t(unsafe.Sizeof(C.FastqRecord{})))) + slice := (*[1<<30 - 1]C.FastqRecord)(unsafe.Pointer(cRecords))[:len(records):len(records)] + + for i, read := range records { + slice[i].identifier = C.CString(read.Identifier) + slice[i].sequence = C.CString(read.Sequence) + slice[i].quality = C.CString(read.Quality) + + optionalsCount := len(read.Optionals) + slice[i].optionals_count = C.int(optionalsCount) + if optionalsCount > 0 { + slice[i].optionals = (*C.FastqOptional)(C.malloc(C.size_t(optionalsCount) * C.size_t(unsafe.Sizeof(C.FastqOptional{})))) + optionalsSlice := (*[1<<30 - 1]C.FastqOptional)(unsafe.Pointer(slice[i].optionals))[:optionalsCount:optionalsCount] + + j := 0 + for key, value := range read.Optionals { + optionalsSlice[j].key = C.CString(key) + optionalsSlice[j].value = C.CString(value) + j++ + } + } + } + + return cRecords, len(records), nil +} + +//export ParseFastqFromCFile +func ParseFastqFromCFile(cfile *C.FILE) (*C.FastqRecord, int, *C.char) { + reader := readerFromCFile(cfile) + return goFastqToCFastq(reader) +} + +//export ParseFastqFromCString +func ParseFastqFromCString(cstring *C.char) (*C.FastqRecord, int, *C.char) { + reader := strings.NewReader(C.GoString(cstring)) + return goFastqToCFastq(reader) +} + /****************************************************************************** main.go diff --git a/py/setup.py b/py/setup.py index c2010f00..c065e4dd 100644 --- a/py/setup.py +++ b/py/setup.py @@ -13,7 +13,7 @@ def get_shared_lib_ext(): setup( name='dnadesign', - version='0.1.3', + version='0.1.4', packages=find_packages(), package_data={'dnadesign': ['definitions.h', 'libdnadesign.h', "libdnadesign" + get_shared_lib_ext()]}, install_requires=[ diff --git a/py/tests/data/example.fastq b/py/tests/data/example.fastq new file mode 100644 index 00000000..c1d451c9 --- /dev/null +++ b/py/tests/data/example.fastq @@ -0,0 +1,16 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GGTATACTTCGTTCAGTTACGTATTGCTCAAGACGGAGTTGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGATAGTGCCACGGTTTCCTGTTTTCGCACACATTGTTACCAGAACCTAAGTGGGTAGTCTTGTAGTGCGTTGTTCGTTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTGCTGAATTTCATCTAAGCGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAACGAAATGCGCCCCGCGTGCGTTTGTTGGTTCCCTCAGATTCGCTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAGCATCGGCCCCAAGGTTTGCCAATAATACTGCGTCTTGGTTCCACCGCTCTCACTCAACATGGCAAGGAAGACCTCGTTAGTAGTAGCAATACGTAACC ++ +%&&%$&(()')23-+.:)''.10355=DEAE@E;--&+:<115924-0CJ:<>DE?=6,.+,.//0*123,*7//&'&'%#&$$)*631-/0&%($),&%)(+/0-/29;88=;8EGFHFJFEFFFB===C@?;((426?=<5&':;<8&8()%76:?5.'2-,'()/&20.-3>8+$#'&.1186EBA@B>C;:-/)+...0-+%1-/3*&.)$'(&'$'1&,466663)5+<6)++,.7999;;92;9:977$+61)-124.5970<,8=:-.1--,+'*++(-***,,12@??9:2/61-)&## diff --git a/py/tests/test_fasta_parser.py b/py/tests/test_fasta_parser.py index 12ec3c94..e0f16cb6 100644 --- a/py/tests/test_fasta_parser.py +++ b/py/tests/test_fasta_parser.py @@ -1,6 +1,6 @@ import pytest import os -from dnadesign.fasta_parser import parse_fasta_from_c_file, parse_fasta_from_c_string, FastaRecord +from dnadesign.parsers import parse_fasta_from_c_file, parse_fasta_from_c_string, FastaRecord def test_parse_fasta_from_c_file(): current_dir = os.path.dirname(__file__) diff --git a/py/tests/test_fastq_parser.py b/py/tests/test_fastq_parser.py new file mode 100644 index 00000000..3cf77b49 --- /dev/null +++ b/py/tests/test_fastq_parser.py @@ -0,0 +1,44 @@ +import pytest +import os +from dnadesign.parsers import parse_fastq_from_c_file, parse_fastq_from_c_string, FastqRecord + +def test_parse_fastq_from_c_file(): + current_dir = os.path.dirname(__file__) + example_path = os.path.join(current_dir, 'data/example.fastq') + records = parse_fastq_from_c_file(example_path) + assert len(records) > 0 + assert all(isinstance(r, FastqRecord) for r in records) + +def test_parse_fastq_from_c_string(): + fastq_data = "@test\nATCG\n+\nIIII\n" + records = parse_fastq_from_c_string(fastq_data) + assert len(records) == 1 + assert records[0].identifier == "test" + assert records[0].sequence == "ATCG" + assert records[0].quality == "IIII" + assert records[0].optionals == {} + +def test_parse_fastq_with_optionals(): + fastq_data = "@test read=1 ch=2\nATCG\n+\nIIII\n" + records = parse_fastq_from_c_string(fastq_data) + assert len(records) == 1 + assert records[0].identifier == "test" + assert records[0].sequence == "ATCG" + assert records[0].quality == "IIII" + assert records[0].optionals == {"read": "1", "ch": "2"} + +def test_multiple_fastq_records(): + fastq_data = "@seq1\nACGT\n+\nHHHH\n@seq2\nTGCA\n+\nIIII\n" + records = parse_fastq_from_c_string(fastq_data) + assert len(records) == 2 + assert records[0].identifier == "seq1" + assert records[0].sequence == "ACGT" + assert records[0].quality == "HHHH" + assert records[1].identifier == "seq2" + assert records[1].sequence == "TGCA" + assert records[1].quality == "IIII" + +def test_invalid_fastq(): + invalid_fastq = "@test\nATCG\n+\nII\n" # Quality string too short + with pytest.raises(Exception): + parse_fastq_from_c_string(invalid_fastq)