diff --git a/lib/bio/bio.go b/lib/bio/bio.go index a50a60a..8ebbee9 100644 --- a/lib/bio/bio.go +++ b/lib/bio/bio.go @@ -21,6 +21,7 @@ import ( "github.com/koeng101/dnadesign/lib/bio/genbank" "github.com/koeng101/dnadesign/lib/bio/pileup" "github.com/koeng101/dnadesign/lib/bio/slow5" + "github.com/koeng101/dnadesign/lib/bio/uniprot" "golang.org/x/sync/errgroup" ) @@ -143,6 +144,16 @@ func NewPileupParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[* return &Parser[*pileup.Line, *pileup.Header]{parserInterface: pileup.NewParser(r, maxLineLength)}, nil } +// NewUniprotParser initiates a new Uniprot parser from an io.Reader. No +// maxLineLength is necessary. The parser should be reading a gzipped xml file. +func NewUniprotParser(r io.Reader) (*Parser[*uniprot.Entry, *uniprot.Header], error) { + parser, err := uniprot.NewParser(r) + if err != nil { + return &Parser[*uniprot.Entry, *uniprot.Header]{}, err + } + return &Parser[*uniprot.Entry, *uniprot.Header]{parserInterface: parser}, nil +} + /****************************************************************************** Parser higher-level functions diff --git a/lib/bio/example_test.go b/lib/bio/example_test.go index a6b241b..fa3f16b 100644 --- a/lib/bio/example_test.go +++ b/lib/bio/example_test.go @@ -10,6 +10,7 @@ import ( "github.com/koeng101/dnadesign/lib/bio" "github.com/koeng101/dnadesign/lib/bio/fasta" + "github.com/koeng101/dnadesign/lib/bio/uniprot" ) // Example_read shows an example of reading a file from disk. @@ -288,3 +289,108 @@ seq1 279 C 23 A..T,,.,.,...,,,.,..... 75&<<<<<<<<<=<<<9<<:<<<`) fmt.Println(lines[1].Quality) // Output: <<<;<<<<<<<<<3<=<<<;<<+ } + +func ExampleNewUniprotParser() { + // The following is a real entry in Swiss-Prot. We're going to gzip it and + // put the gzipped text as an io.Reader to mock a file. You can edit the + // text here to see how the parser works. + uniprotEntryText := ` + P0C9F0 + 1001R_ASFK5 + + + Protein MGF 100-1R + + + + Ken-018 + + + African swine fever virus (isolate Pig/Kenya/KEN-50/1950) + ASFV + + + Viruses + Varidnaviria + Bamfordvirae + Nucleocytoviricota + Pokkesviricetes + Asfuvirales + Asfarviridae + Asfivirus + + + + Ornithodoros + relapsing fever ticks + + + + Phacochoerus aethiopicus + Warthog + + + + Phacochoerus africanus + Warthog + + + + Potamochoerus larvatus + Bushpig + + + + Sus scrofa + Pig + + + + + African swine fever virus genomes. + + + + + + NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA] + + + Plays a role in virus cell tropism, and may be required for efficient virus replication in macrophages. + + + Belongs to the asfivirus MGF 100 family. + + + + + + + + + + + + + + + + + + MVRLFYNPIKYLFYRRSCKKRLRKALKKLNFYHPPKECCQIYRLLENAPGGTYFITENMTNELIMIAKDPVDKKIKSVKLYLTGNYIKINQHYYINIYMYLMRYNQIYKYPLICFSKYSKIL +` + // Encode the string into an gzip io.Reader + var buf bytes.Buffer + gz := gzip.NewWriter(&buf) + _, _ = gz.Write([]byte(uniprotEntryText)) + _ = gz.Close() + + r := bytes.NewReader(buf.Bytes()) + + // Now we load the parser, and get the first entry out. + parser, _ := uniprot.NewParser(r) + entry, _ := parser.Next() + + fmt.Println(entry.Accession[0]) + // Output: P0C9F0 +} diff --git a/lib/bio/uniprot/example_test.go b/lib/bio/uniprot/example_test.go index da6c306..8b08790 100644 --- a/lib/bio/uniprot/example_test.go +++ b/lib/bio/uniprot/example_test.go @@ -2,6 +2,7 @@ package uniprot_test import ( "fmt" + "os" "github.com/koeng101/dnadesign/lib/bio/uniprot" ) @@ -10,12 +11,11 @@ import ( // into a list. Directly using the channel without converting to an array // should be used for the Trembl data dump func Example_basic() { - entries, _, _ := uniprot.Read("data/uniprot_sprot_mini.xml.gz") + uniprotFile, _ := os.Open("data/uniprot_sprot_mini.xml.gz") + defer uniprotFile.Close() + parser, _ := uniprot.NewParser(uniprotFile) + entry, _ := parser.Next() - var entry uniprot.Entry - for singleEntry := range entries { - entry = singleEntry - } fmt.Println(entry.Accession[0]) - // Output: O55723 + // Output: P0C9F0 } diff --git a/lib/bio/uniprot/uniprot.go b/lib/bio/uniprot/uniprot.go index 996a1d4..83c6306 100644 --- a/lib/bio/uniprot/uniprot.go +++ b/lib/bio/uniprot/uniprot.go @@ -17,91 +17,131 @@ Each protein in Uniprot is known as an "Entry" (as defined in xml.go). The function Parse stream-reads Uniprot into an Entry channel, from which you can use the entries however you want. Read simplifies reading gzipped files from a disk into an Entry channel. + +(1) Opinion of Keoni Gandall as of May 18, 2021 +(2) https://www.uniprot.org/downloads +(3) https://www.uniprot.org/docs/uniprot.xsd */ package uniprot import ( "compress/gzip" + "encoding/json" "encoding/xml" - "os" + "io" ) -/****************************************************************************** -May 18, 2021 - -Uniprot is comprehensive, high-quality and freely accessible resource of protein -sequence and functional information. It is the best(1) protein database out there. - -Uniprot database dumps are available as gzipped FASTA files or gzipped XML files. -The XML files have significantly more information than the FASTA files, and this -parser specifically works on the gzipped XML files from Uniprot. - -Uniprot provides an XML schema of their data dumps(3), which is useful for -autogeneration of Golang structs. I used xsdgen(4) to automatically generate -xml.go from uniprot.xsd - -Each protein in Uniprot is known as an "Entry" (as defined in xml.go). - -The function Parse stream-reads Uniprot into an Entry channel, from which -you can use the entries however you want. Read simplifies reading gzipped -files from a disk into an Entry channel, essentially just preparing the reader for -Parse. - -Cheers, -Keoni - -(1) Opinion of Keoni Gandall as of May 18, 2021 -(2) https://www.uniprot.org/downloads -(3) https://www.uniprot.org/docs/uniprot.xsd - -******************************************************************************/ - // Decoder decodes XML elements2 type Decoder interface { DecodeElement(v interface{}, start *xml.StartElement) error Token() (xml.Token, error) } -// Read reads a gzipped Uniprot XML dump. Failing to open the XML dump -// gives a single error, while errors encountered while decoding the XML dump -// are added to the errors channel. -func Read(path string) (chan Entry, chan error, error) { - entries := make(chan Entry, 100) // if you don't have a buffered channel, nothing will be read in loops on the channel. - decoderErrors := make(chan error, 100) - xmlFile, err := os.Open(path) +// Header is a blank struct, needed for compatibility with bio parsers. It contains nothing. +type Header struct{} + +// Header_WriteTo is a blank function, needed for compatibility with bio parsers. It doesn't do anything. +func (header *Header) WriteTo(w io.Writer) (int64, error) { + return 0, nil +} + +// Header returns nil,nil. +func (p *Parser) Header() (*Header, error) { + return &Header{}, nil +} + +// Entry_WriteTo writes an entry to an io.Writer. It specifically writes a JSON +// representation, NOT an XML representation, of the uniprot data. +func (entry *Entry) WriteTo(w io.Writer) (int64, error) { + b, err := json.Marshal(entry) if err != nil { - return entries, decoderErrors, err + return 0, err } - unzippedBytes, err := gzip.NewReader(xmlFile) + n, err := w.Write(b) + return int64(n), err +} + +// Parser implements a bio parser with Next(). +type Parser struct { + decoder Decoder +} + +// NewParser returns a Parser that uses r as the source +// from which to parse fasta formatted sequences. It expects a gzipped file, +// as the default uniprot dump is xml.gz +func NewParser(r io.Reader) (*Parser, error) { + unzippedBytes, err := gzip.NewReader(r) if err != nil { - return entries, decoderErrors, err + return &Parser{}, err } decoder := xml.NewDecoder(unzippedBytes) - go Parse(decoder, entries, decoderErrors) - return entries, decoderErrors, nil + return &Parser{decoder: decoder}, nil } -// Parse parses Uniprot entries into a channel. -func Parse(decoder Decoder, entries chan<- Entry, errors chan<- error) { - for { - decoderToken, err := decoder.Token() +func (p *Parser) Next() (*Entry, error) { + decoderToken, err := p.decoder.Token() - if err != nil { - if err.Error() == "EOF" { - break - } - errors <- err + // Check decoding + if err != nil { + // If we are the end of the file, return io.EOF + if err.Error() == "EOF" { + return &Entry{}, io.EOF } - startElement, ok := decoderToken.(xml.StartElement) - if ok && startElement.Name.Local == "entry" { - var e Entry - err = decoder.DecodeElement(&e, &startElement) - if err != nil { - errors <- err - } - entries <- e + } + + // Actual parsing + startElement, ok := decoderToken.(xml.StartElement) + if ok && startElement.Name.Local == "entry" { + var e Entry + err = p.decoder.DecodeElement(&e, &startElement) + if err != nil { + return &Entry{}, err } + return &e, nil } - close(entries) - close(errors) + return p.Next() } + +//// Read reads a gzipped Uniprot XML dump. Failing to open the XML dump +//// gives a single error, while errors encountered while decoding the XML dump +//// are added to the errors channel. +//func Read(path string) (chan Entry, chan error, error) { +// entries := make(chan Entry, 100) // if you don't have a buffered channel, nothing will be read in loops on the channel. +// decoderErrors := make(chan error, 100) +// xmlFile, err := os.Open(path) +// if err != nil { +// return entries, decoderErrors, err +// } +// unzippedBytes, err := gzip.NewReader(xmlFile) +// if err != nil { +// return entries, decoderErrors, err +// } +// decoder := xml.NewDecoder(unzippedBytes) +// go Parse(decoder, entries, decoderErrors) +// return entries, decoderErrors, nil +//} +// +//// Parse parses Uniprot entries into a channel. +//func Parse(decoder Decoder, entries chan<- Entry, errors chan<- error) { +// for { +// decoderToken, err := decoder.Token() +// +// if err != nil { +// if err.Error() == "EOF" { +// break +// } +// errors <- err +// } +// startElement, ok := decoderToken.(xml.StartElement) +// if ok && startElement.Name.Local == "entry" { +// var e Entry +// err = decoder.DecodeElement(&e, &startElement) +// if err != nil { +// errors <- err +// } +// entries <- e +// } +// } +// close(entries) +// close(errors) +//} diff --git a/lib/bio/uniprot/uniprot_test.go b/lib/bio/uniprot/uniprot_test.go index ff0b1fc..c316d8a 100644 --- a/lib/bio/uniprot/uniprot_test.go +++ b/lib/bio/uniprot/uniprot_test.go @@ -1,123 +1,66 @@ -package uniprot +package uniprot_test import ( - "compress/gzip" - "encoding/xml" - "errors" - "fmt" + "io" + "io/ioutil" "os" "testing" - "github.com/stretchr/testify/assert" + "github.com/koeng101/dnadesign/lib/bio/uniprot" ) -func ExampleRead() { - entries, _, _ := Read("data/uniprot_sprot_mini.xml.gz") - - var entry Entry - for singleEntry := range entries { - entry = singleEntry - } - fmt.Println(entry.Accession[0]) - // Output: O55723 -} - -func ExampleParse() { - xmlFile, _ := os.Open("data/uniprot_sprot_mini.xml.gz") - unzippedBytes, _ := gzip.NewReader(xmlFile) - - entries := make(chan Entry, 100) // if you don't have a buffered channel, nothing will be read in loops on the channel. - decoderErrors := make(chan error, 100) - decoder := xml.NewDecoder(unzippedBytes) - go Parse(decoder, entries, decoderErrors) - - var entry Entry - for singleEntry := range entries { - entry = singleEntry - } - fmt.Println(entry.Accession[0]) - // Output: O55723 -} - func TestRead(t *testing.T) { - _, _, err := Read("data/test") + testFile, err := os.Open("data/test") + if err != nil { + t.Errorf("Should open file properly") + } + defer testFile.Close() + _, err = uniprot.NewParser(testFile) if err == nil { t.Errorf("Failed to fail on non-gzipped file") } - _, _, err = Read("data/FAKE") + _, err = os.Open("data/FAKE") if err == nil { t.Errorf("Failed to fail on empty file") } - _, errors, err := Read("data/uniprot_sprot_mini.xml.gz") + uniprotFile, err := os.Open("data/uniprot_sprot_mini.xml.gz") if err != nil { - t.Errorf("Failed on real file with error: %v", err) + t.Errorf("Should open file properly") } - - for err := range errors { + defer uniprotFile.Close() + parser, err := uniprot.NewParser(uniprotFile) + if err != nil { + t.Errorf("Parser should succeed. Got err: %s", err) + } + for { + _, err := parser.Next() if err != nil { - t.Errorf("Failed during parsing with error: %v", err) + if err == io.EOF { + break + } else { + t.Errorf("Failed to parse uniprot test file with err: %s", err) + break + } } } } -func TestParse(t *testing.T) { - t.Run("error getting a token", func(t *testing.T) { - entries := make(chan Entry, 100) - decoderErrors := make(chan error, 100) - tokenErr := errors.New("token error") - firstRun := true - decoder := &mockDecoder{ - TokenFn: func() (xml.Token, error) { - if firstRun { - firstRun = false - return nil, tokenErr - } - return nil, errors.New("EOF") - }, - } - Parse(decoder, entries, decoderErrors) - assert.EqualError(t, <-decoderErrors, tokenErr.Error()) - }) - - t.Run("error decoding after getting a token", func(t *testing.T) { - entries := make(chan Entry, 100) - decoderErrors := make(chan error, 100) - decodeErr := errors.New("decode error") - startElement := xml.StartElement{ - Name: xml.Name{ - Local: "entry", - }, - Attr: nil, - } - firstRun := true - decoder := &mockDecoder{ - DecodeElementFn: func(v interface{}, start *xml.StartElement) error { - return decodeErr - }, - TokenFn: func() (xml.Token, error) { - if firstRun { - firstRun = false - return startElement, nil - } - return nil, errors.New("EOF") - }, - } - Parse(decoder, entries, decoderErrors) - assert.EqualError(t, <-decoderErrors, decodeErr.Error()) - }) -} - -type mockDecoder struct { - DecodeElementFn func(v interface{}, start *xml.StartElement) error - TokenFn func() (xml.Token, error) -} - -func (d *mockDecoder) DecodeElement(v interface{}, start *xml.StartElement) error { - return d.DecodeElementFn(v, start) +func TestHeader(t *testing.T) { + var writer = ioutil.Discard + header := uniprot.Header{} + _, err := header.WriteTo(writer) + if err != nil { + t.Errorf("should always be nil") + } } -func (d *mockDecoder) Token() (xml.Token, error) { - return d.TokenFn() +func TestEntry(t *testing.T) { + var writer = ioutil.Discard + entry := uniprot.Entry{} + _, err := entry.WriteTo(writer) + if err != nil { + t.Errorf("should always be nil") + } }