Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updated uniprot to be a part of the generic parsers #22

Merged
merged 3 commits into from
Dec 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions lib/bio/bio.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/koeng101/dnadesign/lib/bio/genbank"
"github.com/koeng101/dnadesign/lib/bio/pileup"
"github.com/koeng101/dnadesign/lib/bio/slow5"
"github.com/koeng101/dnadesign/lib/bio/uniprot"
"golang.org/x/sync/errgroup"
)

Expand Down Expand Up @@ -143,6 +144,16 @@ func NewPileupParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*
return &Parser[*pileup.Line, *pileup.Header]{parserInterface: pileup.NewParser(r, maxLineLength)}, nil
}

// NewUniprotParser initiates a new Uniprot parser from an io.Reader. No
// maxLineLength is necessary. The parser should be reading a gzipped xml file.
func NewUniprotParser(r io.Reader) (*Parser[*uniprot.Entry, *uniprot.Header], error) {
parser, err := uniprot.NewParser(r)
if err != nil {
return &Parser[*uniprot.Entry, *uniprot.Header]{}, err
}
return &Parser[*uniprot.Entry, *uniprot.Header]{parserInterface: parser}, nil
}

/******************************************************************************

Parser higher-level functions
Expand Down
106 changes: 106 additions & 0 deletions lib/bio/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (

"github.com/koeng101/dnadesign/lib/bio"
"github.com/koeng101/dnadesign/lib/bio/fasta"
"github.com/koeng101/dnadesign/lib/bio/uniprot"
)

// Example_read shows an example of reading a file from disk.
Expand Down Expand Up @@ -288,3 +289,108 @@ seq1 279 C 23 A..T,,.,.,...,,,.,..... 75&<<<<<<<<<=<<<9<<:<<<`)
fmt.Println(lines[1].Quality)
// Output: <<<;<<<<<<<<<3<=<<<;<<+
}

func ExampleNewUniprotParser() {
// The following is a real entry in Swiss-Prot. We're going to gzip it and
// put the gzipped text as an io.Reader to mock a file. You can edit the
// text here to see how the parser works.
uniprotEntryText := `<entry dataset="Swiss-Prot" created="2009-05-05" modified="2020-08-12" version="9" xmlns="http://uniprot.org/uniprot">
<accession>P0C9F0</accession>
<name>1001R_ASFK5</name>
<protein>
<recommendedName>
<fullName>Protein MGF 100-1R</fullName>
</recommendedName>
</protein>
<gene>
<name type="ordered locus">Ken-018</name>
</gene>
<organism>
<name type="scientific">African swine fever virus (isolate Pig/Kenya/KEN-50/1950)</name>
<name type="common">ASFV</name>
<dbReference type="NCBI Taxonomy" id="561445"/>
<lineage>
<taxon>Viruses</taxon>
<taxon>Varidnaviria</taxon>
<taxon>Bamfordvirae</taxon>
<taxon>Nucleocytoviricota</taxon>
<taxon>Pokkesviricetes</taxon>
<taxon>Asfuvirales</taxon>
<taxon>Asfarviridae</taxon>
<taxon>Asfivirus</taxon>
</lineage>
</organism>
<organismHost>
<name type="scientific">Ornithodoros</name>
<name type="common">relapsing fever ticks</name>
<dbReference type="NCBI Taxonomy" id="6937"/>
</organismHost>
<organismHost>
<name type="scientific">Phacochoerus aethiopicus</name>
<name type="common">Warthog</name>
<dbReference type="NCBI Taxonomy" id="85517"/>
</organismHost>
<organismHost>
<name type="scientific">Phacochoerus africanus</name>
<name type="common">Warthog</name>
<dbReference type="NCBI Taxonomy" id="41426"/>
</organismHost>
<organismHost>
<name type="scientific">Potamochoerus larvatus</name>
<name type="common">Bushpig</name>
<dbReference type="NCBI Taxonomy" id="273792"/>
</organismHost>
<organismHost>
<name type="scientific">Sus scrofa</name>
<name type="common">Pig</name>
<dbReference type="NCBI Taxonomy" id="9823"/>
</organismHost>
<reference key="1">
<citation type="submission" date="2003-03" db="EMBL/GenBank/DDBJ databases">
<title>African swine fever virus genomes.</title>
<authorList>
<person name="Kutish G.F."/>
<person name="Rock D.L."/>
</authorList>
</citation>
<scope>NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]</scope>
</reference>
<comment type="function">
<text evidence="1">Plays a role in virus cell tropism, and may be required for efficient virus replication in macrophages.</text>
</comment>
<comment type="similarity">
<text evidence="2">Belongs to the asfivirus MGF 100 family.</text>
</comment>
<dbReference type="EMBL" id="AY261360">
<property type="status" value="NOT_ANNOTATED_CDS"/>
<property type="molecule type" value="Genomic_DNA"/>
</dbReference>
<dbReference type="Proteomes" id="UP000000861">
<property type="component" value="Genome"/>
</dbReference>
<proteinExistence type="inferred from homology"/>
<feature type="chain" id="PRO_0000373170" description="Protein MGF 100-1R">
<location>
<begin position="1"/>
<end position="122"/>
</location>
</feature>
<evidence type="ECO:0000250" key="1"/>
<evidence type="ECO:0000305" key="2"/>
<sequence length="122" mass="14969" checksum="C5E63C34B941711C" modified="2009-05-05" version="1">MVRLFYNPIKYLFYRRSCKKRLRKALKKLNFYHPPKECCQIYRLLENAPGGTYFITENMTNELIMIAKDPVDKKIKSVKLYLTGNYIKINQHYYINIYMYLMRYNQIYKYPLICFSKYSKIL</sequence>
</entry>`
// Encode the string into an gzip io.Reader
var buf bytes.Buffer
gz := gzip.NewWriter(&buf)
_, _ = gz.Write([]byte(uniprotEntryText))
_ = gz.Close()

r := bytes.NewReader(buf.Bytes())

// Now we load the parser, and get the first entry out.
parser, _ := uniprot.NewParser(r)
entry, _ := parser.Next()

fmt.Println(entry.Accession[0])
// Output: P0C9F0
}
12 changes: 6 additions & 6 deletions lib/bio/uniprot/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package uniprot_test

import (
"fmt"
"os"

"github.com/koeng101/dnadesign/lib/bio/uniprot"
)
Expand All @@ -10,12 +11,11 @@ import (
// into a list. Directly using the channel without converting to an array
// should be used for the Trembl data dump
func Example_basic() {
entries, _, _ := uniprot.Read("data/uniprot_sprot_mini.xml.gz")
uniprotFile, _ := os.Open("data/uniprot_sprot_mini.xml.gz")
defer uniprotFile.Close()
parser, _ := uniprot.NewParser(uniprotFile)
entry, _ := parser.Next()

var entry uniprot.Entry
for singleEntry := range entries {
entry = singleEntry
}
fmt.Println(entry.Accession[0])
// Output: O55723
// Output: P0C9F0
}
164 changes: 102 additions & 62 deletions lib/bio/uniprot/uniprot.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,91 +17,131 @@ Each protein in Uniprot is known as an "Entry" (as defined in xml.go).
The function Parse stream-reads Uniprot into an Entry channel, from which you
can use the entries however you want. Read simplifies reading gzipped files
from a disk into an Entry channel.

(1) Opinion of Keoni Gandall as of May 18, 2021
(2) https://www.uniprot.org/downloads
(3) https://www.uniprot.org/docs/uniprot.xsd
*/
package uniprot

import (
"compress/gzip"
"encoding/json"
"encoding/xml"
"os"
"io"
)

/******************************************************************************
May 18, 2021

Uniprot is comprehensive, high-quality and freely accessible resource of protein
sequence and functional information. It is the best(1) protein database out there.

Uniprot database dumps are available as gzipped FASTA files or gzipped XML files.
The XML files have significantly more information than the FASTA files, and this
parser specifically works on the gzipped XML files from Uniprot.

Uniprot provides an XML schema of their data dumps(3), which is useful for
autogeneration of Golang structs. I used xsdgen(4) to automatically generate
xml.go from uniprot.xsd

Each protein in Uniprot is known as an "Entry" (as defined in xml.go).

The function Parse stream-reads Uniprot into an Entry channel, from which
you can use the entries however you want. Read simplifies reading gzipped
files from a disk into an Entry channel, essentially just preparing the reader for
Parse.

Cheers,
Keoni

(1) Opinion of Keoni Gandall as of May 18, 2021
(2) https://www.uniprot.org/downloads
(3) https://www.uniprot.org/docs/uniprot.xsd

******************************************************************************/

// Decoder decodes XML elements2
type Decoder interface {
DecodeElement(v interface{}, start *xml.StartElement) error
Token() (xml.Token, error)
}

// Read reads a gzipped Uniprot XML dump. Failing to open the XML dump
// gives a single error, while errors encountered while decoding the XML dump
// are added to the errors channel.
func Read(path string) (chan Entry, chan error, error) {
entries := make(chan Entry, 100) // if you don't have a buffered channel, nothing will be read in loops on the channel.
decoderErrors := make(chan error, 100)
xmlFile, err := os.Open(path)
// Header is a blank struct, needed for compatibility with bio parsers. It contains nothing.
type Header struct{}

// Header_WriteTo is a blank function, needed for compatibility with bio parsers. It doesn't do anything.
func (header *Header) WriteTo(w io.Writer) (int64, error) {
return 0, nil
}

// Header returns nil,nil.
func (p *Parser) Header() (*Header, error) {
return &Header{}, nil
}

// Entry_WriteTo writes an entry to an io.Writer. It specifically writes a JSON
// representation, NOT an XML representation, of the uniprot data.
func (entry *Entry) WriteTo(w io.Writer) (int64, error) {
b, err := json.Marshal(entry)
if err != nil {
return entries, decoderErrors, err
return 0, err
}
unzippedBytes, err := gzip.NewReader(xmlFile)
n, err := w.Write(b)
return int64(n), err
}

// Parser implements a bio parser with Next().
type Parser struct {
decoder Decoder
}

// NewParser returns a Parser that uses r as the source
// from which to parse fasta formatted sequences. It expects a gzipped file,
// as the default uniprot dump is xml.gz
func NewParser(r io.Reader) (*Parser, error) {
unzippedBytes, err := gzip.NewReader(r)
if err != nil {
return entries, decoderErrors, err
return &Parser{}, err
}
decoder := xml.NewDecoder(unzippedBytes)
go Parse(decoder, entries, decoderErrors)
return entries, decoderErrors, nil
return &Parser{decoder: decoder}, nil
}

// Parse parses Uniprot entries into a channel.
func Parse(decoder Decoder, entries chan<- Entry, errors chan<- error) {
for {
decoderToken, err := decoder.Token()
func (p *Parser) Next() (*Entry, error) {
decoderToken, err := p.decoder.Token()

if err != nil {
if err.Error() == "EOF" {
break
}
errors <- err
// Check decoding
if err != nil {
// If we are the end of the file, return io.EOF
if err.Error() == "EOF" {
return &Entry{}, io.EOF
}
startElement, ok := decoderToken.(xml.StartElement)
if ok && startElement.Name.Local == "entry" {
var e Entry
err = decoder.DecodeElement(&e, &startElement)
if err != nil {
errors <- err
}
entries <- e
}

// Actual parsing
startElement, ok := decoderToken.(xml.StartElement)
if ok && startElement.Name.Local == "entry" {
var e Entry
err = p.decoder.DecodeElement(&e, &startElement)
if err != nil {
return &Entry{}, err
}
return &e, nil
}
close(entries)
close(errors)
return p.Next()
}

//// Read reads a gzipped Uniprot XML dump. Failing to open the XML dump
//// gives a single error, while errors encountered while decoding the XML dump
//// are added to the errors channel.
//func Read(path string) (chan Entry, chan error, error) {
// entries := make(chan Entry, 100) // if you don't have a buffered channel, nothing will be read in loops on the channel.
// decoderErrors := make(chan error, 100)
// xmlFile, err := os.Open(path)
// if err != nil {
// return entries, decoderErrors, err
// }
// unzippedBytes, err := gzip.NewReader(xmlFile)
// if err != nil {
// return entries, decoderErrors, err
// }
// decoder := xml.NewDecoder(unzippedBytes)
// go Parse(decoder, entries, decoderErrors)
// return entries, decoderErrors, nil
//}
//
//// Parse parses Uniprot entries into a channel.
//func Parse(decoder Decoder, entries chan<- Entry, errors chan<- error) {
// for {
// decoderToken, err := decoder.Token()
//
// if err != nil {
// if err.Error() == "EOF" {
// break
// }
// errors <- err
// }
// startElement, ok := decoderToken.(xml.StartElement)
// if ok && startElement.Name.Local == "entry" {
// var e Entry
// err = decoder.DecodeElement(&e, &startElement)
// if err != nil {
// errors <- err
// }
// entries <- e
// }
// }
// close(entries)
// close(errors)
//}
Loading
Loading