Koeng101 · Koeng101 · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023
diff --git a/lib/bio/bio.go b/lib/bio/bio.go
@@ -21,6 +21,7 @@ import (
 	"github.com/koeng101/dnadesign/lib/bio/genbank"
 	"github.com/koeng101/dnadesign/lib/bio/pileup"
 	"github.com/koeng101/dnadesign/lib/bio/slow5"
+	"github.com/koeng101/dnadesign/lib/bio/uniprot"
 	"golang.org/x/sync/errgroup"
 )
 
@@ -143,6 +144,16 @@ func NewPileupParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*
 	return &Parser[*pileup.Line, *pileup.Header]{parserInterface: pileup.NewParser(r, maxLineLength)}, nil
 }
 
+// NewUniprotParser initiates a new Uniprot parser from an io.Reader. No
+// maxLineLength is necessary. The parser should be reading a gzipped xml file.
+func NewUniprotParser(r io.Reader) (*Parser[*uniprot.Entry, *uniprot.Header], error) {
+	parser, err := uniprot.NewParser(r)
+	if err != nil {
+		return &Parser[*uniprot.Entry, *uniprot.Header]{}, err
+	}
+	return &Parser[*uniprot.Entry, *uniprot.Header]{parserInterface: parser}, nil
+}
+
 /******************************************************************************
 
 Parser higher-level functions

diff --git a/lib/bio/example_test.go b/lib/bio/example_test.go
@@ -10,6 +10,7 @@ import (
 
 	"github.com/koeng101/dnadesign/lib/bio"
 	"github.com/koeng101/dnadesign/lib/bio/fasta"
+	"github.com/koeng101/dnadesign/lib/bio/uniprot"
 )
 
 // Example_read shows an example of reading a file from disk.
@@ -288,3 +289,108 @@ seq1 	279 	C 	23 	A..T,,.,.,...,,,.,..... 	75&<<<<<<<<<=<<<9<<:<<<`)
 	fmt.Println(lines[1].Quality)
 	// Output: <<<;<<<<<<<<<3<=<<<;<<+
 }
+
+func ExampleNewUniprotParser() {
+	// The following is a real entry in Swiss-Prot. We're going to gzip it and
+	// put the gzipped text as an io.Reader to mock a file. You can edit the
+	// text here to see how the parser works.
+	uniprotEntryText := `<entry dataset="Swiss-Prot" created="2009-05-05" modified="2020-08-12" version="9" xmlns="http://uniprot.org/uniprot">
+  <accession>P0C9F0</accession>
+  <name>1001R_ASFK5</name>
+  <protein>
+    <recommendedName>
+      <fullName>Protein MGF 100-1R</fullName>
+    </recommendedName>
+  </protein>
+  <gene>
+    <name type="ordered locus">Ken-018</name>
+  </gene>
+  <organism>
+    <name type="scientific">African swine fever virus (isolate Pig/Kenya/KEN-50/1950)</name>
+    <name type="common">ASFV</name>
+    <dbReference type="NCBI Taxonomy" id="561445"/>
+    <lineage>
+      <taxon>Viruses</taxon>
+      <taxon>Varidnaviria</taxon>
+      <taxon>Bamfordvirae</taxon>
+      <taxon>Nucleocytoviricota</taxon>
+      <taxon>Pokkesviricetes</taxon>
+      <taxon>Asfuvirales</taxon>
+      <taxon>Asfarviridae</taxon>
+      <taxon>Asfivirus</taxon>
+    </lineage>
+  </organism>
+  <organismHost>
+    <name type="scientific">Ornithodoros</name>
+    <name type="common">relapsing fever ticks</name>
+    <dbReference type="NCBI Taxonomy" id="6937"/>
+  </organismHost>
+  <organismHost>
+    <name type="scientific">Phacochoerus aethiopicus</name>
+    <name type="common">Warthog</name>
+    <dbReference type="NCBI Taxonomy" id="85517"/>
+  </organismHost>
+  <organismHost>
+    <name type="scientific">Phacochoerus africanus</name>
+    <name type="common">Warthog</name>
+    <dbReference type="NCBI Taxonomy" id="41426"/>
+  </organismHost>
+  <organismHost>
+    <name type="scientific">Potamochoerus larvatus</name>
+    <name type="common">Bushpig</name>
+    <dbReference type="NCBI Taxonomy" id="273792"/>
+  </organismHost>
+  <organismHost>
+    <name type="scientific">Sus scrofa</name>
+    <name type="common">Pig</name>
+    <dbReference type="NCBI Taxonomy" id="9823"/>
+  </organismHost>
+  <reference key="1">
+    <citation type="submission" date="2003-03" db="EMBL/GenBank/DDBJ databases">
+      <title>African swine fever virus genomes.</title>
+      <authorList>
+        <person name="Kutish G.F."/>
+        <person name="Rock D.L."/>
+      </authorList>
+    </citation>
+    <scope>NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA]</scope>
+  </reference>
+  <comment type="function">
+    <text evidence="1">Plays a role in virus cell tropism, and may be required for efficient virus replication in macrophages.</text>
+  </comment>
+  <comment type="similarity">
+    <text evidence="2">Belongs to the asfivirus MGF 100 family.</text>
+  </comment>
+  <dbReference type="EMBL" id="AY261360">
+    <property type="status" value="NOT_ANNOTATED_CDS"/>
+    <property type="molecule type" value="Genomic_DNA"/>
+  </dbReference>
+  <dbReference type="Proteomes" id="UP000000861">
+    <property type="component" value="Genome"/>
+  </dbReference>
+  <proteinExistence type="inferred from homology"/>
+  <feature type="chain" id="PRO_0000373170" description="Protein MGF 100-1R">
+    <location>
+      <begin position="1"/>
+      <end position="122"/>
+    </location>
+  </feature>
+  <evidence type="ECO:0000250" key="1"/>
+  <evidence type="ECO:0000305" key="2"/>
+  <sequence length="122" mass="14969" checksum="C5E63C34B941711C" modified="2009-05-05" version="1">MVRLFYNPIKYLFYRRSCKKRLRKALKKLNFYHPPKECCQIYRLLENAPGGTYFITENMTNELIMIAKDPVDKKIKSVKLYLTGNYIKINQHYYINIYMYLMRYNQIYKYPLICFSKYSKIL</sequence>
+</entry>`
+	// Encode the string into an gzip io.Reader
+	var buf bytes.Buffer
+	gz := gzip.NewWriter(&buf)
+	_, _ = gz.Write([]byte(uniprotEntryText))
+	_ = gz.Close()
+
+	r := bytes.NewReader(buf.Bytes())
+
+	// Now we load the parser, and get the first entry out.
+	parser, _ := uniprot.NewParser(r)
+	entry, _ := parser.Next()
+
+	fmt.Println(entry.Accession[0])
+	// Output: P0C9F0
+}
diff --git a/lib/bio/uniprot/example_test.go b/lib/bio/uniprot/example_test.go
@@ -2,6 +2,7 @@ package uniprot_test
 
 import (
 	"fmt"
+	"os"
 
 	"github.com/koeng101/dnadesign/lib/bio/uniprot"
 )
@@ -10,12 +11,11 @@ import (
 // into a list. Directly using the channel without converting to an array
 // should be used for the Trembl data dump
 func Example_basic() {
-	entries, _, _ := uniprot.Read("data/uniprot_sprot_mini.xml.gz")
+	uniprotFile, _ := os.Open("data/uniprot_sprot_mini.xml.gz")
+	defer uniprotFile.Close()
+	parser, _ := uniprot.NewParser(uniprotFile)
+	entry, _ := parser.Next()
 
-	var entry uniprot.Entry
-	for singleEntry := range entries {
-		entry = singleEntry
-	}
 	fmt.Println(entry.Accession[0])
-	// Output: O55723
+	// Output: P0C9F0
 }
diff --git a/lib/bio/uniprot/uniprot.go b/lib/bio/uniprot/uniprot.go
@@ -17,91 +17,131 @@ Each protein in Uniprot is known as an "Entry" (as defined in xml.go).
 The function Parse stream-reads Uniprot into an Entry channel, from which you
 can use the entries however you want. Read simplifies reading gzipped files
 from a disk into an Entry channel.
+
+(1) Opinion of Keoni Gandall as of May 18, 2021
+(2) https://www.uniprot.org/downloads
+(3) https://www.uniprot.org/docs/uniprot.xsd
 */
 package uniprot
 
 import (
 	"compress/gzip"
+	"encoding/json"
 	"encoding/xml"
-	"os"
+	"io"
 )
 
-/******************************************************************************
-May 18, 2021
-
-Uniprot is comprehensive, high-quality and freely accessible resource of protein
-sequence and functional information. It is the best(1) protein database out there.
-
-Uniprot database dumps are available as gzipped FASTA files or gzipped XML files.
-The XML files have significantly more information than the FASTA files, and this
-parser specifically works on the gzipped XML files from Uniprot.
-
-Uniprot provides an XML schema of their data dumps(3), which is useful for
-autogeneration of Golang structs. I used xsdgen(4) to automatically generate
-xml.go from uniprot.xsd
-
-Each protein in Uniprot is known as an "Entry" (as defined in xml.go).
-
-The function Parse stream-reads Uniprot into an Entry channel, from which
-you can use the entries however you want. Read simplifies reading gzipped
-files from a disk into an Entry channel, essentially just preparing the reader for
-Parse.
-
-Cheers,
-Keoni
-
-(1) Opinion of Keoni Gandall as of May 18, 2021
-(2) https://www.uniprot.org/downloads
-(3) https://www.uniprot.org/docs/uniprot.xsd
-
-******************************************************************************/
-
 // Decoder decodes XML elements2
 type Decoder interface {
 	DecodeElement(v interface{}, start *xml.StartElement) error
 	Token() (xml.Token, error)
 }
 
-// Read reads a gzipped Uniprot XML dump. Failing to open the XML dump
-// gives a single error, while errors encountered while decoding the XML dump
-// are added to the errors channel.
-func Read(path string) (chan Entry, chan error, error) {
-	entries := make(chan Entry, 100) // if you don't have a buffered channel, nothing will be read in loops on the channel.
-	decoderErrors := make(chan error, 100)
-	xmlFile, err := os.Open(path)
+// Header is a blank struct, needed for compatibility with bio parsers. It contains nothing.
+type Header struct{}
+
+// Header_WriteTo is a blank function, needed for compatibility with bio parsers. It doesn't do anything.
+func (header *Header) WriteTo(w io.Writer) (int64, error) {
+	return 0, nil
+}
+
+// Header returns nil,nil.
+func (p *Parser) Header() (*Header, error) {
+	return &Header{}, nil
+}
+
+// Entry_WriteTo writes an entry to an io.Writer. It specifically writes a JSON
+// representation, NOT an XML representation, of the uniprot data.
+func (entry *Entry) WriteTo(w io.Writer) (int64, error) {
+	b, err := json.Marshal(entry)
 	if err != nil {
-		return entries, decoderErrors, err
+		return 0, err
 	}
-	unzippedBytes, err := gzip.NewReader(xmlFile)
+	n, err := w.Write(b)
+	return int64(n), err
+}
+
+// Parser implements a bio parser with Next().
+type Parser struct {
+	decoder Decoder
+}
+
+// NewParser returns a Parser that uses r as the source
+// from which to parse fasta formatted sequences. It expects a gzipped file,
+// as the default uniprot dump is xml.gz
+func NewParser(r io.Reader) (*Parser, error) {
+	unzippedBytes, err := gzip.NewReader(r)
 	if err != nil {
-		return entries, decoderErrors, err
+		return &Parser{}, err
 	}
 	decoder := xml.NewDecoder(unzippedBytes)
-	go Parse(decoder, entries, decoderErrors)
-	return entries, decoderErrors, nil
+	return &Parser{decoder: decoder}, nil
 }
 
-// Parse parses Uniprot entries into a channel.
-func Parse(decoder Decoder, entries chan<- Entry, errors chan<- error) {
-	for {
-		decoderToken, err := decoder.Token()
+func (p *Parser) Next() (*Entry, error) {
+	decoderToken, err := p.decoder.Token()
 
-		if err != nil {
-			if err.Error() == "EOF" {
-				break
-			}
-			errors <- err
+	// Check decoding
+	if err != nil {
+		// If we are the end of the file, return io.EOF
+		if err.Error() == "EOF" {
+			return &Entry{}, io.EOF
 		}
-		startElement, ok := decoderToken.(xml.StartElement)
-		if ok && startElement.Name.Local == "entry" {
-			var e Entry
-			err = decoder.DecodeElement(&e, &startElement)
-			if err != nil {
-				errors <- err
-			}
-			entries <- e
+	}
+
+	// Actual parsing
+	startElement, ok := decoderToken.(xml.StartElement)
+	if ok && startElement.Name.Local == "entry" {
+		var e Entry
+		err = p.decoder.DecodeElement(&e, &startElement)
+		if err != nil {
+			return &Entry{}, err
 		}
+		return &e, nil
 	}
-	close(entries)
-	close(errors)
+	return p.Next()
 }
+
+//// Read reads a gzipped Uniprot XML dump. Failing to open the XML dump
+//// gives a single error, while errors encountered while decoding the XML dump
+//// are added to the errors channel.
+//func Read(path string) (chan Entry, chan error, error) {
+//	entries := make(chan Entry, 100) // if you don't have a buffered channel, nothing will be read in loops on the channel.
+//	decoderErrors := make(chan error, 100)
+//	xmlFile, err := os.Open(path)
+//	if err != nil {
+//		return entries, decoderErrors, err
+//	}
+//	unzippedBytes, err := gzip.NewReader(xmlFile)
+//	if err != nil {
+//		return entries, decoderErrors, err
+//	}
+//	decoder := xml.NewDecoder(unzippedBytes)
+//	go Parse(decoder, entries, decoderErrors)
+//	return entries, decoderErrors, nil
+//}
+//
+//// Parse parses Uniprot entries into a channel.
+//func Parse(decoder Decoder, entries chan<- Entry, errors chan<- error) {
+//	for {
+//		decoderToken, err := decoder.Token()
+//
+//		if err != nil {
+//			if err.Error() == "EOF" {
+//				break
+//			}
+//			errors <- err
+//		}
+//		startElement, ok := decoderToken.(xml.StartElement)
+//		if ok && startElement.Name.Local == "entry" {
+//			var e Entry
+//			err = decoder.DecodeElement(&e, &startElement)
+//			if err != nil {
+//				errors <- err
+//			}
+//			entries <- e
+//		}
+//	}
+//	close(entries)
+//	close(errors)
+//}