Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add svb compression to slow5 #328

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.18

require (
github.com/google/go-cmp v0.5.8
github.com/koeng101/svb v0.0.0-20230815034912-d6737f9ed8b8
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this dependency has assembly in it?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. It make it go zoom!

github.com/lunny/log v0.0.0-20160921050905-7887c61bf0de
github.com/mitchellh/go-wordwrap v1.0.1
github.com/mroth/weightedrand v0.4.1
Expand All @@ -15,6 +16,7 @@ require (

require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/intel-go/cpuid v0.0.0-20181003105527-1a4a6f06a1c6 // indirect
github.com/mattn/go-sqlite3 v1.14.13 // indirect
)

Expand Down
24 changes: 24 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,12 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg=
github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/intel-go/cpuid v0.0.0-20181003105527-1a4a6f06a1c6 h1:XboatR7lasl05yel5hNXF7kQBw2oFUGdMztcgisfhNU=
github.com/intel-go/cpuid v0.0.0-20181003105527-1a4a6f06a1c6/go.mod h1:RmeVYf9XrPRbRc3XIx0gLYA8qOFvNoPOfaEZduRlEp4=
github.com/klauspost/cpuid v1.3.1 h1:5JNjFYYQrZeKRJ0734q51WCEEn2huer72Dc7K+R/b6s=
github.com/klauspost/cpuid v1.3.1/go.mod h1:bYW4mA6ZgKPob1/Dlai2LviZJO7KGI3uoWLd42rAQw4=
github.com/koeng101/svb v0.0.0-20230815034912-d6737f9ed8b8 h1:oaPrvMY8VJEYfJ/r/VTKh2zshX/SxYHcImJRPZ++JPI=
github.com/koeng101/svb v0.0.0-20230815034912-d6737f9ed8b8/go.mod h1:/zIPMIRhcEjka8JxY3mo7jexMl4ncHLRUnv9RIiAS9E=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
Expand All @@ -16,6 +20,7 @@ github.com/mattn/go-sqlite3 v1.14.13 h1:1tj15ngiFfcZzii7yd82foL+ks+ouQcj8j/TPq3f
github.com/mattn/go-sqlite3 v1.14.13/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0=
github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0=
github.com/mmcloughlin/avo v0.0.0-20200504053806-fa88270b07e4/go.mod h1:wqKykBG2QzQDJEzvRkcS8x6MiSJkF52hXZsXcjaB3ls=
github.com/mroth/weightedrand v0.4.1 h1:rHcbUBopmi/3x4nnrvwGJBhX9d0vk+KgoLUZeDP6YyI=
github.com/mroth/weightedrand v0.4.1/go.mod h1:3p2SIcC8al1YMzGhAIoXD+r9olo/g/cdJgAD905gyNE=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
Expand All @@ -25,8 +30,26 @@ github.com/sergi/go-diff v1.2.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNX
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
golang.org/x/arch v0.0.0-20190909030613-46d78d1859ac/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0 h1:LGJsf5LRplCck6jUCH3dBL2dmycNruWNF5xugkSlfXw=
golang.org/x/exp v0.0.0-20230310171629-522b1b587ee0/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20200425043458-8463f397d07c/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
Expand All @@ -36,3 +59,4 @@ gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
lukechampine.com/blake3 v1.1.5 h1:hsACfxWvLdGmjYbWGrumQIphOvO+ZruZehWtgd2fxoM=
lukechampine.com/blake3 v1.1.5/go.mod h1:hE8RpzdO8ttZ7446CXEwDP1eu2V4z7stv0Urj1El20g=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
1 change: 1 addition & 0 deletions io/slow5/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ func ExampleNewParser() {
// run where I was testing using nanopore for doing COVID testing. It
// contains real nanopore data.
file, _ := os.Open("data/example.slow5")
defer file.Close()
// Set maxLineSize to 64kb. If you expect longer reads,
// make maxLineSize longer!
const maxLineSize = 2 * 32 * 1024
Expand Down
53 changes: 52 additions & 1 deletion io/slow5/slow5.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ import (
"sort"
"strconv"
"strings"

"github.com/koeng101/svb"
)

/******************************************************************************
Expand Down Expand Up @@ -196,7 +198,9 @@ func NewParser(r io.Reader, maxLineSize int) (*Parser, []Header, error) {
return parser, headers, nil
}

// ParseNext parses the next read from a parser.
// ParseNext parses the next read from a parser. Note: you should check
// Read.Error for errors that happen within a read, while the overall error
// for errors that happen in the parser.
Koeng101 marked this conversation as resolved.
Show resolved Hide resolved
func (parser *Parser) ParseNext() (Read, error) {
lineBytes, err := parser.reader.ReadSlice('\n')
if err != nil {
Expand Down Expand Up @@ -437,3 +441,50 @@ func Write(headers []Header, reads <-chan Read, output io.Writer) error {
}
return nil
}

/******************************************************************************
Aug 15, 2023

StreamVByte (svb) compression of raw signal strength is used to decrease the
overall size of records in blow5 files. In my tests using a SQLite database
that contained both slow5 and fastq files, switching from raw signal TEXT to
svb compressed BLOBs brought the total database size from 12GB to 7.1GB, a
~40% reduction in size.

This is the primary method that can be used to decrease the size of storing
slow5 records in alternative datastores to blow5 (SQL databases, etc).

svb is an integer compression algorithm, which are specialized in compressing
integer arrays, which perfectly fits raw signal strength data. When converting
from slow5 to blow5, files are additionally compressed with zstd or zlib on top
of raw signal compression with svb. Still, svb compression is where most the
data size saving comes from.

Stream VByte paper: https://doi.org/10.48550/arXiv.1709.08990

Keoni

******************************************************************************/

// SvbCompressRawSignal takes a read and converts its raw signal field to two
// arrays: a mask array and a data array. Both are needed for decompression.
func SvbCompressRawSignal(rawSignal []int16) (mask, data []byte) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have these functions be methods of Read/another type? As it stands it's not clear to me how a client is supposed to take advantage of them.

Maybe use struct inheritance to define a new CompressedRead class and have a method that goes from CompressedRead -> Read and vice versa.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm using these functions for taking the rawSignal in and out of an SQL database, so having it be a method doesn't work as well as a raw function.

CompressedRead I think would just be a blow5 read rather than slow5 read, which we SHOULD implement at some point. (there is a whole binary format specifically for doing those compressed reads well, with some real performance improvements)

rawSignalUint32 := make([]uint32, len(rawSignal))
for idx := range rawSignal {
rawSignalUint32[idx] = uint32(rawSignal[idx])
}
return svb.Uint32Encode(rawSignalUint32)
}

// SvbDecompressRawSignal decompresses raw signal back to a []int16. It
// requires not only the mask array and data array returned by
// SvbCompressRawSignal, but also the length of the raw signals.
func SvbDecompressRawSignal(lenRawSignal int, mask, data []byte) []int16 {
rawSignalUint32 := make([]uint32, lenRawSignal)
rawSignal := make([]int16, lenRawSignal)
svb.Uint32Decode32(mask, data, rawSignalUint32)
for idx := 0; idx < lenRawSignal; idx++ {
rawSignal[idx] = int16(rawSignalUint32[idx])
}
return rawSignal
}
29 changes: 29 additions & 0 deletions io/slow5/slow5_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ func TestParse(t *testing.T) {
if err != nil {
t.Errorf("Failed to open example.slow5: %s", err)
}
defer file.Close()
parser, headers, err := NewParser(file, maxLineSize)
if err != nil {
t.Errorf("Failed to parse headers of file: %s", err)
Expand Down Expand Up @@ -204,3 +205,31 @@ func TestWrite(t *testing.T) {
t.Errorf("Example and test write are different")
}
}

func TestSvb(t *testing.T) {
file, _ := os.Open("data/example.slow5")
defer file.Close()
const maxLineSize = 2 * 32 * 1024
parser, _, _ := NewParser(file, maxLineSize)
Koeng101 marked this conversation as resolved.
Show resolved Hide resolved
var outputReads []Read
for {
read, err := parser.ParseNext()
if err != nil {
Koeng101 marked this conversation as resolved.
Show resolved Hide resolved
// Break at EOF
break
}
outputReads = append(outputReads, read)
}

for readNum, read := range outputReads {
rawSignal := read.RawSignal
mask, data := SvbCompressRawSignal(rawSignal)
rawSignalDecompressed := SvbDecompressRawSignal(len(rawSignal), mask, data)
for idx := range rawSignal {
if rawSignal[idx] != rawSignalDecompressed[idx] {
t.Errorf("Read signal at readNum %d idx %d didn't match decompressed signal %d", readNum, rawSignal[idx], rawSignalDecompressed[idx])
}
}
}

}
Loading