Skip to content

Commit

Permalink
Run length bwt (#440)
Browse files Browse the repository at this point in the history
* run length bwt

* better clarify mapping from original sequnce space to run space

---------

Co-authored-by: Timothy Stiles <tim@stiles.io>
  • Loading branch information
TwFlem and TimothyStiles authored Jan 24, 2024
1 parent 3248776 commit 05169d2
Show file tree
Hide file tree
Showing 5 changed files with 409 additions and 44 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added
- Basic BWT for sub-sequence count and offset for sequence alignment. Only supports exact matches for now.
- Moved `BWT`, `align`, and `mash` packages to new `search` sub-directory.
- Implemented Run-Length Burrows Wheeler Transform.


## [0.30.0] - 2023-12-18
Expand Down
239 changes: 223 additions & 16 deletions search/bwt/bwt.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,13 +186,38 @@ type BWT struct {
// represented as a list of skipEntries because the first column of
// the BWT is always lexicographically ordered. This saves time and memory.
firstColumnSkipList []skipEntry
// Column last column of the BWT- the actual textual representation
// of the BWT.
lastColumn waveletTree
// suffixArray an array that allows us to map a position in the first
// column to a position in the original sequence. This is needed to be
// able to extract text from the BWT.
suffixArray []int
// runLengthCompressedBWT is the compressed version of the BWT. The compression
// is for each run. For Example:
// the sequence "banana" has BWT "annb$aa"
// the run length compression of "annb$aa" is "anb$a"
// This helps us save a lot of memory while still having a search index we can
// use to align the original sequence. This allows us to understand how many
// runs of a certain character there are and where a run of a certain rank exists.
runBWTCompression waveletTree
// runStartPositions are the starting position of each run in the original sequence
// For example:
// "annb$aa" will have the runStartPositions [0, 1, 3, 4, 5]
// This helps us map our search range from "uncompressed BWT Space" to its
// "compressed BWT Run Space". With this, we can understand which runs we need
// to consider during LF mapping.
runStartPositions runInfo
// runCumulativeCounts is the cumulative count of characters for each run.
// This helps us efficiently lookup the number of occurrences of a given
// character before a given offset in "uncompressed BWT Space"
// For Example:
// "annb$aa" will have the runCumulativeCounts:
// "a": [0, 1, 3],
// "n": [0, 2],
// "b": [0, 1],
// "$": [0, 1],
runCumulativeCounts map[string]runInfo

// flag for turning on BWT debugging
debug bool
}

// Count represents the number of times the provided pattern
Expand Down Expand Up @@ -269,7 +294,34 @@ func (bwt BWT) Len() int {

// GetTransform returns the last column of the BWT transform of the original sequence.
func (bwt BWT) GetTransform() string {
return bwt.lastColumn.reconstruct()
lastColumn := strings.Builder{}
lastColumn.Grow(bwt.getLenOfOriginalStringWithNullChar())
for i := 0; i < bwt.runBWTCompression.length; i++ {
currChar := bwt.runBWTCompression.Access(i)
var currCharEnd int
if i+1 >= len(bwt.runStartPositions) {
currCharEnd = bwt.getLenOfOriginalStringWithNullChar()
} else {
currCharEnd = bwt.runStartPositions[i+1]
}
for lastColumn.Len() < currCharEnd {
lastColumn.WriteByte(currChar)
}
}
return lastColumn.String()
}

//lint:ignore U1000 Ignore unused function. This is valuable for future debugging
func (bwt BWT) getFirstColumnStr() string {
firstColumn := strings.Builder{}
firstColumn.Grow(bwt.getLenOfOriginalStringWithNullChar())
for i := 0; i < len(bwt.firstColumnSkipList); i++ {
e := bwt.firstColumnSkipList[i]
for j := e.openEndedInterval.start; j < e.openEndedInterval.end; j++ {
firstColumn.WriteByte(e.char)
}
}
return firstColumn.String()
}

// getFCharPosFromOriginalSequenceCharPos looks up mapping from the original position
Expand All @@ -291,21 +343,55 @@ func (bwt BWT) getFCharPosFromOriginalSequenceCharPos(originalPos int) int {
func (bwt BWT) lfSearch(pattern string) interval {
searchRange := interval{start: 0, end: bwt.getLenOfOriginalStringWithNullChar()}
for i := 0; i < len(pattern); i++ {
if bwt.debug {
printLFDebug(bwt, searchRange, i)
}
if searchRange.end-searchRange.start <= 0 {
return interval{}
}

c := pattern[len(pattern)-1-i]
skip, ok := bwt.lookupSkipByChar(c)
if !ok {
return interval{}
}
searchRange.start = skip.openEndedInterval.start + bwt.lastColumn.Rank(c, searchRange.start)
searchRange.end = skip.openEndedInterval.start + bwt.lastColumn.Rank(c, searchRange.end)
nextStart := bwt.getNextLfSearchOffset(c, searchRange.start)
nextEnd := bwt.getNextLfSearchOffset(c, searchRange.end)
searchRange.start = nextStart
searchRange.end = nextEnd
}
return searchRange
}

func (bwt BWT) getNextLfSearchOffset(c byte, offset int) int {
nearestRunStart := bwt.runStartPositions.FindNearestRunStartPosition(offset + 1)
maxRunInCompressedSpace := bwt.runBWTCompression.Rank(c, nearestRunStart)

skip, ok := bwt.lookupSkipByChar(c)
if !ok {
return 0
}

cumulativeCounts, ok := bwt.runCumulativeCounts[string(c)]
if !ok {
return 0
}

cumulativeCountBeforeMaxRun := cumulativeCounts[maxRunInCompressedSpace]

currRunStart := bwt.runStartPositions.FindNearestRunStartPosition(offset)
currentRunChar := string(bwt.runBWTCompression.Access(currRunStart))
extraOffset := 0
// It is possible that an offset currently lies within a run of the same
// character we are inspecting. In this case, cumulativeCountBeforeMaxRun
// is not enough since the Max Run in this case does not include the run
// the offset is currently in. To adjust for this, we must count the number
// of character occurrences since the beginning of the run that the offset
// is currently in.
if c == currentRunChar[0] {
o := bwt.runStartPositions[nearestRunStart]
extraOffset += offset - o
}

return skip.openEndedInterval.start + cumulativeCountBeforeMaxRun + extraOffset
}

// lookupSkipByChar looks up a skipEntry by its character in the First Column
func (bwt BWT) lookupSkipByChar(c byte) (entry skipEntry, ok bool) {
for i := range bwt.firstColumnSkipList {
Expand Down Expand Up @@ -372,30 +458,64 @@ func New(sequence string) (BWT, error) {
sortPrefixArray(prefixArray)

suffixArray := make([]int, len(sequence))
lastColBuilder := strings.Builder{}
charCount := 0
runBWTCompressionBuilder := strings.Builder{}
var runStartPositions runInfo
runCumulativeCounts := make(map[string]runInfo)

var prevChar *byte
for i := 0; i < len(prefixArray); i++ {
currChar := sequence[getBWTIndex(len(sequence), len(prefixArray[i]))]
lastColBuilder.WriteByte(currChar)
if prevChar == nil {
prevChar = &currChar
}

if currChar != *prevChar {
runBWTCompressionBuilder.WriteByte(*prevChar)
runStartPositions = append(runStartPositions, i-charCount)
addRunCumulativeCountEntry(runCumulativeCounts, *prevChar, charCount)

charCount = 0
prevChar = &currChar
}

charCount++
suffixArray[i] = len(sequence) - len(prefixArray[i])
}
runBWTCompressionBuilder.WriteByte(*prevChar)
runStartPositions = append(runStartPositions, len(prefixArray)-charCount)
addRunCumulativeCountEntry(runCumulativeCounts, *prevChar, charCount)

fb := strings.Builder{}
for i := 0; i < len(prefixArray); i++ {
fb.WriteByte(prefixArray[i][0])
}

wt, err := newWaveletTreeFromString(lastColBuilder.String())
skipList := buildSkipList(prefixArray)

wt, err := newWaveletTreeFromString(runBWTCompressionBuilder.String())
if err != nil {
return BWT{}, err
}

return BWT{
firstColumnSkipList: buildSkipList(prefixArray),
lastColumn: wt,
firstColumnSkipList: skipList,
suffixArray: suffixArray,
runBWTCompression: wt,
runStartPositions: runStartPositions,
runCumulativeCounts: runCumulativeCounts,
}, nil
}

func addRunCumulativeCountEntry(rumCumulativeCounts map[string]runInfo, char byte, charCount int) {
cumulativeCountsOfChar, ok := rumCumulativeCounts[string(char)]
if ok {
cumulativeCountsOfChar = append(cumulativeCountsOfChar, charCount+cumulativeCountsOfChar[len(cumulativeCountsOfChar)-1])
} else {
cumulativeCountsOfChar = runInfo{0, charCount}
}
rumCumulativeCounts[string(char)] = cumulativeCountsOfChar
}

// buildSkipList compressed the First Column of the BWT into a skip list
func buildSkipList(prefixArray []string) []skipEntry {
prevChar := prefixArray[0][0]
Expand Down Expand Up @@ -457,6 +577,38 @@ func bwtRecovery(operation string, err *error) {
}
}

// runInfo each element of runInfo should represent an offset i where i
// corresponds to the start of a run in a given sequence. For example,
// aaaabbccc would have the run info [0, 4, 6]
type runInfo []int

// FindNearestRunStartPosition given some offset, find the nearest starting position for the.
// beginning of a run. Another way of saying this is give me the max i where runStartPositions[i] <= offset.
// This is needed so we can understand which run an offset is a part of.
func (r runInfo) FindNearestRunStartPosition(offset int) int {
start := 0
end := len(r) - 1
for start < end {
mid := start + (end-start)/2
if r[mid] < offset {
start = mid + 1
continue
}
if r[mid] > offset {
end = mid - 1
continue
}

return mid
}

if r[start] > offset {
return start - 1
}

return start
}

func isValidPattern(s string) (err error) {
if len(s) == 0 {
return errors.New("Pattern can not be empty")
Expand All @@ -480,3 +632,58 @@ func validateSequenceBeforeTransforming(sequence *string) (err error) {
}
return nil
}

// printLFDebug this will print the first column and last column of the BWT along with some ascii visualizations.
// This is very helpful for debugging the LF mapping. For example, lets say you're in the middle of making some changes to the LF
// mapping and the test for counting starts to fail. To understand where the LF search is going wrong, you
// can do something like the below to outline which parts of the BWT are being searched some given iteration.
//
// For Example, if you had the BWT of:
// "rowrowrowyourboat"
// and wanted to Count the number of occurrences of "row"
// Then the iterations of the LF search would look something like:
//
// BWT Debug Begin Iteration: 0
// torbyrrru$wwaoooow
// $abooooorrrrtuwwwy
// ^^^^^^^^^^^^^^^^^^X
//
// BWT Debug Begin Iteration: 1
// torbyrrru$wwaoooow
// $abooooorrrrtuwwwy
// ______________^^^X
//
// BWT Debug Begin Iteration: 2
// torbyrrru$wwaoooow
// $abooooorrrrtuwwwy
// _____^^^X
//
// Where:
// * '^' denotes the active search range
// * 'X' denotes one character after the end of the active search searchRange
// * '_' is visual padding to help align the active search range
//
// NOTE: It can also be helpful to include the other auxiliary data structures. For example, it can be very helpful to include
// a similar visualization for the run length compression to help debug and understand which run were used to compute the active
// search window during each iteration.
func printLFDebug(bwt BWT, searchRange interval, iteration int) {
first := bwt.getFirstColumnStr()
last := bwt.GetTransform()
lastRunCompression := bwt.runBWTCompression.reconstruct()

fullASCIIRange := strings.Builder{}
fullASCIIRange.Grow(searchRange.end + 1)
for i := 0; i < searchRange.start; i++ {
fullASCIIRange.WriteRune('_')
}
for i := searchRange.start; i < searchRange.end; i++ {
fullASCIIRange.WriteRune('^')
}
fullASCIIRange.WriteRune('X')

fmt.Println("BWT Debug Begin Iteration:", iteration)
fmt.Println(last)
fmt.Println(first)
fmt.Println(fullASCIIRange.String())
fmt.Println(lastRunCompression)
}
Loading

0 comments on commit 05169d2

Please sign in to comment.