Run length bwt (#440)

* run length bwt * better clarify mapping from original sequnce space to run space --------- Co-authored-by: Timothy Stiles <tim@stiles.io>
bebop · Jan 24, 2024 · 05169d2 · 05169d2
1 parent 3248776
commit 05169d2
Show file tree

Hide file tree

Showing 5 changed files with 409 additions and 44 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Basic BWT for sub-sequence count and offset for sequence alignment. Only supports exact matches for now.
 - Moved `BWT`, `align`, and `mash` packages to new `search` sub-directory.
+- Implemented Run-Length Burrows Wheeler Transform.
 
 
 ## [0.30.0] - 2023-12-18

diff --git a/search/bwt/bwt.go b/search/bwt/bwt.go
@@ -186,13 +186,38 @@ type BWT struct {
 	// represented as a list of skipEntries because the first column of
 	// the BWT is always lexicographically ordered. This saves time and memory.
 	firstColumnSkipList []skipEntry
-	// Column last column of the BWT- the actual textual representation
-	// of the BWT.
-	lastColumn waveletTree
 	// suffixArray an array that allows us to map a position in the first
 	// column to a position in the original sequence. This is needed to be
 	// able to extract text from the BWT.
 	suffixArray []int
+	// runLengthCompressedBWT is the compressed version of the BWT. The compression
+	// is for each run. For Example:
+	// the sequence "banana" has BWT "annb$aa"
+	// the run length compression of "annb$aa" is "anb$a"
+	// This helps us save a lot of memory while still having a search index we can
+	// use to align the original sequence. This allows us to understand how many
+	// runs of a certain character there are and where a run of a certain rank exists.
+	runBWTCompression waveletTree
+	// runStartPositions are the starting position of each run in the original sequence
+	// For example:
+	// "annb$aa" will have the runStartPositions [0, 1, 3, 4, 5]
+	// This helps us map our search range from "uncompressed BWT Space" to its
+	// "compressed BWT Run Space". With this, we can understand which runs we need
+	// to consider during LF mapping.
+	runStartPositions runInfo
+	// runCumulativeCounts is the cumulative count of characters for each run.
+	// This helps us efficiently lookup the number of occurrences of a given
+	// character before a given offset in "uncompressed BWT Space"
+	// For Example:
+	// "annb$aa" will have the runCumulativeCounts:
+	//   "a": [0, 1, 3],
+	//   "n": [0, 2],
+	//   "b": [0, 1],
+	//   "$": [0, 1],
+	runCumulativeCounts map[string]runInfo
+
+	// flag for turning on BWT debugging
+	debug bool
 }
 
 // Count represents the number of times the provided pattern
@@ -269,7 +294,34 @@ func (bwt BWT) Len() int {
 
 // GetTransform returns the last column of the BWT transform of the original sequence.
 func (bwt BWT) GetTransform() string {
-	return bwt.lastColumn.reconstruct()
+	lastColumn := strings.Builder{}
+	lastColumn.Grow(bwt.getLenOfOriginalStringWithNullChar())
+	for i := 0; i < bwt.runBWTCompression.length; i++ {
+		currChar := bwt.runBWTCompression.Access(i)
+		var currCharEnd int
+		if i+1 >= len(bwt.runStartPositions) {
+			currCharEnd = bwt.getLenOfOriginalStringWithNullChar()
+		} else {
+			currCharEnd = bwt.runStartPositions[i+1]
+		}
+		for lastColumn.Len() < currCharEnd {
+			lastColumn.WriteByte(currChar)
+		}
+	}
+	return lastColumn.String()
+}
+
+//lint:ignore U1000 Ignore unused function. This is valuable for future debugging
+func (bwt BWT) getFirstColumnStr() string {
+	firstColumn := strings.Builder{}
+	firstColumn.Grow(bwt.getLenOfOriginalStringWithNullChar())
+	for i := 0; i < len(bwt.firstColumnSkipList); i++ {
+		e := bwt.firstColumnSkipList[i]
+		for j := e.openEndedInterval.start; j < e.openEndedInterval.end; j++ {
+			firstColumn.WriteByte(e.char)
+		}
+	}
+	return firstColumn.String()
 }
 
 // getFCharPosFromOriginalSequenceCharPos looks up mapping from the original position
@@ -291,21 +343,55 @@ func (bwt BWT) getFCharPosFromOriginalSequenceCharPos(originalPos int) int {
 func (bwt BWT) lfSearch(pattern string) interval {
 	searchRange := interval{start: 0, end: bwt.getLenOfOriginalStringWithNullChar()}
 	for i := 0; i < len(pattern); i++ {
+		if bwt.debug {
+			printLFDebug(bwt, searchRange, i)
+		}
 		if searchRange.end-searchRange.start <= 0 {
 			return interval{}
 		}
 
 		c := pattern[len(pattern)-1-i]
-		skip, ok := bwt.lookupSkipByChar(c)
-		if !ok {
-			return interval{}
-		}
-		searchRange.start = skip.openEndedInterval.start + bwt.lastColumn.Rank(c, searchRange.start)
-		searchRange.end = skip.openEndedInterval.start + bwt.lastColumn.Rank(c, searchRange.end)
+		nextStart := bwt.getNextLfSearchOffset(c, searchRange.start)
+		nextEnd := bwt.getNextLfSearchOffset(c, searchRange.end)
+		searchRange.start = nextStart
+		searchRange.end = nextEnd
 	}
 	return searchRange
 }
 
+func (bwt BWT) getNextLfSearchOffset(c byte, offset int) int {
+	nearestRunStart := bwt.runStartPositions.FindNearestRunStartPosition(offset + 1)
+	maxRunInCompressedSpace := bwt.runBWTCompression.Rank(c, nearestRunStart)
+
+	skip, ok := bwt.lookupSkipByChar(c)
+	if !ok {
+		return 0
+	}
+
+	cumulativeCounts, ok := bwt.runCumulativeCounts[string(c)]
+	if !ok {
+		return 0
+	}
+
+	cumulativeCountBeforeMaxRun := cumulativeCounts[maxRunInCompressedSpace]
+
+	currRunStart := bwt.runStartPositions.FindNearestRunStartPosition(offset)
+	currentRunChar := string(bwt.runBWTCompression.Access(currRunStart))
+	extraOffset := 0
+	// It is possible that an offset currently lies within a run of the same
+	// character we are inspecting. In this case, cumulativeCountBeforeMaxRun
+	// is not enough since the Max Run in this case does not include the run
+	// the offset is currently in. To adjust for this, we must count the number
+	// of character occurrences since the beginning of the run that the offset
+	// is currently in.
+	if c == currentRunChar[0] {
+		o := bwt.runStartPositions[nearestRunStart]
+		extraOffset += offset - o
+	}
+
+	return skip.openEndedInterval.start + cumulativeCountBeforeMaxRun + extraOffset
+}
+
 // lookupSkipByChar looks up a skipEntry by its character in the First Column
 func (bwt BWT) lookupSkipByChar(c byte) (entry skipEntry, ok bool) {
 	for i := range bwt.firstColumnSkipList {
@@ -372,30 +458,64 @@ func New(sequence string) (BWT, error) {
 	sortPrefixArray(prefixArray)
 
 	suffixArray := make([]int, len(sequence))
-	lastColBuilder := strings.Builder{}
+	charCount := 0
+	runBWTCompressionBuilder := strings.Builder{}
+	var runStartPositions runInfo
+	runCumulativeCounts := make(map[string]runInfo)
+
+	var prevChar *byte
 	for i := 0; i < len(prefixArray); i++ {
 		currChar := sequence[getBWTIndex(len(sequence), len(prefixArray[i]))]
-		lastColBuilder.WriteByte(currChar)
+		if prevChar == nil {
+			prevChar = &currChar
+		}
+
+		if currChar != *prevChar {
+			runBWTCompressionBuilder.WriteByte(*prevChar)
+			runStartPositions = append(runStartPositions, i-charCount)
+			addRunCumulativeCountEntry(runCumulativeCounts, *prevChar, charCount)
 
+			charCount = 0
+			prevChar = &currChar
+		}
+
+		charCount++
 		suffixArray[i] = len(sequence) - len(prefixArray[i])
 	}
+	runBWTCompressionBuilder.WriteByte(*prevChar)
+	runStartPositions = append(runStartPositions, len(prefixArray)-charCount)
+	addRunCumulativeCountEntry(runCumulativeCounts, *prevChar, charCount)
+
 	fb := strings.Builder{}
 	for i := 0; i < len(prefixArray); i++ {
 		fb.WriteByte(prefixArray[i][0])
 	}
 
-	wt, err := newWaveletTreeFromString(lastColBuilder.String())
+	skipList := buildSkipList(prefixArray)
+
+	wt, err := newWaveletTreeFromString(runBWTCompressionBuilder.String())
 	if err != nil {
 		return BWT{}, err
 	}
-
 	return BWT{
-		firstColumnSkipList: buildSkipList(prefixArray),
-		lastColumn:          wt,
+		firstColumnSkipList: skipList,
 		suffixArray:         suffixArray,
+		runBWTCompression:   wt,
+		runStartPositions:   runStartPositions,
+		runCumulativeCounts: runCumulativeCounts,
 	}, nil
 }
 
+func addRunCumulativeCountEntry(rumCumulativeCounts map[string]runInfo, char byte, charCount int) {
+	cumulativeCountsOfChar, ok := rumCumulativeCounts[string(char)]
+	if ok {
+		cumulativeCountsOfChar = append(cumulativeCountsOfChar, charCount+cumulativeCountsOfChar[len(cumulativeCountsOfChar)-1])
+	} else {
+		cumulativeCountsOfChar = runInfo{0, charCount}
+	}
+	rumCumulativeCounts[string(char)] = cumulativeCountsOfChar
+}
+
 // buildSkipList compressed the First Column of the BWT into a skip list
 func buildSkipList(prefixArray []string) []skipEntry {
 	prevChar := prefixArray[0][0]
@@ -457,6 +577,38 @@ func bwtRecovery(operation string, err *error) {
 	}
 }
 
+// runInfo each element of runInfo should represent an offset i where i
+// corresponds to the start of a run in a given sequence. For example,
+// aaaabbccc would have the run info [0, 4, 6]
+type runInfo []int
+
+// FindNearestRunStartPosition given some offset, find the nearest starting position for the.
+// beginning of a run. Another way of saying this is give me the max i where runStartPositions[i] <= offset.
+// This is needed so we can understand which run an offset is a part of.
+func (r runInfo) FindNearestRunStartPosition(offset int) int {
+	start := 0
+	end := len(r) - 1
+	for start < end {
+		mid := start + (end-start)/2
+		if r[mid] < offset {
+			start = mid + 1
+			continue
+		}
+		if r[mid] > offset {
+			end = mid - 1
+			continue
+		}
+
+		return mid
+	}
+
+	if r[start] > offset {
+		return start - 1
+	}
+
+	return start
+}
+
 func isValidPattern(s string) (err error) {
 	if len(s) == 0 {
 		return errors.New("Pattern can not be empty")
@@ -480,3 +632,58 @@ func validateSequenceBeforeTransforming(sequence *string) (err error) {
 	}
 	return nil
 }
+
+// printLFDebug this will print the first column and last column of the BWT along with some ascii visualizations.
+// This is very helpful for debugging the LF mapping. For example, lets say you're in the middle of making some changes to the LF
+// mapping and the test for counting starts to fail. To understand where the LF search is going wrong, you
+// can do something like the below to outline which parts of the BWT are being searched some given iteration.
+//
+// For Example, if you had the BWT of:
+// "rowrowrowyourboat"
+// and wanted to Count the number of occurrences of "row"
+// Then the iterations of the LF search would look something like:
+//
+// BWT Debug Begin Iteration: 0
+// torbyrrru$wwaoooow
+// $abooooorrrrtuwwwy
+// ^^^^^^^^^^^^^^^^^^X
+//
+// BWT Debug Begin Iteration: 1
+// torbyrrru$wwaoooow
+// $abooooorrrrtuwwwy
+// ______________^^^X
+//
+// BWT Debug Begin Iteration: 2
+// torbyrrru$wwaoooow
+// $abooooorrrrtuwwwy
+// _____^^^X
+//
+// Where:
+// * '^' denotes the active search range
+// * 'X' denotes one character after the end of the active search searchRange
+// * '_' is visual padding to help align the active search range
+//
+// NOTE: It can also be helpful to include the other auxiliary data structures. For example, it can be very helpful to include
+// a similar visualization for the run length compression to help debug and understand which run were used to compute the active
+// search window during each iteration.
+func printLFDebug(bwt BWT, searchRange interval, iteration int) {
+	first := bwt.getFirstColumnStr()
+	last := bwt.GetTransform()
+	lastRunCompression := bwt.runBWTCompression.reconstruct()
+
+	fullASCIIRange := strings.Builder{}
+	fullASCIIRange.Grow(searchRange.end + 1)
+	for i := 0; i < searchRange.start; i++ {
+		fullASCIIRange.WriteRune('_')
+	}
+	for i := searchRange.start; i < searchRange.end; i++ {
+		fullASCIIRange.WriteRune('^')
+	}
+	fullASCIIRange.WriteRune('X')
+
+	fmt.Println("BWT Debug Begin Iteration:", iteration)
+	fmt.Println(last)
+	fmt.Println(first)
+	fmt.Println(fullASCIIRange.String())
+	fmt.Println(lastRunCompression)
+}