Skip to content

Commit

Permalink
Import the source code
Browse files Browse the repository at this point in the history
  • Loading branch information
Tavian Barnes committed Jun 10, 2019
1 parent 7b48449 commit 0f404df
Show file tree
Hide file tree
Showing 14 changed files with 1,611 additions and 9 deletions.
40 changes: 31 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,36 @@
bistring
========

# Contributing
The bistring library provides non-destructive versions of common string processing operations like normalization, case folding, and find/replace.
Each bistring remembers the original string, and how its substrings map to substrings of the modified version.

This project welcomes contributions and suggestions. Most contributions require you to agree to a
Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
the rights to use your contribution. For details, visit https://cla.microsoft.com.
For example:

When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
provided by the bot. You will only need to do this once across all repos using our CLA.
```python
>>> from bistring import bistr
>>> s = bistr('𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌')
>>> s = s.normalize('NFKD') # Unicode normalization
>>> s = s.casefold() # Case-insensitivity
>>> s = s.sub(r'[^a-z ]+', '') # Strip everything but letters and spaces
>>> s = s[:19] # Extract a substring
>>> s.modified # The modified substring, after changes
'the quick brown fox'
>>> s.original # The original substring, before changes
'𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝'
```

This allows you to perform very aggressive text processing completely invisibly.


Contributing
------------

This project welcomes contributions and suggestions.
Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution.
For details, visit https://cla.microsoft.com.

When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment).
Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.

This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
7 changes: 7 additions & 0 deletions python/bistring/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.

from ._alignment import *
from ._bistr import *
from ._builder import *
from ._token import *
250 changes: 250 additions & 0 deletions python/bistring/_alignment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT license.

__all__ = ["Alignment"]

import bisect
from typing import Iterable, List, Optional, Tuple, cast, overload

from ._typing import Bounds, Range


class Alignment:
"""
An alignment between two related sequences.
"""

__slots__ = ("_original", "_modified")

_original: List[int]
_modified: List[int]

def __init__(self, values: Iterable[Bounds]):
self._original = []
self._modified = []
for i, j in values:
if self._original:
if i < self._original[-1]:
raise ValueError("Original sequence position moved backwards")
elif j < self._modified[-1]:
raise ValueError("Modified sequence position moved backwards")
elif i == self._original[-1] and j == self._modified[-1]:
continue

self._original.append(i)
self._modified.append(j)

if not self._original:
raise ValueError("No sequence positions to align")

@classmethod
def _create(cls, original: List[int], modified: List[int]) -> "Alignment":
result = super().__new__(cls)
result._original = original
result._modified = modified
return result

def __str__(self):
i, j = self._original[0], self._original[-1]
k, l = self._modified[0], self._modified[-1]
if self._original == list(range(i, j + 1)) and self._modified == list(range(k, l + 1)):
return f"[{i}:{j}{k}:{l}]"
else:
return "[" + ", ".join(f"{i}{j}" for i, j in self) + "]"

def __repr__(self):
i, j = self._original[0], self._original[-1]
if self._original == list(range(i, j + 1)) and self._modified == list(range(i, j + 1)):
return f"Alignment.identity({self._original[0]}, {self._original[-1]})"
else:
return "Alignment([" + ", ".join(map(repr, self)) + "])"

def __eq__(self, other):
if isinstance(other, Alignment):
return (self._original, self._modified) == (other._original, other._modified)
else:
return NotImplemented

@classmethod
def _parse_args(cls, args: Tuple) -> Bounds:
l = len(args)
if l == 0:
return None, None
elif l == 1:
arg = args[0]
if isinstance(arg, range):
return arg.start, arg.stop
elif isinstance(arg, slice):
if arg.start is None or arg.stop is None:
raise ValueError("slice with unspecified bounds")
return arg.start, arg.stop
elif isinstance(arg, tuple):
return cast(Bounds, arg)
else:
return 0, arg
elif l == 2:
return cast(Bounds, args)
else:
raise TypeError("Too many arguments")

@overload
@classmethod
def identity(cls, length: int) -> "Alignment":
...

@overload
@classmethod
def identity(cls, start: int, stop: int) -> "Alignment":
...

@overload
@classmethod
def identity(cls, bounds: Range) -> "Alignment":
...

@classmethod
def identity(cls, *args):
start, stop = cls._parse_args(args)
values = list(range(start, stop + 1))
return cls._create(values, values)

def __iter__(self):
return zip(self._original, self._modified)

def __len__(self):
return len(self._original)

def __getitem__(self, index):
if isinstance(index, slice):
start, stop, stride = index.indices(len(self))
if stride != 1:
raise ValueError("Non-unit strides not supported")
return self._create(self._original[index], self._modified[index])
else:
return (self._original[index], self._modified[index])

def shift(self, delta_o, delta_m):
return self._create(
[o + delta_o for o in self._original],
[m + delta_m for m in self._modified],
)

def _search(self, source: List[int], start: int, stop: int) -> Bounds:
first = bisect.bisect_right(source, start)
if first == 0:
raise IndexError("range start too small")
first -= 1

last = bisect.bisect_left(source, stop, first)
if last == len(source):
raise IndexError("range end too big")

return first, last

def _bounds(self, source: List[int], target: List[int], args: Tuple) -> Bounds:
start, stop = self._parse_args(args)
if start is None:
i, j = 0, -1
else:
i, j = self._search(source, start, stop)
return (target[i], target[j])

def original_bounds(self, *args) -> Bounds:
return self._bounds(self._modified, self._original, args)

def original_range(self, *args) -> range:
return range(*self.original_bounds(*args))

def original_slice(self, *args) -> slice:
return slice(*self.original_bounds(*args))

def modified_bounds(self, *args) -> Bounds:
return self._bounds(self._original, self._modified, args)

def modified_range(self, *args) -> range:
return range(*self.modified_bounds(*args))

def modified_slice(self, *args) -> slice:
return slice(*self.modified_bounds(*args))

def slice_by_original(self, *args) -> "Alignment":
start, stop = self._parse_args(args)
first, last = self._search(self._original, start, stop)
original = self._original[first:last+1]
original = [min(max(i, start), stop) for i in original]
modified = self._modified[first:last+1]
return self._create(original, modified)

def slice_by_modified(self, *args) -> "Alignment":
start, stop = self._parse_args(args)
first, last = self._search(self._modified, start, stop)
original = self._original[first:last+1]
modified = self._modified[first:last+1]
modified = [min(max(i, start), stop) for i in modified]
return self._create(original, modified)

def __add__(self, other):
"""
Concatenate two alignments.
"""

if not isinstance(other, Alignment):
return NotImplemented

o_orig = other._original
o_mod = other._modified

if o_orig[0] < self._original[-1]:
raise ValueError("Original sequence position moved backwards")
elif o_mod[0] < self._modified[-1]:
raise ValueError("Modified sequence position moved backwards")
elif o_orig[0] == self._original[-1] and o_mod[0] == self._modified[-1]:
o_orig = o_orig[1:]
o_mod = o_mod[1:]

return self._create(self._original + o_orig, self._modified + o_mod)

def compose(self, other: "Alignment") -> "Alignment":
"""
Return a new alignment equivalent to applying this one first, then the
other.
"""

if self.modified_bounds() != other.original_bounds():
raise ValueError("Incompatible alignments")

original = []
modified = []
i, i_max = 0, len(self)
j, j_max = 0, len(other)

while i < i_max:
# Map self._original[i] to its lower bound in other
while self._modified[i] > other._original[j]:
j += 1
while self._modified[i] < other._original[j] and self._modified[i + 1] <= other._original[j]:
i += 1
original.append(self._original[i])
modified.append(other._modified[j])

# Map self._original[i] to its upper bound in other (if it's different)
while i + 1 < i_max and self._original[i] == self._original[i + 1]:
i += 1

needs_upper = False
while j + 1 < j_max and self._modified[i] >= other._original[j + 1]:
needs_upper = True
j += 1
if needs_upper:
original.append(self._original[i])
modified.append(other._modified[j])

i += 1

return self._create(original, modified)

def inverse(self) -> "Alignment":
"""
The inverse of this alignment, from the modified to the original sequence.
"""
return self._create(self._modified, self._original)
Loading

0 comments on commit 0f404df

Please sign in to comment.