diff --git a/README.md b/README.md
index 72f1506..383b97f 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,36 @@
+bistring
+========
 
-# Contributing
+The bistring library provides non-destructive versions of common string processing operations like normalization, case folding, and find/replace.
+Each bistring remembers the original string, and how its substrings map to substrings of the modified version.
 
-This project welcomes contributions and suggestions.  Most contributions require you to agree to a
-Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
-the rights to use your contribution. For details, visit https://cla.microsoft.com.
+For example:
 
-When you submit a pull request, a CLA-bot will automatically determine whether you need to provide
-a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions
-provided by the bot. You will only need to do this once across all repos using our CLA.
+```python
+>>> from bistring import bistr
+>>> s = bistr('𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌')
+>>> s = s.normalize('NFKD')     # Unicode normalization
+>>> s = s.casefold()            # Case-insensitivity
+>>> s = s.sub(r'[^a-z ]+', '')  # Strip everything but letters and spaces
+>>> s = s[:19]                  # Extract a substring
+>>> s.modified                  # The modified substring, after changes
+'the quick brown fox'
+>>> s.original                  # The original substring, before changes
+'𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝'
+```
+
+This allows you to perform very aggressive text processing completely invisibly.
+
+
+Contributing
+------------
+
+This project welcomes contributions and suggestions.
+Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution.
+For details, visit https://cla.microsoft.com.
+
+When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment).
+Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
 
 This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
-For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
-contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
diff --git a/python/bistring/__init__.py b/python/bistring/__init__.py
new file mode 100644
index 0000000..fd89c29
--- /dev/null
+++ b/python/bistring/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+from ._alignment import *
+from ._bistr import *
+from ._builder import *
+from ._token import *
diff --git a/python/bistring/_alignment.py b/python/bistring/_alignment.py
new file mode 100644
index 0000000..576e980
--- /dev/null
+++ b/python/bistring/_alignment.py
@@ -0,0 +1,250 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+__all__ = ["Alignment"]
+
+import bisect
+from typing import Iterable, List, Optional, Tuple, cast, overload
+
+from ._typing import Bounds, Range
+
+
+class Alignment:
+    """
+    An alignment between two related sequences.
+    """
+
+    __slots__ = ("_original", "_modified")
+
+    _original: List[int]
+    _modified: List[int]
+
+    def __init__(self, values: Iterable[Bounds]):
+        self._original = []
+        self._modified = []
+        for i, j in values:
+            if self._original:
+                if i < self._original[-1]:
+                    raise ValueError("Original sequence position moved backwards")
+                elif j < self._modified[-1]:
+                    raise ValueError("Modified sequence position moved backwards")
+                elif i == self._original[-1] and j == self._modified[-1]:
+                    continue
+
+            self._original.append(i)
+            self._modified.append(j)
+
+        if not self._original:
+            raise ValueError("No sequence positions to align")
+
+    @classmethod
+    def _create(cls, original: List[int], modified: List[int]) -> "Alignment":
+        result = super().__new__(cls)
+        result._original = original
+        result._modified = modified
+        return result
+
+    def __str__(self):
+        i, j = self._original[0], self._original[-1]
+        k, l = self._modified[0], self._modified[-1]
+        if self._original == list(range(i, j + 1)) and self._modified == list(range(k, l + 1)):
+            return f"[{i}:{j}⇋{k}:{l}]"
+        else:
+            return "[" + ", ".join(f"{i}⇋{j}" for i, j in self) + "]"
+
+    def __repr__(self):
+        i, j = self._original[0], self._original[-1]
+        if self._original == list(range(i, j + 1)) and self._modified == list(range(i, j + 1)):
+            return f"Alignment.identity({self._original[0]}, {self._original[-1]})"
+        else:
+            return "Alignment([" + ", ".join(map(repr, self)) + "])"
+
+    def __eq__(self, other):
+        if isinstance(other, Alignment):
+            return (self._original, self._modified) == (other._original, other._modified)
+        else:
+            return NotImplemented
+
+    @classmethod
+    def _parse_args(cls, args: Tuple) -> Bounds:
+        l = len(args)
+        if l == 0:
+            return None, None
+        elif l == 1:
+            arg = args[0]
+            if isinstance(arg, range):
+                return arg.start, arg.stop
+            elif isinstance(arg, slice):
+                if arg.start is None or arg.stop is None:
+                    raise ValueError("slice with unspecified bounds")
+                return arg.start, arg.stop
+            elif isinstance(arg, tuple):
+                return cast(Bounds, arg)
+            else:
+                return 0, arg
+        elif l == 2:
+            return cast(Bounds, args)
+        else:
+            raise TypeError("Too many arguments")
+
+    @overload
+    @classmethod
+    def identity(cls, length: int) -> "Alignment":
+        ...
+
+    @overload
+    @classmethod
+    def identity(cls, start: int, stop: int) -> "Alignment":
+        ...
+
+    @overload
+    @classmethod
+    def identity(cls, bounds: Range) -> "Alignment":
+        ...
+
+    @classmethod
+    def identity(cls, *args):
+        start, stop = cls._parse_args(args)
+        values = list(range(start, stop + 1))
+        return cls._create(values, values)
+
+    def __iter__(self):
+        return zip(self._original, self._modified)
+
+    def __len__(self):
+        return len(self._original)
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            start, stop, stride = index.indices(len(self))
+            if stride != 1:
+                raise ValueError("Non-unit strides not supported")
+            return self._create(self._original[index], self._modified[index])
+        else:
+            return (self._original[index], self._modified[index])
+
+    def shift(self, delta_o, delta_m):
+        return self._create(
+            [o + delta_o for o in self._original],
+            [m + delta_m for m in self._modified],
+        )
+
+    def _search(self, source: List[int], start: int, stop: int) -> Bounds:
+        first = bisect.bisect_right(source, start)
+        if first == 0:
+            raise IndexError("range start too small")
+        first -= 1
+
+        last = bisect.bisect_left(source, stop, first)
+        if last == len(source):
+            raise IndexError("range end too big")
+
+        return first, last
+
+    def _bounds(self, source: List[int], target: List[int], args: Tuple) -> Bounds:
+        start, stop = self._parse_args(args)
+        if start is None:
+            i, j = 0, -1
+        else:
+            i, j = self._search(source, start, stop)
+        return (target[i], target[j])
+
+    def original_bounds(self, *args) -> Bounds:
+        return self._bounds(self._modified, self._original, args)
+
+    def original_range(self, *args) -> range:
+        return range(*self.original_bounds(*args))
+
+    def original_slice(self, *args) -> slice:
+        return slice(*self.original_bounds(*args))
+
+    def modified_bounds(self, *args) -> Bounds:
+        return self._bounds(self._original, self._modified, args)
+
+    def modified_range(self, *args) -> range:
+        return range(*self.modified_bounds(*args))
+
+    def modified_slice(self, *args) -> slice:
+        return slice(*self.modified_bounds(*args))
+
+    def slice_by_original(self, *args) -> "Alignment":
+        start, stop = self._parse_args(args)
+        first, last = self._search(self._original, start, stop)
+        original = self._original[first:last+1]
+        original = [min(max(i, start), stop) for i in original]
+        modified = self._modified[first:last+1]
+        return self._create(original, modified)
+
+    def slice_by_modified(self, *args) -> "Alignment":
+        start, stop = self._parse_args(args)
+        first, last = self._search(self._modified, start, stop)
+        original = self._original[first:last+1]
+        modified = self._modified[first:last+1]
+        modified = [min(max(i, start), stop) for i in modified]
+        return self._create(original, modified)
+
+    def __add__(self, other):
+        """
+        Concatenate two alignments.
+        """
+
+        if not isinstance(other, Alignment):
+            return NotImplemented
+
+        o_orig = other._original
+        o_mod = other._modified
+
+        if o_orig[0] < self._original[-1]:
+            raise ValueError("Original sequence position moved backwards")
+        elif o_mod[0] < self._modified[-1]:
+            raise ValueError("Modified sequence position moved backwards")
+        elif o_orig[0] == self._original[-1] and o_mod[0] == self._modified[-1]:
+            o_orig = o_orig[1:]
+            o_mod = o_mod[1:]
+
+        return self._create(self._original + o_orig, self._modified + o_mod)
+
+    def compose(self, other: "Alignment") -> "Alignment":
+        """
+        Return a new alignment equivalent to applying this one first, then the
+        other.
+        """
+
+        if self.modified_bounds() != other.original_bounds():
+            raise ValueError("Incompatible alignments")
+
+        original = []
+        modified = []
+        i, i_max = 0, len(self)
+        j, j_max = 0, len(other)
+
+        while i < i_max:
+            # Map self._original[i] to its lower bound in other
+            while self._modified[i] > other._original[j]:
+                j += 1
+            while self._modified[i] < other._original[j] and self._modified[i + 1] <= other._original[j]:
+                i += 1
+            original.append(self._original[i])
+            modified.append(other._modified[j])
+
+            # Map self._original[i] to its upper bound in other (if it's different)
+            while i + 1 < i_max and self._original[i] == self._original[i + 1]:
+                i += 1
+
+            needs_upper = False
+            while j + 1 < j_max and self._modified[i] >= other._original[j + 1]:
+                needs_upper = True
+                j += 1
+            if needs_upper:
+                original.append(self._original[i])
+                modified.append(other._modified[j])
+
+            i += 1
+
+        return self._create(original, modified)
+
+    def inverse(self) -> "Alignment":
+        """
+        The inverse of this alignment, from the modified to the original sequence.
+        """
+        return self._create(self._modified, self._original)
diff --git a/python/bistring/_bistr.py b/python/bistring/_bistr.py
new file mode 100644
index 0000000..6dd13b3
--- /dev/null
+++ b/python/bistring/_bistr.py
@@ -0,0 +1,238 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+__all__ = ["bistr"]
+
+from typing import Iterable, Optional, Tuple
+
+from ._alignment import Alignment
+from ._typing import Regex, String
+
+
+class bistr:
+    """
+    A bidirectionally transformed string.
+    """
+
+    __slots__ = ("original", "modified", "alignment")
+
+    original: str
+    modified: str
+    alignment: Alignment
+
+    def __new__(cls, original: String, modified: Optional[str] = None, alignment: Optional[Alignment] = None):
+        """
+        Create a new bistr.
+        """
+
+        if isinstance(original, bistr):
+            if modified is not None or alignment is not None:
+                raise ValueError("bistr copy constructor invoked with extra arguments")
+            return original
+
+        if alignment is None:
+            if modified is None:
+                alignment = Alignment.identity(len(original))
+            else:
+                alignment = Alignment([(0, 0), (len(original), len(modified))])
+
+        if modified is None:
+            modified = original
+
+        if alignment.original_bounds() != (0, len(original)):
+            raise ValueError("Alignment incompatible with original string")
+        elif alignment.modified_bounds() != (0, len(modified)):
+            raise ValueError("Alignment incompatible with modified string")
+
+        result = super().__new__(cls)
+        super().__setattr__(result, "original", original)
+        super().__setattr__(result, "modified", modified)
+        super().__setattr__(result, "alignment", alignment)
+        return result
+
+    def __str__(self):
+        if self.original == self.modified:
+            return f"⮎{self.original!r}⮌"
+        else:
+            return f"({self.original!r} ⇋ {self.modified!r})"
+
+    def __repr__(self):
+        if self.original == self.modified and self.alignment == Alignment.identity(len(self.original)):
+            return f"bistr({self.original!r})"
+        elif self.alignment == Alignment([(0, 0), (len(self.original), len(self.modified))]):
+            return f"bistr({self.original!r}, {self.modified!r})"
+        else:
+            return f"bistr({self.original!r}, {self.modified!r}, {self.alignment!r})"
+
+    def __len__(self):
+        return len(self.modified)
+
+    def __eq__(self, other):
+        if isinstance(other, bistr):
+            return (self.original, self.modified, self.alignment) == (other.original, other.modified, other.alignment)
+        else:
+            return NotImplemented
+
+    def __add__(self, other):
+        if isinstance(other, bistr):
+            original = other.original
+            modified = other.modified
+            alignment = other.alignment
+        elif isinstance(other, str):
+            original = other
+            modified = other
+            alignment = Alignment.identity(len(other))
+        else:
+            return NotImplemented
+
+        alignment = alignment.shift(len(self.original), len(self.modified))
+        return bistr(self.original + original, self.modified + modified, self.alignment + alignment)
+
+    def __radd__(self, other):
+        if isinstance(other, str):
+            length = len(other)
+            return bistr(
+                other + self.original,
+                other + self.modified,
+                Alignment.identity(length) + self.alignment.shift(length, length),
+            )
+        else:
+            return NotImplemented
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            start, stop, stride = index.indices(len(self))
+            if stride != 1:
+                raise ValueError("Non-unit strides not supported")
+
+            modified = self.modified[start:stop]
+            original = self.original[self.alignment.original_slice(start, stop)]
+            alignment = self.alignment.slice_by_modified(start, stop)
+            alignment = alignment.shift(-alignment[0][0], -alignment[0][1])
+
+            return bistr(original, modified, alignment)
+        else:
+            return self.modified[index]
+
+    def __setattr__(self, name, value):
+        raise AttributeError("bistr is immutable")
+
+    def __delattr__(self, name):
+        raise AttributeError("bistr is immutable")
+
+    def inverse(self) -> "bistr":
+        """
+        The inverse of this string, swapping the original and modified strings.
+        """
+        return bistr(self.modified, self.original, self.alignment.inverse())
+
+    def chunks(self) -> Iterable["bistr"]:
+        """
+        All the chunks of associated text in this string.
+        """
+
+        i, k = 0, 0
+        for j, l in self.alignment[1:]:
+            yield bistr(self.original[i:j], self.modified[k:l])
+            i, k = j, l
+
+    def _builder(self):
+        from ._builder import BistrBuilder
+        return BistrBuilder(self)
+
+    def casefold(self) -> "bistr":
+        from ._icu import casefold
+        return casefold(self)
+
+    def lower(self, locale: Optional[str] = None) -> "bistr":
+        from ._icu import lower
+        return lower(self, locale)
+
+    def upper(self, locale: Optional[str] = None) -> "bistr":
+        from ._icu import upper
+        return upper(self, locale)
+
+    def title(self, locale: Optional[str] = None) -> "bistr":
+        from ._icu import title
+        return title(self, locale)
+
+    def expandtabs(self, tabsize=8) -> "bistr":
+        return self.replace("\t", " " * tabsize)
+
+    def replace(self, old: str, new: str, count: Optional[int] = None) -> "bistr":
+        builder = self._builder()
+
+        pos = 0
+        n = 0
+        while count is None or n < count:
+            index = self.modified.find(old, pos)
+            if index < 0:
+                break
+
+            builder.skip(index - pos)
+            builder.replace(len(old), new)
+
+            pos = index + len(old)
+            n += 1
+
+        builder.skip_rest()
+        return builder.build()
+
+    def sub(self, regex: Regex, repl: str) -> "bistr":
+        builder = self._builder()
+        builder.replace_all(regex, repl)
+        return builder.build()
+
+    def _stripper(self, chars: Optional[str]):
+        if chars is None:
+            return lambda c: c.isspace()
+        else:
+            return lambda c: c in chars
+
+    def strip(self, chars: Optional[str] = None) -> "bistr":
+        should_strip = self._stripper(chars)
+
+        length = len(self)
+        pre = 0
+        while pre < length and should_strip(self.modified[pre]):
+            pre += 1
+
+        post = length
+        while post > pre and should_strip(self.modified[post - 1]):
+            post -= 1
+
+        builder = self._builder()
+        builder.discard(pre)
+        builder.skip(post - pre)
+        builder.discard_rest()
+        return builder.build()
+
+    def lstrip(self, chars: Optional[str] = None) -> "bistr":
+        should_strip = self._stripper(chars)
+
+        length = len(self)
+        pre = 0
+        while pre < length and should_strip(self.modified[pre]):
+            pre += 1
+
+        builder = self._builder()
+        builder.discard(pre)
+        builder.skip_rest()
+        return builder.build()
+
+    def rstrip(self, chars: Optional[str] = None) -> "bistr":
+        should_strip = self._stripper(chars)
+
+        length = len(self)
+        post = length
+        while post > 0 and should_strip(self.modified[post - 1]):
+            post -= 1
+
+        builder = self._builder()
+        builder.skip(post)
+        builder.discard_rest()
+        return builder.build()
+
+    def normalize(self, form: str):
+        from ._icu import normalize
+        return normalize(self, form)
diff --git a/python/bistring/_builder.py b/python/bistring/_builder.py
new file mode 100644
index 0000000..9b95608
--- /dev/null
+++ b/python/bistring/_builder.py
@@ -0,0 +1,225 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+__all__ = ["BistrBuilder"]
+
+import re
+from typing import Iterable, List, Match, Optional, Pattern, Tuple
+
+from ._alignment import Alignment
+from ._bistr import bistr
+from ._typing import Bounds, Regex, String
+
+
+class BistrBuilder:
+    """
+    Bidirectionally transformed string builer.
+    """
+
+    _original: bistr
+    _modified: List[str]
+    _alignment: List[Bounds]
+    _opos: int
+    _mpos: int
+
+    def __init__(self, original: String):
+        self._original = bistr(original)
+        self._modified = []
+        self._alignment = [(0, 0)]
+        self._opos = 0
+        self._mpos = 0
+
+    @property
+    def original(self) -> str:
+        """
+        The original string being modified.
+        """
+        return self._original.original
+
+    @property
+    def current(self) -> str:
+        """
+        The current string before modifications.
+        """
+        return self._original.modified
+
+    @property
+    def modified(self) -> str:
+        """
+        The modified string as built so far.
+        """
+        return "".join(self._modified)
+
+    @property
+    def alignment(self) -> Alignment:
+        """
+        The alignment built so far from self.current to self.modified.
+        """
+        return Alignment(self._alignment)
+
+    @property
+    def position(self) -> int:
+        """
+        The position of the builder in self.current.
+        """
+        return self._opos
+
+    @property
+    def remaining(self) -> int:
+        """
+        The number of characters of the current string left to process.
+        """
+        return len(self.current) - self._opos
+
+    @property
+    def is_complete(self) -> bool:
+        """
+        Whether we've completely processed the string.
+        """
+        return self.remaining == 0
+
+    def peek(self, n: int):
+        """
+        Peek at the next n characters of the original string.
+        """
+        return self.current[self._opos:self._opos+n]
+
+    def _advance(self, ocount, mcount):
+        self._opos += ocount
+        self._mpos += mcount
+        if ocount > 0 or mcount > 0:
+            self._alignment.append((self._opos, self._mpos))
+
+    def skip(self, n: int):
+        """
+        Skip the next n characters, copying them unchanged.
+        """
+        if n > 0:
+            self._modified.append(self.peek(n))
+            for i in range(n):
+                self._advance(1, 1)
+
+    def skip_rest(self):
+        """
+        Skip the rest of the string, copying it unchanged.
+        """
+        self.skip(self.remaining)
+
+    def insert(self, string: str):
+        """
+        Insert a substring into the string.
+        """
+        self.replace(0, string)
+
+    def discard(self, n: int):
+        """
+        Discard a portion of the original string.
+        """
+        self.replace(n, "")
+
+    def discard_rest(self):
+        """
+        Discard the rest of the original string.
+        """
+        self.discard(self.remaining)
+
+    def replace(self, n: int, repl: str):
+        """
+        Replace the next n characters with a new string.
+        """
+        if len(repl) > 0:
+            self._modified.append(repl)
+        self._advance(n, len(repl))
+
+    def append(self, bs: bistr):
+        """
+        Append a bistr.  The original value of the bistr must match the current
+        string being processed.
+        """
+        if bs.original != self.peek(len(bs.original)):
+            raise ValueError("bistr doesn't match the current string")
+        for x, y in zip(bs.alignment, bs.alignment[1:]):
+            self._advance(y[0] - x[0], y[1] - x[1])
+
+    def _match(self, regex: Regex) -> Optional[Match]:
+        pattern = re.compile(regex)
+        return pattern.match(self.current, pos=self._opos)
+
+    def _search(self, regex: Regex) -> Optional[Match]:
+        pattern = re.compile(regex)
+        return pattern.search(self.current, pos=self._opos)
+
+    def _finditer(self, regex: Regex) -> Iterable[Match]:
+        pattern = re.compile(regex)
+        return pattern.finditer(self.current, pos=self._opos)
+
+    def skip_match(self, regex: Regex) -> bool:
+        """
+        Skip a substring matching a regex, copying it unchanged.
+        """
+        match = self._match(regex)
+        if match:
+            self.skip(match.end() - match.start())
+            return True
+        else:
+            return False
+
+    def discard_match(self, regex: Regex) -> bool:
+        """
+        Discard a substring that matches a regex.
+        """
+        match = self._match(regex)
+        if match:
+            self.discard(match.end() - match.start())
+            return True
+        else:
+            return False
+
+    def replace_match(self, regex: Regex, repl: str) -> bool:
+        """
+        Replace a substring that matches a regex.
+        """
+        match = self._match(regex)
+        if match:
+            self.replace(match.end() - match.start(), match.expand(repl))
+            return True
+        else:
+            return False
+
+    def replace_next(self, regex: Regex, repl: str) -> bool:
+        """
+        Replace the next occurence of a regex.
+        """
+        match = self._search(regex)
+        if match:
+            self.skip(match.start() - self._opos)
+            self.replace(match.end() - match.start(), match.expand(repl))
+            return True
+        else:
+            return False
+
+    def replace_all(self, regex: Regex, repl: str):
+        """
+        Replace all occurences of a regex.
+        """
+        for match in self._finditer(regex):
+            self.skip(match.start() - self._opos)
+            self.replace(match.end() - match.start(), match.expand(repl))
+        self.skip_rest()
+
+    def build(self):
+        """
+        Build the bistr.
+        """
+        alignment = self._original.alignment.compose(self.alignment)
+        return bistr(self.original, self.modified, alignment)
+
+    def rewind(self):
+        """
+        Reset this builder to apply another transformation.
+        """
+        self._original = self.build()
+        self._modified = []
+        self._alignment = [(0, 0)]
+        self._opos = 0
+        self._mpos = 0
diff --git a/python/bistring/_icu.py b/python/bistring/_icu.py
new file mode 100644
index 0000000..94e597d
--- /dev/null
+++ b/python/bistring/_icu.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+import icu
+from typing import Callable, Optional
+
+from ._bistr import bistr
+from ._builder import BistrBuilder
+
+
+def _edit(bs: bistr, op: Callable, locale: Optional[str] = None) -> bistr:
+    builder = BistrBuilder(bs)
+    edits = icu.Edits()
+    ucur = icu.UnicodeString(builder.current)
+
+    if locale is None:
+        umod = icu.UnicodeString(op(ucur, edits))
+    else:
+        umod = icu.UnicodeString(op(icu.Locale(locale), ucur, edits))
+
+    for is_change, old_len, new_len, old_i, new_i, _ in edits.getFineIterator():
+        old_len = ucur.countChar32(old_i, old_len)
+        if is_change:
+            repl = str(umod[new_i:new_i+new_len])
+            builder.replace(old_len, repl)
+        else:
+            builder.skip(old_len)
+
+    return builder.build()
+
+
+def casefold(bs: bistr) -> bistr:
+    return _edit(bs, icu.CaseMap.fold)
+
+
+def lower(bs: bistr, locale: Optional[str]) -> bistr:
+    return _edit(bs, icu.CaseMap.toLower, locale)
+
+
+def upper(bs: bistr, locale: Optional[str]) -> bistr:
+    return _edit(bs, icu.CaseMap.toUpper, locale)
+
+
+def title(bs: bistr, locale: Optional[str]) -> bistr:
+    return _edit(bs, icu.CaseMap.toTitle, locale)
+
+
+def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr:
+    builder = BistrBuilder(bs)
+    us = icu.UnicodeString(bs.modified)
+    offset = 0
+    while not builder.is_complete:
+        i = normalizer.spanQuickCheckYes(us)
+        builder.skip(us.countChar32(0, i))
+        if builder.is_complete:
+            break
+        us = us[i:]
+
+        i = 0
+        while i < len(us):
+            if us.charAt(i) & 0xFC00 == 0xD800:
+                i += 1
+            i += 1
+            if normalizer.hasBoundaryBefore(chr(us.char32At(i))):
+                break
+
+        chunk = us[:i]
+        normalized = str(normalizer.normalize(chunk))
+        builder.replace(chunk.countChar32(), normalized)
+        us = us[i:]
+
+    return builder.build()
+
+
+_NORMALIZERS = {
+    "NFC": icu.Normalizer2.getNFCInstance,
+    "NFKC": icu.Normalizer2.getNFKCInstance,
+    "NFD": icu.Normalizer2.getNFDInstance,
+    "NFKD": icu.Normalizer2.getNFKDInstance,
+}
+
+def normalize(bs: bistr, form: str) -> bistr:
+    factory = _NORMALIZERS.get(form)
+    if factory:
+        return _normalize(bs, factory())
+    else:
+        raise ValueError("invalid normalization form")
diff --git a/python/bistring/_token.py b/python/bistring/_token.py
new file mode 100644
index 0000000..dabb01c
--- /dev/null
+++ b/python/bistring/_token.py
@@ -0,0 +1,289 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+__all__ = [
+    "Token",
+    "Tokenization",
+    "RegexTokenizer",
+    "SplittingTokenizer",
+    "CharacterTokenizer",
+    "WordTokenizer",
+    "SentenceTokenizer",
+]
+
+from dataclasses import dataclass
+import icu
+import re
+import threading
+from typing import Callable, Iterable
+
+from ._alignment import Alignment
+from ._bistr import bistr
+from ._typing import Bounds, Regex, String
+
+
+@dataclass(frozen=True)
+class Token:
+    """
+    A token extracted from a string.
+    """
+
+    text: bistr
+    start: int
+    end: int
+
+    @property
+    def original(self) -> str:
+        """
+        The original value of this token.
+        """
+        return self.text.original
+
+    @property
+    def modified(self) -> str:
+        """
+        The modified value of this token.
+        """
+        return self.text.modified
+
+    @classmethod
+    def slice(cls, text: bistr, start: int, end: int) -> "Token":
+        """
+        Create a Token from a slice of a bistr.
+        """
+        return cls(text[start:end], start, end)
+
+    def __str__(self):
+        return f"[{self.start}:{self.end}]={self.text}"
+
+    def __repr__(self):
+        return f"Token({self.text!r}, start={self.start}, end={self.end})"
+
+
+@dataclass(frozen=True)
+class Tokenization:
+    """
+    A string and its tokenization.
+    """
+
+    text: bistr
+    _tokens: Iterable[Token]
+    alignment: Alignment
+
+    def __init__(self, text: bistr, tokens: Iterable[Token]):
+        """
+        Create a Tokenization.
+        """
+        tokens = tuple(tokens)
+
+        alignment = []
+        for i, token in enumerate(tokens):
+            alignment.append((token.start, i))
+            alignment.append((token.end, i + 1))
+
+        self._init(text, tokens, Alignment(alignment))
+
+    def _init(self, text: bistr, tokens: Iterable[Token], alignment: Alignment):
+        super().__setattr__("text", text)
+        super().__setattr__("_tokens", tokens)
+        super().__setattr__("alignment", Alignment(alignment))
+
+    @classmethod
+    def _create(cls, text: bistr, tokens: Iterable[Token], alignment: Alignment):
+        result = cls.__new__(cls)
+        result._init(text, tokens, alignment)
+        return result
+
+    def __iter__(self):
+        return iter(self._tokens)
+
+    def __len__(self):
+        return len(self._tokens)
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            start, stop, stride = index.indices(len(self))
+            if stride != 1:
+                raise ValueError("Non-unit strides not supported")
+            text_slice = slice(*self.text_bounds(start, stop))
+            return self._create(self.text[text_slice], self._tokens[index], self.alignment[index])
+        else:
+            return self._tokens[index]
+
+    def __str__(self):
+        tokens = ", ".join(map(str, self))
+        return f"Tokenization({self.text}, [{tokens}])"
+
+    def __repr__(self):
+        return f"Tokenization({self.text!r}, {self._tokens!r})"
+
+    def text_bounds(self, *args) -> Bounds:
+        """
+        Map a span of tokens to the bounds of the corresponding text.
+        """
+        return self.alignment.original_bounds(*args)
+
+    def original_bounds(self, *args) -> Bounds:
+        """
+        Map a span of tokens to the bounds of the corresponding original text.
+        """
+        return self.text.alignment.original_bounds(self.text_bounds(*args))
+
+    def bounds_for_text(self, *args) -> Bounds:
+        """
+        Map a span of text to the bounds of the corresponding span of tokens.
+        """
+        return self.alignment.modified_bounds(*args)
+
+    def bounds_for_original(self, *args) -> Bounds:
+        """
+        Map a span of original text to the bounds of the corresponding span of
+        tokens.
+        """
+        text_bounds = self.text.alignment.modified_bounds(*args)
+        return self.alignment.modified_bounds(text_bounds)
+
+    def slice_by_text(self, *args) -> Iterable[Token]:
+        """
+        Map a span of text to the corresponding span of tokens.
+        """
+        i, j = self.bounds_for_text(*args)
+        return self[i:j]
+
+    def slice_by_original(self, *args) -> Iterable[Token]:
+        """
+        Map a span of the original text to the corresponding span of tokens.
+        """
+        i, j = self.bounds_for_original(*args)
+        return self[i:j]
+
+    def align_text_bounds(self, *args) -> Bounds:
+        """
+        Expand a span of text to align it with token boundaries.
+        """
+        return self.text_bounds(self.bounds_for_text(*args))
+
+    def align_original_bounds(self, *args) -> Bounds:
+        """
+        Expand a span of original text to align it with token boundaries.
+        """
+        return self.original_bounds(self.bounds_for_original(*args))
+
+
+class RegexTokenizer:
+    """
+    Breaks text into tokens based on a regex.
+    """
+
+    def __init__(self, regex: Regex):
+        self._pattern = re.compile(regex)
+
+    def tokenize(self, text: String) -> Tokenization:
+        text = bistr(text)
+        tokens = []
+        for match in self._pattern.finditer(text.modified):
+            tokens.append(Token.slice(text, match.start(), match.end()))
+        return Tokenization(text, tokens)
+
+
+class SplittingTokenizer:
+    """
+    Splits text into tokens based on a regex.
+    """
+
+    def __init__(self, regex: Regex):
+        self._pattern = re.compile(regex)
+
+    def tokenize(self, text: String) -> Tokenization:
+        text = bistr(text)
+        tokens = []
+
+        last = 0
+        for match in self._pattern.finditer(text.modified):
+            start = match.start()
+            if start > last:
+                tokens.append(Token.slice(text, last, start))
+            last = match.end()
+
+        end = len(text.modified)
+        if end > last:
+            tokens.append(Token.slice(text, last, end))
+
+        return Tokenization(text, tokens)
+
+
+class _IcuTokenizer:
+    """
+    Base class for ICU BreakIterator-based tokenizers.
+    """
+
+    def __init__(self, locale: str, constructor: Callable):
+        # BreakIterator is not a thread-safe API, so store a cache of
+        # thread-local iterators
+        self._locale = icu.Locale(locale)
+        self._constructor = constructor
+        self._local = threading.local()
+
+        # Eagerly construct one on this thread as an optimization, and to check
+        # for errors
+        self._break_iterator()
+
+    def _break_iterator(self) -> icu.BreakIterator:
+        if not hasattr(self._local, "bi"):
+            self._local.bi = self._constructor(self._locale)
+        return self._local.bi
+
+    def tokenize(self, text: String) -> Tokenization:
+        text = bistr(text)
+        tokens = []
+
+        bi = self._break_iterator()
+
+        utext = icu.UnicodeString(text.modified)
+        bi.setText(utext)
+
+        ui = bi.first()
+        uj = bi.nextBoundary()
+        i = 0
+        while uj != icu.BreakIterator.DONE:
+            j = i + utext.countChar32(ui, uj - ui)
+            if self._check_token(bi.getRuleStatus()):
+                tokens.append(Token.slice(text, i, j))
+            ui = uj
+            uj = bi.nextBoundary()
+            i = j
+
+        return Tokenization(text, tokens)
+
+    def _check_token(self, tag: int) -> bool:
+        return True
+
+
+class CharacterTokenizer(_IcuTokenizer):
+    """
+    Splits text into user-perceived characters/grapheme clusters.
+    """
+
+    def __init__(self, locale: str):
+        super().__init__(locale, icu.BreakIterator.createCharacterInstance)
+
+
+class WordTokenizer(_IcuTokenizer):
+    """
+    Splits text into words based on Unicode rules.
+    """
+
+    def __init__(self, locale: str):
+        super().__init__(locale, icu.BreakIterator.createWordInstance)
+
+    def _check_token(self, tag: int) -> bool:
+        return tag >= 100 # UBRK_WORD_NONE_LIMIT
+
+
+class SentenceTokenizer(_IcuTokenizer):
+    """
+    Splits text into sentences based on Unicode rules.
+    """
+
+    def __init__(self, locale: str):
+        super().__init__(locale, icu.BreakIterator.createSentenceInstance)
diff --git a/python/bistring/_typing.py b/python/bistring/_typing.py
new file mode 100644
index 0000000..3133730
--- /dev/null
+++ b/python/bistring/_typing.py
@@ -0,0 +1,13 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+from typing import Pattern, Tuple, Union
+
+
+Bounds = Tuple[int, int]
+
+Range = Union[range, slice, Bounds]
+
+Regex = Union[str, Pattern]
+
+String = Union[str, "bistr"]
diff --git a/python/setup.py b/python/setup.py
new file mode 100644
index 0000000..ccd6c3a
--- /dev/null
+++ b/python/setup.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+from setuptools import setup
+
+
+setup(
+    name="bistring",
+    version="0.0",
+    author="Microsoft Research Montreal",
+    author_email="msrmtle@microsoft.com",
+    description="Bidirectionally transformed strings",
+    url="https://dev.azure.com/maluuba/Isentrope",
+    packages=[
+        "bistring",
+    ],
+    test_suite="tests",
+    setup_requires=[
+        "pytest-runner >= 4.2",
+    ],
+    install_requires=[
+        "pyicu >= 2.1",
+    ],
+    tests_require=[
+        "pytest >= 3.8.2",
+    ],
+)
diff --git a/python/tests/__init__.py b/python/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/tests/test_alignment.py b/python/tests/test_alignment.py
new file mode 100644
index 0000000..7903220
--- /dev/null
+++ b/python/tests/test_alignment.py
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+from bistring import Alignment
+import pytest
+
+
+def test_empty():
+    pytest.raises(ValueError, Alignment, [])
+
+    alignment = Alignment.identity(0)
+    assert list(alignment) == [(0, 0)]
+
+    assert alignment.original_bounds() == (0, 0)
+    assert alignment.modified_bounds() == (0, 0)
+
+    assert alignment.original_bounds(0, 0) == (0, 0)
+    assert alignment.modified_bounds(0, 0) == (0, 0)
+
+
+def test_identity():
+    alignment = Alignment.identity(1, 16)
+
+    assert alignment == Alignment((i, i) for i in range(1, 17))
+    assert list(alignment) == [(i, i) for i in range(1, 17)]
+
+    assert alignment.original_bounds() == (1, 16)
+    assert alignment.modified_bounds() == (1, 16)
+
+    assert alignment.original_bounds(4, 7) == (4, 7)
+    assert alignment.modified_bounds(4, 7) == (4, 7)
+
+
+def test_aligning():
+    alignment = Alignment([(0, 0), (1, 2), (2, 4), (3, 6)])
+
+    assert alignment.original_bounds() == (0, 3)
+    assert alignment.modified_bounds() == (0, 6)
+
+    assert alignment.original_bounds(0, 0) == (0, 0)
+    assert alignment.original_bounds(0, 1) == (0, 1)
+    assert alignment.original_bounds(0, 2) == (0, 1)
+    assert alignment.original_bounds(0, 3) == (0, 2)
+    assert alignment.original_bounds(1, 1) == (0, 1)
+    assert alignment.original_bounds(1, 3) == (0, 2)
+    assert alignment.original_bounds(1, 4) == (0, 2)
+    assert alignment.original_bounds(2, 2) == (1, 1)
+    assert alignment.original_bounds(2, 4) == (1, 2)
+    assert alignment.original_bounds(2, 5) == (1, 3)
+    assert alignment.original_bounds(3, 3) == (1, 2)
+
+    assert alignment.modified_bounds(0, 0) == (0, 0)
+    assert alignment.modified_bounds(0, 1) == (0, 2)
+    assert alignment.modified_bounds(0, 2) == (0, 4)
+    assert alignment.modified_bounds(0, 3) == (0, 6)
+    assert alignment.modified_bounds(1, 1) == (2, 2)
+    assert alignment.modified_bounds(2, 2) == (4, 4)
+
+
+def test_canonicalization():
+    assert Alignment([(0, 0), (1, 2), (1, 2), (2, 4)]) == Alignment([(0, 0), (1, 2), (2, 4)])
+
+    assert Alignment([(0, 0), (1, 2)]) + Alignment([(1, 2), (2, 4)]) == Alignment([(0, 0), (1, 2), (2, 4)])
+
+
+def _test_composition(first, second):
+    composed = first.compose(second)
+    original_range = composed.original_range()
+    modified_range = composed.modified_range()
+
+    assert original_range == first.original_range()
+    assert modified_range == second.modified_range()
+
+    for i in original_range:
+        for j in original_range[i:]:
+            assert composed.modified_bounds(i, j) == second.modified_bounds(first.modified_bounds(i, j))
+
+    for i in modified_range:
+        for j in modified_range[i:]:
+            assert composed.original_bounds(i, j) == first.original_bounds(second.original_bounds(i, j))
+
+
+def test_compose():
+    first = Alignment((i, 2 * i) for i in range(4))
+    second = Alignment((i, 2 * i) for i in range(7))
+    _test_composition(first, second)
+
+
+def _test_identity_composition(alignment):
+    _test_composition(alignment, Alignment.identity(alignment.modified_range()))
+    _test_composition(Alignment.identity(alignment.original_range()), alignment)
+
+
+def test_compose_identity():
+    alignment = Alignment([
+        (0, 2),
+        (2, 2),
+        (4, 4),
+        (6, 6),
+        (8, 6),
+    ])
+
+    # Modified sequence is smaller
+    _test_identity_composition(alignment)
+
+    # Original sequence is smaller
+    _test_identity_composition(alignment.inverse())
diff --git a/python/tests/test_bistr.py b/python/tests/test_bistr.py
new file mode 100644
index 0000000..8b6dd81
--- /dev/null
+++ b/python/tests/test_bistr.py
@@ -0,0 +1,135 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+import unicodedata
+
+from bistring import Alignment, bistr
+
+
+def test_concat():
+    bs = bistr("  ", "")
+    bs += "Hello"
+    bs += bistr("  ", " ")
+    bs += "world!"
+    bs += bistr("  ", "")
+
+    assert bs.original == "  Hello  world!  "
+    assert bs.modified == "Hello world!"
+
+    bs = bs[4:7]
+    assert bs.original == "o  w"
+    assert bs.modified == "o w"
+
+    bs = bs[1:2]
+    assert bs.original == "  "
+    assert bs.modified == " "
+
+
+def test_strip():
+    bs = bistr("  Hello  world!  ")
+    assert bs.original == "  Hello  world!  "
+    assert bs.modified == "  Hello  world!  "
+
+    bs = bs.strip()
+    assert bs.original == "  Hello  world!  "
+    assert bs.modified == "Hello  world!"
+
+    bs = bistr("    ").strip()
+    assert bs.modified == ""
+    assert bs.original == "    "
+
+
+def test_casefold():
+    # "Híﬃ"
+    # í has a combining acute accent, ﬃ is a ligature
+    bs = bistr("Hi\u0301\uFB03").casefold()
+    assert bs.original == "Hi\u0301\uFB03"
+    assert bs.modified == "hi\u0301ffi"
+    assert bs.modified == bs.original.casefold()
+
+    assert bs[:3].original == "Hi\u0301"
+    assert bs[:3].modified == "hi\u0301"
+
+    assert bs[4:5].original == "\uFB03"
+    assert bs[4:5].modified == "f"
+
+
+def test_lower():
+    bs = bistr("DİYARBAKIR").lower("en_US")
+    assert bs.original == "DİYARBAKIR"
+    assert bs.modified == "di̇yarbakir"
+
+    bs = bistr("DİYARBAKIR").lower("tr_TR")
+    assert bs.original == "DİYARBAKIR"
+    assert bs.modified == "diyarbakır"
+
+
+def test_upper():
+    bs = bistr("straße").upper("de_DE")
+    assert bs.original == "straße"
+    assert bs.modified == "STRASSE"
+    assert bs[4:6].original == "ß"
+    assert bs[4:6].modified == "SS"
+
+    bs = bistr("Diyarbakır").upper("tr_TR")
+    assert bs.original == "Diyarbakır"
+    assert bs.modified == "DİYARBAKIR"
+
+def test_title():
+    bs = bistr("istanbul").title("en_US")
+    assert bs.original == "istanbul"
+    assert bs.modified == "Istanbul"
+
+    bs = bistr("istanbul").title("tr_TR")
+    assert bs.original == "istanbul"
+    assert bs.modified == "İstanbul"
+
+
+def test_normalize():
+    # é is composed but ö has a combining diaeresis
+    bs = bistr("H\u00E9llo\u0308")
+
+    bs = bs.normalize("NFC")
+    assert bs.original == "H\u00E9llo\u0308"
+    assert bs.modified == "H\u00E9ll\u00F6"
+    assert bs.modified == unicodedata.normalize("NFC", bs.original)
+    assert bs[4:5].original == "o\u0308"
+    assert bs[4:5].modified == "\u00F6"
+
+    bs = bs.normalize("NFD")
+    assert bs.original == "H\u00E9llo\u0308"
+    assert bs.modified == "He\u0301llo\u0308"
+    assert bs.modified == unicodedata.normalize("NFD", bs.original)
+    assert bs[1:3].original == "\u00E9"
+    assert bs[1:3].modified == "e\u0301"
+
+
+def test_readme():
+    bs = bistr("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌")
+    bs = bs.normalize("NFKD")
+    bs = bs.casefold()
+    bs = bs.sub(r"[^a-z ]+", "")
+    bs = bs[:19]
+    assert bs.modified == "the quick brown fox"
+    assert bs.original == "𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝"
+
+
+def test_equality():
+    bs1 = bistr("  Hello world  ").strip().casefold()
+    bs2 = bistr("  Hello world  ", "hello world", Alignment([
+        (0, 0),
+        (2, 0),
+        (3, 1),
+        (4, 2),
+        (5, 3),
+        (6, 4),
+        (7, 5),
+        (8, 6),
+        (9, 7),
+        (10, 8),
+        (11, 9),
+        (12, 10),
+        (13, 11),
+        (15, 11),
+    ]))
+    assert bs1 == bs2
diff --git a/python/tests/test_builder.py b/python/tests/test_builder.py
new file mode 100644
index 0000000..d74e20b
--- /dev/null
+++ b/python/tests/test_builder.py
@@ -0,0 +1,115 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+from bistring import bistr, BistrBuilder
+
+
+def test_chunk_words():
+    builder = BistrBuilder("  the quick  brown fox ")
+    builder.discard(2)
+    builder.replace(3, "the")
+    builder.skip(1)
+    builder.replace(5, "quick")
+    builder.replace(2, " ")
+    builder.replace(5, "brown")
+    builder.skip(1)
+    builder.replace(3, "fox")
+    builder.discard(1)
+    bs = builder.build()
+
+    assert bs.original == "  the quick  brown fox "
+    assert bs.modified == "the quick brown fox"
+
+    assert bs[0:1].original == "the"
+    assert bs[1:2].original == "the"
+    assert bs[2:3].original == "the"
+
+    assert bs[0:3].original == "the"
+    assert bs[1:3].original == "the"
+
+    assert bs[0:4].original == "the "
+    assert bs[1:4].original == "the "
+
+    assert bs[3:4].original == " "
+    assert bs[9:10].original == "  "
+
+    assert bs[4:15].original == "quick  brown"
+    assert bs[5:14].original == "quick  brown"
+
+    assert bs[0:0].original == ""
+    assert bs[10:10].original == ""
+
+
+def test_chunk_chars():
+    builder = BistrBuilder("  the quick  brown fox ")
+    builder.discard_match(r"\s+")
+    while not builder.is_complete:
+        builder.skip_match(r"\S+")
+        builder.replace_match(r"\s+(?=\S)", " ")
+        builder.discard_match(r"\s+$")
+
+    bs = builder.build()
+
+    assert bs.original == "  the quick  brown fox "
+    assert bs.modified == "the quick brown fox"
+
+    assert bs[0:1].original == "t"
+    assert bs[1:2].original == "h"
+    assert bs[2:3].original == "e"
+
+    assert bs[0:3].original == "the"
+    assert bs[1:3].original == "he"
+
+    assert bs[0:4].original == "the "
+    assert bs[1:4].original == "he "
+
+    assert bs[3:4].original == " "
+    assert bs[9:10].original == "  "
+
+    assert bs[4:15].original == "quick  brown"
+    assert bs[5:14].original == "uick  brow"
+
+    assert bs[0:0].original == ""
+    assert bs[10:10].original == ""
+
+
+def test_empty_string():
+    builder = BistrBuilder("")
+    bs = builder.build()
+    assert bs.original == ""
+    assert bs.modified == ""
+    assert bs[0:0].original == ""
+
+
+def test_iterative():
+    builder = BistrBuilder("I wish I wouldn't've spent one thousand dollars.")
+    builder.skip_match(r"[^.]*")
+    builder.discard_rest()
+    builder.rewind()
+    builder.skip_match(r"I wish I ");
+    builder.replace_match(r"wouldn't've", "would not have");
+    builder.skip_match(r" spent ");
+    builder.replace_match(r"one thousand dollars", "$1,000");
+
+    bs = builder.build()
+    assert bs.original == "I wish I wouldn't've spent one thousand dollars."
+    assert bs.modified == "I wish I would not have spent $1,000"
+
+
+def test_replace_matches():
+    builder = BistrBuilder("the cheese that the mouse that the cat that the dog chased played with ate")
+    builder.replace_next(r"that", "which")
+    builder.replace_all(r"that", "whom")
+
+    bs = builder.build()
+    assert bs.original == "the cheese that the mouse that the cat that the dog chased played with ate"
+    assert bs.modified == "the cheese which the mouse whom the cat whom the dog chased played with ate"
+
+
+def test_replace_backreference():
+    builder = BistrBuilder("it doesn't work and stuff doesn't get replaced")
+    builder.replace_all(r"\bdoesn't (\S+)", r"\1s")
+
+    bs = builder.build()
+    assert bs.original == "it doesn't work and stuff doesn't get replaced"
+    assert bs.modified == "it works and stuff gets replaced"
diff --git a/python/tests/test_token.py b/python/tests/test_token.py
new file mode 100644
index 0000000..02f7085
--- /dev/null
+++ b/python/tests/test_token.py
@@ -0,0 +1,85 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT license.
+
+from bistring import bistr
+
+
+def test_regex_tokenizer():
+    from bistring import RegexTokenizer
+
+    text = bistr(" 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ")
+    text = text.normalize("NFKD")
+    text = text.casefold()
+
+    tokenizer = RegexTokenizer(r"\w+")
+
+    tokens = tokenizer.tokenize(text)
+    assert tokens.text == text
+    assert len(tokens) == 9
+    assert tokens.text_bounds(0, 2) == (1, 10)
+    assert tokens[0:2].text == text[1:10]
+    assert len(tokens.slice_by_text(5, 10)) == 1
+    assert len(tokens.slice_by_text(5, 11)) == 1
+    assert len(tokens.slice_by_text(3, 13)) == 3
+
+
+def test_splitting_tokenizer():
+    from bistring import SplittingTokenizer
+
+    text = bistr(" 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ")
+    text = text.normalize("NFKD")
+    text = text.casefold()
+
+    tokenizer = SplittingTokenizer(r"\s+")
+
+    tokens = tokenizer.tokenize(text)
+    assert tokens.text == text
+    assert len(tokens) == 9
+    assert tokens.text_bounds(0, 2) == (1, 11)
+    assert tokens[0:2].text == text[1:11]
+    assert len(tokens.slice_by_text(5, 10)) == 1
+    assert len(tokens.slice_by_text(5, 11)) == 1
+    assert len(tokens.slice_by_text(3, 13)) == 3
+
+
+def test_character_tokenizer():
+    from bistring import CharacterTokenizer
+
+    text = bistr(" 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ")
+
+    tokenizer = CharacterTokenizer("en_US")
+
+    tokens = tokenizer.tokenize(text)
+    assert tokens.text == text
+    assert all(token.text == text[i:i+1] for i, token in enumerate(tokens))
+
+
+def test_word_tokenizer():
+    from bistring import WordTokenizer
+
+    text = bistr(" 𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 𝖋𝖔𝖝 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 𝖉𝖔𝖌 ")
+
+    tokenizer = WordTokenizer("en_US")
+
+    tokens = tokenizer.tokenize(text)
+    assert tokens.text == text
+    assert len(tokens) == 9
+    assert tokens.text_bounds(0, 2) == (1, 10)
+    assert tokens[0:2].text == text[1:10]
+    assert len(tokens.slice_by_text(5, 10)) == 1
+    assert len(tokens.slice_by_text(5, 11)) == 1
+    assert len(tokens.slice_by_text(3, 13)) == 3
+
+
+def test_sentence_tokenizer():
+    from bistring import SentenceTokenizer
+
+    text = bistr("The following sentence is true.  The preceeding sentence, surprisingly, is false.")
+
+    tokenizer = SentenceTokenizer("en_US")
+
+    tokens = tokenizer.tokenize(text)
+    assert tokens.text == text
+    assert len(tokens) == 2
+    assert tokens[0].text == text[:33]
+    assert tokens[1].text == text[33:]