diff --git a/README.md b/README.md index 72f1506..383b97f 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,36 @@ +bistring +======== -# Contributing +The bistring library provides non-destructive versions of common string processing operations like normalization, case folding, and find/replace. +Each bistring remembers the original string, and how its substrings map to substrings of the modified version. -This project welcomes contributions and suggestions. Most contributions require you to agree to a -Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us -the rights to use your contribution. For details, visit https://cla.microsoft.com. +For example: -When you submit a pull request, a CLA-bot will automatically determine whether you need to provide -a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions -provided by the bot. You will only need to do this once across all repos using our CLA. +```python +>>> from bistring import bistr +>>> s = bistr('๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐–‹๐–”๐– ๐–๐–š๐–’๐–•๐–˜ ๐–”๐–›๐–Š๐–— ๐–™๐–๐–Š ๐–‘๐–†๐–Ÿ๐–ž ๐–‰๐–”๐–Œ') +>>> s = s.normalize('NFKD') # Unicode normalization +>>> s = s.casefold() # Case-insensitivity +>>> s = s.sub(r'[^a-z ]+', '') # Strip everything but letters and spaces +>>> s = s[:19] # Extract a substring +>>> s.modified # The modified substring, after changes +'the quick brown fox' +>>> s.original # The original substring, before changes +'๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐–‹๐–”๐–' +``` + +This allows you to perform very aggressive text processing completely invisibly. + + +Contributing +------------ + +This project welcomes contributions and suggestions. +Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. +For details, visit https://cla.microsoft.com. + +When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). +Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA. This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). -For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or -contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. +For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. diff --git a/python/bistring/__init__.py b/python/bistring/__init__.py new file mode 100644 index 0000000..fd89c29 --- /dev/null +++ b/python/bistring/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +from ._alignment import * +from ._bistr import * +from ._builder import * +from ._token import * diff --git a/python/bistring/_alignment.py b/python/bistring/_alignment.py new file mode 100644 index 0000000..576e980 --- /dev/null +++ b/python/bistring/_alignment.py @@ -0,0 +1,250 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +__all__ = ["Alignment"] + +import bisect +from typing import Iterable, List, Optional, Tuple, cast, overload + +from ._typing import Bounds, Range + + +class Alignment: + """ + An alignment between two related sequences. + """ + + __slots__ = ("_original", "_modified") + + _original: List[int] + _modified: List[int] + + def __init__(self, values: Iterable[Bounds]): + self._original = [] + self._modified = [] + for i, j in values: + if self._original: + if i < self._original[-1]: + raise ValueError("Original sequence position moved backwards") + elif j < self._modified[-1]: + raise ValueError("Modified sequence position moved backwards") + elif i == self._original[-1] and j == self._modified[-1]: + continue + + self._original.append(i) + self._modified.append(j) + + if not self._original: + raise ValueError("No sequence positions to align") + + @classmethod + def _create(cls, original: List[int], modified: List[int]) -> "Alignment": + result = super().__new__(cls) + result._original = original + result._modified = modified + return result + + def __str__(self): + i, j = self._original[0], self._original[-1] + k, l = self._modified[0], self._modified[-1] + if self._original == list(range(i, j + 1)) and self._modified == list(range(k, l + 1)): + return f"[{i}:{j}โ‡‹{k}:{l}]" + else: + return "[" + ", ".join(f"{i}โ‡‹{j}" for i, j in self) + "]" + + def __repr__(self): + i, j = self._original[0], self._original[-1] + if self._original == list(range(i, j + 1)) and self._modified == list(range(i, j + 1)): + return f"Alignment.identity({self._original[0]}, {self._original[-1]})" + else: + return "Alignment([" + ", ".join(map(repr, self)) + "])" + + def __eq__(self, other): + if isinstance(other, Alignment): + return (self._original, self._modified) == (other._original, other._modified) + else: + return NotImplemented + + @classmethod + def _parse_args(cls, args: Tuple) -> Bounds: + l = len(args) + if l == 0: + return None, None + elif l == 1: + arg = args[0] + if isinstance(arg, range): + return arg.start, arg.stop + elif isinstance(arg, slice): + if arg.start is None or arg.stop is None: + raise ValueError("slice with unspecified bounds") + return arg.start, arg.stop + elif isinstance(arg, tuple): + return cast(Bounds, arg) + else: + return 0, arg + elif l == 2: + return cast(Bounds, args) + else: + raise TypeError("Too many arguments") + + @overload + @classmethod + def identity(cls, length: int) -> "Alignment": + ... + + @overload + @classmethod + def identity(cls, start: int, stop: int) -> "Alignment": + ... + + @overload + @classmethod + def identity(cls, bounds: Range) -> "Alignment": + ... + + @classmethod + def identity(cls, *args): + start, stop = cls._parse_args(args) + values = list(range(start, stop + 1)) + return cls._create(values, values) + + def __iter__(self): + return zip(self._original, self._modified) + + def __len__(self): + return len(self._original) + + def __getitem__(self, index): + if isinstance(index, slice): + start, stop, stride = index.indices(len(self)) + if stride != 1: + raise ValueError("Non-unit strides not supported") + return self._create(self._original[index], self._modified[index]) + else: + return (self._original[index], self._modified[index]) + + def shift(self, delta_o, delta_m): + return self._create( + [o + delta_o for o in self._original], + [m + delta_m for m in self._modified], + ) + + def _search(self, source: List[int], start: int, stop: int) -> Bounds: + first = bisect.bisect_right(source, start) + if first == 0: + raise IndexError("range start too small") + first -= 1 + + last = bisect.bisect_left(source, stop, first) + if last == len(source): + raise IndexError("range end too big") + + return first, last + + def _bounds(self, source: List[int], target: List[int], args: Tuple) -> Bounds: + start, stop = self._parse_args(args) + if start is None: + i, j = 0, -1 + else: + i, j = self._search(source, start, stop) + return (target[i], target[j]) + + def original_bounds(self, *args) -> Bounds: + return self._bounds(self._modified, self._original, args) + + def original_range(self, *args) -> range: + return range(*self.original_bounds(*args)) + + def original_slice(self, *args) -> slice: + return slice(*self.original_bounds(*args)) + + def modified_bounds(self, *args) -> Bounds: + return self._bounds(self._original, self._modified, args) + + def modified_range(self, *args) -> range: + return range(*self.modified_bounds(*args)) + + def modified_slice(self, *args) -> slice: + return slice(*self.modified_bounds(*args)) + + def slice_by_original(self, *args) -> "Alignment": + start, stop = self._parse_args(args) + first, last = self._search(self._original, start, stop) + original = self._original[first:last+1] + original = [min(max(i, start), stop) for i in original] + modified = self._modified[first:last+1] + return self._create(original, modified) + + def slice_by_modified(self, *args) -> "Alignment": + start, stop = self._parse_args(args) + first, last = self._search(self._modified, start, stop) + original = self._original[first:last+1] + modified = self._modified[first:last+1] + modified = [min(max(i, start), stop) for i in modified] + return self._create(original, modified) + + def __add__(self, other): + """ + Concatenate two alignments. + """ + + if not isinstance(other, Alignment): + return NotImplemented + + o_orig = other._original + o_mod = other._modified + + if o_orig[0] < self._original[-1]: + raise ValueError("Original sequence position moved backwards") + elif o_mod[0] < self._modified[-1]: + raise ValueError("Modified sequence position moved backwards") + elif o_orig[0] == self._original[-1] and o_mod[0] == self._modified[-1]: + o_orig = o_orig[1:] + o_mod = o_mod[1:] + + return self._create(self._original + o_orig, self._modified + o_mod) + + def compose(self, other: "Alignment") -> "Alignment": + """ + Return a new alignment equivalent to applying this one first, then the + other. + """ + + if self.modified_bounds() != other.original_bounds(): + raise ValueError("Incompatible alignments") + + original = [] + modified = [] + i, i_max = 0, len(self) + j, j_max = 0, len(other) + + while i < i_max: + # Map self._original[i] to its lower bound in other + while self._modified[i] > other._original[j]: + j += 1 + while self._modified[i] < other._original[j] and self._modified[i + 1] <= other._original[j]: + i += 1 + original.append(self._original[i]) + modified.append(other._modified[j]) + + # Map self._original[i] to its upper bound in other (if it's different) + while i + 1 < i_max and self._original[i] == self._original[i + 1]: + i += 1 + + needs_upper = False + while j + 1 < j_max and self._modified[i] >= other._original[j + 1]: + needs_upper = True + j += 1 + if needs_upper: + original.append(self._original[i]) + modified.append(other._modified[j]) + + i += 1 + + return self._create(original, modified) + + def inverse(self) -> "Alignment": + """ + The inverse of this alignment, from the modified to the original sequence. + """ + return self._create(self._modified, self._original) diff --git a/python/bistring/_bistr.py b/python/bistring/_bistr.py new file mode 100644 index 0000000..6dd13b3 --- /dev/null +++ b/python/bistring/_bistr.py @@ -0,0 +1,238 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +__all__ = ["bistr"] + +from typing import Iterable, Optional, Tuple + +from ._alignment import Alignment +from ._typing import Regex, String + + +class bistr: + """ + A bidirectionally transformed string. + """ + + __slots__ = ("original", "modified", "alignment") + + original: str + modified: str + alignment: Alignment + + def __new__(cls, original: String, modified: Optional[str] = None, alignment: Optional[Alignment] = None): + """ + Create a new bistr. + """ + + if isinstance(original, bistr): + if modified is not None or alignment is not None: + raise ValueError("bistr copy constructor invoked with extra arguments") + return original + + if alignment is None: + if modified is None: + alignment = Alignment.identity(len(original)) + else: + alignment = Alignment([(0, 0), (len(original), len(modified))]) + + if modified is None: + modified = original + + if alignment.original_bounds() != (0, len(original)): + raise ValueError("Alignment incompatible with original string") + elif alignment.modified_bounds() != (0, len(modified)): + raise ValueError("Alignment incompatible with modified string") + + result = super().__new__(cls) + super().__setattr__(result, "original", original) + super().__setattr__(result, "modified", modified) + super().__setattr__(result, "alignment", alignment) + return result + + def __str__(self): + if self.original == self.modified: + return f"โฎŽ{self.original!r}โฎŒ" + else: + return f"({self.original!r} โ‡‹ {self.modified!r})" + + def __repr__(self): + if self.original == self.modified and self.alignment == Alignment.identity(len(self.original)): + return f"bistr({self.original!r})" + elif self.alignment == Alignment([(0, 0), (len(self.original), len(self.modified))]): + return f"bistr({self.original!r}, {self.modified!r})" + else: + return f"bistr({self.original!r}, {self.modified!r}, {self.alignment!r})" + + def __len__(self): + return len(self.modified) + + def __eq__(self, other): + if isinstance(other, bistr): + return (self.original, self.modified, self.alignment) == (other.original, other.modified, other.alignment) + else: + return NotImplemented + + def __add__(self, other): + if isinstance(other, bistr): + original = other.original + modified = other.modified + alignment = other.alignment + elif isinstance(other, str): + original = other + modified = other + alignment = Alignment.identity(len(other)) + else: + return NotImplemented + + alignment = alignment.shift(len(self.original), len(self.modified)) + return bistr(self.original + original, self.modified + modified, self.alignment + alignment) + + def __radd__(self, other): + if isinstance(other, str): + length = len(other) + return bistr( + other + self.original, + other + self.modified, + Alignment.identity(length) + self.alignment.shift(length, length), + ) + else: + return NotImplemented + + def __getitem__(self, index): + if isinstance(index, slice): + start, stop, stride = index.indices(len(self)) + if stride != 1: + raise ValueError("Non-unit strides not supported") + + modified = self.modified[start:stop] + original = self.original[self.alignment.original_slice(start, stop)] + alignment = self.alignment.slice_by_modified(start, stop) + alignment = alignment.shift(-alignment[0][0], -alignment[0][1]) + + return bistr(original, modified, alignment) + else: + return self.modified[index] + + def __setattr__(self, name, value): + raise AttributeError("bistr is immutable") + + def __delattr__(self, name): + raise AttributeError("bistr is immutable") + + def inverse(self) -> "bistr": + """ + The inverse of this string, swapping the original and modified strings. + """ + return bistr(self.modified, self.original, self.alignment.inverse()) + + def chunks(self) -> Iterable["bistr"]: + """ + All the chunks of associated text in this string. + """ + + i, k = 0, 0 + for j, l in self.alignment[1:]: + yield bistr(self.original[i:j], self.modified[k:l]) + i, k = j, l + + def _builder(self): + from ._builder import BistrBuilder + return BistrBuilder(self) + + def casefold(self) -> "bistr": + from ._icu import casefold + return casefold(self) + + def lower(self, locale: Optional[str] = None) -> "bistr": + from ._icu import lower + return lower(self, locale) + + def upper(self, locale: Optional[str] = None) -> "bistr": + from ._icu import upper + return upper(self, locale) + + def title(self, locale: Optional[str] = None) -> "bistr": + from ._icu import title + return title(self, locale) + + def expandtabs(self, tabsize=8) -> "bistr": + return self.replace("\t", " " * tabsize) + + def replace(self, old: str, new: str, count: Optional[int] = None) -> "bistr": + builder = self._builder() + + pos = 0 + n = 0 + while count is None or n < count: + index = self.modified.find(old, pos) + if index < 0: + break + + builder.skip(index - pos) + builder.replace(len(old), new) + + pos = index + len(old) + n += 1 + + builder.skip_rest() + return builder.build() + + def sub(self, regex: Regex, repl: str) -> "bistr": + builder = self._builder() + builder.replace_all(regex, repl) + return builder.build() + + def _stripper(self, chars: Optional[str]): + if chars is None: + return lambda c: c.isspace() + else: + return lambda c: c in chars + + def strip(self, chars: Optional[str] = None) -> "bistr": + should_strip = self._stripper(chars) + + length = len(self) + pre = 0 + while pre < length and should_strip(self.modified[pre]): + pre += 1 + + post = length + while post > pre and should_strip(self.modified[post - 1]): + post -= 1 + + builder = self._builder() + builder.discard(pre) + builder.skip(post - pre) + builder.discard_rest() + return builder.build() + + def lstrip(self, chars: Optional[str] = None) -> "bistr": + should_strip = self._stripper(chars) + + length = len(self) + pre = 0 + while pre < length and should_strip(self.modified[pre]): + pre += 1 + + builder = self._builder() + builder.discard(pre) + builder.skip_rest() + return builder.build() + + def rstrip(self, chars: Optional[str] = None) -> "bistr": + should_strip = self._stripper(chars) + + length = len(self) + post = length + while post > 0 and should_strip(self.modified[post - 1]): + post -= 1 + + builder = self._builder() + builder.skip(post) + builder.discard_rest() + return builder.build() + + def normalize(self, form: str): + from ._icu import normalize + return normalize(self, form) diff --git a/python/bistring/_builder.py b/python/bistring/_builder.py new file mode 100644 index 0000000..9b95608 --- /dev/null +++ b/python/bistring/_builder.py @@ -0,0 +1,225 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +__all__ = ["BistrBuilder"] + +import re +from typing import Iterable, List, Match, Optional, Pattern, Tuple + +from ._alignment import Alignment +from ._bistr import bistr +from ._typing import Bounds, Regex, String + + +class BistrBuilder: + """ + Bidirectionally transformed string builer. + """ + + _original: bistr + _modified: List[str] + _alignment: List[Bounds] + _opos: int + _mpos: int + + def __init__(self, original: String): + self._original = bistr(original) + self._modified = [] + self._alignment = [(0, 0)] + self._opos = 0 + self._mpos = 0 + + @property + def original(self) -> str: + """ + The original string being modified. + """ + return self._original.original + + @property + def current(self) -> str: + """ + The current string before modifications. + """ + return self._original.modified + + @property + def modified(self) -> str: + """ + The modified string as built so far. + """ + return "".join(self._modified) + + @property + def alignment(self) -> Alignment: + """ + The alignment built so far from self.current to self.modified. + """ + return Alignment(self._alignment) + + @property + def position(self) -> int: + """ + The position of the builder in self.current. + """ + return self._opos + + @property + def remaining(self) -> int: + """ + The number of characters of the current string left to process. + """ + return len(self.current) - self._opos + + @property + def is_complete(self) -> bool: + """ + Whether we've completely processed the string. + """ + return self.remaining == 0 + + def peek(self, n: int): + """ + Peek at the next n characters of the original string. + """ + return self.current[self._opos:self._opos+n] + + def _advance(self, ocount, mcount): + self._opos += ocount + self._mpos += mcount + if ocount > 0 or mcount > 0: + self._alignment.append((self._opos, self._mpos)) + + def skip(self, n: int): + """ + Skip the next n characters, copying them unchanged. + """ + if n > 0: + self._modified.append(self.peek(n)) + for i in range(n): + self._advance(1, 1) + + def skip_rest(self): + """ + Skip the rest of the string, copying it unchanged. + """ + self.skip(self.remaining) + + def insert(self, string: str): + """ + Insert a substring into the string. + """ + self.replace(0, string) + + def discard(self, n: int): + """ + Discard a portion of the original string. + """ + self.replace(n, "") + + def discard_rest(self): + """ + Discard the rest of the original string. + """ + self.discard(self.remaining) + + def replace(self, n: int, repl: str): + """ + Replace the next n characters with a new string. + """ + if len(repl) > 0: + self._modified.append(repl) + self._advance(n, len(repl)) + + def append(self, bs: bistr): + """ + Append a bistr. The original value of the bistr must match the current + string being processed. + """ + if bs.original != self.peek(len(bs.original)): + raise ValueError("bistr doesn't match the current string") + for x, y in zip(bs.alignment, bs.alignment[1:]): + self._advance(y[0] - x[0], y[1] - x[1]) + + def _match(self, regex: Regex) -> Optional[Match]: + pattern = re.compile(regex) + return pattern.match(self.current, pos=self._opos) + + def _search(self, regex: Regex) -> Optional[Match]: + pattern = re.compile(regex) + return pattern.search(self.current, pos=self._opos) + + def _finditer(self, regex: Regex) -> Iterable[Match]: + pattern = re.compile(regex) + return pattern.finditer(self.current, pos=self._opos) + + def skip_match(self, regex: Regex) -> bool: + """ + Skip a substring matching a regex, copying it unchanged. + """ + match = self._match(regex) + if match: + self.skip(match.end() - match.start()) + return True + else: + return False + + def discard_match(self, regex: Regex) -> bool: + """ + Discard a substring that matches a regex. + """ + match = self._match(regex) + if match: + self.discard(match.end() - match.start()) + return True + else: + return False + + def replace_match(self, regex: Regex, repl: str) -> bool: + """ + Replace a substring that matches a regex. + """ + match = self._match(regex) + if match: + self.replace(match.end() - match.start(), match.expand(repl)) + return True + else: + return False + + def replace_next(self, regex: Regex, repl: str) -> bool: + """ + Replace the next occurence of a regex. + """ + match = self._search(regex) + if match: + self.skip(match.start() - self._opos) + self.replace(match.end() - match.start(), match.expand(repl)) + return True + else: + return False + + def replace_all(self, regex: Regex, repl: str): + """ + Replace all occurences of a regex. + """ + for match in self._finditer(regex): + self.skip(match.start() - self._opos) + self.replace(match.end() - match.start(), match.expand(repl)) + self.skip_rest() + + def build(self): + """ + Build the bistr. + """ + alignment = self._original.alignment.compose(self.alignment) + return bistr(self.original, self.modified, alignment) + + def rewind(self): + """ + Reset this builder to apply another transformation. + """ + self._original = self.build() + self._modified = [] + self._alignment = [(0, 0)] + self._opos = 0 + self._mpos = 0 diff --git a/python/bistring/_icu.py b/python/bistring/_icu.py new file mode 100644 index 0000000..94e597d --- /dev/null +++ b/python/bistring/_icu.py @@ -0,0 +1,87 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +import icu +from typing import Callable, Optional + +from ._bistr import bistr +from ._builder import BistrBuilder + + +def _edit(bs: bistr, op: Callable, locale: Optional[str] = None) -> bistr: + builder = BistrBuilder(bs) + edits = icu.Edits() + ucur = icu.UnicodeString(builder.current) + + if locale is None: + umod = icu.UnicodeString(op(ucur, edits)) + else: + umod = icu.UnicodeString(op(icu.Locale(locale), ucur, edits)) + + for is_change, old_len, new_len, old_i, new_i, _ in edits.getFineIterator(): + old_len = ucur.countChar32(old_i, old_len) + if is_change: + repl = str(umod[new_i:new_i+new_len]) + builder.replace(old_len, repl) + else: + builder.skip(old_len) + + return builder.build() + + +def casefold(bs: bistr) -> bistr: + return _edit(bs, icu.CaseMap.fold) + + +def lower(bs: bistr, locale: Optional[str]) -> bistr: + return _edit(bs, icu.CaseMap.toLower, locale) + + +def upper(bs: bistr, locale: Optional[str]) -> bistr: + return _edit(bs, icu.CaseMap.toUpper, locale) + + +def title(bs: bistr, locale: Optional[str]) -> bistr: + return _edit(bs, icu.CaseMap.toTitle, locale) + + +def _normalize(bs: bistr, normalizer: icu.Normalizer2) -> bistr: + builder = BistrBuilder(bs) + us = icu.UnicodeString(bs.modified) + offset = 0 + while not builder.is_complete: + i = normalizer.spanQuickCheckYes(us) + builder.skip(us.countChar32(0, i)) + if builder.is_complete: + break + us = us[i:] + + i = 0 + while i < len(us): + if us.charAt(i) & 0xFC00 == 0xD800: + i += 1 + i += 1 + if normalizer.hasBoundaryBefore(chr(us.char32At(i))): + break + + chunk = us[:i] + normalized = str(normalizer.normalize(chunk)) + builder.replace(chunk.countChar32(), normalized) + us = us[i:] + + return builder.build() + + +_NORMALIZERS = { + "NFC": icu.Normalizer2.getNFCInstance, + "NFKC": icu.Normalizer2.getNFKCInstance, + "NFD": icu.Normalizer2.getNFDInstance, + "NFKD": icu.Normalizer2.getNFKDInstance, +} + +def normalize(bs: bistr, form: str) -> bistr: + factory = _NORMALIZERS.get(form) + if factory: + return _normalize(bs, factory()) + else: + raise ValueError("invalid normalization form") diff --git a/python/bistring/_token.py b/python/bistring/_token.py new file mode 100644 index 0000000..dabb01c --- /dev/null +++ b/python/bistring/_token.py @@ -0,0 +1,289 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +__all__ = [ + "Token", + "Tokenization", + "RegexTokenizer", + "SplittingTokenizer", + "CharacterTokenizer", + "WordTokenizer", + "SentenceTokenizer", +] + +from dataclasses import dataclass +import icu +import re +import threading +from typing import Callable, Iterable + +from ._alignment import Alignment +from ._bistr import bistr +from ._typing import Bounds, Regex, String + + +@dataclass(frozen=True) +class Token: + """ + A token extracted from a string. + """ + + text: bistr + start: int + end: int + + @property + def original(self) -> str: + """ + The original value of this token. + """ + return self.text.original + + @property + def modified(self) -> str: + """ + The modified value of this token. + """ + return self.text.modified + + @classmethod + def slice(cls, text: bistr, start: int, end: int) -> "Token": + """ + Create a Token from a slice of a bistr. + """ + return cls(text[start:end], start, end) + + def __str__(self): + return f"[{self.start}:{self.end}]={self.text}" + + def __repr__(self): + return f"Token({self.text!r}, start={self.start}, end={self.end})" + + +@dataclass(frozen=True) +class Tokenization: + """ + A string and its tokenization. + """ + + text: bistr + _tokens: Iterable[Token] + alignment: Alignment + + def __init__(self, text: bistr, tokens: Iterable[Token]): + """ + Create a Tokenization. + """ + tokens = tuple(tokens) + + alignment = [] + for i, token in enumerate(tokens): + alignment.append((token.start, i)) + alignment.append((token.end, i + 1)) + + self._init(text, tokens, Alignment(alignment)) + + def _init(self, text: bistr, tokens: Iterable[Token], alignment: Alignment): + super().__setattr__("text", text) + super().__setattr__("_tokens", tokens) + super().__setattr__("alignment", Alignment(alignment)) + + @classmethod + def _create(cls, text: bistr, tokens: Iterable[Token], alignment: Alignment): + result = cls.__new__(cls) + result._init(text, tokens, alignment) + return result + + def __iter__(self): + return iter(self._tokens) + + def __len__(self): + return len(self._tokens) + + def __getitem__(self, index): + if isinstance(index, slice): + start, stop, stride = index.indices(len(self)) + if stride != 1: + raise ValueError("Non-unit strides not supported") + text_slice = slice(*self.text_bounds(start, stop)) + return self._create(self.text[text_slice], self._tokens[index], self.alignment[index]) + else: + return self._tokens[index] + + def __str__(self): + tokens = ", ".join(map(str, self)) + return f"Tokenization({self.text}, [{tokens}])" + + def __repr__(self): + return f"Tokenization({self.text!r}, {self._tokens!r})" + + def text_bounds(self, *args) -> Bounds: + """ + Map a span of tokens to the bounds of the corresponding text. + """ + return self.alignment.original_bounds(*args) + + def original_bounds(self, *args) -> Bounds: + """ + Map a span of tokens to the bounds of the corresponding original text. + """ + return self.text.alignment.original_bounds(self.text_bounds(*args)) + + def bounds_for_text(self, *args) -> Bounds: + """ + Map a span of text to the bounds of the corresponding span of tokens. + """ + return self.alignment.modified_bounds(*args) + + def bounds_for_original(self, *args) -> Bounds: + """ + Map a span of original text to the bounds of the corresponding span of + tokens. + """ + text_bounds = self.text.alignment.modified_bounds(*args) + return self.alignment.modified_bounds(text_bounds) + + def slice_by_text(self, *args) -> Iterable[Token]: + """ + Map a span of text to the corresponding span of tokens. + """ + i, j = self.bounds_for_text(*args) + return self[i:j] + + def slice_by_original(self, *args) -> Iterable[Token]: + """ + Map a span of the original text to the corresponding span of tokens. + """ + i, j = self.bounds_for_original(*args) + return self[i:j] + + def align_text_bounds(self, *args) -> Bounds: + """ + Expand a span of text to align it with token boundaries. + """ + return self.text_bounds(self.bounds_for_text(*args)) + + def align_original_bounds(self, *args) -> Bounds: + """ + Expand a span of original text to align it with token boundaries. + """ + return self.original_bounds(self.bounds_for_original(*args)) + + +class RegexTokenizer: + """ + Breaks text into tokens based on a regex. + """ + + def __init__(self, regex: Regex): + self._pattern = re.compile(regex) + + def tokenize(self, text: String) -> Tokenization: + text = bistr(text) + tokens = [] + for match in self._pattern.finditer(text.modified): + tokens.append(Token.slice(text, match.start(), match.end())) + return Tokenization(text, tokens) + + +class SplittingTokenizer: + """ + Splits text into tokens based on a regex. + """ + + def __init__(self, regex: Regex): + self._pattern = re.compile(regex) + + def tokenize(self, text: String) -> Tokenization: + text = bistr(text) + tokens = [] + + last = 0 + for match in self._pattern.finditer(text.modified): + start = match.start() + if start > last: + tokens.append(Token.slice(text, last, start)) + last = match.end() + + end = len(text.modified) + if end > last: + tokens.append(Token.slice(text, last, end)) + + return Tokenization(text, tokens) + + +class _IcuTokenizer: + """ + Base class for ICU BreakIterator-based tokenizers. + """ + + def __init__(self, locale: str, constructor: Callable): + # BreakIterator is not a thread-safe API, so store a cache of + # thread-local iterators + self._locale = icu.Locale(locale) + self._constructor = constructor + self._local = threading.local() + + # Eagerly construct one on this thread as an optimization, and to check + # for errors + self._break_iterator() + + def _break_iterator(self) -> icu.BreakIterator: + if not hasattr(self._local, "bi"): + self._local.bi = self._constructor(self._locale) + return self._local.bi + + def tokenize(self, text: String) -> Tokenization: + text = bistr(text) + tokens = [] + + bi = self._break_iterator() + + utext = icu.UnicodeString(text.modified) + bi.setText(utext) + + ui = bi.first() + uj = bi.nextBoundary() + i = 0 + while uj != icu.BreakIterator.DONE: + j = i + utext.countChar32(ui, uj - ui) + if self._check_token(bi.getRuleStatus()): + tokens.append(Token.slice(text, i, j)) + ui = uj + uj = bi.nextBoundary() + i = j + + return Tokenization(text, tokens) + + def _check_token(self, tag: int) -> bool: + return True + + +class CharacterTokenizer(_IcuTokenizer): + """ + Splits text into user-perceived characters/grapheme clusters. + """ + + def __init__(self, locale: str): + super().__init__(locale, icu.BreakIterator.createCharacterInstance) + + +class WordTokenizer(_IcuTokenizer): + """ + Splits text into words based on Unicode rules. + """ + + def __init__(self, locale: str): + super().__init__(locale, icu.BreakIterator.createWordInstance) + + def _check_token(self, tag: int) -> bool: + return tag >= 100 # UBRK_WORD_NONE_LIMIT + + +class SentenceTokenizer(_IcuTokenizer): + """ + Splits text into sentences based on Unicode rules. + """ + + def __init__(self, locale: str): + super().__init__(locale, icu.BreakIterator.createSentenceInstance) diff --git a/python/bistring/_typing.py b/python/bistring/_typing.py new file mode 100644 index 0000000..3133730 --- /dev/null +++ b/python/bistring/_typing.py @@ -0,0 +1,13 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +from typing import Pattern, Tuple, Union + + +Bounds = Tuple[int, int] + +Range = Union[range, slice, Bounds] + +Regex = Union[str, Pattern] + +String = Union[str, "bistr"] diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..ccd6c3a --- /dev/null +++ b/python/setup.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 + +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +from setuptools import setup + + +setup( + name="bistring", + version="0.0", + author="Microsoft Research Montreal", + author_email="msrmtle@microsoft.com", + description="Bidirectionally transformed strings", + url="https://dev.azure.com/maluuba/Isentrope", + packages=[ + "bistring", + ], + test_suite="tests", + setup_requires=[ + "pytest-runner >= 4.2", + ], + install_requires=[ + "pyicu >= 2.1", + ], + tests_require=[ + "pytest >= 3.8.2", + ], +) diff --git a/python/tests/__init__.py b/python/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/python/tests/test_alignment.py b/python/tests/test_alignment.py new file mode 100644 index 0000000..7903220 --- /dev/null +++ b/python/tests/test_alignment.py @@ -0,0 +1,107 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +from bistring import Alignment +import pytest + + +def test_empty(): + pytest.raises(ValueError, Alignment, []) + + alignment = Alignment.identity(0) + assert list(alignment) == [(0, 0)] + + assert alignment.original_bounds() == (0, 0) + assert alignment.modified_bounds() == (0, 0) + + assert alignment.original_bounds(0, 0) == (0, 0) + assert alignment.modified_bounds(0, 0) == (0, 0) + + +def test_identity(): + alignment = Alignment.identity(1, 16) + + assert alignment == Alignment((i, i) for i in range(1, 17)) + assert list(alignment) == [(i, i) for i in range(1, 17)] + + assert alignment.original_bounds() == (1, 16) + assert alignment.modified_bounds() == (1, 16) + + assert alignment.original_bounds(4, 7) == (4, 7) + assert alignment.modified_bounds(4, 7) == (4, 7) + + +def test_aligning(): + alignment = Alignment([(0, 0), (1, 2), (2, 4), (3, 6)]) + + assert alignment.original_bounds() == (0, 3) + assert alignment.modified_bounds() == (0, 6) + + assert alignment.original_bounds(0, 0) == (0, 0) + assert alignment.original_bounds(0, 1) == (0, 1) + assert alignment.original_bounds(0, 2) == (0, 1) + assert alignment.original_bounds(0, 3) == (0, 2) + assert alignment.original_bounds(1, 1) == (0, 1) + assert alignment.original_bounds(1, 3) == (0, 2) + assert alignment.original_bounds(1, 4) == (0, 2) + assert alignment.original_bounds(2, 2) == (1, 1) + assert alignment.original_bounds(2, 4) == (1, 2) + assert alignment.original_bounds(2, 5) == (1, 3) + assert alignment.original_bounds(3, 3) == (1, 2) + + assert alignment.modified_bounds(0, 0) == (0, 0) + assert alignment.modified_bounds(0, 1) == (0, 2) + assert alignment.modified_bounds(0, 2) == (0, 4) + assert alignment.modified_bounds(0, 3) == (0, 6) + assert alignment.modified_bounds(1, 1) == (2, 2) + assert alignment.modified_bounds(2, 2) == (4, 4) + + +def test_canonicalization(): + assert Alignment([(0, 0), (1, 2), (1, 2), (2, 4)]) == Alignment([(0, 0), (1, 2), (2, 4)]) + + assert Alignment([(0, 0), (1, 2)]) + Alignment([(1, 2), (2, 4)]) == Alignment([(0, 0), (1, 2), (2, 4)]) + + +def _test_composition(first, second): + composed = first.compose(second) + original_range = composed.original_range() + modified_range = composed.modified_range() + + assert original_range == first.original_range() + assert modified_range == second.modified_range() + + for i in original_range: + for j in original_range[i:]: + assert composed.modified_bounds(i, j) == second.modified_bounds(first.modified_bounds(i, j)) + + for i in modified_range: + for j in modified_range[i:]: + assert composed.original_bounds(i, j) == first.original_bounds(second.original_bounds(i, j)) + + +def test_compose(): + first = Alignment((i, 2 * i) for i in range(4)) + second = Alignment((i, 2 * i) for i in range(7)) + _test_composition(first, second) + + +def _test_identity_composition(alignment): + _test_composition(alignment, Alignment.identity(alignment.modified_range())) + _test_composition(Alignment.identity(alignment.original_range()), alignment) + + +def test_compose_identity(): + alignment = Alignment([ + (0, 2), + (2, 2), + (4, 4), + (6, 6), + (8, 6), + ]) + + # Modified sequence is smaller + _test_identity_composition(alignment) + + # Original sequence is smaller + _test_identity_composition(alignment.inverse()) diff --git a/python/tests/test_bistr.py b/python/tests/test_bistr.py new file mode 100644 index 0000000..8b6dd81 --- /dev/null +++ b/python/tests/test_bistr.py @@ -0,0 +1,135 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +import unicodedata + +from bistring import Alignment, bistr + + +def test_concat(): + bs = bistr(" ", "") + bs += "Hello" + bs += bistr(" ", " ") + bs += "world!" + bs += bistr(" ", "") + + assert bs.original == " Hello world! " + assert bs.modified == "Hello world!" + + bs = bs[4:7] + assert bs.original == "o w" + assert bs.modified == "o w" + + bs = bs[1:2] + assert bs.original == " " + assert bs.modified == " " + + +def test_strip(): + bs = bistr(" Hello world! ") + assert bs.original == " Hello world! " + assert bs.modified == " Hello world! " + + bs = bs.strip() + assert bs.original == " Hello world! " + assert bs.modified == "Hello world!" + + bs = bistr(" ").strip() + assert bs.modified == "" + assert bs.original == " " + + +def test_casefold(): + # "Hรญ๏ฌƒ" + # รญ has a combining acute accent, ๏ฌƒ is a ligature + bs = bistr("Hi\u0301\uFB03").casefold() + assert bs.original == "Hi\u0301\uFB03" + assert bs.modified == "hi\u0301ffi" + assert bs.modified == bs.original.casefold() + + assert bs[:3].original == "Hi\u0301" + assert bs[:3].modified == "hi\u0301" + + assert bs[4:5].original == "\uFB03" + assert bs[4:5].modified == "f" + + +def test_lower(): + bs = bistr("DฤฐYARBAKIR").lower("en_US") + assert bs.original == "DฤฐYARBAKIR" + assert bs.modified == "diฬ‡yarbakir" + + bs = bistr("DฤฐYARBAKIR").lower("tr_TR") + assert bs.original == "DฤฐYARBAKIR" + assert bs.modified == "diyarbakฤฑr" + + +def test_upper(): + bs = bistr("straรŸe").upper("de_DE") + assert bs.original == "straรŸe" + assert bs.modified == "STRASSE" + assert bs[4:6].original == "รŸ" + assert bs[4:6].modified == "SS" + + bs = bistr("Diyarbakฤฑr").upper("tr_TR") + assert bs.original == "Diyarbakฤฑr" + assert bs.modified == "DฤฐYARBAKIR" + +def test_title(): + bs = bistr("istanbul").title("en_US") + assert bs.original == "istanbul" + assert bs.modified == "Istanbul" + + bs = bistr("istanbul").title("tr_TR") + assert bs.original == "istanbul" + assert bs.modified == "ฤฐstanbul" + + +def test_normalize(): + # รฉ is composed but oฬˆ has a combining diaeresis + bs = bistr("H\u00E9llo\u0308") + + bs = bs.normalize("NFC") + assert bs.original == "H\u00E9llo\u0308" + assert bs.modified == "H\u00E9ll\u00F6" + assert bs.modified == unicodedata.normalize("NFC", bs.original) + assert bs[4:5].original == "o\u0308" + assert bs[4:5].modified == "\u00F6" + + bs = bs.normalize("NFD") + assert bs.original == "H\u00E9llo\u0308" + assert bs.modified == "He\u0301llo\u0308" + assert bs.modified == unicodedata.normalize("NFD", bs.original) + assert bs[1:3].original == "\u00E9" + assert bs[1:3].modified == "e\u0301" + + +def test_readme(): + bs = bistr("๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐–‹๐–”๐– ๐–๐–š๐–’๐–•๐–˜ ๐–”๐–›๐–Š๐–— ๐–™๐–๐–Š ๐–‘๐–†๐–Ÿ๐–ž ๐–‰๐–”๐–Œ") + bs = bs.normalize("NFKD") + bs = bs.casefold() + bs = bs.sub(r"[^a-z ]+", "") + bs = bs[:19] + assert bs.modified == "the quick brown fox" + assert bs.original == "๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐–‹๐–”๐–" + + +def test_equality(): + bs1 = bistr(" Hello world ").strip().casefold() + bs2 = bistr(" Hello world ", "hello world", Alignment([ + (0, 0), + (2, 0), + (3, 1), + (4, 2), + (5, 3), + (6, 4), + (7, 5), + (8, 6), + (9, 7), + (10, 8), + (11, 9), + (12, 10), + (13, 11), + (15, 11), + ])) + assert bs1 == bs2 diff --git a/python/tests/test_builder.py b/python/tests/test_builder.py new file mode 100644 index 0000000..d74e20b --- /dev/null +++ b/python/tests/test_builder.py @@ -0,0 +1,115 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +from bistring import bistr, BistrBuilder + + +def test_chunk_words(): + builder = BistrBuilder(" the quick brown fox ") + builder.discard(2) + builder.replace(3, "the") + builder.skip(1) + builder.replace(5, "quick") + builder.replace(2, " ") + builder.replace(5, "brown") + builder.skip(1) + builder.replace(3, "fox") + builder.discard(1) + bs = builder.build() + + assert bs.original == " the quick brown fox " + assert bs.modified == "the quick brown fox" + + assert bs[0:1].original == "the" + assert bs[1:2].original == "the" + assert bs[2:3].original == "the" + + assert bs[0:3].original == "the" + assert bs[1:3].original == "the" + + assert bs[0:4].original == "the " + assert bs[1:4].original == "the " + + assert bs[3:4].original == " " + assert bs[9:10].original == " " + + assert bs[4:15].original == "quick brown" + assert bs[5:14].original == "quick brown" + + assert bs[0:0].original == "" + assert bs[10:10].original == "" + + +def test_chunk_chars(): + builder = BistrBuilder(" the quick brown fox ") + builder.discard_match(r"\s+") + while not builder.is_complete: + builder.skip_match(r"\S+") + builder.replace_match(r"\s+(?=\S)", " ") + builder.discard_match(r"\s+$") + + bs = builder.build() + + assert bs.original == " the quick brown fox " + assert bs.modified == "the quick brown fox" + + assert bs[0:1].original == "t" + assert bs[1:2].original == "h" + assert bs[2:3].original == "e" + + assert bs[0:3].original == "the" + assert bs[1:3].original == "he" + + assert bs[0:4].original == "the " + assert bs[1:4].original == "he " + + assert bs[3:4].original == " " + assert bs[9:10].original == " " + + assert bs[4:15].original == "quick brown" + assert bs[5:14].original == "uick brow" + + assert bs[0:0].original == "" + assert bs[10:10].original == "" + + +def test_empty_string(): + builder = BistrBuilder("") + bs = builder.build() + assert bs.original == "" + assert bs.modified == "" + assert bs[0:0].original == "" + + +def test_iterative(): + builder = BistrBuilder("I wish I wouldn't've spent one thousand dollars.") + builder.skip_match(r"[^.]*") + builder.discard_rest() + builder.rewind() + builder.skip_match(r"I wish I "); + builder.replace_match(r"wouldn't've", "would not have"); + builder.skip_match(r" spent "); + builder.replace_match(r"one thousand dollars", "$1,000"); + + bs = builder.build() + assert bs.original == "I wish I wouldn't've spent one thousand dollars." + assert bs.modified == "I wish I would not have spent $1,000" + + +def test_replace_matches(): + builder = BistrBuilder("the cheese that the mouse that the cat that the dog chased played with ate") + builder.replace_next(r"that", "which") + builder.replace_all(r"that", "whom") + + bs = builder.build() + assert bs.original == "the cheese that the mouse that the cat that the dog chased played with ate" + assert bs.modified == "the cheese which the mouse whom the cat whom the dog chased played with ate" + + +def test_replace_backreference(): + builder = BistrBuilder("it doesn't work and stuff doesn't get replaced") + builder.replace_all(r"\bdoesn't (\S+)", r"\1s") + + bs = builder.build() + assert bs.original == "it doesn't work and stuff doesn't get replaced" + assert bs.modified == "it works and stuff gets replaced" diff --git a/python/tests/test_token.py b/python/tests/test_token.py new file mode 100644 index 0000000..02f7085 --- /dev/null +++ b/python/tests/test_token.py @@ -0,0 +1,85 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT license. + +from bistring import bistr + + +def test_regex_tokenizer(): + from bistring import RegexTokenizer + + text = bistr(" ๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐–‹๐–”๐– ๐–๐–š๐–’๐–•๐–˜ ๐–”๐–›๐–Š๐–— ๐–™๐–๐–Š ๐–‘๐–†๐–Ÿ๐–ž ๐–‰๐–”๐–Œ ") + text = text.normalize("NFKD") + text = text.casefold() + + tokenizer = RegexTokenizer(r"\w+") + + tokens = tokenizer.tokenize(text) + assert tokens.text == text + assert len(tokens) == 9 + assert tokens.text_bounds(0, 2) == (1, 10) + assert tokens[0:2].text == text[1:10] + assert len(tokens.slice_by_text(5, 10)) == 1 + assert len(tokens.slice_by_text(5, 11)) == 1 + assert len(tokens.slice_by_text(3, 13)) == 3 + + +def test_splitting_tokenizer(): + from bistring import SplittingTokenizer + + text = bistr(" ๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐–‹๐–”๐– ๐–๐–š๐–’๐–•๐–˜ ๐–”๐–›๐–Š๐–— ๐–™๐–๐–Š ๐–‘๐–†๐–Ÿ๐–ž ๐–‰๐–”๐–Œ ") + text = text.normalize("NFKD") + text = text.casefold() + + tokenizer = SplittingTokenizer(r"\s+") + + tokens = tokenizer.tokenize(text) + assert tokens.text == text + assert len(tokens) == 9 + assert tokens.text_bounds(0, 2) == (1, 11) + assert tokens[0:2].text == text[1:11] + assert len(tokens.slice_by_text(5, 10)) == 1 + assert len(tokens.slice_by_text(5, 11)) == 1 + assert len(tokens.slice_by_text(3, 13)) == 3 + + +def test_character_tokenizer(): + from bistring import CharacterTokenizer + + text = bistr(" ๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐–‹๐–”๐– ๐–๐–š๐–’๐–•๐–˜ ๐–”๐–›๐–Š๐–— ๐–™๐–๐–Š ๐–‘๐–†๐–Ÿ๐–ž ๐–‰๐–”๐–Œ ") + + tokenizer = CharacterTokenizer("en_US") + + tokens = tokenizer.tokenize(text) + assert tokens.text == text + assert all(token.text == text[i:i+1] for i, token in enumerate(tokens)) + + +def test_word_tokenizer(): + from bistring import WordTokenizer + + text = bistr(" ๐•ฟ๐–๐–Š ๐––๐–š๐–Ž๐–ˆ๐–, ๐–‡๐–—๐–”๐–œ๐–“ ๐–‹๐–”๐– ๐–๐–š๐–’๐–•๐–˜ ๐–”๐–›๐–Š๐–— ๐–™๐–๐–Š ๐–‘๐–†๐–Ÿ๐–ž ๐–‰๐–”๐–Œ ") + + tokenizer = WordTokenizer("en_US") + + tokens = tokenizer.tokenize(text) + assert tokens.text == text + assert len(tokens) == 9 + assert tokens.text_bounds(0, 2) == (1, 10) + assert tokens[0:2].text == text[1:10] + assert len(tokens.slice_by_text(5, 10)) == 1 + assert len(tokens.slice_by_text(5, 11)) == 1 + assert len(tokens.slice_by_text(3, 13)) == 3 + + +def test_sentence_tokenizer(): + from bistring import SentenceTokenizer + + text = bistr("The following sentence is true. The preceeding sentence, surprisingly, is false.") + + tokenizer = SentenceTokenizer("en_US") + + tokens = tokenizer.tokenize(text) + assert tokens.text == text + assert len(tokens) == 2 + assert tokens[0].text == text[:33] + assert tokens[1].text == text[33:]