Skip to content

Commit

Permalink
Merge pull request #70 from hyunwoongko/6.0.0
Browse files Browse the repository at this point in the history
6.0.0
  • Loading branch information
hyunwoongko authored Apr 27, 2024
2 parents 79b3956 + a3b3e07 commit 91355e2
Show file tree
Hide file tree
Showing 168 changed files with 261,887 additions and 906 deletions.
Binary file added .DS_Store
Binary file not shown.
24 changes: 24 additions & 0 deletions .github/workflows/test_macos.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: macos-latest
on: push

jobs:
test:
runs-on: macos-latest
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11

- name: Install kss locally
run: |
pip3 install -e .
- name: Install pytest
run: |
python3 -m pip install pytest
- name: Run the test suite
run: |
cd tests && pytest -v
24 changes: 24 additions & 0 deletions .github/workflows/test_ubuntu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: ubuntu-latest
on: push

jobs:
test:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11

- name: Install kss locally
run: |
pip3 install -e .
- name: Install pytest
run: |
python3 -m pip install pytest
- name: Run the test suite
run: |
cd tests && pytest -v
24 changes: 24 additions & 0 deletions .github/workflows/test_windows.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
name: windows-latest
on: push

jobs:
test:
runs-on: windows-latest
steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Setup Python
uses: actions/setup-python@v2
with:
python-version: 3.10.11

- name: Install kss locally
run: |
pip3 install -e .
- name: Install pytest
run: |
python3 -m pip install pytest
- name: Run the test suite
run: |
cd tests && pytest -v
4 changes: 4 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ include csrc/kss_cython.pyx
include csrc/sentence_splitter.cpp
include csrc/sentence_splitter.h
include csrc/__init__.py
include kss/_modules/g2p/assets/rules.txt
include kss/_modules/g2p/assets/idiom.txt
include kss/_modules/g2p/assets/table.csv
include kss/_modules/augmentation/assets/wordnet.json
include setup.py
2,013 changes: 1,190 additions & 823 deletions README.md

Large diffs are not rendered by default.

Binary file added bench/.DS_Store
Binary file not shown.
165 changes: 165 additions & 0 deletions bench/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
UNICODE_TO_REMOVE = {
"\u0000": "", # Null
"\u0001": "", # Start of Heading
"\u0002": "", # Start of Text
"\u0003": "", # End of Text
"\u0004": "", # End of Transmission
"\u0005": "", # Enquiry
"\u0006": "", # Acknowledge
"\u0007": "", # Bell
"\u0008": "", # Backspace
"\u0009": "", # Horizontal Tab
"\u000B": "", # Vertical Tab
"\u000C": "", # Form Feed
"\u000D": "", # Carriage Return
"\u000E": "", # Shift Out
"\u000F": "", # Shift In
"\u0010": "", # Data Link Escape
"\u0011": "", # Device Control 1
"\u0012": "", # Device Control 2
"\u0013": "", # Device Control 3
"\u0014": "", # Device Control 4
"\u0015": "", # Negative Acknowledge
"\u0016": "", # Synchronous Idle
"\u0017": "", # End of Transmission Block
"\u0018": "", # Cancel
"\u0019": "", # End of Medium
"\u001A": "", # Substitute
"\u001B": "", # Escape
"\u001C": "", # File Separator
"\u001D": "", # Group Separator
"\u001E": "", # Record Separator
"\u001F": "", # Unit Separator
"\u007F": "", # Delete
"\u0080": "", # Padding Character
"\u0081": "", # High Octet Preset
"\u0082": "", # Break Permitted Here
"\u0083": "", # No Break Here
"\u0084": "", # Index
"\u0085": "", # Next Line
"\u0086": "", # Start of Selected Area
"\u0087": "", # End of Selected Area
"\u0088": "", # Character Tabulation Set
"\u0089": "", # Character Tabulation with Justification
"\u008A": "", # Line Tabulation Set
"\u008B": "", # Partial Line Forward
"\u008C": "", # Partial Line Backward
"\u008D": "", # Reverse Line Feed
"\u008E": "", # Single-Shift Two
"\u008F": "", # Single-Shift Three
"\u0090": "", # Device Control String
"\u0091": "", # Private Use 1
"\u0092": "", # Private Use 2
"\u0093": "", # Set Transmit State
"\u0094": "", # Cancel Character
"\u0095": "", # Message Waiting
"\u0096": "", # Start of Guarded Area
"\u0097": "", # End of Guarded Area
"\u0098": "", # Start of String
"\u0099": "", # Single Graphic Character Introducer
"\u009A": "", # Single Character Introducer
"\u009B": "", # Control Sequence Introducer
"\u009C": "", # String Terminator
"\u009D": "", # Operating System Command
"\u009E": "", # Privacy Message
"\u009F": "", # Application Program Command
"\u00A0": "", # No-Break Space
"\u00AD": "", # Soft Hyphen
"\u061C": "", # Arabic Letter Mark
"\u115f": "", # Hangul Choseong Filler
"\u1160": "", # Hangul Jungseong Filler
"\u1680": "", # Ogham Space Mark
"\u17B4": "", # Khmer Vowel Inherent AQ
"\u17B5": "", # Khmer Vowel Inherent AA
"\u180B": "", # Mongolian Free Variation Selector One
"\u180C": "", # Mongolian Free Variation Selector Two
"\u180D": "", # Mongolian Free Variation Selector Three
"\u180E": "", # Mongolian Vowel Separator
"\u2000": "", # En Quad
"\u2001": "", # Em Quad
"\u2002": "", # En Space
"\u2003": "", # Em Space
"\u2004": "", # Three-Per-Em Space
"\u2005": "", # Four-Per-Em Space
"\u2006": "", # Six-Per-Em Space
"\u2007": "", # Figure Space
"\u2008": "", # Punctuation Space
"\u2009": "", # Thin Space
"\u200A": "", # Hair Space
"\u200B": "", # Zero Width Space
"\u200C": "", # Zero Width Non-Joiner
"\u200D": "", # Zero Width Joiner
"\u200E": "", # Left-to-Right Mark
"\u200F": "", # Right-to-Left Mark
"\u202A": "", # Left-to-Right Embedding
"\u202B": "", # Right-to-Left Embedding
"\u202C": "", # Pop Directional Formatting
"\u202D": "", # Left-to-Right Override
"\u202E": "", # Right-to-Left Override
"\u202F": "", # Narrow No-Break Space
"\u2060": "", # Word Joiner
"\u2061": "", # Function Application
"\u2062": "", # Invisible Times
"\u2063": "", # Invisible Separator
"\u2064": "", # Invisible Plus
"\u2066": "", # Left-to-Right Isolate
"\u2067": "", # Right-to-Left Isolate
"\u2068": "", # First Strong Isolate
"\u2069": "", # Pop Directional Isolate
"\u206A": "", # Inhibit Symmetric Swapping
"\u206B": "", # Activate Symmetric Swapping
"\u206C": "", # Inhibit Arabic Form Shaping
"\u206D": "", # Activate Arabic Form Shaping
"\u206E": "", # National Digit Shapes
"\u206F": "", # Nominal Digit Shapes
"\u3164": "", # Hangul Filler
"\uFEFF": "", # Zero Width No-Break Space
"\uFFA0": "", # Halfwidth Hangul Filler
"\uFFFC": "", # Object Replacement Character
"\uFFFE": "", # Byte Order Mark
"\uFFFF": "", # Non character
"\U0001307B": "", # Egyptian Hieroglyph Z015B
"\U0001BCA0": "", # Shorthand Format Letter Overlap
}
UNICODE_TO_REMOVE.update({chr(char): "" for char in range(0xE0000, 0xF8FF + 1)}) # Tag + PUA characters
UNICODE_TO_REMOVE.update({chr(char): "" for char in range(0xF0000, 0xFFFFF + 1)}) # PUA characters
UNICODE_TO_REMOVE.update({chr(char): "" for char in range(0x100000, 0x10FFFF + 1)}) # PUA characters
UNICODE_TO_REMOVE.update({chr(char): "" for char in range(0x1D100, 0x1D1FF + 1)}) # Musical symbols

TO_REPLACE = {
"\u0020": " ", # Space
"\u0009": "\t", # Horizontal Tab
"\u000A": "\n", # Line Feed
"\u034F": "\u034F", # Combining Grapheme Joiner
"\u2028": "\n", # Line Separator
"\u2029": "\n\n", # Paragraph Separator
"\u2000": " ", # En Quad
"\u2001": " ", # Em Quad
"\u2002": " ", # En Space
"\u2003": " ", # Em Space
"\u2004": " ", # Three-Per-Em Space
"\u2005": " ", # Four-Per-Em Space
"\u2006": " ", # Six-Per-Em Space
"\u2007": " ", # Figure Space
"\u2008": " ", # Punctuation Space
"\u2009": " ", # Thin Space
"\u200A": " ", # Hair Space
"\u205F": " ", # Medium Mathematical Space
"\u3000": " ", # Ideographic Space
"\u2800": "\u2800", # Braille Pattern Blank
"\u200D": "\u200D", # Zero Width Joiner
}
# TO_REPLACE.update({chr(char): chr(char) for char in range(0x02B0, 0x02FF + 1)}) # Modifier letters
# TO_REPLACE.update({chr(char): chr(char) for char in range(0x0300, 0x036F + 1)}) # Combining Diacritical Marks
# TO_REPLACE.update({chr(char): chr(char) for char in range(0xFE00, 0xFE0F + 1)}) # Variation Selectors
# TO_REPLACE.update({chr(char): chr(char) for char in range(0x1E0100, 0x1E01EF + 1)}) # Variation Selectors

if __name__ == '__main__':
text = "string\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007\u0008\u0009\u000B\u000C\u000D\u000E\u000F\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F\u007F\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087\u0088\u0089\u008A\u008B\u008C\u008D\u008E\u008F\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097\u0098\u0099\u009A\u009B\u009C\u009D\u009E\u009F\u00A0\u00AD\u061C\u115f\u1160\u1680\u17B4\u17B5\u180B\u180C\u180D\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u200C\u200D\u200E\u200F\u202A\u202B\u202C\u202D\u202E\u202F\u2060\u2061\u2062\u2063\u2064\u2066\u2067\u2068\u2069\u206A\u206B\u206C\u206D\u206E\u206F\u3164\uFEFF\uFFA0\uFFFC\uFFFE\uFFFF\U0001307B\U0001BCA0"
text += "".join(chr(char) for char in range(0xE0000, 0xF8FF + 1)) # Tag + PUA characters
text += "".join(chr(char) for char in range(0xF0000, 0xFFFFF + 1)) # PUA characters
text += "".join(chr(char) for char in range(0x100000, 0x10FFFF + 1)) # PUA characters
text += "".join(chr(char) for char in range(0x1D100, 0x1D1FF + 1)) # Musical symbols
text += "\u0020\u0009\u000A\u034F\u2028\u2029\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u205F\u3000\u2800\u200D"
print(len(text))
print(text)
Binary file added bench/preprocessing/.DS_Store
Binary file not shown.
Loading

0 comments on commit 91355e2

Please sign in to comment.