-
Notifications
You must be signed in to change notification settings - Fork 3
/
toktok.py
executable file
·90 lines (75 loc) · 3.05 KB
/
toktok.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python3
# By Jon Dehdari, 2017
""" Command-line interface to all of NLTK's tokenizers. """
from __future__ import print_function
import sys
import argparse
import re
import nltk.tokenize as tok
def load_tokenizer(cmd_args):
""" Selects the appropriate tokenizer, from command-line arguments. """
tokr = tok
if cmd_args.tok == 'casual':
tokr = tok.casual.TweetTokenizer()
#elif cmd_args.tok == 'mwe':
# tokr = tok.mwe.MWETokenizer()
#elif cmd_args.tok == 'punkt':
# tokr = tok.punkt.PunktSentenceTokenizer()
#elif cmd_args.tok == 'regexp':
# tokr = tok.regexp.RegexpTokenizer()
#elif cmd_args.tok == 'repp':
# tokr = tok.repp.ReppTokenizer()
#elif cmd_args.tok == 'sexpr':
# tokr = tok.sexpr.SExprTokenizer()
elif cmd_args.tok == 'stanford':
tokr = tok.stanford.StanfordTokenizer()
elif cmd_args.tok == 'texttiling':
tokr = tok.texttiling.TextTilingTokenizer()
elif cmd_args.tok == 'treebank':
tokr = tok.treebank.TreebankWordTokenizer()
elif cmd_args.tok == 'moses':
tokr = tok.moses.MosesTokenizer(lang=cmd_args.lang)
else:
# Thanks to Liling Tan for implementing Tok-tok's regexes into NLTK!
tokr = tok.toktok.ToktokTokenizer()
return tokr
def tok_stdin(cmd_args, tokr):
""" Tokenizes each line, and prints it out. """
for line in sys.stdin:
# Skip empty lines
if cmd_args.no_empty and line == '\n':
continue
# Don't tokenize comments
if cmd_args.skip_comments and line[0] == '#':
print(line, end='')
continue
line = ' '.join(tokr.tokenize(line))
if cmd_args.digit is not None:
line = re.sub('\d', cmd_args.digit, line)
if cmd_args.lc:
line = line.lower()
print(line)
def main():
""" Parse command-line arguments and tokenize STDIN. """
parser = argparse.ArgumentParser(
description="Command-line interface to all of NLTK's tokenizers.")
parser.add_argument('-d', '--digit', type=str,
help='Conflate all digits. For example "3.14" -> "5.55"')
parser.add_argument('-l', '--lang', type=str, default='en',
help='Specify language code for moses tokenizer (default: %(default)s)')
parser.add_argument('--lc', '--lower', action='store_true',
help='Lowercase text')
parser.add_argument('--no_empty', action='store_true',
help='Remove empty lines')
parser.add_argument('--skip_comments', action='store_true',
help="Don't tokenize lines starting with '#'")
parser.add_argument('-t', '--tok', type=str, default='toktok',
help='Specify tokenizer submodule {casual, moses,\
stanford, toktok, treebank} (default: %(default)s)')
cmd_args = parser.parse_args()
# Load tokenizer
tokr = load_tokenizer(cmd_args)
# Tokenize stdin
tok_stdin(cmd_args, tokr)
if __name__ == '__main__':
main()