-
Notifications
You must be signed in to change notification settings - Fork 1
/
punktTokenizer.py
executable file
·184 lines (144 loc) · 7.1 KB
/
punktTokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
#from _mysql import NULL
#sentence_splitter = PunktSentenceTokenizer(punkt_param)
#text = "is THAT what you mean, Mrs. Hussey?"
#sentences = sentence_splitter.tokenize(text)
#
outputList1 = open('outputList1.txt', 'w')
outputList2 = open('outputList2.txt', 'w')
outputList3 = open('outputList3.txt', 'w')
outputList4 = open('outputList4.txt', 'w')
outputList5 = open('outputList5.txt', 'w')
outputList6 = open('outputList6.txt', 'w')
outputList7 = open('outputList7.txt', 'w')
outputList8 = open('outputList8.txt', 'w')
outputList9 = open('outputList9.txt', 'w')
outputList10 = open('outputList10.txt', 'w')
outputList11 = open('outputList11.txt', 'w')
outputList12 = open('outputList12.txt', 'w')
outputList13 = open('outputList13.txt', 'w')
outputList14 = open('outputList14.txt', 'w')
outputList15 = open('outputList15.txt', 'w')
outputList16 = open('outputList16.txt', 'w')
outputList17 = open('outputList17.txt', 'w')
outputList18 = open('outputList18.txt', 'w')
outputList19 = open('outputList19.txt', 'w')
outputList20 = open('outputList20.txt', 'w')
outputList21 = open('outputList21.txt', 'w')
outputList22 = open('outputList22.txt', 'w')
outputList23 = open('outputList23.txt', 'w')
outputList24 = open('outputList24.txt', 'w')
outputList25 = open('outputList25.txt', 'w')
extra_abbreviations = ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'i.e']
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
sentence_tokenizer._params.abbrev_types.update(extra_abbreviations)
def tokenize(fileName):
extra_abbreviations = ['dr', 'vs', 'mr', 'mrs', 'prof', 'inc', 'e.g', 'pp', 'lett', 'phys','fig', 'vol', 'appl', 'al', 'etc','i.e']
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
sent_detector._params.abbrev_types.update(extra_abbreviations)
'''
sentences = []
for paragraph in fileName.split('\n'):
sentences.extend(punkt.tokenize(paragraph))
'''
textList = []
#textList = sent_detector.tokenize(fileName.strip())
for paragraph in fileName.split('\n'):
textList.extend(sent_detector.tokenize(paragraph))
'''
numOfFilesNeeded = (len(textList)/100)+1
if numOfFilesNeeded == 1:
textList1 = textList[0:100]
outputList1.write(" \n".join(textList1).encode('utf-8'))
if numOfFilesNeeded == 2:
textList1 = textList[0:100]
outputList1.write(" \n".join(textList1).encode('utf-8'))
textList2 = textList[100:200]
outputList2.write(" \n".join(textList2).encode('utf-8'))
if numOfFilesNeeded == 3:
textList1 = textList[0:100]
outputList1.write(" \n".join(textList1).encode('utf-8'))
textList2 = textList[100:200]
outputList2.write(" \n".join(textList2).encode('utf-8'))
textList3 = textList[200:300]
outputList3.write(" \n".join(textList3).encode('utf-8'))
if numOfFilesNeeded == 4:
textList1 = textList[0:100]
outputList1.write(" \n".join(textList1).encode('utf-8'))
textList2 = textList[100:200]
outputList2.write(" \n".join(textList2).encode('utf-8'))
textList3 = textList[200:300]
outputList3.write(" \n".join(textList3).encode('utf-8'))
textList4 = textList[300:400]
outputList4.write(" \n".join(textList4).encode('utf-8'))
if numOfFilesNeeded == 5:
textList1 = textList[0:100]
outputList1.write(" \n".join(textList1).encode('utf-8'))
textList2 = textList[100:200]
outputList2.write(" \n".join(textList2).encode('utf-8'))
textList3 = textList[200:300]
outputList3.write(" \n".join(textList3).encode('utf-8'))
textList4 = textList[300:400]
outputList4.write(" \n".join(textList4).encode('utf-8'))
textList5 = textList[400:500]
outputList5.write(" \n".join(textList5).encode('utf-8'))
if numOfFilesNeeded == 6:
textList1 = textList[0:100]
outputList1.write(" \n".join(textList1).encode('utf-8'))
textList2 = textList[100:200]
outputList2.write(" \n".join(textList2).encode('utf-8'))
textList3 = textList[200:300]
outputList3.write(" \n".join(textList3).encode('utf-8'))
textList4 = textList[300:400]
outputList4.write(" \n".join(textList4).encode('utf-8'))
textList5 = textList[400:500]
outputList5.write(" \n".join(textList5).encode('utf-8'))
textList6 = textList[500:600]
outputList6.write(" \n".join(textList6).encode('utf-8'))
if numOfFilesNeeded == 7:
textList1 = textList[0:100]
outputList1.write(" \n".join(textList1).encode('utf-8'))
textList2 = textList[100:200]
outputList2.write(" \n".join(textList2).encode('utf-8'))
textList3 = textList[200:300]
outputList3.write(" \n".join(textList3).encode('utf-8'))
textList4 = textList[300:400]
outputList4.write(" \n".join(textList4).encode('utf-8'))
textList5 = textList[400:500]
outputList5.write(" \n".join(textList5).encode('utf-8'))
textList6 = textList[500:600]
outputList6.write(" \n".join(textList6).encode('utf-8'))
textList7 = textList[600:700]
outputList7.write(" \n".join(textList7).encode('utf-8'))
'''
#tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
'''
outputList6.write(" \n".join(textList6).encode('utf-8'))
outputList7.write(" \n".join(textList7).encode('utf-8'))
outputList8.write(" \n".join(textList8).encode('utf-8'))
outputList9.write(" \n".join(textList9).encode('utf-8'))
outputList10.write(" \n".join(textList10).encode('utf-8'))
outputList11.write(" \n".join(textList11).encode('utf-8'))
outputList12.write(" \n".join(textList12).encode('utf-8'))
outputList13.write(" \n".join(textList13).encode('utf-8'))
outputList14.write(" \n".join(textList14).encode('utf-8'))
outputList15.write(" \n".join(textList15).encode('utf-8'))
outputList16.write(" \n".join(textList16).encode('utf-8'))
outputList17.write(" \n".join(textList17).encode('utf-8'))
outputList18.write(" \n".join(textList18).encode('utf-8'))
outputList19.write(" \n".join(textList19).encode('utf-8'))
outputList20.write(" \n".join(textList20).encode('utf-8'))
outputList21.write(" \n".join(textList21).encode('utf-8'))
outputList22.write(" \n".join(textList22).encode('utf-8'))
outputList23.write(" \n".join(textList23).encode('utf-8'))
outputList24.write(" \n".join(textList24).encode('utf-8'))
outputList25.write(" \n".join(textList25).encode('utf-8'))
'''
#tokenizedText = ' \n'.join(sent_detector.tokenize(fileName.strip()))
#textList = ' \n'.join(textList)
#print('\n-----\n'.join(tokenizedText.tokenize(text.strip())))
#print("outputList1")
#return .join(textList)
#return tokenizedText
return textList