-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathantonyms.py
71 lines (52 loc) · 2.04 KB
/
antonyms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# coding: utf8
from parse import parse_block
from scrapy.selector import Selector
import zipfile
EPUB_PATH = 'books/TAMOP-4_2_5-09_Ellentetes_jelentesu_szavak_adatbazisa.epub'
class Word(object):
def __init__(self, word, category, type, comment=None):
self.word = word
self.category = category
self.type = type
self.comment = comment
self.antonyms = []
def add_antonym(self, other):
for a in self.antonyms:
if (a.word, a.category) == (other.word, other.category):
return
else:
self.antonyms.append(other)
def get_htmls():
root = zipfile.ZipFile(EPUB_PATH, 'r')
for name in sorted(root.namelist()):
if name.startswith('OEBPS/text/content'):
if name >= 'OEBPS/text/content0006.xhtml':
print 'processing', name
yield root.read(name)
words = {}
for html in get_htmls():
sel = Selector(text=html)
current = None
for p in sel.xpath('//p'):
text = p.extract()
(word, category, type, comment), antonyms = parse_block(text)
if word:
if (word, category) in words:
current = words[(word, category)]
else:
words[(word, category)] = current = Word(word, category, type, comment)
for (word, category, type, comment) in antonyms:
if (word, current.category) in words:
antonym = words[(word, current.category)]
else:
words[(word, current.category)] = antonym = Word(word, current.category, type, comment)
current.add_antonym(antonym)
antonym.add_antonym(current)
with open('words/antonyms.txt', 'w') as f:
for (word, category) in sorted(words.keys()):
word = words[(word, category)]
if word.category == 'mn':
f.write(u'\n[{}]\n'.format(word.word).encode('utf8'))
for antonym in word.antonyms:
f.write(u'{}\n'.format(antonym.word).encode('utf8'))
print '{} words'.format(len(words))