-
Notifications
You must be signed in to change notification settings - Fork 1
/
part_of_speech.py
128 lines (100 loc) · 4.25 KB
/
part_of_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# -*- coding: utf-8 -*-
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer
import common_functions
import tweaks
__author__ = "Romain Claret"
__maintainer__ = "Romain Claret"
__copyright__ = "Copyright 2015, Romain Claret "
__credits__ = ["Romain Claret"]
# Copyright (C) Romain Claret, All Rights Reserved
# Unauthorized copying of this file, via any medium is strictly prohibited
__license__ = "Proprietary and confidential"
__version__ = "1.0.0"
__email__ = "romain.claret@rocla.ch"
__status__ = "Prototype" # Prototype, Development, Production
__date__ = "08.11.2015"
"""@package part_of_speech
Documentation about the part of speech process.
"""
def get_word_tag(treebank_tag):
"""
Get only words in the Adjective, Verb, Noun, and Adverb category of the wordnet.
:param treebank_tag: tag of the tokenization
:return: wordnet symolic of the word, if not part of the willing category will return nothing
"""
if treebank_tag.startswith('J'):
return wordnet.ADJ
elif treebank_tag.startswith('V'):
return wordnet.VERB
elif treebank_tag.startswith('N'):
return wordnet.NOUN
elif treebank_tag.startswith('R'):
return wordnet.ADV
else:
return ''
def get_words_frequency(string, top_values):
"""
Gets the words frequency in a corpus
:param string: corpus
:param top_values: maximum of sorted values to return
:return: list of frequencies of the word in there synset form
"""
# import stop words from nltk corpus
stop_words_en_nltk = list(stopwords.words('english'))
# create additional stop words for puntuations and others
stop_words_en_custom = ['.', ',', '\'', '!', '(', ')', ':', ';', '?', '--', '*', '[', ']', '``', str("''"),
'&', '\'ll', '\'ve', '\'s', '\'re', 'a', 'b', 'c',
'i', '\'i', 'this', 'n\'t', 'a', 'could', 'should', 'would', 'can', 'will', 'shall',
'there', 'it', 'also', 'in', 'the', 'many', 'by', 'an',
'1990s', 'the', '+', '-', '...', '=', '%', '#', '[hide]', '[edit]', '.jpg', '/',
'be.v.01', 'have.v.01', 'use.v.01', 'besides.r.02', 'analysis.n.01', 'categorization.n.03',
'vitamin_e.n.01', 'vitamin_c.n.01', 'include.v.01', 'such.s.01', 'many.a.01', 'order.n.01',
'episode.n.01', 'show.n.01', 'not.r.01', 'standard.n.01', 'survey.n.01', 'factor.n.01',
'first.a.01']
until_number = 300
stop_words_en_custom_numbers = []
for value in [lambda i=i: i for i in range(until_number+1)]:
stop_words_en_custom_numbers.append(str(value()))
# add them together
stop_words_en = stop_words_en_nltk + stop_words_en_custom + stop_words_en_custom_numbers
words_list_tmp = word_tokenize(string.lower())
words_list = []
lemmatizer = WordNetLemmatizer()
for word in nltk.pos_tag(words_list_tmp):
tag = get_word_tag(word[1])
if tag is not '':
try:
synset_word = wordnet.synsets(lemmatizer.lemmatize(word[0], pos=tag), pos=tag)[0]
words_list.append(synset_word.name())
except:
pass
processed_word_list = [word for word in words_list if word not in stop_words_en]
text_obj = nltk.Text(processed_word_list)
fd = FreqDist(text_obj)
result = list(fd.items())
if top_values is not 0:
result.sort(key=lambda x: x[1], reverse=True)
result = result[:top_values]
return result
else:
return result
if __name__ == "__main__":
"""
This function is run if this file is run directly.
It will:
Open the first corpus in alphabetical order
Get the frequency
Display the synset and its frequency
"""
files = common_functions.getListFolders(tweaks.textFilesDirectory)
f = open(tweaks.textFilesDirectory + files[0], encoding="utf-8")
text = f.read()
# remove non-ascii
processed_string = "".join(i for i in text if ord(i) < 128)
frequency = get_words_frequency(processed_string, 0)
for word in frequency:
print(word)