-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPreprocessing.py
85 lines (74 loc) · 3.02 KB
/
Preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
from collections import Counter
import numpy as np
class Preprocessing:
def __init__(self):
self.__train_dir = None
self.__test_dir = None
def set_directory(self, dir):
self.__train_dir = dir
def get_emails(self):
"""
:param train_dir: local directory that contains text files of the emails
:return: list of all files (full directory) for every email
"""
emails = [os.path.join(self.__train_dir, f) for f in os.listdir(self.__train_dir)]
return emails
def get_emails_size(self):
return len(self.get_emails())
def words_count(self):
"""
:return: dictionary represents counts of all words in every email
"""
counts = {}
# for every email:
for mail in self.get_emails():
# open the file
with open(mail) as m:
for i, line in enumerate(m):
# email body in the third line:
if i == 2:
# get all words and count them:
words = line.split()
for word in words: counts[word] = counts.get(word, 0) + 1
return self.__clean_dict(Counter(counts))
def __clean_dict(self, dict_counts):
"""
responsible for removing non-words and absured single characters which are irrelevant
:param dict_counts: dictionary contains every words occurrences in every email
:return: cleaned and filtered dictionary
"""
to_be_removed = dict_counts.copy().keys()
for item in to_be_removed:
# delete all non-alphabetic entries:
if not item.isalpha():
del dict_counts[item]
# delete all single letters:
elif len(item) == 1:
del dict_counts[item]
return dict_counts
def build_sparse_feat_matrix(self):
"""
this function responsible for feature extraction process
Feature space here is 4000 dimensions for each email [emails_size, features_size]
:return: feature matrix that is bag of words model representation
"""
features_n = len(self.words_count().most_common(1000))
emails_n = len(self.get_emails())
matrix = np.zeros((emails_n, features_n))
doc_idx, word_idx = 0, 0
for d, email in enumerate(self.get_emails()):
# for every email/document:
with open(email) as mail:
# open the email file
for i, line in enumerate(mail):
# get the email body only
if i == 2:
words = line.split()
for word in words:
word_idx = 0
for i, w in enumerate(self.words_count()):
if w == word:
word_idx, doc_idx = i, d
matrix[doc_idx, word_idx] = words.count(word)
return matrix