-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathch13_naive_bayes.py
133 lines (96 loc) · 4.43 KB
/
ch13_naive_bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from __future__ import division
from collections import Counter, defaultdict
from machine_learning import split_data
import math, random, re, glob
def tokenize(message):
message = message.lower() # convert to lowercase
all_words = re.findall("[a-z0-9']+", message) # extract the words
return set(all_words) # remove duplicates
def count_words(training_set):
"""training set consists of pairs (message, is_spam)"""
counts = defaultdict(lambda: [0, 0])
for message, is_spam in training_set:
for word in tokenize(message):
counts[word][0 if is_spam else 1] += 1
return counts
def word_probabilities(counts, total_spams, total_non_spams, k=0.5):
"""turn the word_counts into a list of triplets
w, p(w | spam) and p(w | ~spam)"""
return [(w,
(spam + k) / (total_spams + 2 * k),
(non_spam + k) / (total_non_spams + 2 * k))
for w, (spam, non_spam) in counts.iteritems()]
def spam_probability(word_probs, message):
message_words = tokenize(message)
log_prob_if_spam = log_prob_if_not_spam = 0.0
for word, prob_if_spam, prob_if_not_spam in word_probs:
# for each word in the message,
# add the log probability of seeing it
if word in message_words:
log_prob_if_spam += math.log(prob_if_spam)
log_prob_if_not_spam += math.log(prob_if_not_spam)
# for each word that's not in the message
# add the log probability of _not_ seeing it
else:
log_prob_if_spam += math.log(1.0 - prob_if_spam)
log_prob_if_not_spam += math.log(1.0 - prob_if_not_spam)
prob_if_spam = math.exp(log_prob_if_spam)
prob_if_not_spam = math.exp(log_prob_if_not_spam)
return prob_if_spam / (prob_if_spam + prob_if_not_spam)
class NaiveBayesClassifier:
def __init__(self, k=0.5):
self.k = k
self.word_probs = []
def train(self, training_set):
# count spam and non-spam messages
num_spams = len([is_spam
for message, is_spam in training_set
if is_spam])
num_non_spams = len(training_set) - num_spams
# run training data through our "pipeline"
word_counts = count_words(training_set)
self.word_probs = word_probabilities(word_counts,
num_spams,
num_non_spams,
self.k)
def classify(self, message):
return spam_probability(self.word_probs, message)
def get_subject_data(path):
data = []
# regex for stripping out the leading "Subject:" and any spaces after it
subject_regex = re.compile(r"^Subject:\s+")
# glob.glob returns every filename that matches the wildcarded path
for fn in glob.glob(path):
is_spam = "ham" not in fn
with open(fn,'r') as file:
for line in file:
if line.startswith("Subject:"):
subject = subject_regex.sub("", line).strip()
data.append((subject, is_spam))
return data
def p_spam_given_word(word_prob):
word, prob_if_spam, prob_if_not_spam = word_prob
return prob_if_spam / (prob_if_spam + prob_if_not_spam)
def train_and_test_model(path):
data = get_subject_data(path)
random.seed(0) # just so you get the same answers as me
train_data, test_data = split_data(data, 0.75)
classifier = NaiveBayesClassifier()
classifier.train(train_data)
classified = [(subject, is_spam, classifier.classify(subject))
for subject, is_spam in test_data]
counts = Counter((is_spam, spam_probability > 0.5) # (actual, predicted)
for _, is_spam, spam_probability in classified)
print counts
classified.sort(key=lambda row: row[2])
spammiest_hams = filter(lambda row: not row[1], classified)[-5:]
hammiest_spams = filter(lambda row: row[1], classified)[:5]
print "spammiest_hams", spammiest_hams
print "hammiest_spams", hammiest_spams
words = sorted(classifier.word_probs, key=p_spam_given_word)
spammiest_words = words[-5:]
hammiest_words = words[:5]
print "spammiest_words", spammiest_words
print "hammiest_words", hammiest_words
if __name__ == "__main__":
train_and_test_model(r"c:\spam\*\*")