-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnltkNGram.py
102 lines (75 loc) · 2.86 KB
/
nltkNGram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import argparse
import nltk.lm
import numpy as np
from itertools import tee
parser = argparse.ArgumentParser(description='nltk 2-Gram calculate MRR score')
parser.add_argument('--TestLines', type=str, default='./logs/MRRLines_20102019.txt',
help='Test lines')
parser.add_argument('--data', type=str, default='./data/achemenet_data_20102019',
help='location of the data to evaluate - create ngram from')
args = parser.parse_args()
# Collect data
with open(args.data + '/train.txt', 'r') as file:
train_file = file.read().split()
with open(args.TestLines) as f:
test_lines = f.readlines()
with open(args.data + '/test.txt', 'r') as f:
test_file = f.read().split()
# Init model and parameters
depth = 2
counter = nltk.lm.NgramCounter([nltk.ngrams(tuple(train_file), i) for i in range(1, depth + 1)])
languageModel = nltk.lm.models.KneserNeyInterpolated(order=2, vocabulary=nltk.lm.Vocabulary(train_file), counter=counter)
# languageModel = nltk.lm.MLE(order=2, vocabulary=nltk.lm.Vocabulary(train_file), counter=counter)
scores = 0
removeIdx = 4
ranks = []
# MRR score
for line in test_lines:
line = line.split()
context = line[removeIdx - depth: removeIdx - 1]
predictions = sorted(languageModel.context_counts(tuple(context)).items(),
reverse=True, key=lambda x: x[1])
print(f"Query: {line[:removeIdx]}")
print("Proposed Results: ", end="")
for location, prediction in enumerate(predictions[:20]):
print(f"{str(location)}) {prediction}", end=", ")
print("...")
rank = -1
for location, prediction in enumerate(predictions):
if prediction[0] == line[removeIdx - 1]:
rank = location + 1
print(f"Correct response: {line[removeIdx - 1]} "
f"Rank: {rank} "
f"Reciprocal rank: 1/{rank} = {1 / rank if rank > 0 else 0}\n")
if rank > 0:
scores += 1 / rank
ranks.append(rank)
print(f"Mean reciprocal rank: {scores}/{len(test_lines)} = {scores / len(test_lines)}")
# Top hit
ranks = np.array(ranks)
ranks[ranks == -1] = 2000000
print('Hit@1 = ', (ranks <= 1).mean())
print('Hit@5 = ', (ranks <= 5).mean())
print('Hit@10 = ', (ranks <= 10).mean())
# Cross entropy and perplexity
def pairwise(iterable):
a, b = tee(iterable)
next(b, None)
return zip(a, b)
a = list(pairwise(train_file))
b = a
# for item in a:
# if languageModel.entropy([item]) != float("inf"):
# b.append(item)
print(f"Train Perplexity: {languageModel.perplexity(b)}")
print(f"Train Entropy: {languageModel.entropy(b)}")
print(len(a),len(b),len(b)*1./len(a))
a = list(pairwise(test_file))
b = a
#
# for item in a:
# if languageModel.entropy([item]) != float("inf"):
# b.append(item)
print(f"Test Perplexity: {languageModel.perplexity(b)}")
print(f"Test Entropy: {languageModel.entropy(b)}")
print(len(a),len(b),len(b)*1./len(a))