-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJaccard_Coefficient.py
139 lines (104 loc) · 4.63 KB
/
Jaccard_Coefficient.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import Utility
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import nltk.data
import French_To_English
import English_To_French
nltk.download('stopwords')
def jaccard_coeff(string1):
average_jaccard = 0
if (string1 == 1): #French
tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')
with open("Dataset/output.txt") as f:
output = f.readlines()
with open("Dataset/actual.txt") as g:
actual = g.readlines()
output_tokenized = English_To_French.sen_tokenizer(output)
actual_tokenized = English_To_French.sen_tokenizer(actual)
output_sentence = list()
actual_sentence = list()
for line in output_tokenized :
a = tokenizer.tokenize(line)
for sentence in a :
output_sentence.append(sentence)
for line in actual_tokenized :
a = tokenizer.tokenize(line)
for sentence in a :
actual_sentence.append(sentence)
no_of_sentence = len(output_sentence)
stopwords_list = stopwords.words('french')
for index in range(no_of_sentence) :
list1 =[];
list2 =[]
try:
A = word_tokenize(output_sentence[index].lower())
B = word_tokenize(actual_sentence[index].lower())
except IndexError:
continue
A_set = {words for words in A if not words in stopwords_list}
B_set = {words for words in B if not words in stopwords_list}
rvector = A_set.union(B_set)
for word in rvector:
if word in A_set: list1.append(1) # create a vector
else: list1.append(0)
if word in B_set: list2.append(1)
else: list2.append(0)
z = 0
length = len(rvector)
intersection = 0
union = length
for i in range(length) :
intersection += list1[i]*list2[i]
# jaccard coefficient formula
jaccard_coefficient = intersection/union
average_jaccard += jaccard_coefficient/no_of_sentence
print("\nJaccard Coefficient : ", jaccard_coefficient)
print("\nAverage Jaccard coefficient is ",average_jaccard,"\n")
elif(string1 == 2): #english
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open("Dataset/output.txt") as f:
output = f.readlines()
with open("Dataset/actual.txt") as g:
actual = g.readlines()
output_lines = French_To_English.sen_tokenizer(output)
actual_lines = French_To_English.sen_tokenizer(actual)
output_sentence = list()
actual_sentence = list()
for line in output_lines :
a = tokenizer.tokenize(line)
for sentence in a :
output_sentence.append(sentence)
for line in actual_lines :
a = tokenizer.tokenize(line)
for sentence in a :
actual_sentence.append(sentence)
no_of_sentence = len(actual_sentence)
stopwords_list = stopwords.words('english')
for index in range(no_of_sentence) :
list1 =[];
list2 =[]
try:
A = word_tokenize(output_sentence[index].lower())
B = word_tokenize(actual_sentence[index].lower())
except IndexError:
continue
A_set = {words for words in A if not words in stopwords_list}
B_set = {words for words in B if not words in stopwords_list}
rvector = A_set.union(B_set)
for word in rvector:
if word in A_set: list1.append(1) # create a vector
else: list1.append(0)
if word in B_set: list2.append(1)
else: list2.append(0)
z = 0
length = len(rvector)
intersection = 0
union = length
for i in range(length) :
intersection += list1[i]*list2[i]
# jaccard coefficient formula
jaccard_coefficient = intersection/union
average_jaccard += jaccard_coefficient/no_of_sentence
print("\nJaccard Coefficient : ", jaccard_coefficient)
print("\nAverage Jaccard coefficient is ",average_jaccard,"\n")