-
Notifications
You must be signed in to change notification settings - Fork 1
/
stemmer.py
69 lines (61 loc) · 2.58 KB
/
stemmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import re
allUrduAffixes = {}
wrongGuessedStem = {}
totalWords = 0
totalCorrectGuessed = 0
urduPrefixes = ['بے', 'بد', 'لا', 'ے', 'نا', 'با', 'کم', 'ان', 'اہل', 'کم']
urduSuffixes = ['دار', 'وں', 'یاں', 'یں', 'ات', 'گوار', 'ور', 'پسند']
# Removes space form a urdu word
def remove(string):
return string.replace(" ", "")
# Opening the file which contains all urdu words and their respective stems
urduFile = open("urdu-affixes.txt", "r", encoding="utf-8")
for urduWord in urduFile:
totalWords = totalWords + 1
x = urduWord.splitlines()
x = x[0].split('\t\t')
# Adding real word and its real stem in allUrduAffixes dictionary
allUrduAffixes[x[0]] = x[1]
for sentence in allUrduAffixes:
urduWord = sentence
prefixFound = False
foundBothPrefixSuffix = False
print(checkPrefix,checkSuffix)
if not foundBothPrefixSuffix:
for prefix in urduPrefixes:
checkPrefix = re.search(rf'\A{prefix}', urduWord)
if checkPrefix:
predictedStem = urduWord[checkPrefix.span(0)[1]:]
prefixFound = True
realStem = remove(allUrduAffixes[sentence])
predictedStem = remove(predictedStem)
if predictedStem == realStem:
totalCorrectGuessed = totalCorrectGuessed + 1
else:
temp = {
"realStem": realStem,
"predictedStem": predictedStem,
}
wrongGuessedStem[urduWord] = temp
break
if not prefixFound:
for suffix in urduSuffixes:
checkSuffix = re.search(rf"{suffix}\Z", urduWord)
if checkSuffix:
predictedStem = urduWord[:checkSuffix.span(0)[0]]
# print(predictedStem)
realStem = remove(allUrduAffixes[sentence])
predictedStem = remove(predictedStem)
if predictedStem == realStem:
totalCorrectGuessed = totalCorrectGuessed + 1
else:
temp = {
"realStem": realStem,
"predictedStem": predictedStem,
}
wrongGuessedStem[urduWord] = temp
break
print("Total num of words: ", totalWords)
print("Total num of words correctly predicted: ", totalCorrectGuessed)
print("Wrong Words List: ", wrongGuessedStem)
print("Fitness Percentage: ", totalCorrectGuessed / totalWords * 100)