-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordcount.py
109 lines (92 loc) · 3.33 KB
/
wordcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from collections import defaultdict
from multiprocessing import Pool
from bs4 import BeautifulSoup
import re
import sys
import os
import datetime
import time
# Logging is disabled since it is not available
def extract_article_body(filename):
"""
Extracts the article body from the SEP article at the given filename. Some
error handling is done to guarantee that this function returns at least the
empty string. Check the error log.
"""
print(filename)
with open(filename) as f:
# Some files are not properly encoded in UTF-8, this ignores them
try:
doc = f.read()
except UnicodeDecodeError:
return ''
# ConvertEntities is not available in BeautifulSoup4 so it has been removed
soup = BeautifulSoup(doc)
# rip out bibliography
biblio_root = soup.findAll('h2', text='Bibliography')
if biblio_root:
biblio_root = biblio_root[-1].findParent('h2')
if biblio_root:
biblio = [biblio_root]
biblio.extend(biblio_root.findNextSiblings())
biblio = [elm.extract() for elm in biblio]
#else:
#logging.error('Could not extract bibliography from %s' % filename)
# grab modified body
body = soup.find("div", id="aueditable")
if body is not None:
# remove HTML escaped characters
body = re.sub("&\w+;", "", body.text)
return body
else:
#logging.error('Could not extract text from %s' % filename)
return ''
def wordcount(string):
"""
Takes a string as input and returns a defaultdict containing each word
in the string associated with the number of times it occurs in the string.
"""
words = string.split()
count = defaultdict(int)
for word in words:
count[word] += 1
return count
def reduce(dictlist):
"""
Takes a list of defaultdicts as input and returns a single defaultdict
containing each word in the entireity of the corpus associated with the
number of times each word occurs in the corpus.
"""
count = defaultdict(int)
for d in dictlist:
for key,value in d.items():
count[key] += value
return count
# 1. inpho.corpus.sep.extract_article_body can get just the text
# 2. actually getting the list of all articles: inpho.corpus.sep.get_titles()
if __name__ == '__main__':
entriesDir = sys.argv[-1]
#Start timing
startTime = time.clock()
dictionaryList = []
# Limiting the Pool to 4 processes to prevent excess memory usage
pool = Pool(processes=4)
articles = []
for path, dirs, files in os.walk(entriesDir):
for f in files:
if f == "index.html":
filePath = path + "/" + f
data = extract_article_body(filePath)
articles.append(data)
results = pool.map(wordcount, articles)
finalDict = reduce(results)
# Finish timing
endTime = time.clock()
timestamp = str(datetime.datetime.now()) + "\n"
elapsedTime = str(endTime - startTime) + " seconds \n"
with open('wc_output.txt', 'a+') as f:
f.write(timestamp + elapsedTime)
f.write("---------------------------\n\n")
for key,value in sorted(finalDict.items(), key=lambda x:x[1], reverse=True):
line = key + " , " + str(value) + "\n"
f.write(line)