-
Notifications
You must be signed in to change notification settings - Fork 168
/
Word_count_from_web.py
76 lines (62 loc) · 2.68 KB
/
Word_count_from_web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import urllib.request, urllib.parse, urllib.error
print("Opening the file connection...")
# Following example reads Project Gutenberg EBook of Pride and Prejudice
fhand = urllib.request.urlopen('http://www.gutenberg.org/files/1342/1342-0.txt')
txt_dump = ''
line_count=0
word_count=0
# Iterate over the lines in the file handler object and dump the data into the text string. Also increment line and word counts
for line in fhand:
# Use decode method to convert the UTF-8 to Unicode string
txt_dump+=line.decode()
line_count+=1
# Count the length of words in the line and add to the running count
word_count+=len(line.decode().split(' '))
# Determine the first newline character
firstblank = txt_dump.find('\n')
# Print the first line of the text data (just before the newline char position)
print('\n'+txt_dump[:firstblank])
# Prints basic informationn about the text data
print("\nPrinting some info on the text dump\n"+"-"*60)
print("Total characters:",len(txt_dump))
print("Total words:",word_count)
print(f"Total lines: {line_count}")
# Function for counting top N words
def print_top_N_words(text, top_N=10):
"""
This function accepts a text data (in the form of string) and extracts the top N words from it.
N is supplied to the function. If not, default 10 is taken.
"""
# Initialize a dictionary
counts = dict()
txt_dump=text
top_N=top_N
words = txt_dump.split()
for word in words:
counts[word.lower()]=counts.get(word.lower(),0)+1
common_words = set(line.strip() for line in open('stopwords.txt'))
tmplst = []
for key,val in counts.items():
if key not in common_words:
# Store the value first and then key (later used for sorting)
newtup = (val,key)
tmplst.append(newtup)
# Sort the list by 'value' i.e. word count
tmplst=sorted(tmplst,reverse=True)
print(f"Top {top_N} words in this file are shown below\n"+"-"*55)
for val,key in tmplst[:top_N]:
print(f"{key}:{val}")
# Common words' list
common_words = ['i','in','the','a','an','to','is','are','on','yes','no',
'not','he','she','you','I','my','me','at','of','so','but',
'it','as','if','for','oh','him','her','them','and','all',
'this','that','those','these','his','her','was','had','get',
'have','has','with','up','down','out','when','where','how',
'then','now','we','they','got','there','by','or','by','would',
'will','shall','what','be','do','done','did']
# Now prints the most frequenct N words. Accepts input from user on the terminal.
print("\n")
top_N = int(input("How many most freqently used owrds do you want to display: "))
print_top_N_words(txt_dump,top_N)
print("\nNow closing the file connection...")
fhand.close()