-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtext_mining.py
87 lines (72 loc) · 3.15 KB
/
text_mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""Gigablast Web Search Python Script.
Author: Gilberto Diaz
Application Description:
This application utilizes the Gigablast Search Engine to
search the Internet and download web site's content in
HTML format. The application will save the web page
content to the local hard drive in a text file.
"""
import requests
from bs4 import BeautifulSoup
import re
def main():
# Reading File
file_csv = open('query.csv', 'r')
words = file_csv.read().splitlines()
# Base Gigablast URL
base_url = 'https://www.gigablast.com/search?c=main&index=search&sc=1&hacr=1'
# Building Gigablast Paramaters
for topics in words:
term = topics.replace(',', ' ')
my_params = {'userid': '', 'code': '', 'format': 'xml',
'q': term, 'n': '10', 'dr': '1', 'filetype': 'html',
'pss': '10', 'ddu': '1', 'sortby': '0', 'qlang': 'en'}
# GET Request to Gigablast
reqst = requests.get(base_url, params=my_params)
resp = reqst.text
# Parsing XML from Gigablast object
xml = BeautifulSoup(resp, 'html.parser')
# Get URL's
urls = xml.find_all('url')
# Creating URL's list
urls_list = []
for url in urls:
url = url.text.replace('<url>', '').replace('</url>', '').replace('https://', '').replace('http://', '')
urls_list.append(url)
# Creating a urls_list with status code == 200.
clean_urls = []
for link in urls_list:
print(link)
try:
page = requests.get('http://' + link, timeout=5)
stat_code = str(page.status_code)
print('Status code = ' + stat_code + '\n')
if page.status_code == 200:
clean_urls.append('http://' + link)
except requests.exceptions.Timeout:
print('TIMEOUT ERROR: Web page has not respond in 5 seconds.')
except requests.exceptions.ConnectionError:
print('CONNECTION ERROR: Max retries to connect exceeded with URL')
# Fetching web pages from clean_urls. Saving top 10
count = 1
for link in clean_urls[:10]:
try:
request_page = requests.get(link, timeout=5)
page = request_page.text
soup = BeautifulSoup(page, 'html.parser')
# Removing JS and CSS
scripts = soup.findAll(['script', 'style'])
for match in scripts:
match.decompose()
file_content = soup.get_text()
# Striping 'ascii' code
content = re.sub(r'[^\x00-\x7f]', r' ', file_content)
# Creating 'txt' files
with open(my_params['q'] + '_' + str(count) + '.txt', 'w+') as webpage_out:
webpage_out.write(content)
print('The file ' + my_params['q'] + '_' + str(count) + '.txt ' + 'has been created successfully.')
count += 1
except requests.exceptions.Timeout:
print('TIMEOUT ERROR: Web page has not respond in 5 seconds.')
if __name__ == '__main__':
main()