-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_company_links.py
34 lines (28 loc) · 1.13 KB
/
scrape_company_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import requests
import time
from bs4 import BeautifulSoup
# About 2 requests per second
delay = 0.5
list_url = 'http://mim.maltaenterprise.com/default.asp?page=list&show=0&disp={}'
start_number = 11
current_number = start_number
def get_a_tags(number):
response = requests.get(url=list_url.format(number))
soup = BeautifulSoup(response.text, 'lxml')
content_td = soup.find_all('td', {'class': 'content'})[2]
content_table = content_td.find('table', {'width': '100%'})
return content_table.find_all('a')
a_tags = get_a_tags(current_number)
# if there is nothing in the tbody, then there are no companies listed
while len(a_tags) > 0:
print('Scraping Page: {}'.format(current_number // 10))
# get all hrefs from the a tags
hrefs = [a_tag['href'] for a_tag in a_tags]
# write the links to a file
with open('links.txt', 'a', encoding='utf-8') as links_file:
for href in hrefs:
links_file.write('{}{}\n'.format('http://mim.maltaenterprise.com/', href))
# add on 10 to the current number and get the next page
current_number += 10
a_tags = get_a_tags(current_number)
time.sleep(delay)