-
Notifications
You must be signed in to change notification settings - Fork 1
/
acquire0.py
183 lines (162 loc) · 7.24 KB
/
acquire0.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import numpy as np
import pandas as pd
from datetime import datetime
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os
def make_soup(url):
'''
This helper function takes in a url and uses requests module to
parse HTML from the page returning a soup object. We can then use
the soup object to call various methods to get the parts of the page
that we need like, job title, and links to job postings. UPDATE:
implemented random string generator to get passed CAPTCHA when scrapping
'''
import random
import string
rand_string = ''.join(random.SystemRandom().choice(string.ascii_letters + string.digits) for _ in range(10))
print(f'User: {rand_string}')
headers = {'User-Agent': rand_string}
response = get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
return soup
def get_search_urls():
'''
This function scrapes the Indeed search results pages for each of 4
job labels for the first 10 pages of each and returns a list of all
the urls.
'''
# create empty list to hold urls
urls = []
# create list of job titles to search for
jobs = ['data scientist', 'data analyst',
'data engineer', 'machine learning engineer']
# loop through each job
for job in jobs:
count = 0
# loop through each page until
for i in range(1,16):
# each page for most starred repos on GH
url = f'https://www.indeed.com/jobs?q={job}&l=United+States&start={count}'
# append the url to the urls list
urls.append(url)
# add 10 to the start to get the next set of entries (next page)
count += 10
return urls
def get_all_cards(urls):
'''
This function scrapes the url from each job card within each page of
the search result urls and returns a complete list of urls for each job.
UPDATE: added random number generator to get past Captcha'''
# create empty list
job_urls = []
n = 0
# loop through each url in urls list
for url in urls:
# generates a random number between 1 and 8
# to simulate "human" like activity
import random
rand_int = random.randint(1,9)
print(f'Interval {rand_int}')
# Make request and soup object using helper function
soup = make_soup(url)
# delay 1 second between fetch
sleep(rand_int)
n = n + 1
print(f"Scraping loop number {n}")
# Create a list of the divider elements that hold the job cards.
card_list = soup.find_all('div', class_='jobsearch-SerpJobCard')
# I'm using a set comprehension to return only unique urls.
card_set = {'https://www.indeed.com' + card.h2.a.get('href') for card in card_list}
# I'm converting my set to a list of urls.
card_set = list(card_set)
# extend the list with a new url as an element
job_urls.extend(card_set)
return job_urls
def get_job_content(urls, i, cached=False):
'''
This function takes in a list of Job urls and a parameter
with default cached == False which scrapes the job_title, company,
location, remote, salary, post_date, access_date, and job_description
for each url, creates a list of dictionaries with the the features mentioned
for each job, converts list to df, and returns df. If cached == True,
the function returns a df from a json file.
Try and except statements are in place in case a variable isn't indicated.
We will replace it with an empty string.
'''
if cached == True:
df = pd.read_json(f'indeed-data-jobs{i}.json')
# cached == False completes a fresh scrape for df
else:
# Create an empty list to hold dictionaries
records = []
n = 0
# Loop through each url in our list of urls
for url in urls:
# generates a random number between 1 and 8
# to simulate "human" like activity
import random
rand_int = random.randint(1,5)
# Make request and soup object using helper
soup = make_soup(url)
sleep(rand_int)
n = n + 1
print(f"Loop number: {n}")
print(f'Interval: {rand_int} seconds \n')
# access the job title
try:
job_title = job_title = soup.find('h1', 'icl-u-xs-mb--xs icl-u-xs-mt--none jobsearch-JobInfoHeader-title').text.strip()
except AttributeError:
job_title = ''
# access the company
try:
company = soup.find('div', 'icl-u-lg-mr--sm icl-u-xs-mr--xs').text.strip()
except AttributeError:
company = ''
# access the location
try:
location = soup.find('div', 'icl-u-xs-mt--xs icl-u-textColor--secondary jobsearch-JobInfoHeader-subtitle jobsearch-DesktopStickyContainer-subtitle').contents[1].text
except AttributeError:
location = ''
# is the position remote
try:
if soup.find('div', 'icl-u-xs-mt--xs icl-u-textColor--secondary jobsearch-JobInfoHeader-subtitle jobsearch-DesktopStickyContainer-subtitle').contents[2].text != None:
remote = 1
except IndexError:
remote = 0
except AttributeError:
remote = 0
# access salary
try:
salary = soup.find('span', 'icl-u-xs-mr--xs').text
except AttributeError:
salary = ''
# access post date from access
try:
if (soup.find('div', 'jobsearch-JobMetadataFooter').contents[1].text)[0].isdigit():
post_date = soup.find('div', 'jobsearch-JobMetadataFooter').contents[1].text
elif (soup.find('div', 'jobsearch-JobMetadataFooter').contents[0].text)[0].isdigit():
post_date = soup.find('div', 'jobsearch-JobMetadataFooter').contents[0].text
except AttributeError:
post_date = ''
# today's date
today = datetime.today().strftime('%Y-%m-%d')
# access full job description text
try:
job_description = soup.find('div', {'id':'jobDescriptionText', 'class':'jobsearch-jobDescriptionText'}).text.strip().replace('\n', ' ')
except AttributeError:
job_description = ''
print('^ NO JOB DESC, Check Record. \n')
# Create a dictionary holding the variables for each job
job = {'job_title': job_title, 'company': company, 'location': location,
'is_remote': remote, 'salary': salary, 'post_date': post_date,
'date_accessed': today, 'job_description': job_description}
# Add each dictionary to the records list of dictionaries
records.append(job)
# convert our list of dictionaries to a df
df = pd.DataFrame(records)
# Write df to a json file for faster access
# "i" is a number to indicate a file with a different set of records
df.to_json(f'indeed-data-jobs{i}.json')
return df