-
Notifications
You must be signed in to change notification settings - Fork 1
/
workday_scrape.py
266 lines (222 loc) · 12.7 KB
/
workday_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException, WebDriverException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
import csv
import time
from colorama import Fore, Back, Style, init
import argparse
from datetime import datetime, timezone
# Initialize colorama
init(autoreset=True)
def debug_print(message, color=Fore.BLUE):
print(f"{color}{message}{Style.RESET_ALL}")
def setup_driver():
debug_print("Setting up the Chrome driver...", Fore.CYAN)
chrome_options = Options()
# Remove the headless option
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--window-size=1920,1080")
try:
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)
debug_print("Chrome driver set up successfully.", Fore.GREEN)
return driver
except Exception as e:
debug_print(f"Error setting up Chrome driver: {str(e)}", Fore.RED)
raise
def wait_for_element(driver, by, value, timeout=30):
try:
element = WebDriverWait(driver, timeout).until(
EC.presence_of_element_located((by, value))
)
debug_print(f"Element found: {value}", Fore.GREEN)
return element
except TimeoutException:
debug_print(f"Timeout waiting for element: {value}", Fore.RED)
return None
def safe_find_element(driver, by, value):
try:
return WebDriverWait(driver, 10).until(EC.presence_of_element_located((by, value)))
except (NoSuchElementException, StaleElementReferenceException):
return None
def filter_us_jobs(driver):
debug_print("Filtering for US jobs...", Fore.YELLOW)
try:
# Click on the Location filter
location_filter = wait_for_element(driver, By.CSS_SELECTOR, "button[data-automation-id='distanceLocation']")
if not location_filter:
debug_print("Location filter not found. Page might not have loaded correctly.", Fore.RED)
return False
driver.execute_script("arguments[0].click();", location_filter)
# Wait for the filter menu to appear
filter_menu = wait_for_element(driver, By.CSS_SELECTOR, "div[data-automation-id='filterMenu']")
if not filter_menu:
debug_print("Filter menu not found. Filter options might not have loaded.", Fore.RED)
return False
# Find the Locations section
locations_section = wait_for_element(filter_menu, By.XPATH, ".//fieldset[.//span[text()='Locations']]")
if not locations_section:
debug_print("Locations section not found in the filter menu.", Fore.RED)
return False
# Find the United States checkbox
us_checkbox = wait_for_element(locations_section, By.XPATH, ".//input[@id='2fcb99c455831013ea52fb338f2932d8']")
if not us_checkbox:
debug_print("United States checkbox not found.", Fore.RED)
return False
# Check if the US checkbox is already selected
if not us_checkbox.is_selected():
driver.execute_script("arguments[0].click();", us_checkbox)
# Find and click the View Jobs button
view_jobs_button = wait_for_element(driver, By.CSS_SELECTOR, "button[data-automation-id='viewAllJobsButton']")
if not view_jobs_button:
debug_print("View Jobs button not found.", Fore.RED)
return False
driver.execute_script("arguments[0].click();", view_jobs_button)
# Wait for the page to update
time.sleep(3)
debug_print("US jobs filter applied successfully.", Fore.GREEN)
return True
except Exception as e:
debug_print(f"Error applying US jobs filter: {str(e)}", Fore.RED)
return False
def scrape_workday_jobs(url, max_pages=None, max_retries=3):
driver = None
for attempt in range(max_retries):
try:
driver = setup_driver()
debug_print(f"Navigating to URL: {url}", Fore.YELLOW)
driver.get(url)
if not filter_us_jobs(driver):
debug_print("Failed to apply US jobs filter. Retrying...", Fore.YELLOW)
continue
job_count_element = wait_for_element(driver, By.CSS_SELECTOR, "[data-automation-id='jobFoundText']", timeout=30)
if not job_count_element:
debug_print("Job count not found. Page might not have loaded correctly. Retrying...", Fore.YELLOW)
continue
job_count = int(job_count_element.text.split()[0])
total_pages = (job_count - 1) // 20 + 1 # Assuming 20 jobs per page
debug_print(f"Total jobs: {job_count}, Total pages to scrape: {total_pages}", Fore.MAGENTA)
jobs = []
processed_job_ids = set()
scrape_timestamp = datetime.now(timezone.utc).isoformat()
for page in range(1, total_pages + 1):
debug_print(f"Scraping page {page} of {total_pages}...", Fore.MAGENTA)
# Wait for job listings to load
WebDriverWait(driver, 3).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, "li.css-1q2dra3"))
)
# Use JavaScript to get all job elements
job_elements = driver.execute_script("""
return Array.from(document.querySelectorAll('li.css-1q2dra3')).map(el => ({
title: el.querySelector('a[data-automation-id="jobTitle"]')?.textContent,
url: el.querySelector('a[data-automation-id="jobTitle"]')?.href,
location: el.querySelector('dd.css-129m7dg')?.textContent,
job_id: el.querySelector('li.css-h2nt8k')?.textContent,
scrape_timestamp: '""" + scrape_timestamp + """'
}));
""")
debug_print(f"Found {len(job_elements)} job listings on page {page}.", Fore.GREEN)
new_jobs_on_page = 0
for job in job_elements:
if job['job_id'] not in processed_job_ids:
jobs.append(job)
processed_job_ids.add(job['job_id'])
new_jobs_on_page += 1
debug_print(f"Added {new_jobs_on_page} new jobs from page {page}", Fore.CYAN)
if new_jobs_on_page == 0 and page > 1:
debug_print("No new jobs on this page. Ending scrape.", Fore.YELLOW)
break
if max_pages and page >= max_pages:
debug_print(f"Reached specified maximum of {max_pages} pages. Ending scrape.", Fore.YELLOW)
break
if page < total_pages:
try:
debug_print(f"Attempting to navigate to page {page + 1}...", Fore.YELLOW)
# Scroll to the bottom of the page
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2) # Wait for any lazy-loaded elements
debug_print("Scrolled to bottom of page", Fore.CYAN)
# Retry mechanism for finding and clicking the next button
max_retries = 3
for retry in range(max_retries):
try:
debug_print(f"Attempt {retry + 1} to find and click 'next' button", Fore.YELLOW)
# Wait for the next button with a longer timeout
next_button = WebDriverWait(driver, 20).until(
EC.element_to_be_clickable((By.CSS_SELECTOR, "button[aria-label='next']"))
)
debug_print("'Next' button found", Fore.GREEN)
driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
time.sleep(1) # Wait for the button to be fully in view
# Try to click using JavaScript
driver.execute_script("arguments[0].click();", next_button)
debug_print("Clicked 'next' button", Fore.GREEN)
time.sleep(3) # Wait for the page to load
# Check if new jobs are loaded
new_job_elements = driver.execute_script("""
return Array.from(document.querySelectorAll('li.css-1q2dra3')).map(el => ({
title: el.querySelector('a[data-automation-id="jobTitle"]')?.textContent,
url: el.querySelector('a[data-automation-id="jobTitle"]')?.href,
location: el.querySelector('dd.css-129m7dg')?.textContent,
job_id: el.querySelector('li.css-h2nt8k')?.textContent
}));
""")
if len(new_job_elements) > 0:
debug_print(f"Successfully navigated to page {page + 1}", Fore.GREEN)
break
else:
debug_print("No new jobs loaded, retrying...", Fore.YELLOW)
except Exception as e:
debug_print(f"Error during pagination attempt {retry + 1}: {str(e)}", Fore.RED)
if retry == max_retries - 1:
raise # Re-raise the exception if all retries failed
time.sleep(2) # Wait before retrying
except TimeoutException:
debug_print("Timeout: Couldn't find next button or it's not clickable. Ending scrape.", Fore.YELLOW)
break
except Exception as e:
debug_print(f"Unhandled error during pagination: {str(e)}", Fore.RED)
break
return jobs
except WebDriverException as e:
debug_print(f"WebDriver error: {str(e)}", Fore.RED)
except Exception as e:
debug_print(f"An unexpected error occurred: {str(e)}", Fore.RED)
finally:
if driver:
debug_print("Closing the browser...", Fore.CYAN)
driver.quit()
if attempt < max_retries - 1:
debug_print(f"Retrying... Attempt {attempt + 2} of {max_retries}", Fore.YELLOW)
else:
debug_print("Max retries reached. Scraping failed.", Fore.RED)
return jobs
def save_to_csv(jobs, filename='nvidia_us_jobs.csv'):
debug_print(f"Saving {len(jobs)} jobs to {filename}...", Fore.YELLOW)
with open(filename, 'w', newline='', encoding='utf-8') as file:
writer = csv.DictWriter(file, fieldnames=['title', 'location', 'job_id', 'url', 'scrape_timestamp'])
writer.writeheader()
for job in jobs:
writer.writerow(job)
debug_print(f"Jobs saved successfully to {filename}", Fore.GREEN)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Scrape job listings from NVIDIA's Workday site.")
parser.add_argument("-p", "--pages", type=int, help="Maximum number of pages to scrape")
args = parser.parse_args()
url = "https://nvidia.wd5.myworkdayjobs.com/NVIDIAExternalCareerSite"
debug_print("Starting the job scraping process...", Fore.CYAN)
jobs = scrape_workday_jobs(url, max_pages=args.pages)
if jobs:
debug_print(f"Found {len(jobs)} job listings.", Fore.GREEN)
save_to_csv(jobs)
else:
debug_print("No job listings found.", Fore.RED)
debug_print("Script execution completed.", Fore.CYAN)