-
Notifications
You must be signed in to change notification settings - Fork 0
/
so.py
42 lines (35 loc) · 1.3 KB
/
so.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import requests
from bs4 import BeautifulSoup
URL="https://stackoverflow.com/jobs?q=python"
def get_last_pages():
result=requests.get(URL)
soup=BeautifulSoup(result.text,"html.parser")
pages=soup.find("div",{"class":"s-pagination"}).find_all("a")
last_page=pages[-2].get_text(strip=True)
return int(last_page)
def extract_job(html):
title=html.find("h2",{"class":"mb4 fc-black-800 fs-body3"})
if title is not None:
title=title.find("a")["title"]
company_row=html.find("h3",{"class":"fc-black-700 fs-body1 mb4"})
if company_row is not None:
company, location=company_row.find_all("span",recursive=False)
company=company.get_text(strip=True)
location=location.get_text(strip=True)
job_id=html["data-jobid"]
return {"title": title, "company":company, "location":location, "apply_link":f"https://stackoverflow.com/jobs/{job_id}"}
def extract_jobs(last_page):
jobs=[]
for page in range(last_page):
print(f"Scrapping Stack Overflow: page {page}")
result=requests.get(f"{URL}&pg={page+1}")
soup=BeautifulSoup(result.text,"html.parser")
results=soup.find_all("div",{"class":"-job"})
for result in results:
job=extract_job(result)
jobs.append(job)
return jobs
def get_jobs():
last_page=get_last_pages()
jobs=extract_jobs(last_page)
return jobs