-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjob-board-scraper.sh
executable file
·124 lines (109 loc) · 2.74 KB
/
job-board-scraper.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/bin/zsh
# Author : Ryan Tipps
# License : MIT
# USER INPUTS FOR
declare -a locations=(
'Remote'
'Austin, TX'
# 'San Antonio, TX'
)
declare -a search_terms=(
'Java'
'Spring'
'PHP'
'Laravel'
'Rails'
'Django'
'Flask'
'Node'
'Angular'
'React'
'Vue'
)
normalize_string () {
# removes spaces, commas, periods, and apostrophies from input string, replaces with
# I created this to avoid using the non-portable sed command
replace_with='-'
A=${1// /$replace_with}
B=${A//,}
C=${B//./$replace_with}
D=${C//\'/}
echo $D
}
scrape_Indeed () {
# Scrapes the reported number of matching job postings from an indivudual query on Indeed.com job board.
# Build URL based on search parameters:
# NOTE: Indeed URL-encodes spaces (%20) but NOT commas. This is non-standard. Therefore we have to
# manually URL-encode rather than using curl's built-in URL encoding.
location=$1
radius=50 # leave hard coded for now
search_term=$2
from_age=14 # leave hard coded for now
scrape_url="https://www.indeed.com/jobs"
scrape_url+="?"
scrape_url+="q=${search_term//' '/%20}"
scrape_url+='&'
scrape_url+="l=${location//' '/%20}"
if [ $location:l != 'remote' ]
then
scrape_url+="&"
scrape_url+="radius=$radius"
fi
scrape_url+="&"
scrape_url+="fromage=$from_age"
# echo $scrape_url
# Typical resulting URL: https://www.indeed.com/jobs?q=Java&l=remote&fromage=14
# Fetch page from Indeed and scrape the reported result:
# This scrape is very site-specific to Indeed.com and prone to breaking. Needs improvement!
# echo $scrape_url
curl -s $scrape_url | grep -E "[0-9]+ jobs</div>" | cut -d' ' -f 24
}
run_report (){
# Aggregates data and prints report to stdout
# Scrape the data asynchronously
declare -A data
while read -r AA BB;
do
data[$AA]=$BB
done < <(
for T in "${search_terms[@]}"
do
for L in "${locations[@]}"
do
(
index="L-$(normalize_string $L)-T-$(normalize_string $T)"
value="$(scrape_Indeed $L $T)"
echo $index $value
) &
done
done
wait
);
# Print output:
printf "%s\n" 'Indeed.com job postings in last 14 days by location and keyword mentions:'
printf "%s\n"
(
# Print column headers
printf "\n%s" "SEARCH TERM"
for L in "${locations[@]}"
do
printf '_'
printf $L:u
done
printf "%s\n"
# Print data rows
for T in "${search_terms[@]}"
do
printf $T
for L in "${locations[@]}"
do
printf '_'
index="L-$(normalize_string $L)-T-$(normalize_string $T)"
printf "${data[$index]}"
done
printf "%s\n"
done
) | column -t -s '_'
}
# Execute !!
run_report