-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathditpontren-scraper.do
144 lines (125 loc) · 4.52 KB
/
ditpontren-scraper.do
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
clear
clear matrix
set more off
gl out "C:\Users\Akirawisnu\Dropbox\COVID-Response\pondok-pesantren"
cap mkdir "$out/listponpes"
local link "https://ditpdpontren.kemenag.go.id/pdpp/profil/"
/*
*****************************************************************
_____ __ .__ .__
/ _ \ | | _|__|___________ __ _ _|__| ______ ____ __ __
/ /_\ \| |/ / \_ __ \__ \\ \/ \/ / |/ ___// \| | \
/ | \ <| || | \// __ \\ /| |\___ \| | \ | /
\____|__ /__|_ \__||__| (____ /\/\_/ |__/____ >___| /____/
\/ \/ \/ \/ \/
*****************************************************************
Scraper for Ponpes in Indonesia
*/
qui{
forval i=1/27741{
noi: di in green "Scraping for Ponpes ID: " `i'
cap copy "`link'`i'" "$out/listponpes/ponpes-`i'.html", replace
cap import delimited using "$out/listponpes/ponpes-`i'.html", case(lower) delim("|X|", collapse) clear
if _rc!=0{
di "Can't scrape ID: `i', not an available Ponpes"
}
else{
keep v1
gen important=.
replace important=1 if regexm(v1,`"class="nama-pondok"')
replace important=1 if regexm(v1,`"class="nspp-pondok""')
replace important=1 if regexm(v1,`"alt="icon kyai"')
replace important=1 if regexm(v1,`"alt="icon lokasi"')
replace important=1 if regexm(v1[_n-1],`"<h1>Profil Singkat</h1>"')
replace important=1 if regexm(v1[_n-3],`"<h1>Profil Singkat</h1>"')
keep if important==1
gen name_pontren=subinstr(v1,`"<h3 class="nama-pondok color-green mgTop-0">"',"",.) in 1
replace name_pontren=subinstr(name_pontren,`"</h3>"',"",.)
gen nspp_pontren=subinstr(v1,`"<div class="nspp-pondok">NSPP"',"",.) in 2
replace nspp_pontren=subinstr(nspp_pontren,`"</div>"',"",.)
gen hm_pontren=subinstr(v1,`" <img src="https://ditpdpontren.kemenag.go.id/pdpp/umum/images/icon-kyai.png" alt="icon kyai">"',"",.) in 3
gen loc_pontren=v1 in 5
gen detail_pontren=v1 in 6
drop v1 important
ds *
foreach j in `r(varlist)'{
forval k=1/3{
replace `j'=`j'[_n-`k'] if `j'==""
replace `j'=`j'[_n+`k'] if `j'==""
}
}
keep in 1
gen kode="`i'"
compress
ds *
foreach j in `r(varlist)'{
replace `j'=trim(`j')
}
** get important varlist
if loc_pontren==""{
gen loc_pontren2=""
gen loc_pontren3=""
}
else{
split loc_pontren, parse("berdiri pada" "beralamat di")
}
if detail_pontren==""{
gen detail_pontren2=""
gen detail_pontren3=""
gen detail_pontren4=""
}
else{
split detail_pontren, parse("jumlah santri pria berjumlah" "dan santri perempuan berjumlah" ", dengan tenaga pengajar berjumlah")
}
cap des loc_pontren2
if _rc!=0{
gen loc_pontren2=""
gen loc_pontren3=""
}
cap des detail_pontren2
if _rc!=0{
gen detail_pontren2=""
gen detail_pontren3=""
gen detail_pontren4=""
}
ren loc_pontren2 built_yr_pontren
ren loc_pontren3 address_pontren
ren detail_pontren2 male_stud_pontren
ren detail_pontren3 female_stud_pontren
ren detail_pontren4 lecturer_pontren
keep kode name_pontren nspp_pontren hm_pontren built_yr_pontren address_pontren male_stud_pontren female_stud_pontren lecturer_pontren
order kode name_pontren nspp_pontren hm_pontren built_yr_pontren address_pontren male_stud_pontren female_stud_pontren lecturer_pontren
compress
foreach j in male_stud_pontren female_stud_pontren lecturer_pontren{
destring `j' , gen(`j'x) i(o r a n g .)
drop `j'
ren `j'x `j'
}
compress
tempfile snsd`i'
saveold `snsd`i'', replace
*copy "$out/ponpes-`i'.html" "$out/listponpes/ponpes-`i'.html", replace
*rm "$out/ponpes-`i'.html"
}
}
noi: di ""
noi: di "Starting to Append ALL"
noi: di ""
cap drop _all
forval i=1/27741{
cap append using `snsd`i''
}
split address_pontren, parse("Kabupaten " "Propinsi ")
replace address_pontren2=address_pontren3 if address_pontren4!=""
replace address_pontren3=address_pontren4 if address_pontren4!=""
drop address_pontren4
ren address_pontren1 jalan_pontren
ren address_pontren2 kab_pontren
ren address_pontren3 prov_pontren
replace kab_pontren = subinstr(kab_pontren ,",","",.)
replace prov_pontren = subinstr(prov_pontren ,".","",.)
compress
saveold "$out/dbase-ponpes-may2020.dta", replace
noi: di "SAVED"
}
exit