-
Notifications
You must be signed in to change notification settings - Fork 0
/
siteScraper.py
81 lines (65 loc) · 2.92 KB
/
siteScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Feb 20 15:45:21 2020
@author: chasebrown
"""
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import urllib.request
def download_file(name, download_url):
response = urllib.request.urlopen(download_url)
file = open(name + ".pdf", 'wb')
file.write(response.read())
file.close()
print("Completed")
WINDOW_SIZE = "1720,1080"
chrome_options = Options()
chrome_options.add_argument("--window-size=%s" % WINDOW_SIZE)
chrome_options.add_argument('"plugins.always_open_pdf_externally": True')
download_dir = "Path you want this downloaded to"
options = webdriver.ChromeOptions()
profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}], # Disable Chrome's PDF Viewer
"download.default_directory": download_dir , "download.extensions_to_open": "applications/pdf"}
options.add_experimental_option("prefs", profile)
driver = webdriver.Chrome(executable_path="./chromedriver", chrome_options=options) # Optional argument, if not specified will search path.
browser = webdriver.Chrome(executable_path="./chromedriver", chrome_options=chrome_options)
browser.get("https://web-as.tamu.edu/gradereport/")
yearElement = browser.find_element_by_id("ctl00_plcMain_lstGradYear")
semesterElement = browser.find_element_by_id("ctl00_plcMain_lstGradTerm")
collegeElement = browser.find_element_by_id("ctl00_plcMain_lstGradCollege")
listOfYears = []
counter = 1
for i in browser.find_elements_by_xpath('//*[@id="ctl00_plcMain_lstGradYear"]/option'):
listOfYears.append({'text':i.text, 'xpath':'//*[@id="ctl00_plcMain_lstGradYear"]/option[' + str(counter) + ']'})
counter+=1
listOfPDFs = []
for i in listOfYears:
itext = i['text']
browser.find_element_by_xpath(i['xpath']).click()
time.sleep(.5)
listOfTerms = []
counter_s = 1
for s in browser.find_elements_by_xpath('//*[@id="ctl00_plcMain_lstGradTerm"]/option'):
listOfTerms.append({'text':s.text, 'xpath':'//*[@id="ctl00_plcMain_lstGradTerm"]/option[' + str(counter_s) + ']'})
counter_s += 1
for s in listOfTerms:
stext = s['text']
browser.find_element_by_xpath(s['xpath']).click()
time.sleep(.5)
listOfColleges = []
counter_h = 1
for h in browser.find_elements_by_xpath('//*[@id="ctl00_plcMain_lstGradCollege"]/option'):
listOfColleges.append({'text':h.text, 'xpath':'//*[@id="ctl00_plcMain_lstGradCollege"]/option[' + str(counter_h) + ']'})
counter_h += 1
for h in listOfColleges:
htext = h['text']
browser.find_element_by_xpath(h['xpath']).click()
browser.find_element_by_xpath('//*[@id="ctl00_plcMain_btnGrade"]').click()
time.sleep(.5)
driver.get(browser.current_url)
time.sleep(.5)
browser.back()
browser.close()
driver.close()