-
Notifications
You must be signed in to change notification settings - Fork 9
/
reinvent.py
193 lines (167 loc) · 7.45 KB
/
reinvent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
############################################################################################
#### AWS re:Invent 2017 - Session Information Downloader
# Provides a quick dirty way to export AWS re:Invent session content from the event website.
# Requirements:
# 1. Update your event website credentials in the USERNAME and PASSWORD vars.
# 2. Download the Chrome web driver (https://sites.google.com/a/chromium.org/chromedriver/downloads).
# 3. Change the CHROME_DRIVER var to point to the driver location.
#
# @author Matt Adorjan
# @email matt.adorjan@gmail.com
############################################################################################
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import os
import requests
from time import sleep
from bs4 import BeautifulSoup
import re
#Venetian, Encore, Aria, MGM, Mirage, BELLAGIO, VDARA
VENUE_CODES = [
33659,
33660,
728,
33661,
33662,
33663,
]
# Set username and password for reinvent event website
USERNAME = 'USERNAME'
PASSWORD = 'PASSWORD'
# Chrome web driver path
CHROME_DRIVER = './chromedriver'
# Set to False to ignore SSL certificate validation in Requests package
REQ_VERIFY = True
# Initialize headless chrome
chrome_options = Options()
chrome_options.add_argument("--headless")
content_to_parse = ''
driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROME_DRIVER)
def login(chrome_driver, username, password):
'''
Handle user login to the reinvent session catalog.
Utilizes headless chrome, passing in username and password
'''
chrome_driver.get("https://www.portal.reinvent.awsevents.com/connect/login.ww")
cookie_button = chrome_driver.find_element_by_id("cookieAgreementAcceptButton")
cookie_button.click()
username_field = chrome_driver.find_element_by_id("loginUsername")
username_field.send_keys(username)
password_field = chrome_driver.find_element_by_id("loginPassword")
password_field.send_keys(password)
login_button = chrome_driver.find_element_by_id("loginButton")
login_button.click()
def get_session_time(session_id):
'''
Calls the API on the reinvent event website which returns session times.
Outputs a JSON object with time and room information for a specific session.
'''
url = 'https://www.portal.reinvent.awsevents.com/connect/dwr/call/plaincall/ConnectAjax.getSchedulingJSON.dwr'
data = {
"callCount": 1,
"windowName": "",
"c0-scriptName": "ConnectAjax",
"c0-methodName": "getSchedulingJSON",
"c0-id": 0,
"c0-param0": "number:" + session_id,
"c0-param1": "false",
"batchId": 5,
"instanceId": 0,
"page": "%2Fconnect%2Fsearch.ww",
"scriptSessionId": "1234567"
}
headers = {'Content-Type': 'text/plain'}
r = requests.post(url, headers=headers, data=data, verify=REQ_VERIFY)
returned = r.content.decode('utf8')
returned = returned.replace("\\", '')
# Returns in XHR format. Strip out the relevant information.
m = re.search(r"startTime\":(\".*?\")", returned, re.DOTALL | re.MULTILINE)
if m:
start_time = m.group(1)
else:
start_time = ""
print (returned)
m = re.search(r"endTime\":(\".*?\")", returned, re.DOTALL | re.MULTILINE)
if m:
end_time = m.group(1)
else:
end_time = ""
print (returned)
m = re.search(r"room\":(\".*?\")", returned, re.DOTALL | re.MULTILINE)
if m:
room = m.group(1)
else:
room = ""
print (returned)
time_information = {
"start_time": start_time.replace('"', ''),
"end_time": end_time.replace('"', ''),
"room": room.replace('"', ''),
"day": start_time.replace('"', '')[:start_time.replace('"', '').find(',')]
}
return time_information
# Login to the reinvent website
login(driver, USERNAME, PASSWORD)
# Getting content by day, instead of the entire set, because sometimes the
# Get More Results link stops working on the full list. Haven't had issues
# looking at the lists day by day.
for venue in VENUE_CODES:
#driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=daytime&dayID="+str(day))
driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=abbreviationSort&p=&i(728)="+str(venue))
sleep(3)
print ("Getting Content for Venue Code: " + str(venue))
more_results = True
# Click through all of the session results pages for a specific day.
# The goal is to get the full list for a day loaded.
while(more_results):
try:
# Find the Get More Results link and click it to load next sessions
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
get_results_btn = driver.find_element_by_link_text("Get More Results")
get_results_btn.click()
sleep(3)
except NoSuchElementException as e:
more_results = False
# Once all sessions for the day have been loaded by the headless browser,
# append to a variable for use in BS.
content_to_parse = content_to_parse + driver.page_source
driver.close()
# Start the process of grabbing out relevant session information and writing to a file
#soup = BeautifulSoup(content_to_parse, "html5lib")
soup = BeautifulSoup(content_to_parse, "html.parser")
# In some event titles, there are audio options available inside of an 'i' tag
# Strip out all 'i' tags to make this easier on BS
# Hopefully there is no other italicized text that I'm removing
for i in soup.find_all('i'):
i.extract()
# Grab all of the sessionRows from the final set of HTML and work only with that
sessions = soup.find_all("div", class_="sessionRow")
# Open a blank text file to write sessions to
file = open("sessions.txt","w")
# Create a header row for the file. Note the PIPE (|) DELIMITER.
file.write("Session Number|Session Title|Session Interest|Start Time|End Time|Room and Building\n")
# For each session, pull out the relevant fields and write them to the sessions.txt file.
for session in sessions:
session_soup = BeautifulSoup(str(session), "html.parser")
session_id = session_soup.find("div", class_="sessionRow")
session_id = session_id['id']
session_id = session_id[session_id.find("_")+1:]
session_timing = get_session_time(session_id)
session_number = session_soup.find("span", class_="abbreviation")
session_number = session_number.string.replace(" - ", "")
session_title = session_soup.find("span", class_="title")
session_title = session_title.string.encode('utf-8').rstrip()
session_title = session_title.decode('utf-8')
session_abstract = session_soup.find("span", class_="abstract")
session_interest = session_soup.find("a", class_="interested")
if (session_interest == None):
session_interest = False
else:
session_interest = True
write_contents = str(session_number) + "|" + session_title + "|" + str(session_interest) + "|" + str(session_timing['start_time']) + "|" + str(session_timing['end_time']) + "|" + str(session_timing['room'] + "|" + str(session_timing['day']))
file.write(write_contents.strip() + "\n")
# Print the session title for each session written to the file
print (session_title.encode('utf-8').strip())
file.close()