Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed for 2018 schedule #5

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
chromedriver*
sessions.txt
sessions.txt
sessions.csv
*.txt
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
# AWS re:Invent 2017 Schedule Extract Tool
# AWS re:Invent 2018 Schedule Extract Tool

Modified to work with the 2018 schedule
Original codebase - https://github.com/mda590/reinvent_schedule_extract

-------------------------------------

This tool is meant to make it super easy to export your re:invent schedule into a text file and then import it into whatever tool makes it easier for you to work with.

Expand Down
197 changes: 115 additions & 82 deletions reinvent.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
############################################################################################
#### AWS re:Invent 2017 - Session Information Downloader
#### AWS re:Invent 2018 - Session Information Downloader
# Provides a quick dirty way to export AWS re:Invent session content from the event website.
# Requirements:
# 1. Update your event website credentials in the USERNAME and PASSWORD vars.
# 2. Download the Chrome web driver (https://sites.google.com/a/chromium.org/chromedriver/downloads).
# 3. Change the CHROME_DRIVER var to point to the driver location.
#
# @author Matt Adorjan
# @author written by Matt Adorjan
# @email matt.adorjan@gmail.com
############################################################################################

Expand All @@ -15,29 +15,32 @@
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
import os
import requests
from time import sleep
from bs4 import BeautifulSoup
import re

#Venetian, Encore, Aria, MGM, Mirage, LINQ
VENUE_CODES = [22188,728,22191,22190,22583,22584]
# Set username and password for reinvent event website
USERNAME = 'USERNAME'
PASSWORD = 'PASSWORD'
USERNAME = 'YOUR USERNAME HERE'
PASSWORD = 'YOUR PASSWORD HERE'

# Set to True to download the data from the web OR False to use a pre-downloaded set of data
# useful if you want to change the parsed datasets
downloadDataFromWeb = False

# Chrome web driver path
CHROME_DRIVER = './chromedriver'

# Set to False to ignore SSL certificate validation in Requests package
REQ_VERIFY = True

# Venetian, Encore, Aria, MGM, Mirage, Bellagio, Vdara
VENUE_CODES = [22188,728,22191,22190,22583,22584,24372]

# Initialize headless chrome
chrome_options = Options()
chrome_options.add_argument("--headless")
content_to_parse = ''

driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROME_DRIVER)
# Uncomment this out to run the chromedriver in headless mode (hides the display)
#chrome_options.add_argument("--headless")

content_to_parse = ''

def login(chrome_driver, username, password):
'''
Expand All @@ -49,74 +52,68 @@ def login(chrome_driver, username, password):
username_field.send_keys(username)
password_field = chrome_driver.find_element_by_id("loginPassword")
password_field.send_keys(password)
cookieAccept = chrome_driver.find_element_by_id( "cookieAgreementAcceptButton" )
cookieAccept.click()
login_button = chrome_driver.find_element_by_id("loginButton")
login_button.click()

def get_session_time(session_id):
'''
Calls the API on the reinvent event website which returns session times.
Outputs a JSON object with time and room information for a specific session.
'''
url = 'https://www.portal.reinvent.awsevents.com/connect/dwr/call/plaincall/ConnectAjax.getSchedulingJSON.dwr'
data = {
"callCount": 1,
"windowName": "",
"c0-scriptName": "ConnectAjax",
"c0-methodName": "getSchedulingJSON",
"c0-id": 0,
"c0-param0": "number:" + session_id,
"batchId": 5,
"instanceId": 0,
"page": "%2Fconnect%2Fsearch.ww",
"scriptSessionId": "1234567"
}
headers = {'Content-Type': 'text/plain'}
r = requests.post(url, headers=headers, data=data, verify=REQ_VERIFY)
returned = r.content
returned = returned.replace("\\", '')

# Returns in XHR format. Strip out the relevant information.
start_time = re.search(r"startTime\":(\".*?\")", returned, re.DOTALL | re.MULTILINE).group(1)
end_time = re.search(r"endTime\":(\".*?\")", returned, re.DOTALL | re.MULTILINE).group(1)
room = re.search(r"room\":(\".*?\")", returned, re.DOTALL | re.MULTILINE).group(1)

time_information = {
"start_time": start_time.replace('"', ''),
"end_time": end_time.replace('"', ''),
"room": room.replace('"', ''),
"day": start_time.replace('"', '')[:start_time.replace('"', '').find(',')]
}

return time_information

# Login to the reinvent website
login(driver, USERNAME, PASSWORD)

# Getting content by day, instead of the entire set, because sometimes the
# Get More Results link stops working on the full list. Haven't had issues
# looking at the lists day by day.
for venue in VENUE_CODES:
#driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=daytime&dayID="+str(day))
driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=abbreviationSort&p=&i(728)="+str(venue))
sleep(3)
print ("Getting Content for Venue Code: " + str(venue))
more_results = True
# Click through all of the session results pages for a specific day.
# The goal is to get the full list for a day loaded.
while(more_results):
try:
# Find the Get More Results link and click it to load next sessions
get_results_btn = driver.find_element_by_link_text("Get More Results")
get_results_btn.click()
sleep(3)
except NoSuchElementException as e:
more_results = False

# Once all sessions for the day have been loaded by the headless browser,
# append to a variable for use in BS.
content_to_parse = content_to_parse + driver.page_source

driver.close()
def loadSessonContentsFromURL():
global content_to_parse

driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROME_DRIVER)

# Login to the reinvent website
login(driver, USERNAME, PASSWORD)

# Getting content by day, instead of the entire set, because sometimes the
# Get More Results link stops working on the full list. Haven't had issues
# looking at the lists day by day.
for venue in VENUE_CODES:
driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=abbreviationSort&p=&i(728)="+str(venue))
sleep(3)
print ("Getting Content for Venue Code: " + str(venue))
more_results = True
# Click through all of the session results pages for a specific day.
# The goal is to get the full list for a day loaded.
while(more_results):
try:
# Find the Get More Results link and click it to load next sessions
get_results_btn = driver.find_element_by_link_text("Get More Results")
get_results_btn.click()
sleep(3)
except NoSuchElementException as e:
more_results = False

# Go through all the links and expand the scheduling options
# this loads the schedule times (and means we don't need to mess with the AJAX call which
# requires sessions and stuff)
sessions = driver.find_element_by_id('searchResult')
sessionTimes = sessions.find_elements_by_xpath( "//*[contains(@onclick,'showAvailSessions')]")
for link in sessionTimes:
link.click()
sleep(0.250)

# write to <venueid>.txt
with open( "{}.txt".format(venue), "w") as out:
out.write( driver.page_source )

driver.close()

def loadSessonContentsFromFile( ):
global content_to_parse
# Getting content by day, instead of the entire set, because sometimes the
# Get More Results link stops working on the full list. Haven't had issues
# looking at the lists day by day.
for venue in VENUE_CODES:
with open( "{}.txt".format(venue), "r") as input:
data = input.read()
content_to_parse = content_to_parse + data


if downloadDataFromWeb == True:
loadSessonContentsFromURL()
else:
loadSessonContentsFromFile()

# Start the process of grabbing out relevant session information and writing to a file
#soup = BeautifulSoup(content_to_parse, "html5lib")
Expand All @@ -133,19 +130,49 @@ def get_session_time(session_id):

# Open a blank text file to write sessions to
file = open("sessions.txt","w")

# Create a header row for the file. Note the PIPE (|) DELIMITER.
file.write("Session Number|Session Title|Session Interest|Start Time|End Time|Room and Building\n")
file.write("Session Number,Session Title,Session Level,Session Interest,Day,Start Time,End Time,Building,Room\n")

# For each session, pull out the relevant fields and write them to the sessions.txt file.
unableToGet = []
for session in sessions:
session_soup = BeautifulSoup(str(session), "html.parser")
session_id = session_soup.find("div", class_="sessionRow")
session_id = session_id['id']
session_id = session_id[session_id.find("_")+1:]
session_timing = get_session_time(session_id)

# Grab the schedule timings
text = session_soup.find( "ul", class_="availableSessions").text
text = text[37:]
#print( "{} - [{}]".format( session_id, text ) )

match = re.search("([^,]*), ([^,]*), ([^-]*)- ([^-–]*). ([^,]*), ([^,]*), (.*)", text, re.DOTALL | re.MULTILINE)
if match == None:
unableToGet.append( session_id )
session_timing = {
"start_time": "Unknown",
"end_time": "Unknown",
"building": "Unknown",
"room": "Unknown",
"day": "Unknown",
}
else:
groups = match.groups()

session_timing = {
"start_time": groups[2],
"end_time": groups[3],
"building": groups[4],
"room": "{} - {}".format(groups[5], groups[6].replace( ",", " - ")),
"day": "{}".format(groups[1])
}

session_number = session_soup.find("span", class_="abbreviation")
session_number = session_number.string.replace(" - ", "")

level = session_number[3:6]

session_title = session_soup.find("span", class_="title")
session_title = session_title.string.encode('utf-8').rstrip()
session_title = session_title.decode('utf-8')
Expand All @@ -159,9 +186,15 @@ def get_session_time(session_id):
else:
session_interest = True

write_contents = str(session_number) + "|" + session_title + "|" + str(session_interest) + "|" + str(session_timing['start_time']) + "|" + str(session_timing['end_time']) + "|" + str(session_timing['room'] + "|" + str(session_timing['day']))
file.write(write_contents.encode('utf-8').strip() + "\n")
write_contents = "{},\"{}\",{},{},{},{},{},{},{}".format(session_number, session_title, level, session_interest, session_timing['day'], session_timing['start_time'], session_timing['end_time'], session_timing['building'], session_timing['room'])
file.write(write_contents.strip() + "\n")

# Print the session title for each session written to the file
print (session_title.encode('utf-8').strip())
print (session_title.strip())

file.close()

print( "------------")
print( "Unable to get details for the following sessions:")
for session in unableToGet:
print( " {}".format( session ) )