From 11513d6406cf6c2edb09abefc54da6811503ef68 Mon Sep 17 00:00:00 2001 From: AndyQ Date: Mon, 1 Oct 2018 20:34:37 +0100 Subject: [PATCH] Fixed for 2018 schedule --- .gitignore | 4 +- README.md | 7 +- reinvent.py | 197 ++++++++++++++++++++++++++++++---------------------- 3 files changed, 124 insertions(+), 84 deletions(-) diff --git a/.gitignore b/.gitignore index 3664c2e..4ac9ba6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ chromedriver* -sessions.txt \ No newline at end of file +sessions.txt +sessions.csv +*.txt diff --git a/README.md b/README.md index c0995e0..7b6c83a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,9 @@ -# AWS re:Invent 2017 Schedule Extract Tool +# AWS re:Invent 2018 Schedule Extract Tool + +Modified to work with the 2018 schedule +Original codebase - https://github.com/mda590/reinvent_schedule_extract + +------------------------------------- This tool is meant to make it super easy to export your re:invent schedule into a text file and then import it into whatever tool makes it easier for you to work with. diff --git a/reinvent.py b/reinvent.py index 623ac2f..29497b5 100644 --- a/reinvent.py +++ b/reinvent.py @@ -1,12 +1,12 @@ ############################################################################################ -#### AWS re:Invent 2017 - Session Information Downloader +#### AWS re:Invent 2018 - Session Information Downloader # Provides a quick dirty way to export AWS re:Invent session content from the event website. # Requirements: # 1. Update your event website credentials in the USERNAME and PASSWORD vars. # 2. Download the Chrome web driver (https://sites.google.com/a/chromium.org/chromedriver/downloads). # 3. Change the CHROME_DRIVER var to point to the driver location. # -# @author Matt Adorjan +# @author written by Matt Adorjan # @email matt.adorjan@gmail.com ############################################################################################ @@ -15,29 +15,32 @@ from selenium.webdriver.common.keys import Keys from selenium.webdriver.chrome.options import Options import os -import requests from time import sleep from bs4 import BeautifulSoup import re -#Venetian, Encore, Aria, MGM, Mirage, LINQ -VENUE_CODES = [22188,728,22191,22190,22583,22584] # Set username and password for reinvent event website -USERNAME = 'USERNAME' -PASSWORD = 'PASSWORD' +USERNAME = 'YOUR USERNAME HERE' +PASSWORD = 'YOUR PASSWORD HERE' + +# Set to True to download the data from the web OR False to use a pre-downloaded set of data +# useful if you want to change the parsed datasets +downloadDataFromWeb = False # Chrome web driver path CHROME_DRIVER = './chromedriver' -# Set to False to ignore SSL certificate validation in Requests package -REQ_VERIFY = True + +# Venetian, Encore, Aria, MGM, Mirage, Bellagio, Vdara +VENUE_CODES = [22188,728,22191,22190,22583,22584,24372] # Initialize headless chrome chrome_options = Options() -chrome_options.add_argument("--headless") -content_to_parse = '' -driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROME_DRIVER) +# Uncomment this out to run the chromedriver in headless mode (hides the display) +#chrome_options.add_argument("--headless") + +content_to_parse = '' def login(chrome_driver, username, password): ''' @@ -49,74 +52,68 @@ def login(chrome_driver, username, password): username_field.send_keys(username) password_field = chrome_driver.find_element_by_id("loginPassword") password_field.send_keys(password) + cookieAccept = chrome_driver.find_element_by_id( "cookieAgreementAcceptButton" ) + cookieAccept.click() login_button = chrome_driver.find_element_by_id("loginButton") login_button.click() -def get_session_time(session_id): - ''' - Calls the API on the reinvent event website which returns session times. - Outputs a JSON object with time and room information for a specific session. - ''' - url = 'https://www.portal.reinvent.awsevents.com/connect/dwr/call/plaincall/ConnectAjax.getSchedulingJSON.dwr' - data = { - "callCount": 1, - "windowName": "", - "c0-scriptName": "ConnectAjax", - "c0-methodName": "getSchedulingJSON", - "c0-id": 0, - "c0-param0": "number:" + session_id, - "batchId": 5, - "instanceId": 0, - "page": "%2Fconnect%2Fsearch.ww", - "scriptSessionId": "1234567" - } - headers = {'Content-Type': 'text/plain'} - r = requests.post(url, headers=headers, data=data, verify=REQ_VERIFY) - returned = r.content - returned = returned.replace("\\", '') - - # Returns in XHR format. Strip out the relevant information. - start_time = re.search(r"startTime\":(\".*?\")", returned, re.DOTALL | re.MULTILINE).group(1) - end_time = re.search(r"endTime\":(\".*?\")", returned, re.DOTALL | re.MULTILINE).group(1) - room = re.search(r"room\":(\".*?\")", returned, re.DOTALL | re.MULTILINE).group(1) - - time_information = { - "start_time": start_time.replace('"', ''), - "end_time": end_time.replace('"', ''), - "room": room.replace('"', ''), - "day": start_time.replace('"', '')[:start_time.replace('"', '').find(',')] - } - - return time_information - -# Login to the reinvent website -login(driver, USERNAME, PASSWORD) - -# Getting content by day, instead of the entire set, because sometimes the -# Get More Results link stops working on the full list. Haven't had issues -# looking at the lists day by day. -for venue in VENUE_CODES: - #driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=daytime&dayID="+str(day)) - driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=abbreviationSort&p=&i(728)="+str(venue)) - sleep(3) - print ("Getting Content for Venue Code: " + str(venue)) - more_results = True - # Click through all of the session results pages for a specific day. - # The goal is to get the full list for a day loaded. - while(more_results): - try: - # Find the Get More Results link and click it to load next sessions - get_results_btn = driver.find_element_by_link_text("Get More Results") - get_results_btn.click() - sleep(3) - except NoSuchElementException as e: - more_results = False - - # Once all sessions for the day have been loaded by the headless browser, - # append to a variable for use in BS. - content_to_parse = content_to_parse + driver.page_source - -driver.close() +def loadSessonContentsFromURL(): + global content_to_parse + + driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROME_DRIVER) + + # Login to the reinvent website + login(driver, USERNAME, PASSWORD) + + # Getting content by day, instead of the entire set, because sometimes the + # Get More Results link stops working on the full list. Haven't had issues + # looking at the lists day by day. + for venue in VENUE_CODES: + driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=abbreviationSort&p=&i(728)="+str(venue)) + sleep(3) + print ("Getting Content for Venue Code: " + str(venue)) + more_results = True + # Click through all of the session results pages for a specific day. + # The goal is to get the full list for a day loaded. + while(more_results): + try: + # Find the Get More Results link and click it to load next sessions + get_results_btn = driver.find_element_by_link_text("Get More Results") + get_results_btn.click() + sleep(3) + except NoSuchElementException as e: + more_results = False + + # Go through all the links and expand the scheduling options + # this loads the schedule times (and means we don't need to mess with the AJAX call which + # requires sessions and stuff) + sessions = driver.find_element_by_id('searchResult') + sessionTimes = sessions.find_elements_by_xpath( "//*[contains(@onclick,'showAvailSessions')]") + for link in sessionTimes: + link.click() + sleep(0.250) + + # write to .txt + with open( "{}.txt".format(venue), "w") as out: + out.write( driver.page_source ) + + driver.close() + +def loadSessonContentsFromFile( ): + global content_to_parse + # Getting content by day, instead of the entire set, because sometimes the + # Get More Results link stops working on the full list. Haven't had issues + # looking at the lists day by day. + for venue in VENUE_CODES: + with open( "{}.txt".format(venue), "r") as input: + data = input.read() + content_to_parse = content_to_parse + data + + +if downloadDataFromWeb == True: + loadSessonContentsFromURL() +else: + loadSessonContentsFromFile() # Start the process of grabbing out relevant session information and writing to a file #soup = BeautifulSoup(content_to_parse, "html5lib") @@ -133,19 +130,49 @@ def get_session_time(session_id): # Open a blank text file to write sessions to file = open("sessions.txt","w") + # Create a header row for the file. Note the PIPE (|) DELIMITER. -file.write("Session Number|Session Title|Session Interest|Start Time|End Time|Room and Building\n") +file.write("Session Number,Session Title,Session Level,Session Interest,Day,Start Time,End Time,Building,Room\n") # For each session, pull out the relevant fields and write them to the sessions.txt file. +unableToGet = [] for session in sessions: session_soup = BeautifulSoup(str(session), "html.parser") session_id = session_soup.find("div", class_="sessionRow") session_id = session_id['id'] session_id = session_id[session_id.find("_")+1:] - session_timing = get_session_time(session_id) + + # Grab the schedule timings + text = session_soup.find( "ul", class_="availableSessions").text + text = text[37:] + #print( "{} - [{}]".format( session_id, text ) ) + + match = re.search("([^,]*), ([^,]*), ([^-]*)- ([^-–]*). ([^,]*), ([^,]*), (.*)", text, re.DOTALL | re.MULTILINE) + if match == None: + unableToGet.append( session_id ) + session_timing = { + "start_time": "Unknown", + "end_time": "Unknown", + "building": "Unknown", + "room": "Unknown", + "day": "Unknown", + } + else: + groups = match.groups() + + session_timing = { + "start_time": groups[2], + "end_time": groups[3], + "building": groups[4], + "room": "{} - {}".format(groups[5], groups[6].replace( ",", " - ")), + "day": "{}".format(groups[1]) + } + session_number = session_soup.find("span", class_="abbreviation") session_number = session_number.string.replace(" - ", "") + level = session_number[3:6] + session_title = session_soup.find("span", class_="title") session_title = session_title.string.encode('utf-8').rstrip() session_title = session_title.decode('utf-8') @@ -159,9 +186,15 @@ def get_session_time(session_id): else: session_interest = True - write_contents = str(session_number) + "|" + session_title + "|" + str(session_interest) + "|" + str(session_timing['start_time']) + "|" + str(session_timing['end_time']) + "|" + str(session_timing['room'] + "|" + str(session_timing['day'])) - file.write(write_contents.encode('utf-8').strip() + "\n") + write_contents = "{},\"{}\",{},{},{},{},{},{},{}".format(session_number, session_title, level, session_interest, session_timing['day'], session_timing['start_time'], session_timing['end_time'], session_timing['building'], session_timing['room']) + file.write(write_contents.strip() + "\n") + # Print the session title for each session written to the file - print (session_title.encode('utf-8').strip()) + print (session_title.strip()) file.close() + +print( "------------") +print( "Unable to get details for the following sessions:") +for session in unableToGet: + print( " {}".format( session ) ) \ No newline at end of file