mda590 · AndyQ · Oct 1, 2018
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,4 @@
 chromedriver*
-sessions.txt
+sessions.txt
+sessions.csv
+*.txt
diff --git a/README.md b/README.md
@@ -1,4 +1,9 @@
-# AWS re:Invent 2017 Schedule Extract Tool
+# AWS re:Invent 2018 Schedule Extract Tool
+
+Modified to work with the 2018 schedule
+Original codebase  - https://github.com/mda590/reinvent_schedule_extract
+
+-------------------------------------
 
 This tool is meant to make it super easy to export your re:invent schedule into a text file and then import it into whatever tool makes it easier for you to work with.
 

diff --git a/reinvent.py b/reinvent.py
@@ -1,12 +1,12 @@
 ############################################################################################
-#### AWS re:Invent 2017 - Session Information Downloader
+#### AWS re:Invent 2018 - Session Information Downloader
 # Provides a quick dirty way to export AWS re:Invent session content from the event website.
 # Requirements:
 #   1. Update your event website credentials in the USERNAME and PASSWORD vars.
 #   2. Download the Chrome web driver (https://sites.google.com/a/chromium.org/chromedriver/downloads).
 #   3. Change the CHROME_DRIVER var to point to the driver location.
 #
-# @author Matt Adorjan
+# @author written by Matt Adorjan 
 # @email matt.adorjan@gmail.com
 ############################################################################################
 
@@ -15,29 +15,32 @@
 from selenium.webdriver.common.keys import Keys
 from selenium.webdriver.chrome.options import Options
 import os
-import requests
 from time import sleep
 from bs4 import BeautifulSoup
 import re
 
-#Venetian, Encore, Aria, MGM, Mirage, LINQ
-VENUE_CODES = [22188,728,22191,22190,22583,22584]
 # Set username and password for reinvent event website
-USERNAME = 'USERNAME'
-PASSWORD = 'PASSWORD'
+USERNAME = 'YOUR USERNAME HERE'
+PASSWORD = 'YOUR PASSWORD HERE'
+
+# Set to True to download the data from the web OR False to use a pre-downloaded set of data
+# useful if you want to change the parsed datasets
+downloadDataFromWeb = False
 
 # Chrome web driver path
 CHROME_DRIVER = './chromedriver'
 
-# Set to False to ignore SSL certificate validation in Requests package
-REQ_VERIFY = True
+
+# Venetian, Encore, Aria, MGM, Mirage, Bellagio, Vdara
+VENUE_CODES = [22188,728,22191,22190,22583,22584,24372]
 
 # Initialize headless chrome
 chrome_options = Options()
-chrome_options.add_argument("--headless")
-content_to_parse = ''
 
-driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROME_DRIVER)
+# Uncomment this out to run the chromedriver in headless mode (hides the display)
+#chrome_options.add_argument("--headless")
+
+content_to_parse = ''
 
 def login(chrome_driver, username, password):
     '''
@@ -49,74 +52,68 @@ def login(chrome_driver, username, password):
     username_field.send_keys(username)
     password_field = chrome_driver.find_element_by_id("loginPassword")
     password_field.send_keys(password)
+    cookieAccept = chrome_driver.find_element_by_id( "cookieAgreementAcceptButton" )
+    cookieAccept.click()
     login_button = chrome_driver.find_element_by_id("loginButton")
     login_button.click()
 
-def get_session_time(session_id):
-    '''
-    Calls the API on the reinvent event website which returns session times.
-    Outputs a JSON object with time and room information for a specific session.
-    '''
-    url = 'https://www.portal.reinvent.awsevents.com/connect/dwr/call/plaincall/ConnectAjax.getSchedulingJSON.dwr'
-    data = {
-        "callCount": 1,
-        "windowName": "",
-        "c0-scriptName": "ConnectAjax",
-        "c0-methodName": "getSchedulingJSON",
-        "c0-id": 0,
-        "c0-param0": "number:" + session_id,
-        "batchId": 5,
-        "instanceId": 0,
-        "page": "%2Fconnect%2Fsearch.ww",
-        "scriptSessionId": "1234567"
-    }
-    headers = {'Content-Type': 'text/plain'}
-    r = requests.post(url, headers=headers, data=data, verify=REQ_VERIFY)
-    returned = r.content
-    returned = returned.replace("\\", '')
-
-    # Returns in XHR format. Strip out the relevant information.
-    start_time = re.search(r"startTime\":(\".*?\")", returned, re.DOTALL | re.MULTILINE).group(1)
-    end_time = re.search(r"endTime\":(\".*?\")", returned, re.DOTALL | re.MULTILINE).group(1)
-    room = re.search(r"room\":(\".*?\")", returned, re.DOTALL | re.MULTILINE).group(1)
-
-    time_information = {
-        "start_time": start_time.replace('"', ''),
-        "end_time": end_time.replace('"', ''),
-        "room": room.replace('"', ''),
-        "day": start_time.replace('"', '')[:start_time.replace('"', '').find(',')]
-    }
-
-    return time_information
-
-# Login to the reinvent website
-login(driver, USERNAME, PASSWORD)
-
-# Getting content by day, instead of the entire set, because sometimes the
-# Get More Results link stops working on the full list. Haven't had issues
-# looking at the lists day by day.
-for venue in VENUE_CODES:
-    #driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=daytime&dayID="+str(day))
-    driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=abbreviationSort&p=&i(728)="+str(venue))
-    sleep(3)
-    print ("Getting Content for Venue Code: " + str(venue))
-    more_results = True
-    # Click through all of the session results pages for a specific day.
-    # The goal is to get the full list for a day loaded.
-    while(more_results):
-        try:
-            # Find the Get More Results link and click it to load next sessions
-            get_results_btn = driver.find_element_by_link_text("Get More Results")
-            get_results_btn.click()
-            sleep(3)
-        except NoSuchElementException as e:
-            more_results = False
-
-    # Once all sessions for the day have been loaded by the headless browser,
-    # append to a variable for use in BS.
-    content_to_parse = content_to_parse + driver.page_source
-
-driver.close() 
+def loadSessonContentsFromURL():
+    global content_to_parse
+
+    driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CHROME_DRIVER)
+
+    # Login to the reinvent website
+    login(driver, USERNAME, PASSWORD)
+
+    # Getting content by day, instead of the entire set, because sometimes the
+    # Get More Results link stops working on the full list. Haven't had issues
+    # looking at the lists day by day.
+    for venue in VENUE_CODES:
+        driver.get("https://www.portal.reinvent.awsevents.com/connect/search.ww#loadSearch-searchPhrase=&searchType=session&tc=0&sortBy=abbreviationSort&p=&i(728)="+str(venue))
+        sleep(3)
+        print ("Getting Content for Venue Code: " + str(venue))
+        more_results = True
+        # Click through all of the session results pages for a specific day.
+        # The goal is to get the full list for a day loaded.
+        while(more_results):
+            try:
+                # Find the Get More Results link and click it to load next sessions
+                get_results_btn = driver.find_element_by_link_text("Get More Results")
+                get_results_btn.click()
+                sleep(3)
+            except NoSuchElementException as e:
+                more_results = False
+
+        # Go through all the links and expand the scheduling options
+        # this loads the schedule times (and means we don't need to mess with the AJAX call which
+        # requires sessions and stuff)
+        sessions = driver.find_element_by_id('searchResult')
+        sessionTimes = sessions.find_elements_by_xpath( "//*[contains(@onclick,'showAvailSessions')]")
+        for link in sessionTimes:
+            link.click()
+            sleep(0.250)
+
+        # write to <venueid>.txt
+        with open( "{}.txt".format(venue), "w") as out:
+            out.write( driver.page_source )
+
+    driver.close() 
+
+def loadSessonContentsFromFile( ):
+    global content_to_parse
+    # Getting content by day, instead of the entire set, because sometimes the
+    # Get More Results link stops working on the full list. Haven't had issues
+    # looking at the lists day by day.
+    for venue in VENUE_CODES:
+        with open( "{}.txt".format(venue), "r") as input:
+            data = input.read()
+            content_to_parse = content_to_parse + data
+
+
+if downloadDataFromWeb == True:
+    loadSessonContentsFromURL()
+else:
+    loadSessonContentsFromFile()
 
 # Start the process of grabbing out relevant session information and writing to a file
 #soup = BeautifulSoup(content_to_parse, "html5lib")
@@ -133,19 +130,49 @@ def get_session_time(session_id):
 
 # Open a blank text file to write sessions to
 file = open("sessions.txt","w")
+
 # Create a header row for the file. Note the PIPE (|) DELIMITER.
-file.write("Session Number|Session Title|Session Interest|Start Time|End Time|Room and Building\n")
+file.write("Session Number,Session Title,Session Level,Session Interest,Day,Start Time,End Time,Building,Room\n")
 
 # For each session, pull out the relevant fields and write them to the sessions.txt file.
+unableToGet = []
 for session in sessions:
     session_soup = BeautifulSoup(str(session), "html.parser")
     session_id = session_soup.find("div", class_="sessionRow")
     session_id = session_id['id']
     session_id = session_id[session_id.find("_")+1:]
-    session_timing = get_session_time(session_id)
+
+    # Grab the schedule timings
+    text = session_soup.find( "ul", class_="availableSessions").text
+    text = text[37:]
+    #print( "{} - [{}]".format( session_id, text ) )
+
+    match = re.search("([^,]*), ([^,]*), ([^-]*)- ([^-–]*). ([^,]*), ([^,]*), (.*)", text, re.DOTALL | re.MULTILINE)
+    if match == None:
+        unableToGet.append( session_id )
+        session_timing = {
+            "start_time": "Unknown",
+            "end_time": "Unknown",
+            "building": "Unknown",
+            "room": "Unknown",
+            "day": "Unknown",
+        }
+    else:
+        groups = match.groups()
+
+        session_timing = {
+            "start_time": groups[2],
+            "end_time": groups[3],
+            "building": groups[4],
+            "room": "{} - {}".format(groups[5], groups[6].replace( ",", " - ")),
+            "day": "{}".format(groups[1])
+        }
+
     session_number = session_soup.find("span", class_="abbreviation")
     session_number = session_number.string.replace(" - ", "")
 
+    level = session_number[3:6]
+
     session_title = session_soup.find("span", class_="title")
     session_title = session_title.string.encode('utf-8').rstrip()
     session_title = session_title.decode('utf-8')
@@ -159,9 +186,15 @@ def get_session_time(session_id):
     else:
         session_interest = True
 
-    write_contents = str(session_number) + "|" + session_title + "|" + str(session_interest) + "|" + str(session_timing['start_time']) + "|" + str(session_timing['end_time']) + "|" + str(session_timing['room'] + "|" + str(session_timing['day']))
-    file.write(write_contents.encode('utf-8').strip() + "\n")
+    write_contents = "{},\"{}\",{},{},{},{},{},{},{}".format(session_number, session_title, level, session_interest, session_timing['day'], session_timing['start_time'], session_timing['end_time'], session_timing['building'], session_timing['room'])
+    file.write(write_contents.strip() + "\n")
+
     # Print the session title for each session written to the file
-    print (session_title.encode('utf-8').strip())
+    print (session_title.strip())
 
 file.close()
+
+print( "------------")
+print( "Unable to get details for the following sessions:")
+for session in unableToGet:
+    print( "     {}".format( session ) )