scraper.py

#!/usr/bin/env python3
# scraper.py
# web scraper for magicformulainvesting.com
# pulls company information from site to save time that would be spent manually typing out the info
# Gavin Inglis
# January 2019

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait

import zipfile
import time
import datetime
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import requests
import re
import getpass

# Get latest chromedriver zip file for mac, extract into same folder
try:
    version = requests.get('https://chromedriver.storage.googleapis.com/LATEST_RELEASE').text
    url = 'https://chromedriver.storage.googleapis.com/{0}/{1}'.format(version, 'chromedriver_mac64.zip')
    r = requests.get(url, allow_redirects=True)
    open('chromedriver.zip', 'wb').write(r.content)
    with zipfile.ZipFile("chromedriver.zip", "r") as zip_ref:
        zip_ref.extractall()
except:
    pass

'''Globals'''

GOOGLE_URL = 'http://www.google.com/search'

# scope of access for api
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']

# credentials file generated by google developer console when creating sheets api
credentials = ServiceAccountCredentials.from_json_keyfile_name('PATH TO YOUR CREDENTIALS', scope)
gc = gspread.authorize(credentials)

# login url for site
url = 'https://www.magicformulainvesting.com/Account/LogOn'

options = webdriver.ChromeOptions()
options.add_argument('headless')

# declare driver as chrome headless instance
driver = webdriver.Chrome(executable_path="./chromedriver", options=options)

'''Functions'''
def scrapeSite():

    print("Scraping stock info...")  # update for terminal

    # find all td elements, write needed elements to file
    trs=driver.find_elements_by_xpath('//table[@class="divheight screeningdata"]/tbody/tr')

    names = []
    tikrs = []

    for tr in trs:
        td = tr.find_elements_by_xpath(".//td")

        company_name=td[0].get_attribute("innerHTML")
        company_tikr=td[1].get_attribute("innerHTML")

        names.append(company_name)
        tikrs.append(company_tikr)

    return names, tikrs

def writeSheet(names, tikrs):

    print("Writing to sheet...")  # update to terminal

    # access sheet by url
    wks = gc.open_by_url("YOUR URL HERE").get_worksheet(1) # worksheet number
    
    #wks.append_row([' '], table_range='A1') # append a blank line before tickers as requested by OC
         
    date=datetime.datetime.today().strftime('%Y-%m-%d') # current date
    wks.append_row([date], table_range='A1') # append the date, starts in first column

    for i in range(len(names)):
        price = '=GOOGLEFINANCE("' + tikrs[i] + '","price")'

        query = names[i]

        url = getUrl(query)

        wks.append_row([names[i],tikrs[i], price, url], table_range='A1', value_input_option="USER_ENTERED") # start in first column

def getUrl(companyName):
    url    = GOOGLE_URL + '?q=' + companyName
    result = requests.get(url)
    # fancy regex courtesy of pbui
    urls     = re.findall('/url\?q=([^&]*)', result.text)
    return urls[0]

'''Main Execution'''

# go to page url
driver.get(url)

# find the input elements for logging in
username=driver.find_element_by_name("Email")
password=driver.find_element_by_name("Password")

# enter email and password. uses getpass to hide password (i.e. not using plaintext)
your_email=raw_input("Please enter your email for magicformulainvesting.com: ")
your_password=getpass.getpass("Please enter your password for magicformulainvesting.com: ")
username.send_keys(your_email)
password.send_keys(your_password)

# enter email and password (for hard coding only)
# username.send_keys("EMAIL")
# password.send_keys("PASSWORD")

# click login button
button=driver.find_element_by_name("login")
button.click()

time.sleep(1) # seconds

# use xpathing to find the radio button element for 50 stocks and click it
radio = driver.find_element_by_xpath('//input[@value="false" and contains(@name,"Select30")]')
radio.click()

button2=driver.find_element_by_name("stocks")
button2.click()

time.sleep(.5)

names, tikrs = scrapeSite()

driver.quit()

writeSheet(names, tikrs)