-
Notifications
You must be signed in to change notification settings - Fork 12
/
scraper.py
executable file
·141 lines (100 loc) · 4.16 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
# scraper.py
# web scraper for magicformulainvesting.com
# pulls company information from site to save time that would be spent manually typing out the info
# Gavin Inglis
# January 2019
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import zipfile
import time
import datetime
import gspread
from oauth2client.service_account import ServiceAccountCredentials
import requests
import re
import getpass
# Get latest chromedriver zip file for mac, extract into same folder
try:
version = requests.get('https://chromedriver.storage.googleapis.com/LATEST_RELEASE').text
url = 'https://chromedriver.storage.googleapis.com/{0}/{1}'.format(version, 'chromedriver_mac64.zip')
r = requests.get(url, allow_redirects=True)
open('chromedriver.zip', 'wb').write(r.content)
with zipfile.ZipFile("chromedriver.zip", "r") as zip_ref:
zip_ref.extractall()
except:
pass
'''Globals'''
GOOGLE_URL = 'http://www.google.com/search'
# scope of access for api
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
# credentials file generated by google developer console when creating sheets api
credentials = ServiceAccountCredentials.from_json_keyfile_name('PATH TO YOUR CREDENTIALS', scope)
gc = gspread.authorize(credentials)
# login url for site
url = 'https://www.magicformulainvesting.com/Account/LogOn'
options = webdriver.ChromeOptions()
options.add_argument('headless')
# declare driver as chrome headless instance
driver = webdriver.Chrome(executable_path="./chromedriver", options=options)
'''Functions'''
def scrapeSite():
print("Scraping stock info...") # update for terminal
# find all td elements, write needed elements to file
trs=driver.find_elements_by_xpath('//table[@class="divheight screeningdata"]/tbody/tr')
names = []
tikrs = []
for tr in trs:
td = tr.find_elements_by_xpath(".//td")
company_name=td[0].get_attribute("innerHTML")
company_tikr=td[1].get_attribute("innerHTML")
names.append(company_name)
tikrs.append(company_tikr)
return names, tikrs
def writeSheet(names, tikrs):
print("Writing to sheet...") # update to terminal
# access sheet by url
wks = gc.open_by_url("YOUR URL HERE").get_worksheet(1) # worksheet number
#wks.append_row([' '], table_range='A1') # append a blank line before tickers as requested by OC
date=datetime.datetime.today().strftime('%Y-%m-%d') # current date
wks.append_row([date], table_range='A1') # append the date, starts in first column
for i in range(len(names)):
price = '=GOOGLEFINANCE("' + tikrs[i] + '","price")'
query = names[i]
url = getUrl(query)
wks.append_row([names[i],tikrs[i], price, url], table_range='A1', value_input_option="USER_ENTERED") # start in first column
def getUrl(companyName):
url = GOOGLE_URL + '?q=' + companyName
result = requests.get(url)
# fancy regex courtesy of pbui
urls = re.findall('/url\?q=([^&]*)', result.text)
return urls[0]
'''Main Execution'''
# go to page url
driver.get(url)
# find the input elements for logging in
username=driver.find_element_by_name("Email")
password=driver.find_element_by_name("Password")
# enter email and password. uses getpass to hide password (i.e. not using plaintext)
your_email=raw_input("Please enter your email for magicformulainvesting.com: ")
your_password=getpass.getpass("Please enter your password for magicformulainvesting.com: ")
username.send_keys(your_email)
password.send_keys(your_password)
# enter email and password (for hard coding only)
# username.send_keys("EMAIL")
# password.send_keys("PASSWORD")
# click login button
button=driver.find_element_by_name("login")
button.click()
time.sleep(1) # seconds
# use xpathing to find the radio button element for 50 stocks and click it
radio = driver.find_element_by_xpath('//input[@value="false" and contains(@name,"Select30")]')
radio.click()
button2=driver.find_element_by_name("stocks")
button2.click()
time.sleep(.5)
names, tikrs = scrapeSite()
driver.quit()
writeSheet(names, tikrs)