-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathticker_scraper.py
153 lines (131 loc) · 6.53 KB
/
ticker_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from bs4 import BeautifulSoup as BS
import requests
def convert_notation(notation):
# Convert ie. 1.04B to 1040000000
if notation[-1] == 'T':
notation = int(float(notation[0:-1]) * 1_000_000_000_000)
elif notation[-1] == 'B':
notation = int(float(notation[0:-1]) * 1_000_000_000)
elif notation[-1:] == 'M':
notation = int(float(notation[0:-1]) * 1_000_000)
elif notation[-1:] == 'K':
notation = int(float(notation[0:-1]) * 1_000)
else:
try:
# if the value is small, it might not have a denomination
# ie. 1000
notation = int(notation)
except:
try:
# account for a value not included above (T, B, M, K)
if notation[-1] == type(str):
notation = notation[0:-1]
except:
# return None so we can assume if the value is None, it
# is missing or wasn't parsed correctly.
# We will check before calculations that the value exists
# None will be excluded rather than interrupt with an error
return None
return notation
def parse_elements(stock_data, price_data):
"""From the two lists generated from the html elements, parse the items of interest into
values of specified type to create Stock object.
This uses absolute references, but the references should be the same for each ticker scraped. A null value
will be stored as None which can be checked before attempting a calculation.
Most values scraped will be a string with surrounding data of the numerical value we are interested in.
ie. opening price -> "Open $123.40" we will use replace to replace the leading test with no text ('') leaving
only the numerical value.
Other functions to remove special character such as percentages, letter denominations, commas, etc. are detailed below.
Args:
stock_data (list): List generated by scraper containing left elements of MarketWatch financial data.
price_data (list): List generated by scraper containing right elements of MarketWatch historical price data.
Yields:
tuple: ticker, day_open, day_range_low, day_range_high, fiftytwo_low, fiftytwo_high,
market_cap, shares_outstanding, public_float, beta, rev_per_emp, pe_ratio, eps,
stock_yield, dividend, ex_dividend, short_interest, percent_float, avg_volume,
five_day, one_month, three_month, ytd, one_year
"""
ticker = stock_data[0]
day_open = make_float(stock_data[1].replace('Open $', ''))
day_range = stock_data[2].replace('Day Range ', '').split()
day_range_low = make_float(day_range[0])
day_range_high = make_float(day_range[2])
fiftytwo_week_range = stock_data[3].replace('52 Week Range ', '').split()
fiftytwo_low = make_float(fiftytwo_week_range[0])
fiftytwo_high = make_float(fiftytwo_week_range[2])
market_cap = convert_notation(stock_data[4].replace('Market Cap $', ''))
shares_outstanding = convert_notation(
stock_data[5].replace('Shares Outstanding ', ''))
public_float = convert_notation(stock_data[6].replace('Public Float ', ''))
beta = make_float(stock_data[7].replace('Beta ', ''))
rev_per_emp = convert_notation(
stock_data[8].replace('Rev. per Employee $', ''))
pe_ratio = make_float(stock_data[9].replace('P/E Ratio ', ''))
eps = make_float(stock_data[10].replace('EPS $', ''))
stock_yield = remove_percent(stock_data[11].replace('Yield ', ''))
dividend = make_float(stock_data[12].replace('Dividend $', ''))
ex_dividend = stock_data[13].replace('Ex-Dividend Date ', '')
short_interest_str = stock_data[14].replace('Short Interest ', '').split()
short_interest = convert_notation(short_interest_str[0])
# short_interest_date = short_interest[1] - no current use for a short interest date
percent_float = remove_percent(
stock_data[15].replace('% of Float Shorted ', ''))
avg_volume = convert_notation(
stock_data[16].replace('Average Volume ', ''))
five_day = remove_percent(price_data[43])
one_month = remove_percent(price_data[45])
three_month = remove_percent(price_data[47])
ytd = remove_percent(price_data[49])
one_year = remove_percent(price_data[51])
return ticker, day_open, day_range_low, day_range_high, fiftytwo_low, fiftytwo_high, market_cap, shares_outstanding, public_float, beta, rev_per_emp, pe_ratio, eps, stock_yield, dividend, ex_dividend, short_interest, percent_float, avg_volume, five_day, one_month, three_month, ytd, one_year
def scrape(ticker):
# Scrape financial and price history data from MarketWatch.com pages and store html elemtns in two lists
# from which we will parse data
stock_data = []
price_data = []
url = (
f"https://www.marketwatch.com/investing/stock/{ticker}?mod=quote_search")
print(f"\nRequesting info for {ticker}...")
page = requests.get(url)
# Right now this just prints a message so the user can see it is working
# once the list exports to json, this can be used to confirm the page was
# retrieved successfully for debugging purposes.
if page.status_code == 200:
print(f"OK")
soup = BS(page.content, 'html.parser')
print("Creating soup...")
# Left table in the "overview" section
elements_left = soup.find_all('li', class_='kv__item')
# Right table in the overview section
elements_right = soup.find_all('td', class_='table__cell')
# Store the ticker as the first item in the list
stock_data.append(ticker)
for li in elements_left:
# format the elements and append each to the list after the ticker
element = li.get_text().replace('\n', ' ')
stock_data.append(element.strip())
for tr in elements_right:
# format the elements and append each to the list
element = tr.get_text()
price_data.append(element.strip())
# Call the function to parse the raw text items into useful data and types
parsed_data = parse_elements(stock_data, price_data)
print(f"Parsing {ticker} data...\n")
# Returns a tuple containing the parsed data
return parsed_data
def remove_percent(percentage):
# removes the % sign at the end of a value
# ie. 7.08% -> 7.08
percentage = percentage[0:-1]
try:
return float(percentage)
except:
return None
def make_float(data):
# for values > 999, removes "," char and
# converts string to a float
# ie. 1,234.56 -> 1234.56
try:
return float(data.replace(',', ''))
except:
return None