-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_tests4.py
130 lines (110 loc) · 5.42 KB
/
scrape_tests4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from sentiment_analysis import (get_undervalued_stocks, get_ticker_news_sentiment, generate_csv_and_json)
from metric_graphs import get_sp500_stocks
import requests
from bs4 import BeautifulSoup
from pprint import pprint
import json
import os
ticker_symbol = input("Enter the ticker symbol you would like to scrape: ")
# Construct the URL with the provided ticker symbol
url = f"https://finance.yahoo.com/quote/{ticker_symbol}/key-statistics?p={ticker_symbol}"
metric_aliases = {
'Market Cap': 'market_cap',
'Beta (5Y Monthly)': 'beta',
'52 Week High 3': '52_week_high',
'52 Week Low 3': '52_week_low',
'50-Day Moving Average 3': '50_day_ma',
'200-Day Moving Average 3': '200_day_ma',
'Avg Vol (3 month) 3': 'avg_vol_3m',
'Avg Vol (10 day) 3': 'avg_vol_10d',
'Shares Outstanding 5': 'shares_outstanding',
'Float 8': 'float',
'% Held by Insiders 1': 'held_by_insiders',
'% Held by Institutions 1': 'held_by_institutions',
'Short Ratio (Jan 30, 2023) 4': 'short_ratio',
'Payout Ratio 4': 'payout_ratio',
'Profit Margin': 'profit_margin',
'Operating Margin (ttm)': 'operating_margin',
'Return on Assets (ttm)': 'return_on_assets',
'Return on Equity (ttm)': 'return_on_equity',
'Revenue (ttm)': 'revenue',
'Revenue Per Share (ttm)': 'revenue_per_share',
'Gross Profit (ttm)': 'gross_profit',
'EBITDA': 'ebitda',
'Net Income Avi to Common (ttm)': 'net_income',
'Diluted EPS (ttm)': 'eps',
'Total Cash (mrq)': 'total_cash',
'Total Cash Per Share (mrq)': 'cash_per_share',
'Total Debt (mrq)': 'total_debt',
'Total Debt/Equity (mrq)': 'debt_to_equity',
'Current Ratio (mrq)': 'current_ratio',
'Book Value Per Share (mrq)': 'book_value_per_share',
'Operating Cash Flow (ttm)': 'operating_cash_flow',
'Levered Free Cash Flow (ttm)': 'levered_free_cash_flow'
}
def get_headers():
return {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.122 Safari/537.36"}
def scrape_data(url):
page = requests.get(url, headers=get_headers())
soup = BeautifulSoup(page.content, 'html.parser')
data_pre = {'financial_highlights': {}, 'trading_information': {}, 'valuation_measures': {}}
# Scrape Financial Highlights Section
financial_highlights_section = soup.find('section', class_='yf-14j5zka')
financial_cards = financial_highlights_section.find_all('section', class_='card small tw-p-0 yf-13ievhf sticky')
for card in financial_cards:
title = card.find('h3', class_='title font-condensed yf-13ievhf clip').text.strip()
table_rows = card.find_all('tr')
financial_data = {}
for row in table_rows:
label = row.find('td', class_='label yf-vaowmx').text.strip()
value = row.find('td', class_='value yf-vaowmx').text.strip()
financial_data[label] = value
data_pre['financial_highlights'][title] = financial_data
# Scrape Trading Information Section
trading_info_section = soup.find_all('section', class_='yf-14j5zka')[1] # Get the second section with the same class
trading_cards = trading_info_section.find_all('section', class_='card small tw-p-0 yf-13ievhf sticky')
for card in trading_cards:
title = card.find('h3', class_='title font-condensed yf-13ievhf clip').text.strip()
table_rows = card.find_all('tr')
trading_data = {}
for row in table_rows:
label = row.find('td', class_='label yf-vaowmx').text.strip()
value = row.find('td', class_='value yf-vaowmx').text.strip()
trading_data[label] = value
data_pre['trading_information'][title] = trading_data
# Scrape Valuation Measures Section
valuation_section = soup.find('section', {'data-testid': 'qsp-statistics'})
if valuation_section:
valuation_rows = valuation_section.find_all('tr')
for row in valuation_rows:
cols = row.find_all('td')
if len(cols) >= 2:
metric = cols[0].text.strip()
value = cols[1].text.strip()
data_pre['valuation_measures'][metric] = value
# Activate the traverse_data function here
data = traverse_data(data_pre, metric_aliases)
return data
data = {}
def traverse_data(data_pre, metric_aliases):
for key, value in data_pre.items():
if isinstance(value, dict):
traverse_data(value, metric_aliases)
else:
for alias_key, alias_value in metric_aliases.items():
if key == alias_key or key == alias_value:
data[alias_value] = value
elif value == alias_key or value == alias_value:
data[alias_value] = value
return data
def save_to_json(data):
if not os.path.exists('out'):
os.makedirs('out')
with open('out/scraped_data.json', 'w') as json_file:
json.dump(data, json_file, indent=4)
if __name__ == '__main__':
data = scrape_data(url)
pprint(data)
save_to_json(data)
generate_csv_and_json(ticker_symbol) # from sentiment_analysis.py
get_sp500_stocks(ticker_symbol) # from metric_graphs.py