-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathapp2.py
426 lines (379 loc) · 22 KB
/
app2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
from dash import Dash, html, dcc, Input, Output, State, callback, no_update
from dash_dangerously_set_inner_html import DangerouslySetInnerHTML
import dash_mantine_components as dmc
import plotly.express as px
import pymongo
import pandas as pd
import statsmodels.api as sm
import xgboost as xgb
from sklearn.feature_selection import SelectFromModel, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
app = Dash(__name__)
db_params={'db_uri': 'mongodb://localhost:27017/',
'db_name': 'stocks_db',
'db_collection_name': 'daily'}
db_uri = db_params['db_uri']
db_name = db_params['db_name']
db_collection_name = db_params['db_collection_name']
# Connect to MongoDB
client = pymongo.MongoClient(db_uri)
db = client[db_name]
db_collection = db[db_collection_name] # Get/create collection for specific type of stocks data (daily, monthly, etc.)
def get_all_index_symbols(db_collection):
"""
Get list of stock index symbols stored in the database
"""
index_symbols = [elem.replace('Index_','^') for elem in list(db_collection.find_one().keys()) if elem.startswith('Index_')]
# Database query to extract pairs {'Symbol', 'Name'} for each Index Symbol in the database
cursor = db_collection.aggregate([{"$match": {"Symbol": {"$in": index_symbols}
}
},
{ '$group': {'_id' : '$Symbol',
'Symbol': { '$first': '$Symbol' },
'Name' : { '$first': '$Name' }
}
},
{ '$project': {'_id': 0,
'Symbol': 1,
'Name': 1,
}
}
])
query_result = list(cursor)
# Transform pairs {'Symbol', 'Name'} into full strying, ex. convert {^NDX, NASDAQ 100} into '^NDX - NASDAQ 100'
index_symbols_names = [f'{value1} - {value2}' for value1, value2 in [list(dict_.values()) for dict_ in query_result]]
return index_symbols_names
def extract_index_symbol(selected_index_symbol_name: str):
"""
Extract Index Symbol from string combination 'Index_Symbol - Index_Name' (ex. get ^NDX from '^NDX - NASDAQ 100')
"""
selected_index_symbol = selected_index_symbol_name.split(' - ', 1)[0]
return selected_index_symbol
def extract_stock_symbols(selected_stock_symbols_names: list):
"""
Extract Symbols from list of strings combinations ['Stock_Symbol_1 - Stock_Name_1', 'Stock_Symbol_2 - Stock_Name_2', ...]
ex. get ['ZM', 'ZS', 'XEL', ...] from ['ZM - Zoom Video Communications, Inc.', 'ZS - Zscaler, Inc.', 'XEL - Xcel Energy Inc.', ...]
"""
selected_stock_symbols = [selected_stock_symbol_name.split(' - ', 1)[0] for selected_stock_symbol_name in selected_stock_symbols_names]
return selected_stock_symbols
def get_all_stocks_for_index(selected_index_symbol):
"""
Get list of all stocks (stored in the database) for a specific stock index symbol
"""
selected_index_symbol = selected_index_symbol.replace('^', '')
# Query database to get all stocks contained in an index
cursor = db_collection.aggregate([{ '$match' : { f'Index_{selected_index_symbol}': True}
},
{ '$group': {'_id' : '$Symbol',
'Symbol': { '$first': '$Symbol' },
'Name' : { '$first': '$Name' }
}
},
{ '$project': {'_id': 0,
'Symbol': 1,
'Name': 1,
}
}
])
query_result = list(cursor)
return query_result
def get_stock_index_data(selected_index_symbol):
"""
Get data (from the database) for a specific stock index symbol
"""
# Query database to get closing prices for selected stock index:
selected_index_cursor = db_collection.find({ 'Symbol': selected_index_symbol },
{'_id': 0,
'Date': 1,
'Symbol': 1,
'Close': 1
}
)
query_result_selected_index = list(selected_index_cursor)
# Build pandas dataframe with stock index data got from database.
# ex.
# Date (df index) ^GSPC (stock index symbol)
# 2023-07-25 04:00:00 4567.459961
# 2023-07-26 04:00:00 4566.750000
# 2023-07-27 04:00:00 4537.410156
# 2023-07-28 04:00:00 4582.229980
# 2023-07-31 04:00:00 4588.959961
# 2023-08-01 04:00:00 4577.919922
#
data_stock_index = pd.DataFrame(query_result_selected_index)
data_stock_index = data_stock_index.rename(columns={'Close': data_stock_index['Symbol'].iloc[0]})
data_stock_index.drop(columns=['Symbol'], inplace=True)
return data_stock_index
def get_stocks_data(selected_stock_symbols):
"""
Get data (from the database) for a list of stock symbols
"""
# Query database to get closing prices for selected stocks:
selected_stocks_cursor = db_collection.find({ 'Symbol': {"$in": selected_stock_symbols}},
{'_id': 0,
'Date': 1,
'Symbol': 1,
'Close': 1
}
)
query_result_selected_stocks = list(selected_stocks_cursor)
# Build pandas dataframe with stocks data got from database (structure it to have name of column containing closing price for specific stock, equal to the symbol of the stock)
# ex.
# Date (df index) XEL ZM ZS => stock symbols
# 2023-07-25 04:00:00 64.959999 69.330002 155.210007
# 2023-07-26 04:00:00 65.050003 71.099998 156.619995
# 2023-07-27 04:00:00 62.869999 72.389999 155.250000
# 2023-07-28 04:00:00 62.889999 73.080002 157.490005
# 2023-07-31 04:00:00 62.730000 73.349998 160.380005
# 2023-08-01 04:00:00 62.930000 72.485001 163.940002
#
data_stocks_query = pd.DataFrame(query_result_selected_stocks)
gb = data_stocks_query.groupby('Symbol')
data_stocks = pd.DataFrame()
for x in gb.groups:
if data_stocks.empty:
df_temp = gb.get_group(x)
df_temp = df_temp.rename(columns={'Close': df_temp['Symbol'].iloc[0]})
df_temp.drop(columns=['Symbol'], inplace=True)
data_stocks = df_temp
del df_temp
else:
df_temp = gb.get_group(x)
df_temp = df_temp.rename(columns={'Close': df_temp['Symbol'].iloc[0]})
df_temp.drop(columns=['Symbol'], inplace=True)
data_stocks = pd.merge(data_stocks, df_temp, on='Date')
del df_temp
return data_stocks
app.layout = dmc.MantineProvider(
theme={
#"fontFamily": "'Inter', sans-serif",
"primaryColor": "indigo",
"components": {
"Button": {"styles": {"root": {"fontWeight": 400}}},
"Alert": {"styles": {"title": {"fontWeight": 500}}},
"AvatarGroup": {"styles": {"truncated": {"fontWeight": 500}}},
},
},
inherit=True,
withGlobalStyles=True,
withNormalizeCSS=True,
children=[
dmc.Space(h=20),
dmc.Grid([
dmc.Col([dmc.Title(f"Multi-Variate Index Regression", order=4),
dmc.Select(label='Select Stock Index',
placeholder='You can select only 1 stock index',
id='stock-indices-dropdown',
data=get_all_index_symbols(db_collection=db_collection),
value='^NDX - NASDAQ 100',
style={"width": 300}
),
dmc.MultiSelect(label='Select Stocks',
description='You can select between 1 and 10 stocks',
id='stocks-dropdown',
#value=['ZM - Zoom Video Communications, Inc.'],
value=[],
maxSelectedValues=10,
style={"width": 300},
clearable=True,
searchable=True),
dmc.Space(h=10),
dmc.Alert("Unfortunately there was a problem in \nestimating the regression model with the \nselected stocks probably for lack of data \nfor some symbols. Please remove the last \nselected stock(s) from the box or try a different group of stocks.",
id='regression-model-alert',
title="ALERT!",
color="red",
duration=5000,
hide=True,
style={"width": 350,
"white-space": "pre"}),
dmc.Space(h=30),
dmc.Text('Press the below button to automatically select the \nstocks (10 or less) that best explain the index \n(selected above):',
align="left",
size="sm",
style={"width": 350,
"white-space": "pre"}),
dmc.Button("Select Best Stocks", id='button-best-stocks'),
dmc.Space(h=10),
dmc.Alert("I'm Selecting the Stocks! Please wait.",
id='computing-stock-selection-alert',
title="INFO",
color="yellow",
hide=True),
dmc.Space(h=10),
dmc.Alert("Unfortunately there was a problem in \nselecting the stocks that best explain \nthe index. \nPlease select them manually in the box.",
id='best-stocks-alert',
title="ALERT!",
color="red",
duration=5000,
hide=True,
style={"width": 350,
"white-space": "pre"}),
], span=3, offset=0.25),
dmc.Col([dcc.Graph(figure={}, id="scatter-plot"),
], span=5, offset=-0.75),
dmc.Col([html.Div(id='ols-summary-table'),
], span=4),
]),
])
@app.callback(Output('stocks-dropdown', 'data'),
Output('ols-summary-table', 'children'),
Output('scatter-plot', 'figure'),
Input('stock-indices-dropdown', 'value'),
prevent_initial_call=True)
def get_all_stocks_for_index_callback(selected_index_symbol_name):
selected_index_symbol = extract_index_symbol(selected_index_symbol_name)
query_result = get_all_stocks_for_index(selected_index_symbol=selected_index_symbol)
# Convert list of stock symbols into list of strings in the format 'Stock_Symbol - Company_Name' (ex. 'ZM - Zoom Video Communications, Inc.' for symbol 'ZM') and drop Index Symbol from list
symbols_names = [value for value in [f'{value1} - {value2}' for value1, value2 in [list(dict_.values()) for dict_ in query_result]] if value != selected_index_symbol_name]
# Reset OLS regression model results table
ols_table_children = []
# Reset scatterplot figure
figure = {}
return symbols_names, ols_table_children, figure
@app.callback(Output("stocks-dropdown", "error"),
Input("stocks-dropdown", "value"))
def select_value(value):
return "Select at least 1 stock" if len(value) < 1 else ""
@app.callback(Output('ols-summary-table', 'children', allow_duplicate=True),
Output('scatter-plot', 'figure', allow_duplicate=True),
Output("regression-model-alert", "hide"),
State('stock-indices-dropdown', 'value'),
Input('stocks-dropdown', 'value'),
State("regression-model-alert", "hide"),
prevent_initial_call=True)
def multiple_linear_regression(selected_index_symbol_name, selected_stock_symbols_names, hide):
# Make sure the list of symbols is not empty
if not selected_stock_symbols_names:
return DangerouslySetInnerHTML('''<br>'''), {}, hide
# Query database for data
# Query for stock index data
selected_index_symbol = extract_index_symbol(selected_index_symbol_name)
data_stock_index = get_stock_index_data(selected_index_symbol=selected_index_symbol)
# Query for stocks data
selected_stock_symbols = extract_stock_symbols(selected_stock_symbols_names)
data_stocks = get_stocks_data(selected_stock_symbols=selected_stock_symbols)
# Fill NANs by forward filling
data_stock_index.ffill(inplace=True)
data_stocks.ffill(inplace=True)
# Make sure the dataframes for the stock index and the single stocks have the same samples to avoid problems in the estimation of the regression model
data_stock_index = pd.merge(data_stock_index, data_stocks[['Date']], how='inner', on='Date')
data_stocks = pd.merge(data_stock_index[['Date']], data_stocks, how='inner', on='Date')
# Set 'Date' as index on both dataframes
data_stock_index.set_index('Date', inplace=True)
data_stocks.set_index('Date', inplace=True)
try:
y = data_stock_index
# define predictor variables
x = data_stocks
# add constant to predictor variables
x = sm.add_constant(x)
# fit linear regression model (I drop missing values)
model = sm.OLS(y, x, missing='drop').fit()
predictions = model.predict(x).to_frame()
predictions.rename(columns={predictions.columns[0]:'model_prediction'}, inplace=True)
df = pd.merge(y, predictions, on='Date')
fig = px.scatter(df, x="model_prediction", y=selected_index_symbol)
result = model.summary().as_html()
result = DangerouslySetInnerHTML(result)
return result, fig, hide
except:
hide = False
return no_update, no_update, hide
@app.callback(Output("computing-stock-selection-alert", "hide", allow_duplicate=True),
Input("button-best-stocks", "n_clicks"),
State("computing-stock-selection-alert", "hide"),
prevent_initial_call=True)
def button_pressed_callback(n_clicks_button, hide_computing_stock_selection_alert):
hide_computing_stock_selection_alert = False
return hide_computing_stock_selection_alert
@app.callback(Output('stocks-dropdown', 'value', allow_duplicate=True),
Output("best-stocks-alert", "hide"),
Output("computing-stock-selection-alert", "hide"),
State('stock-indices-dropdown', 'value'),
Input("button-best-stocks", "n_clicks"),
State("best-stocks-alert", "hide"),
State("computing-stock-selection-alert", "hide"),
prevent_initial_call=True)
def select_10_best_stocks(selected_index_symbol_name, n_clicks, hide_best_stocks_alert, hide_computing_stock_selection_alert):
# Query database for data
selected_index_symbol = extract_index_symbol(selected_index_symbol_name)
data_stock_index = get_stock_index_data(selected_index_symbol=selected_index_symbol)
# Query for stocks data:
# Get all stock symbols for selected stock index.
# Query result will contain a list like:
# [{'Symbol': 'ZM', 'Name': 'Zoom Video Communications, Inc.'},
# {'Symbol': 'ZS', 'Name': 'Zscaler, Inc.'},
# {'Symbol': '^NDX', 'Name': 'NASDAQ 100'},
# {'Symbol': 'XEL', 'Name': 'Xcel Energy Inc.'},
# ...]
query_result = get_all_stocks_for_index(selected_index_symbol=selected_index_symbol)
stock_index_symbol_name_list = get_all_index_symbols(db_collection=db_collection)
stock_index_symbols_list = [extract_index_symbol(stock_index_symbol_name) for stock_index_symbol_name in stock_index_symbol_name_list]
# Make sure to remove stock index symbol from list of stocks
selected_stock_symbols = [symbol_name_dict['Symbol'] for symbol_name_dict in query_result if symbol_name_dict['Symbol'] not in stock_index_symbols_list]
data_stocks = get_stocks_data(selected_stock_symbols=selected_stock_symbols)
# Fill NANs by forward filling
data_stock_index.ffill(inplace=True)
data_stocks.ffill(inplace=True)
# Make sure the dataframes for the stock index and the single stocks have the same samples to avoid problems in the estimation of the regression model
data_stock_index = pd.merge(data_stock_index, data_stocks[['Date']], how='inner', on='Date')
data_stocks = pd.merge(data_stock_index[['Date']], data_stocks, how='inner', on='Date')
# Set 'Date' as index on both dataframes
data_stock_index.set_index('Date', inplace=True)
data_stocks.set_index('Date', inplace=True)
try:
feature_selection = 'SelectFromModel'
#feature_selection = 'SequentialFeatureSelector'
# Select the best 10 stocks/features (or less)
if feature_selection == 'SelectFromModel':
# Feature/stock selection done using an XGBoost model and the SelectFromModel function from scikit-learn (it uses the feature importance from the model passed as parameter)
# I use some preset hyperparameters' values for the XGBoost but a grid search could be applied to optimize these hyperparameters for the specific problem (e.g., by using bayesian hyperparameter
# optimization, random grid search or a full grid search)
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html
model = xgb.XGBRegressor(objective='reg:squarederror',
booster='gbtree',
n_estimators=100,
random_state=42,
n_jobs=-1)
sfm_selector = SelectFromModel(estimator=model,
max_features=10, # we want to select max 10 features
prefit=False)
sfm_selector.fit(X=data_stocks,
y=data_stock_index.values.ravel())
best_stocks = sfm_selector.get_feature_names_out()
elif feature_selection == 'SequentialFeatureSelector':
# Feature/stock selection done with the SequentialFeatureSelector. This method adds (forward selection) or removes (backward selection) features to form a feature subset in a greedy fashion.
# It should give better results than the SelectFromModel approach but it takes longer to execute, in particular if the number of stocks/features is of decent size.
# I use a linear regression model to speed up the computation.
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html
model = LinearRegression()
# We want to select 10 features/stocks if possible but if there are less in the dataset we set the 'n_features_to_select' variable to 'auto'
if data_stocks.shape[1] < 10:
n_features_to_select = 'auto'
else:
n_features_to_select = 10
sfs_selector = SequentialFeatureSelector(estimator=model,
n_features_to_select=n_features_to_select,
tol=None,
direction='forward',
#direction='backward',
scoring='r2',
n_jobs=-1)
sfs_selector.fit(X=data_stocks,
y=data_stock_index.values.ravel())
best_stocks = sfs_selector.get_feature_names_out()
# Once we have selected the best stocks as list (e.g., ['ZM', 'ZS', 'XEL', ...]) we build back the list containing the stock names as well
# (e.g., ['ZM - Zoom Video Communications, Inc.', 'ZS - Zscaler, Inc.', 'XEL - Xcel Energy Inc.', ...]) and pass this to the Dash component with id 'stocks-dropdown'
# which will trigger the callback to estimate the linear regression model.
# Get stock names linked to stock symbols
best_stocks_symbols_names_list_of_dict = [symbol_name_dict for symbol_name_dict in query_result if symbol_name_dict['Symbol'] in best_stocks]
# Convert list of stock symbols into list of strings in the format 'Stock_Symbol - Company_Name' (ex. 'ZM - Zoom Video Communications, Inc.' for symbol 'ZM') and drop Index Symbol from list
best_stocks_symbols_names_list = [value for value in [f'{value1} - {value2}' for value1, value2 in [list(dict_.values()) for dict_ in best_stocks_symbols_names_list_of_dict]]]
hide_computing_stock_selection_alert = True
return best_stocks_symbols_names_list, hide_best_stocks_alert, hide_computing_stock_selection_alert
except:
hide_computing_stock_selection_alert = True
hide_best_stocks_alert = False
return no_update, hide_best_stocks_alert, hide_computing_stock_selection_alert
if __name__ == '__main__':
app.run_server(debug=False)