-
Notifications
You must be signed in to change notification settings - Fork 0
/
ml.py
193 lines (163 loc) · 7.49 KB
/
ml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import numpy as np
import pandas as pd
from datetime import datetime, time
from auxillary import ticker_scroll, weekdays
from functions import get, baseline, tweets, split, mape, rf, linear, linearSVR, kNearest, regressor
holiday_list = ['2018-11-22', '2018-11-23', '2018-12-05',
'2018-12-24', '2018-12-25', '2019-01-01', '2019-01-21', '2019-02-18'] # half days are included as a holiday too
# [thanksgiving, black friday (closes 13:00), GHWB death, Xmas eve (closes 13:00), Xmas, New years day, MLK bday, Presidents day]
tweet_bin_length = '60min'
tweet_window_length = '1440min' # 24 hours
start_date = datetime(2018, 10, 29).date()
end_date = datetime(2019, 3, 25).date()
DATA_PATH = "Data/"
while True:
ans = input(
"\n\n\nOPTIONS\n-------\nData: 'get', 'baseline' & 'tweets'\n\nCorrelation:'pearson', 'coors' \n\nRegression: 'rf', 'linear', 'linearSVR' or 'kNearest'\n(optional: add the n_estimators, tolerance or n_neighbors after test_size)\n\nOther: 'exit'\n\n\nWhat would you like to do?\n")
if ans.split(' ', 1)[0] == "get":
ticker_scroll(get)
elif ans.split(' ', 1)[0] == 'baseline':
ticker_scroll(baseline)
elif ans.split(' ', 1)[0] == 'tweets':
ticker_scroll(tweets)
elif ans.split(' ', 1)[0] == "rf":
try:
test_size = float(ans.split(' ', 1)[1])
if test_size < 0 or test_size > 1:
test_size = 0.1
print("Incorrect test size detected. Default of 0.1 set")
except:
print("No test size detected. Default of 0.1 set.")
test_size = 0.1
try:
n_estimators = int(ans.split(' ', 1)[2])
if n_estimators < 2:
n_estimators = 1000
print("Weird number of estimators detected. Default of 1000 set")
except:
print("No number of estimators detected. Default of 1000 set.")
n_estimators = 1000
regressor(rf, test_size, 'MAPE', n_estimators)
elif ans.split(' ', 1)[0] == "linear":
try:
test_size = float(ans.split(' ', 1)[1])
if test_size < 0 or test_size > 1:
test_size = 0.1
print("Incorrect test size detected. Default of 0.1 set")
except:
print("No test size detected. Default of 0.1 set.")
test_size = 0.1
regressor(linear, test_size, 'MAPE')
elif ans.split(' ', 1)[0] == "linearSVR":
try:
test_size = float(ans.split(' ', 1)[1])
if test_size < 0 or test_size > 1:
test_size = 0.1
print("Incorrect test size detected. Default of 0.1 set")
except:
print("No test size detected. Default of 0.1 set.")
test_size = 0.1
try:
tol = float(ans.split(' ', 1)[2])
if tol < 0 or tol > 1:
tol = 1e-5
print("Incorrect teolerance detected. Default of 1e-5 set")
except:
print("No tolerance detected. Default of 1e-5 set.")
tol = 1e-5
regressor(linearSVR, test_size, 'MAPE', tol)
elif ans.split(' ', 1)[0] == "kNearest":
try:
test_size = float(ans.split(' ', 1)[1])
if test_size < 0 or test_size > 1:
test_size = 0.1
print("Incorrect test size detected. Default of 0.1 set")
except:
print("No test size detected. Default of 0.1 set.")
test_size = 0.1
try:
neigh = int(ans.split(' ', 1)[2])
if neigh < 2:
neigh = 3
print("Incorrect number of neighbors detected. Default of 3 set")
except:
print("No number of neighbors detected. Default of 3 set.")
neigh = 3
regressor(kNearest, test_size, 'MAPE', neigh)
elif ans.split(' ', 1)[0] == "pearson":
import matplotlib.pyplot as plt
try:
ticker = str(ans.split(' ', 1)[1])
except:
print("No ticker detected. Default of AAPL set.")
ticker = "AAPL"
print("Processing pearson correaltion for", ticker + "...")
try:
tdb = pd.read_csv(DATA_PATH + ticker + " tdb.csv", index_col=0,
parse_dates=['timestamp'], date_parser=pd.to_datetime, engine='python') # engine needed to be changed, as dates were causing issues.
sdbtargeted = pd.read_csv(DATA_PATH + ticker + " stocks ftlbl.csv", index_col=0,
parse_dates=[0])
except:
print("You must first run the 'get' and 'baseline' commands")
break
countdb = tdb.drop(
['fullname', 'replies', 'retweets', 'text', 'user'], axis=1)
countdb = countdb.groupby(pd.Grouper(
key='timestamp', freq=tweet_bin_length)).count()
df = countdb.groupby(pd.Grouper(
freq=tweet_window_length))['likes'].agg(list)
df = pd.DataFrame(df.tolist(), index=df.index)
# will become df, but only for weekdays
weekdaydf = pd.DataFrame(columns=list(df))
for i in weekdays(start_date, end_date):
temp = str(i.date()) + " 16:00:00"
# if the day is not a holiday, then make a line in the weekdaydf for it
if str(pd.to_datetime(temp, infer_datetime_format=True).date()) not in holiday_list:
# add the df row to the new weekday
weekdaydf.loc[i] = df.loc[i]
mergeddf = pd.concat([weekdaydf, sdbtargeted], axis=1, sort=False)
# if targets column is empty (i.e. the API failed for that day), then delete the day, as the data is useless.
mergeddf = mergeddf.dropna(subset=['target'])
df = mergeddf
from scipy import stats as st
rhos = dict.fromkeys(range(24))
pvals = dict.fromkeys(range(24))
for i in range(24):
rhos[i], pvals[i] = st.pearsonr(df[[i]], df[['target']])
print('=======\n\nPearson:\n')
height = []
bars = []
intensity = []
for key in range(24):
print('time = {}, rho = {}, pval = {}'.format(key, rhos[key].item(0),
pvals[key].item(0)))
height.append(rhos[key].item(0))
bars.append(key)
intensity.append(pvals[key].item(0))
bars = tuple(bars)
colours = []
for i in intensity:
transparency = 1 - i
colours.append((0, 0.3399, 0.6601, transparency))
y_pos = np.arange(len(bars))
plt.bar(y_pos, height, color=colours)
plt.xticks(y_pos, bars)
plt.show()
elif ans.split(' ', 1)[0] == "coors":
from scipy import stats as st
tickerlist = []
for f in open(DATA_PATH + 'tickers.txt', 'r'):
tickerlist.append(f.strip())
coors = dict.fromkeys(tickerlist)
for ticker in coors.keys():
df = pd.read_csv(DATA_PATH + ticker +
" tweets ftlbl.csv", index_col=0, parse_dates=[0])
coors[ticker] = {'pear': [0]*24, 'pval': [0]*24, 'avg_pear': None}
for i in range(24):
temp = st.pearsonr(df[[str(i)]], df[['target']])
coors[ticker]['pear'][i], coors[ticker]['pval'][i] = [
temp[0][0], temp[1][0]]
coors[ticker]['avg_pear'] = np.mean(coors[ticker]['pear'][i])
print(sorted(coors, key=lambda k: abs(coors[k]['avg_pear'])))
elif ans.split(' ', 1)[0] == "exit":
break