-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathpySecMaster.py
executable file
·576 lines (512 loc) · 27.2 KB
/
pySecMaster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
#!/usr/bin/env python3
import argparse
from datetime import datetime
import time
from create_tables import create_database, main_tables, data_tables,\
events_tables
from extractor import QuandlCodeExtract, QuandlDataExtraction,\
GoogleFinanceDataExtraction, YahooFinanceDataExtraction, CSIDataExtractor,\
NASDAQSectorIndustryExtractor
from load_aux_tables import LoadTables
from build_symbology import create_symbology
from cross_validator import CrossValidate
from utilities.database_queries import query_all_active_tsids
from utilities.user_dir import user_dir
from utilities.database_check import postgres_test
__author__ = 'Josh Schertz'
__copyright__ = 'Copyright (C) 2018 Josh Schertz'
__description__ = 'An automated system to store and maintain financial data.'
__email__ = 'josh[AT]joshschertz[DOT]com'
__license__ = 'GNU AGPLv3'
__maintainer__ = 'Josh Schertz'
__status__ = 'Development'
__url__ = 'https://joshschertz.com/'
__version__ = '1.5.0'
'''
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
''' pySecMaster.py
This manages the pysecmaster database. It should be run daily as a maximum.
Database maintenance tasks:
Creates the tables in the database.
Loads auxiliary tables from included CSV files.
Downloads all available Quandl Codes for the Quandl Databases selected.
Downloads the specified CSI Data factsheet (stocks, commodities).
Creates the symbology table which establishes a unique code for every
item in the database, along with translating different source's codes
Database data download tasks:
Downloads Quandl data based on the download selection criteria using either
the official Quandl Codes or implied codes from CSI Data.
Downloads Google Finance minute stock data.
Can either append only the new data, or replace part of the existing data.
'''
###############################################################################
# Database maintenance options:
userdir = user_dir()
csidata_type = 'stock' # stock, commodity
# Don't change these unless you know what you are doing
database_url = ['https://www.quandl.com/api/v2/datasets.csv?query=*&'
'source_code=', '&per_page=300&page=']
# http://www.csidata.com/factsheets.php?type=stock&format=html
csidata_url = 'http://www.csidata.com/factsheets.php?'
tables_to_load = ['data_vendor', 'exchanges']
nasdaq_sector_industry_url = 'http://www.nasdaq.com/screening/' \
'companies-by-industry.aspx?'
nasdaq_sector_industry_extractor_exchanges = ['NASDAQ', 'NYSE', 'AMEX']
nasdaq_sector_industry_redownload_time = 7
###############################################################################
# Database data download options:
today = datetime.today()
# Don't change these variables unless you know what you are doing!
quandl_data_url = ['https://www.quandl.com/api/v1/datasets/', '.csv']
google_fin_url = {'root': 'http://www.google.com/finance/getprices?',
'ticker': 'q=',
'exchange': 'x=',
'interval': 'i=', # 60; 60 seconds is the shortest interval
# 'sessions': 'sessions=ext_hours',
'period': 'p=', # 20d; 15d is the longest period for min
'fields': 'f=d,c,v,o,h,l'} # order doesn't change anything
yahoo_end_date = ('d=%s&e=%s&f=%s' %
(str(int(today.month - 1)).zfill(2), today.day, today.year))
yahoo_fin_url = {'root': 'http://real-chart.finance.yahoo.com/table.csv?',
'ticker': 's=', # Exchange is added after ticker and '.'
'interval': 'g=', # d, w, m, v: (daily, wkly, mth, dividends)
'start_date': 'a=00&b=1&c=1900', # The entire price history
'end_date': yahoo_end_date, # Today's date (MM; D; YYYY)
'csv': 'ignore=.csv'} # Returns a CSV file
###############################################################################
def maintenance(database_options, quandl_key, quandl_ticker_source,
database_list, threads, quandl_update_range,
csidata_update_range, symbology_sources):
"""
:param database_options: Dictionary of the postgres database options
:param quandl_key: Optional string of the Quandl API key
:param quandl_ticker_source: String of which source the Quandl data should
use when determining which codes to download (csidata, quandl)
:param database_list: List of strings indicating which Quandl databases
should have their codes downloaded (WIKI, GOOG, YAHOO)
:param threads: Integer of the threads to run when downloading Quandl codes
:param quandl_update_range: Integer of the number of days before the
Quandl codes should be updated
:param csidata_update_range: Integer of the number of days before the CSI
Data factsheet should be updated
:param symbology_sources: List of strings of which symbology sources
should be created (csi_data, tsid, quandl_wiki)
"""
print('Starting Security Master table maintenance function. This can take '
'some time to finish if large databases are used. If this fails, '
'rerun it after a few minutes.')
# Create the SQL tables if they don't already exist
create_database(admin_user=database_options['admin_user'],
admin_password=database_options['admin_password'],
database=database_options['database'],
user=database_options['user'])
main_tables(database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'])
data_tables(database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'])
events_tables(database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'])
LoadTables(database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
tables_to_load=tables_to_load,
load_tables=userdir['load_tables'])
# Always extract CSI values, as they are used for the symbology table
CSIDataExtractor(database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
db_url=csidata_url,
data_type=csidata_type,
redownload_time=csidata_update_range)
if quandl_ticker_source == 'quandl':
QuandlCodeExtract(database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
quandl_token=quandl_key,
database_list=database_list,
database_url=database_url,
update_range=quandl_update_range,
threads=threads)
create_symbology(database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
source_list=symbology_sources)
NASDAQSectorIndustryExtractor(
database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
db_url=nasdaq_sector_industry_url,
exchange_list=nasdaq_sector_industry_extractor_exchanges,
redownload_time=nasdaq_sector_industry_redownload_time)
def data_download(database_options, quandl_key, download_list, threads=4,
verbose=False):
""" Loops through all provided data sources in download_list, and runs
the associated data extractor using the provided source variables.
:param database_options: Dictionary of the postgres database options
:param quandl_key: Optional string of the Quandl API key
:param download_list: List of dictionaries, with each dictionary containing
all of the relevant variables for the specific source
:param threads: Integer indicating how many threads should be used to
concurrently download data
:param verbose: Boolean of whether debugging prints should occur.
"""
for source in download_list:
if source['interval'] == 'daily':
table = 'daily_prices'
if source['source'] == 'google':
google_fin_url['interval'] = 'i=' + str(60*60*24)
elif source['source'] == 'yahoo':
yahoo_fin_url['interval'] = 'g=d'
elif source['interval'] == 'minute':
table = 'minute_prices'
if source['source'] == 'google':
google_fin_url['interval'] = 'i=' + str(60)
elif source['source'] == 'yahoo':
raise SystemError('Yahoo Finance does not provide minute data.')
else:
raise SystemError('No interval was provided for %s in '
'data_download in pySecMaster.py' %
source['interval'])
if source['source'] == 'quandl':
if quandl_key:
# Download data for selected Quandl codes
print('\nDownloading all Quandl fields for: %s'
'\nNew data will %s the prior %s day\'s data' %
(source['selection'], source['data_process'],
source['replace_days_back']))
# NOTE: Quandl only allows a single concurrent download with
# their free account
QuandlDataExtraction(
database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
quandl_token=quandl_key,
db_url=quandl_data_url,
download_selection=source['selection'],
redownload_time=source['redownload_time'],
data_process=source['data_process'],
days_back=source['replace_days_back'],
threads=2,
table=table,
load_tables=userdir['load_tables'],
verbose=verbose)
else:
print('\nNot able to download Quandl data for %s because '
'there was no Quandl API key provided.' %
(source['selection'],))
elif source['source'] == 'google':
# Download data for selected Google Finance codes
print('\nDownloading all Google Finance fields for: %s'
'\nNew data will %s the prior %s day\'s data' %
(source['selection'], source['data_process'],
source['replace_days_back']))
google_fin_url['period'] = 'p=' + str(source['period']) + 'd'
GoogleFinanceDataExtraction(
database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
db_url=google_fin_url,
download_selection=source['selection'],
redownload_time=source['redownload_time'],
data_process=source['data_process'],
days_back=source['replace_days_back'],
threads=threads,
table=table,
load_tables=userdir['load_tables'],
verbose=verbose)
elif source['source'] == 'yahoo':
# Download data for selected Google Finance codes
print('\nDownloading all Yahoo Finance fields for: %s'
'\nNew data will %s the prior %s day\'s data' %
(source['selection'], source['data_process'],
source['replace_days_back']))
YahooFinanceDataExtraction(
database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
db_url=yahoo_fin_url,
download_selection=source['selection'],
redownload_time=source['redownload_time'],
data_process=source['data_process'],
days_back=source['replace_days_back'],
threads=threads,
table=table,
load_tables=userdir['load_tables'],
verbose=verbose)
else:
print('The %s source is currently not implemented. Skipping it.' %
source['source'])
print('All available data values have been downloaded for: %s' %
download_list)
def post_download_maintenance(database_options, download_list, period=None,
verbose=False):
""" Perform tasks that require all data to be downloaded first, such as the
source cross validator function.
:param database_options: Dictionary of the postgres database options
:param download_list: List of dictionaries, with each dictionary containing
all of the relevant variables for the specific source
:param period: Optional integer indicating the prior number of days whose
values should be cross validated. If None is provided, then the
entire set of values will be validated.
:param verbose: Boolean of whether debugging prints should occur.
"""
intervals = {}
for source in download_list:
if source['interval'] == 'daily':
intervals['daily_prices'] = True
elif source['interval'] == 'minute':
print('The cross validator for minute prices is disabled because '
'there is currently only one source (Google Finance). Thus, '
'it does not make sense to run it. When you add a second '
'source you can re-enable it by un-commenting out the line '
'below this message in post_download_maintenance within '
'pySecMaster.py')
# intervals['minute_prices'] = True
else:
raise SystemError('No interval was provided for %s in '
'data_download in pySecMaster.py' %
source['interval'])
for key, value in intervals.items():
# The key is the table name to process
table = key
if verbose:
print('Starting cross validator for %s' % table)
tsids_df = query_all_active_tsids(
database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
table=table,
period=period)
tsid_list = tsids_df['tsid'].values
CrossValidate(
database=database_options['database'],
user=database_options['user'],
password=database_options['password'],
host=database_options['host'],
port=database_options['port'],
table=table, tsid_list=tsid_list, period=period, verbose=verbose)
if __name__ == '__main__':
# Establish the argument parser
parser = argparse.ArgumentParser(
description='The pySecMaster is an automated financial securities '
'master framework. Currently, NYSE and NASDAQ securities '
'will have their daily prices loaded into the database. '
'Additional data can be integrated with the system, '
'including minute prices, option chain values, tick '
'values, corporate events (splits, dividends, IPOs), '
'and fundamental data.'
)
# Optional arguments
parser.add_argument('--csidata-update-range', type=int,
default=7,
help='Number of days before the data will be refreshed.')
parser.add_argument('--daily-downloads', type=str, nargs='*',
default=['quandl', 'yahoo', 'google'],
help='Sources whose daily prices will be downloaded. By default, '
'quandl, yahoo and google daily prices will be downloaded.')
parser.add_argument('--database-list', type=str, nargs='+',
default=['WIKI'],
help='The Quandl databases that will have their codes downloaded if '
'the quanddl-ticker-source argument is set to quandl. '
'Provide selections one after the other without quotes. Options '
'include: WIKI, GOOG, YAHOO, SEC, EIA, JODI, CURRFX, FINRA.')
parser.add_argument('--minute-downloads', type=str, nargs='*',
help='Sources whose minute prices will be downloaded. Only google '
'is implemented right now. By default, no minute prices are '
'downloaded.')
parser.add_argument('--quandl-update-range', type=int,
default=30,
help='Number of days before the ticker tables will be refreshed. If '
'the database was not completely downloaded within this range, '
'the remainder of the codes will attempt to be downloaded.')
parser.add_argument('--quandl-ticker-source', type=str,
choices=['quandl', 'csidata'], default='csidata',
help='Determines where the extractor should get the ticker codes from '
'when downloading Quandl data. Options include using the official '
'list from Quandl (quandl), or make implied codes from the CSI '
'data stock factsheet (csidata) which is more accurate but tries '
'more non-existent tickers.')
parser.add_argument('--symbology-sources', type=str, nargs='+',
default = ['csi_data', 'tsid', 'quandl_wiki', 'quandl_eod',
'quandl_goog', 'seeking_alpha', 'yahoo'],
help='Sources that will be integrated into the symbology table, '
'including have symbology specific identifiers created and '
'being linked with other sources symbology identifiers for the '
'same underlying item. The following sources are used by default: '
'csi_data, tsid, quandl_wiki, qualdo_eod, quandl_goog, '
'seeking_alpha, yahoo. Ommitting key sources may break the '
'system. Provide selections one after the other without quotes.')
parser.add_argument('-t', '--threads', type=int,
help='Number of threads to allocate to the system. The total system '
'cores are used by default.')
parser.add_argument('--validator-period', type=int,
help='Prior number of days whose values should be cross validated, '
'with 30 being a good option. If no value is provided, the '
'entire period will be validated.')
parser.add_argument('-v', '--verbose',
action='store_true',
help='Print out the status of the system.')
# Retrieve the arguments
args = parser.parse_args()
############################################################################
# Database maintenance options:
test_database_options = {
'admin_user': userdir['postgresql']['main_user'],
'admin_password': userdir['postgresql']['main_password'],
'database': userdir['postgresql']['pysecmaster_db'],
'user': userdir['postgresql']['pysecmaster_user'],
'password': userdir['postgresql']['pysecmaster_password'],
'host': userdir['postgresql']['pysecmaster_host'],
'port': userdir['postgresql']['pysecmaster_port'],
}
test_quandl_key = userdir['quandl']['quandl_token']
############################################################################
# Database data download options:
# TODO: Move these parameters to a config file
# Example download list: should be a list of dictionaries, with the
# dictionaries containing all relevant variables for the specific source
test_download_list = [
# Quandl WIKI daily data with wiki
{'source': 'quandl', 'selection': 'wiki', 'interval': 'daily',
'redownload_time': 60 * 60 * 12, 'data_process': 'replace',
'replace_days_back': 60},
# Quandl EOD daily data with eod
{'source': 'quandl', 'selection': 'eod', 'interval': 'daily',
'redownload_time': 60 * 60 * 12, 'data_process': 'replace',
'replace_days_back': 60},
# Yahoo Fin daily data with us_main - 9300 seconds (2.58 hours)
{'source': 'yahoo', 'selection': 'us_main', 'interval': 'daily',
'redownload_time': 60 * 60 * 12, 'data_process': 'replace',
'replace_days_back': 60},
# Google daily data with us_main_no_end_date (max of 50 day's prices)
{'source': 'google', 'selection': 'us_main_no_end_date',
'interval': 'daily', 'period': 60, 'redownload_time': 60 * 60 * 12,
'data_process': 'replace', 'replace_days_back': 10},
# Google minute data with us_main (max of 15 day's prices)
{'source': 'google', 'selection': 'us_main', 'interval': 'minute',
'period': 20, 'redownload_time': 60 * 60 * 12,
'data_process': 'replace', 'replace_days_back': 10}
]
# source: String of which data provider should have their data downloaded
# selection: String of which data from the source should be downloaded. To
# understand what is actually being downloaded, go to either the
# query_q_codes function or query_codes function in
# utilities/database_queries.py and view the SQL queries.
# (Quandl: 'wiki', 'eod', 'goog', 'goog_us_main',
# 'goog_us_main_no_end_date', 'goog_us_canada_london', 'goog_etf';
# Google: 'all', 'us_main', 'us_main_no_end_date', 'us_canada_london')
# interval: String of what interval the data should be in (daily or minute).
# period: Integer of how many day's data should be downloaded (Google
# finance only). Minute data only has data back 15 days, and daily data
# only has data back 50 days.
# redownload_time: Integer representing time in seconds before the data is
# allowed to be re-downloaded. Allows the system to be restarted without
# downloading the same data again.
# data_process: String of how the new data will interact with the existing
# data ('replace': replace the prior x days of data (replace_days_back);
# 'append': append the latest data to the existing data (will ignore
# replace_days_back variable). 'append' requires less system resources
# since it only adds new values, instead of deleting overlapping values.
# replace_days_back: Integer of the number of days whose existing data
# should be replaced by new data (50000 replaces all existing data). Due
# to weekends, the days replaced may differ depending on what day this
# function is run
############################################################################
# Build the download list from the argparse arguments provided. There is
# probably a much better way to do this
download_list = []
if args.daily_downloads or args.minute_downloads:
daily_d = args.daily_downloads
minute_d = args.minute_downloads
# Cycle through all download templates specified above
for source in test_download_list:
if daily_d and source['interval'] == 'daily':
for cur_d in daily_d:
# Need to do string to string comparisson
if source['source'] in cur_d:
# For quandl, ensure selection matches argparse exactly
if source['source'] in 'quandl' and \
source['selection'] == cur_d[cur_d.find('.')+1:]:
download_list.append(source)
elif source['source'] not in 'quandl':
download_list.append(source)
if minute_d and source['source'] in minute_d and \
source['interval'] == 'minute':
download_list.append(source)
if args.threads:
threads = args.threads
else:
import multiprocessing
threads = multiprocessing.cpu_count()
# Try connecting to the postgres database
while True:
db_available = postgres_test(database_options=test_database_options)
if db_available:
print('%s database is available' %
test_database_options['database'])
break
else:
print('%s database is unavailable' %
test_database_options['database'])
time.sleep(1)
maintenance(database_options=test_database_options,
quandl_key=test_quandl_key,
quandl_ticker_source=args.quandl_ticker_source,
database_list=args.database_list,
threads=threads,
quandl_update_range=args.quandl_update_range,
csidata_update_range=args.csidata_update_range,
symbology_sources=args.symbology_sources)
if download_list:
data_download(database_options=test_database_options,
quandl_key=test_quandl_key,
download_list=download_list,
threads=threads,
verbose=args.verbose)
# 15 hours for complete build; adds ~6 GB
post_download_maintenance(database_options=test_database_options,
download_list=download_list,
period=args.validator_period,
verbose=args.verbose)
else:
print('No download sources were specified for either the daily data '
'or the minute data, therefore no prices will be downloaded nor '
'will the post download maintenance functions be run.')
print(datetime.now())