-
Notifications
You must be signed in to change notification settings - Fork 0
/
load_datasets.py
53 lines (43 loc) · 1.58 KB
/
load_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
'''
Kaggle Dataset Loader
To avoid a giant repository, datasets are not synced with repository.
Datasets are stored in a 'Data' folder within each project. If these
datasets are from Kaggle, then a 'kaggle.dat' file should be present
with the official name of the contest. This is used by the Kaggle-cli
to download the dataset.
Inputs
Username: Kaggle Username
Password: Kaggle Password
Eg. python3 load_datasets.py username password
'''
import os
import subprocess
import sys
for d in os.listdir():
#Checking for files and hidden folders
if not (os.path.isdir(os.path.join('.', d)) and not d.startswith('.')):
continue
#Looking for Data folder
data_path = os.path.join('.', d, 'Data')
print('Data path: ', data_path)
if not os.path.isdir(data_path):
print('Could not find Data folder')
continue
#Looking for Kaggle file
kaggle_config = os.path.join('.', d, 'kaggle.dat')
if not os.path.isfile(kaggle_config):
print('Could not find kaggle.dat')
continue
#Reading Kaggle file
with open(kaggle_config, 'r') as f:
print('Found kaggle config file: ', kaggle_config)
contest = f.read().rstrip()
#Loading data
print('Loading contest data')
print('Contest: ', contest)
subprocess.run(['kg', 'download', '-u', sys.argv[1], '-p', sys.argv[2], '-c', contest], cwd=data_path)
#Extracting compressed files
for f in os.listdir(data_path):
if (os.path.splitext(f)[1][1:] in ('zip', 'gz')):
print('Extracting: ', f)
subprocess.run(['7z', 'x', f], cwd=data_path)