-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathprocessKeplerData.py
183 lines (136 loc) · 7.63 KB
/
processKeplerData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# Import Generic libraries
import numpy as np
import pandas as pd
# Import Libraries to process the kepler Data
import kplr
import pyfits
from PyKE.kepstitch import kepstitch
from PyKE.kepflatten import kepflatten
from algorithms.dtw import LB_Keogh
# Import Library for parallel computing
import multiprocessing
from joblib import Parallel, delayed
# Setting up log filename
LOG_FILENAME = 'processing_kepler_data.log'
# Read the planets list from the csv file
planets = pd.read_csv('data/candidate_planets.csv')
# Remove Baseline light curve chosen
planets = planets.ix[2:]
# Remove the K and transorm to float type allowing KPLR Library to process the name
planets['kepoi_name'] = planets['kepoi_name'].str.replace('K', '').astype(float)
# Get Some Count of the data
confirmed = planets.loc[planets['koi_disposition'] == 'CONFIRMED']
false_positive = planets.loc[planets['koi_disposition'] == 'FALSE POSITIVE']
candidate = planets.loc[planets['koi_disposition'] == 'CANDIDATE']
print "Number of total Planets: {}, Confirmed: {}, False Positive: {}, Candidates: {}" \
.format(planets.shape[0], confirmed.shape[0], false_positive.shape[0], candidate.shape[0])
# Setting up the resultant dataframe columns name
DF_COLUMNS = ('kepoi_name', 'koi_disposition',
'dtw_sap_flux', 'sap_flux_mean', 'sap_flux_std', 'sap_flux_min', 'sap_flux_max',
'dtw_sap_bkg', 'sap_bkg_mean', 'sap_bkg_std', 'sap_bkg_min', 'sap_bkg_max',
'dtw_pd_sap_flux', 'pd_sap_flux_mean', 'pd_sap_flux_std', 'pd_sap_flux_min', 'pd_sap_flux_max',
'dtw_det_sap_flux', 'det_sap_flux_mean', 'det_sap_flux_std', 'det_sap_flux_min', 'det_sap_flux_max',
'mom_centr1_mean', 'mom_centr1_std', 'mom_centr1_min', 'mom_centr1_max',
'mom_centr2_mean', 'mom_centr2_std', 'mom_centr2_min', 'mom_centr2_max',
'pos_corr1_mean', 'pos_corr1_std', 'pos_corr1_min', 'pos_corr1_max',
'pos_corr2_mean', 'pos_corr2_std', 'pos_corr2_min', 'pos_corr2_max')
# Creating the Dataframe
df = pd.DataFrame(columns=DF_COLUMNS)
print "Holder DataFrame Created"
# Opening the baseline curve FITs File
hdu = pyfits.open("data/detrended/752.01.fits")
# Grab the data from the file
hdu_data = hdu[1].data
# Create the baseline time series for SAP_FLUX, SAP_BKG, PDCSAP_FLUX, DETSAP_FLUX
baseline_sap_flux = pd.Series(np.array(hdu_data["SAP_FLUX"]))
baseline_sap_bkg = pd.Series(np.array(hdu_data["SAP_BKG"]))
baseline_pd_sap_flux = pd.Series(np.array(hdu_data["PDCSAP_FLUX"]))
baseline_det_sap_flux = pd.Series(np.array(hdu_data["DETSAP_FLUX"]))
print "Baselines Variables Created"
def processPlanetData(index, planets, df, baseline_sap_flux, baseline_sap_bkg, baseline_pd_sap_flux,
baseline_det_sap_flux):
# Get Kepler Object of Interest name and disposition
kepoi_name = planets.iloc[index]['kepoi_name']
koi_disposition = planets.iloc[index]['koi_disposition']
# Set Files Names for the resultant Stiched File and Resultant Detrended File
filenameStitch = "data/stitched/{}.fits".format(kepoi_name)
filenameDetrend = "data/detrended/{}.fits".format(kepoi_name)
# Initialize kplr API
client = kplr.API()
print "Processing Kepler Object: {} at index: {}".format(kepoi_name, index)
# Find a Kepler Object of Interest
koi = client.koi(planets.iloc[index]['kepoi_name'])
# Get a list of light curve data sets.
lcs = koi.get_light_curves(short_cadence=False, fetch=True, clobber=False)
print "Got Kepler Object of Interest and Light Curve Files for {}".format(kepoi_name)
lc_list = ""
# Looping trough lcs to get list of light curves path
for lc in lcs:
lc_list += str(lc.filename) + ","
# Removing trailing comma
lc_list_clean = lc_list[:-1]
print "Got path list of light curves."
# Stitching together light curves quarters
kepstitch(lc_list_clean, filenameStitch, True, False, LOG_FILENAME, 0)
print "Finished Stiching the data for {}".format(kepoi_name)
# Detrending Light Curve
kepflatten(filenameStitch, filenameDetrend, "PDCSAP_FLUX", "PDCSAP_FLUX_ERR",
3.0, 1.0, 3.0, 3, 10, "0,0", False, True, False, LOG_FILENAME, 0, True)
print "Finished Detrending the data for {}".format(kepoi_name)
# Opening Detrended FITs File
hdu = pyfits.open(filenameDetrend)
# Getting Detrended Data
hdu_data = hdu[1].data
# Getting all features form the Fit Files
sap_flux = pd.Series(np.array(hdu_data["SAP_FLUX"]))
sap_bkg = pd.Series(np.array(hdu_data["SAP_BKG"]))
pd_sap_flux = pd.Series(np.array(hdu_data["PDCSAP_FLUX"]))
det_sap_flux = pd.Series(np.array(hdu_data["DETSAP_FLUX"]))
mom_centr1 = pd.Series(np.array(hdu_data["MOM_CENTR1"]))
mom_centr2 = pd.Series(np.array(hdu_data["MOM_CENTR2"]))
pos_corr1 = pd.Series(np.array(hdu_data["POS_CORR1"]))
pos_corr2 = pd.Series(np.array(hdu_data["POS_CORR2"]))
# Try to apply DTW with the baseline. If fails return NaN
try:
dtw_sap_flux = LB_Keogh(sap_flux, baseline_sap_flux, 10)
dtw_sap_bkg = LB_Keogh(sap_bkg, baseline_sap_bkg, 10)
dtw_pd_sap_flux = LB_Keogh(pd_sap_flux, baseline_pd_sap_flux, 10)
dtw_det_sap_flux = LB_Keogh(det_sap_flux, baseline_det_sap_flux, 10)
except:
dtw_sap_flux, dtw_sap_bkg, dtw_pd_sap_flux, dtw_det_sap_flux = 'nan', 'nan', 'nan', 'nan'
pass
print "dtw_sap_flux: {}, dtw_sap_bkg: {}, dtw_pd_sap_flux: {}, dtw_det_sap_flux: {}".format(
dtw_sap_flux, dtw_sap_bkg, dtw_pd_sap_flux, dtw_det_sap_flux)
# Describe the Features Extracted (STD, MEAN, MIN, MAX)
desc_sap_flux, desc_sap_bkg = sap_flux.describe(), sap_bkg.describe()
desc_pd_sap_flux, desc_det_sap_flux = pd_sap_flux.describe(), det_sap_flux.describe()
desc_mom_centr1, desc_mom_centr2 = mom_centr1.describe(), mom_centr2.describe()
desc_pos_corr1, desc_pos_corr2 = pos_corr1.describe(), pos_corr2.describe()
print "Features Described Correctly."
# Apply the Features to the dataframe
df.loc[index] = [kepoi_name, koi_disposition,
dtw_sap_flux, desc_sap_flux['mean'], desc_sap_flux['std'], desc_sap_flux['min'],
desc_sap_flux['max'],
dtw_sap_bkg, desc_sap_bkg['mean'], desc_sap_bkg['std'], desc_sap_bkg['min'], desc_sap_bkg['max'],
dtw_pd_sap_flux, desc_pd_sap_flux['mean'], desc_pd_sap_flux['std'], desc_pd_sap_flux['min'],
desc_pd_sap_flux['max'],
dtw_det_sap_flux, desc_det_sap_flux['mean'], desc_det_sap_flux['std'], desc_det_sap_flux['min'],
desc_det_sap_flux['max'],
desc_mom_centr1['mean'], desc_mom_centr1['std'], desc_mom_centr1['min'], desc_mom_centr1['max'],
desc_mom_centr2['mean'], desc_mom_centr2['std'], desc_mom_centr2['min'], desc_mom_centr2['max'],
desc_pos_corr1['mean'], desc_pos_corr1['std'], desc_pos_corr1['min'], desc_pos_corr1['max'],
desc_pos_corr2['mean'], desc_pos_corr2['std'], desc_pos_corr2['min'], desc_pos_corr2['max']
]
print "Dataframe Row Added Correctly."
# Setting the Number of cores in the computer (set up to 1 due to the fact that not all computers allow multiprocessing)
num_cores = 1
# Uncomment if you want to try multiprocessing
# num_cores = multiprocessing.cpu_count()
# Setting Up the Limit for notebook purpose
LIMIT = 2
Parallel(n_jobs=num_cores) \
(delayed(processPlanetData) \
(index, planets, df, baseline_sap_flux, baseline_sap_bkg, baseline_pd_sap_flux, baseline_det_sap_flux) \
for index in range(0, LIMIT))
df.to_csv("data/kepler_pre_ml_notebook.csv")
print df.head()