-
Notifications
You must be signed in to change notification settings - Fork 0
/
database_creator.py
439 lines (345 loc) · 16.4 KB
/
database_creator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
# -*- coding: utf-8 -*-
"""
Created on Mon May 18 14:56:53 2020
@author: Dainean
"""
#Prepare the python system
import pandas as pd
import numpy as np
import fnmatch #For filtering
import os #move around in our OS
from astropy.io import fits #Working with fits
from astropy.cosmology import WMAP9 as cosmo #Cosmology calculators
import itertools as it #iteration / combination trick used
import seaborn as sb
import matplotlib.pyplot as plt
#Working directory control
cwd = os.getcwd()
#Working directory control
cwd = os.getcwd()
print("Initial working directory is:", cwd)
if '/Users/users/verdult/Thesis/thesis' in cwd:
print("Working at kapteyn, changing to data directory")
os.chdir('/net/virgo01/data/users/verdult/Thesis') #This is for kapteyn
if 'data' in cwd:
print("Working in kapteyn data folder")
if 'Dropbox' in cwd:
print("Working at home, changing to onedrive folder")
os.chdir('D:\Onedrive\Thesis')
if 'Dainean' in cwd:
print("Working at home, changing to onedrive folder")
os.chdir('D:\Onedrive\Thesis')
if 'Onedrive' in cwd:
print("Working in onedrive folder")
cwd = os.getcwd()
print("Current working directory is:", cwd)
#%%
def pandafy3(filename): #load up the whole fit file as a pandafile
filename = filename #which file? #222,617 KB
while True:
try:
if remake == True:
print("New file requested")
raise NameError('remake')
dfm = pd.read_hdf('support/SupportDB.h5', 'initial_db') #Read the initial dataframe
print("file found")
break
except (FileNotFoundError,KeyError,NameError):
print("creating new file")
simple = fits.open(filename) #open it
data = simple[1].data #data bit
hdr = simple[1].header #header bit
cols = simple[1].columns #The columns from .fits file as an object
simple.close()
coln = cols.names #Names of the columns
colnpd = pd.Series(coln) #Convert to a pandas series (so we can search the strings)
columns = colnpd
B = np.zeros(len(data)) #Initiate an array of the correct size
for i in columns:
C = data.field(i) #Read the data from a specific coloum
B = np.column_stack((B,C))
D = np.delete(B,0,1) #We did the first column twice
# create the dataframe
df = pd.DataFrame(D, index = data.field(0), columns = columns.values)
df.to_hdf('support/tester.h5', 'test_db') # 195,492 KB
break
# create the dataframe
df = pd.DataFrame(D, index = data.field(0), columns = columns.values)
return(df)
#%%
#This cell will contain the main variables and functions
filename_1 = 'fits/combined/DS-Sersic-SA-kCorr_m4.fits' #which file? #222,617 KB
#we are at version 4 right now.
simple = fits.open(filename_1) #open it
data = simple[1].data #data bit
hdr = simple[1].header #header bit
cols = simple[1].columns #The columns from .fits file as an object
coln = cols.names #Names of the columns
colnpd = pd.Series(coln) #Convert to a pandas series (so we can search the strings)
simple.close()
def pandafy(data,columns): #Colns must be all the columns you want to include
ARG = columns.index #!!!! Does this do anything?
B = np.zeros(len(data)) #Initiate an array of the correct size
for i in columns:
C = data.field(i) #Read the data from a specific coloum
B = np.column_stack((B,C))
D = np.delete(B,0,1) #We did the first column twice
df = pd.DataFrame(D, index = data.field(4), columns = columns.values, dtype='float32')
return(df)
def pandafy2(filename): #load up the whole fit file as a pandafile
filename = filename #which file? #222,617 KB
simple = fits.open(filename) #open it
data = simple[1].data #data bit
hdr = simple[1].header #header bit
cols = simple[1].columns #The columns from .fits file as an object
simple.close()
coln = cols.names #Names of the columns
colnpd = pd.Series(coln) #Convert to a pandas series (so we can search the strings)
columns = colnpd
B = np.zeros(len(data)) #Initiate an array of the correct size
for i in columns:
C = data.field(i) #Read the data from a specific coloum
B = np.column_stack((B,C))
D = np.delete(B,0,1) #We did the first column twice
# create the dataframe
df = pd.DataFrame(D, index = data.field(0), columns = columns.values)
return(df)
def fittify(df,filename='ThesisDB_selected.fits'): #say which dataframe you want to turn into a fit file
holder = []
for i in range(df.columns.values.size):
holder.append(fits.Column(name=df.columns.values[i], format='D', array=df.iloc[:,i]))
cols = fits.ColDefs(holder)
hdu = fits.BinTableHDU.from_columns(cols)
hdu.writeto(filename,overwrite=True)
#%%
#Check for initial dataframe
remake = False #remake the dataframe even if it exists?
#remake = True
while True:
try:
if remake == True:
print("New file requested")
raise Exception()
dfm = pd.read_hdf('support/InitialDB.h5', 'initial_db') #Read the initial dataframe
print("file found")
break
except (FileNotFoundError,KeyError,NameError):
print("creating new file")
dfm = pandafy(data,colnpd[2:430]) #Turning the whole dataset into a pandas dataframe, keep out any strings
dfm.to_hdf('support/InitialDB.h5', 'initial_db') # 195,492 KB
fittify(dfm, "thesis_gama.fits")
break
def pandafy3(filename, remake = False): #load up the whole fit file as a pandafile
while True:
try:
if remake == True:
print("New file requested")
raise NameError('remake')
dfm = pd.read_hdf('support/SupportDB.h5', 'initial_db') #Read the initial dataframe
print("file found")
break
except (FileNotFoundError,KeyError,NameError):
print("creating new file")
dfm = pandafy2]) #Turning the whole dataset into a pandas dataframe, keep out any strings
dfm.to_hdf('support/InitialDB.h5', 'initial_db') # 195,492 KB
break
simple = fits.open(filename) #open it
data = simple[1].data #data bit
hdr = simple[1].header #header bit
print(hdr)
cols = simple[1].columns #The columns from .fits file as an object
simple.close() # close the file again
coln = cols.names #Names of the columns
colnpd = pd.Series(coln) #Convert to a pandas series (so we can search the strings)
columns = colnpd
B = np.zeros(len(data)) #Initiate an array of the correct size
for i in columns:
C = data.field(i) #Read the data from a specific coloum
B = np.column_stack((B,C))
D = np.delete(B,0,1) #We did the first column twice
# create the dataframe
df = pd.DataFrame(D, index = data.field(0), columns = columns.values, dtype='float32')
return(df)
# Extinction dataframe
remake = False
#remake = True
while True:
try:
if remake == True:
print("New file requested")
raise Exception()
extinc = pd.read_hdf('support/SupportDB.h5', 'extinction')
print("file found")
break
except:
print("creating new file")
extinc = pandafy2('fits/GalacticExtinction.fits')
extinc.to_hdf('support/SupportDB.h5', 'extinction')
break
# SDSS Dataframe
remake = False
#remake = True
while True:
try:
if remake == True:
print("New file requested")
raise Exception()
SDSS = pd.read_hdf('support/SupportDB.h5', 'SersicSDSS')
print("file found")
break
except:
print("creating new file")
SDSS = pandafy2('fits/SersicCatSDSS.fits')
SDSS.to_hdf('support/SupportDB.h5', 'SersicSDSS')
break
# UKID dataframe
remake = False
while True:
try:
if remake == True:
print("New file requested")
raise Exception()
UKID = pd.read_hdf('support/SupportDB.h5', 'SersicUKIDSS')
print("file found")
break
except:
print("creating new file")
UKID = pandafy2('fits/SersicCatUKIDSS.fits')
UKID.to_hdf('support/SupportDB.h5', 'SersicUKIDSS')
break
#%%
#This cell will contain the main variables and functions
filename_1 = 'fits/combined/DS-Sersic-SA-kCorr_m4.fits' #which file? #222,617 KB
#we are at version 4 right now.
simple = fits.open(filename_1) #open it
data = simple[1].data #data bit
hdr = simple[1].header #header bit
cols = simple[1].columns #The columns from .fits file as an object
coln = cols.names #Names of the columns
colnpd = pd.Series(coln) #Convert to a pandas series (so we can search the strings)
simple.close()
def pandafy(data,columns): #Colns must be all the columns you want to include
ARG = columns.index #!!!! Does this do anything?
B = np.zeros(len(data)) #Initiate an array of the correct size
for i in columns:
C = data.field(i) #Read the data from a specific coloum
B = np.column_stack((B,C))
D = np.delete(B,0,1) #We did the first column twice
df = pd.DataFrame(D, index = data.field(4), columns = columns.values, dtype='float32')
return(df)
def pandafy2(filename): #load up the whole fit file as a pandafile
filename = filename #which file? #222,617 KB
simple = fits.open(filename) #open it
data = simple[1].data #data bit
hdr = simple[1].header #header bit
cols = simple[1].columns #The columns from .fits file as an object
simple.close()
coln = cols.names #Names of the columns
colnpd = pd.Series(coln) #Convert to a pandas series (so we can search the strings)
columns = colnpd
B = np.zeros(len(data)) #Initiate an array of the correct size
for i in columns:
C = data.field(i) #Read the data from a specific coloum
B = np.column_stack((B,C))
D = np.delete(B,0,1) #We did the first column twice
# create the dataframe
df = pd.DataFrame(D, index = data.field(0), columns = columns.values, dtype='float32')
return(df)
def fittify(df,filename='ThesisDB_selected.fits'): #say which dataframe you want to turn into a fit file
holder = []
for i in range(df.columns.values.size):
holder.append(fits.Column(name=df.columns.values[i], format='D', array=df.iloc[:,i]))
cols = fits.ColDefs(holder)
hdu = fits.BinTableHDU.from_columns(cols)
hdu.writeto(filename,overwrite=True)
#%%
#Updated database creation!
dfm = pd.read_hdf('support/InitialDB.h5', 'initial_db') #Read the initial dataframe
#-------------------------------------------------------------
GALMAG = dfm[dfm.columns[dfm.columns.str.contains("GALMAG_")]] #Grab all the Magnitudes
GALMAG = GALMAG[GALMAG > -9999] #create a new dataframe, where all the "bad" values are replaced by NaN
dis = cosmo.comoving_distance(dfm['Z']) #Comoving distances, using cosmo package and applied to Z
dfm2 = GALMAG #needless renaming, but hassle to rewrite
#Starting out with 6 columns,
#iloc[:,0:6]
dfm2['CATAID'] = dfm['CATAID']
dfm2['RA'] = dfm['RA']
dfm2['DEC'] = dfm['DEC']
dfm2['NQ'] = dfm['NQ'] #Add Redshift quality to the new dataframe (can remove later)
dfm2['Z'] = dfm['Z'] #Add Redshift Z to the new dataframe
dfm2['Distance (Mpc)'] = dis #Add the distances we found to the dataframe.
dfm3 = dfm2.iloc[:,9:] #yet another new dataframe, keeping only the latter from dfm2
#(easier this way then to create a new df)
#-------------------------------------------------------------
#prepare and filter out some bad values:
galr = dfm[dfm.columns[dfm.columns.str.contains("GALR90_")]] #Grab all the 90% radia
galr = galr[galr > -9999] #filter out invalid values
galRE = dfm[dfm.columns[dfm.columns.str.contains("GALRE_")]] #Grab all the Effective Radia in arcsec
galRE = galRE[galRE > -9999] #filter out invalid values
galmu = dfm[dfm.columns[dfm.columns.str.startswith("GALMU")]] #Gather all the surface brightnesses
galmu = galmu[galmu > -9999]
bands = "ugrizYJHK" #All the bands we will iterate over
arcsec = (2*np.pi)/(360*3600) #one arcsec in radians
#kpc2 = (((np.sin(arcsec)*dis)**2)*10**6).value #arcsec^2 converted to kpc^2 for each distance
#-------------------------------------------------------------
#various band information, 10 bits of data over 9 bands = 90 columns
#iloc[:,6:87]
minradius = 0 # minimum radius in kpc, can't have negative
maxradius = 10**18 #max radius in kpc, set extremely high as we switched to outlier detection instead
j = 0 #alternative counter for iterations
for i in bands: #Add to dfm3,
#dfm3['RELMAG_%s'%(i)] = dfm['GALMAG_%s'%(i)] #relative magnitude, can drop this
#Absolute magnitude, based on distance, kcorrection and galactic foreground extinction
dfm3['ABSMAG%s'%(i)] = 5 + (GALMAG['GALMAG_%s'%(i)] -5*np.log10((dis.value*10**6))) \
- dfm['KCORR_%s'%(i)] - extinc.loc[:,'A_u':'A_K_UKIDSS'].iloc[:,j]
#Radius (kpc) that contains 90% of light from galaxy
r = (np.sin(galr['GALR90_%s'%(i)]*arcsec)*dis.value)*10**3
dfm3['size90%s'%(i)] = r[((r > minradius) & (r < maxradius))] #Radius (kpc) that fits 90% of the light)
#Radius (kpc) that contains 50% of the light of the galaxy
r = (np.sin(galRE['GALRE_%s'%(i)]*arcsec)*dis.value)*10**3 #Filter out unrealistic values
dfm3['sizeRE%s'%(i)] = r[((r > minradius) & (r < (maxradius/4)))] #Radius (kpc) where light is at 50%
#sersic index, no adjustments
dfm3['SersicIndex%s'%(i)] = dfm['GALINDEX_%s'%(i)] #good as is
# ================================================================
#dfm3['SersicIndexErr%s'%(i)] = dfm['GALINDEXERR_%s'%(i)] #Error on sersic index, added 14-02-2019
#This is added with the idea that the error on the sersic index will also say something about irregularities in the shape.
#Is this correct? Not correct according to article. Other errors more important, need to filter those out
# ================================================================
#ellipticity, no adjustments
dfm3['Ellipticity%s'%(i)] = dfm['GALELLIP_%s'%(i)] #good as is
#Absolute magnitute at 10 Re #per band: Mv = mv - 2.5*log10((distance / 10 pc)**2) - kcorr
dfm3['ABSMAG10RE%s'%(i)] = (dfm['GALMAG10RE_%s'%(i)] +5 -5*np.log10((dis.value*10**6))) - dfm['KCORR_%s'%(i)]
#Central surface brightness in (absmag / arcsec^2) #No sense changing this
dfm3['MU@0%s'%(i)] = dfm['GALMU0_%s'%(i)]
#Effective surface brightness at effective radius (absmag / arcsec^2) #No sense changing this
dfm3['MU@E%s'%(i)] = dfm['GALMUE_%s'%(i)]
#Average Effective surface brightness within effective radius (absmag / arcsec^2)
dfm3['MUEAVG%s'%(i)] = dfm['GALMUEAVG_%s'%(i)]
j += 1
#-------------------------------------------------------------
#Convert the colours and add them to the dataframe, 36 in total
#[:,87:123]
b=np.arange(len(bands)) #to make an combinations series
combi = pd.Series(list(it.combinations(b,2))) #praise to atomh33ls at stackoverflow
for i in combi:
dfm3['%s-%s'%(bands[i[0]],bands[i[1]])] = (dfm3['ABSMAG%s'%(bands[i[0]])]-dfm3['ABSMAG%s'%(bands[i[1]])])
#-------------------------------------------------------------
#Exrtract some flux info some line fluxes
equivW = dfm[dfm.columns[dfm.columns.str.endswith("EW")]] #Grab all the continua
#-------------------------------------------------------------
#Add spectral information, 52 columns added
#[:,123:175]
#add the 4000 A break strength
dfm3['D4000N'] = dfm['D4000N']
#add all the equivalent widhts (Which is measured flux / background radiation)
for i in range(len(equivW.columns)):
dfm3[equivW.columns[i]] = equivW.iloc[:,i]
# ==================================
# Database has been constructed. Now to drop any NaN values
df3 = dfm3.dropna() #Drop any rows that have NaN in them. This brings us down to 42289 rows.
#small change to before. 36769 becomes 34981
df = df3[df3 > -99999].dropna() #some Equivalent widths have dummy values, here we drop them 36769 rows
# Save partial forms
phot = df.iloc[:,6:87]
colour = df.iloc[:,87:123]
spectral = df.iloc[:,123:175]