-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathimport_data.py
636 lines (494 loc) · 23.6 KB
/
import_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
import datetime
import os
import glob
import re
import numpy as np
import pandas as pd
import scipy
import xarray as xr
from dateutil.relativedelta import relativedelta
def lon_shift_360_2_180(ds, lon_name='lon'):
''' Shift lon from 0:360 to -180:180 space
ds - Input datset
lon_name - Name of longitude varible to shift
'''
ds[lon_name] = ((ds[lon_name]+180)%360)-180
return ds
def preprocess_time_monthly(x):
''' Preprocesses time variables from GFDL format to SIPN2 format.
Convert time to initialization and forecast lead time (to fit into orthogonal matrices)
Input Dims: lat x lon x Time
Output Dims: lat x lon x init_time x fore_time
Where we represent fore_time as monthly increments
'''
Nmons = x.average_T1.size
m_i = np.arange(0,Nmons)
m_dt = ['month' for x in m_i] # list of 'month'
# Set record dimension of 'time' to the beginning of averaging period 'average_T1'
x['time'] = x.average_T1
# Grab forecast times
xtimes = xr.decode_cf(x).time.values;
# Get initialization time
x.coords['init_time'] = xtimes[0] # get first one
x.coords['init_time'].attrs['comments'] = 'Initilzation time of forecast'
# Get forecast time (as timedeltas from init_time)
x.rename({'time':'fore_time'}, inplace=True);
x.coords['fore_time'] = xr.DataArray(m_i, dims='fore_time')
# Set time offset for index in fore_time
x.coords['fore_offset'] = xr.DataArray(m_dt, dims='fore_time', coords={'fore_time':x.fore_time})
return x
def preprocess_time_monthly_Cansips(x):
''' Preprocesses time variables from Cansips format to SIPN2 format.
Convert time to initialization and forecast lead time (to fit into orthogonal matrices)
Input Dims: lat x lon x Time
Output Dims: lat x lon x init_time x fore_time
Where we represent fore_time as monthly increments
'''
Nmons = x.leadtime.size
m_i = np.arange(0,Nmons)
m_dt = ['month' for x in m_i] # list of 'month'
# Set init_time
x.coords['init_time'] = x.reftime.isel(time=0)
x.coords['init_time'].attrs['comments'] = 'Initilzation time of forecast'
x = x.drop(['reftime','leadtime'])
# Get forecast time (as timedeltas from init_time)
x.rename({'time':'fore_time'}, inplace=True);
x.coords['fore_time'] = xr.DataArray(m_i, dims='fore_time')
# Set time offset for index in fore_time
x.coords['fore_offset'] = xr.DataArray(m_dt, dims='fore_time', coords={'fore_time':x.fore_time})
return x
def preprocess_time(x):
''' Convert time to initialization and foreast lead time (to fit into orthoganal matrices)
Input Dims: lat x lon x Time
Output Dims: lat x lon x init_time x fore_time'''
# Set record dimension of 'time' to the beinging of averaging period 'average_T1'
x['time'] = x.average_T1
# Grab forecast times
xtimes = xr.decode_cf(x).time.values;
# Get initialization time
x.coords['init_time'] = xtimes[0] # get first one
x.coords['init_time'].attrs['comments'] = 'Initilzation time of forecast'
# Get forecast time (as timedeltas from init_time)
x.rename({'time':'fore_time'}, inplace=True);
x.coords['fore_time'] = xtimes - xtimes[0]
return x
def get_valid_time(ds, init_dim='init_time', fore_dim='fore_time'):
''' Given a data set with init_time and fore_time coords, calcuate the valid_time coord.'''
if 'fore_offset' in ds.coords:
# Then fore_time is just an index for fore_offset (i.e. monthly data)
# TODO remove hard corded months (get from fore_offset)
fore_time_offset = np.array([relativedelta(months=+x) for x in ds[fore_dim].values])
# Switch types around so we can add datetime64[ns] with an object of relativedelta, then convert back
#valid_time = xr.DataArray(np.array([ds.init_time.values.astype('M8[D]').astype('O')]), dims='init_time') + xr.DataArray(fore_time_offset, dims='fore_time')
valid_time = xr.DataArray(ds[init_dim].values.astype('M8[D]').astype('O'), dims=init_dim) + xr.DataArray(fore_time_offset, dims='fore_time')
ds.coords['valid_time'] = valid_time.astype('datetime64[ns]')
else:
ds.coords['valid_time'] = ds[init_dim] + ds[fore_dim]
return ds
def rename_coords(ds):
c_cords = list(ds.coords.dims.keys())
c_dict = {'.*lat':'lat', '.*lon':'lon', '.*forecast_time':'fore_time',
'.*initial_time':'init_time', '.*ensemble':'ensemble'}
new_dict = {}
for key, value in c_dict.items():
r = re.compile(key)
newlist = list(filter(r.match, c_cords))
if len(newlist)>0:
new_dict[newlist[0]] = value
ds = ds.rename(new_dict)
return ds
def rename_vars(ds=None, var_dict=None):
c_vars = list(ds.data_vars.keys())
new_dict = {}
for key, value in var_dict.items():
r = re.compile(key)
newlist = list(filter(r.match, c_vars))
if len(newlist)>0:
new_dict[newlist[0]] = value
ds = ds.rename(new_dict)
return ds
def open_1_member_monthly(cfiles, e):
ds = xr.open_mfdataset(cfiles, concat_dim='init_time', decode_times=False,
preprocess=lambda x: preprocess_time_monthly(x),
autoclose=True)
# Sort init_time (if more than one)
if ds.init_time.size>1:
ds = ds.reindex(init_time=sorted(ds.init_time.values))
# Add ensemble coord
ds.coords['ensemble'] = e
return ds
def open_1_member(cfiles, e):
ds = xr.open_mfdataset(cfiles, concat_dim='init_time', decode_times=False,
preprocess=lambda x: preprocess_time(x),
autoclose=True)
# Sort init_time (if more than one)
if ds.init_time.size>1:
ds = ds.reindex(init_time=sorted(ds.init_time.values))
# Some of daily gfdl flor forecast go for 10 years instead of 1, only get 1 year here
ds = ds.isel(fore_time=slice(0,365))
# Add ensemble coord
ds.coords['ensemble'] = e
return ds
def readbinfile(f, nx, ny):
with open(f, 'rb') as fid:
data_array = np.fromfile(fid, np.int32)*1e-5
return data_array.reshape((nx,ny))
def get_stero_N_grid(grid_dir):
# Get info about target grid
flat = os.path.join(grid_dir,'psn25lats_v3.dat')
flon = os.path.join(grid_dir,'psn25lons_v3.dat')
NY=304
NX=448
lat = readbinfile(flat, NX, NY).T
lon = readbinfile(flon, NX, NY).T
# Add cell corner lat/lon
return xr.Dataset({'lat': (['x', 'y'], lat), 'lon': (['x', 'y'], lon)})
def naive_fast(latvar,lonvar,lat0,lon0):
# Read latitude and longitude from file into numpy arrays
latvals = latvar[:]
lonvals = lonvar[:]
dist_sq = (latvals-lat0)**2 + (lonvals-lon0)**2
minindex_flattened = dist_sq.argmin() # 1D index of min element
iy_min,ix_min = np.unravel_index(minindex_flattened, latvals.shape)
return iy_min,ix_min
def cell_bounds_to_corners(gridinfo=None, varname=None):
''' Some models/obs give the four corner lat/lons for each cell. xesmf needs the bounds N+1 grid, this
converts the former to the later.'''
# Add cell bound coords (lat_b and lon_b)
n_j = gridinfo.grid_dims.values[1]
n_i = gridinfo.grid_dims.values[0]
nj_b = np.arange(0, n_j + 1) # indices of corner of cells
ni_b = np.arange(0, n_i + 1)
# Grab all corners as arrays
dim_out = tuple(np.flip(gridinfo.grid_dims.transpose().values,0))
ul = gridinfo[varname].isel(grid_corners=0).values.reshape(dim_out)
ll = gridinfo[varname].isel(grid_corners=1).values.reshape(dim_out)
lr = gridinfo[varname].isel(grid_corners=2).values.reshape(dim_out)
ur = gridinfo[varname].isel(grid_corners=3).values.reshape(dim_out)
# Merge together
m1 = np.concatenate((ul, ur[:,0][:, None]), axis=1) # add on ur at right
m2 = np.append(ll[-1,:], lr[-1,0])
m3 = np.concatenate((m1, m2[:, None].T), axis=0) # add ll and lr to bottom
ds_out = xr.DataArray(m3, dims=('nj_b', 'ni_b'), coords={'nj_b':nj_b, 'ni_b':ni_b})
ds_out = xr.ufuncs.rad2deg( ds_out ) # rad to deg
return ds_out
def cell_bounds_to_corners_GFDL(gridinfo=None, varname=None):
''' GFDL provides the four corner lat/lons for each cell. xesmf needs the bounds N+1 grid, this
converts the former to the later.'''
# Add cell bound coords (lat_b and lon_b)
n_j = gridinfo.grid_dims.values[1]
n_i = gridinfo.grid_dims.values[0]
nj_b = np.arange(0, n_j + 1) # indices of corner of cells
ni_b = np.arange(0, n_i + 1)
# Grab all corners as arrays
dim_out = tuple(np.flip(gridinfo.grid_dims.T.values,0))
ll = gridinfo[varname].isel(grid_corners=0).values.reshape(dim_out)
lr = gridinfo[varname].isel(grid_corners=1).values.reshape(dim_out)
ur = gridinfo[varname].isel(grid_corners=2).values.reshape(dim_out)
ul = gridinfo[varname].isel(grid_corners=3).values.reshape(dim_out)
# Merge together
m1 = np.concatenate((ul, ur[:,-1][:, None]), axis=1) # add on ur at right
m2 = np.append(ll[0,:], lr[0,-1])
m3 = np.concatenate((m2[:, None].T, m1), axis=0) # add ll and lr to bottom
ds_out = xr.DataArray(m3, dims=('nj_b', 'ni_b'), coords={'nj_b':nj_b, 'ni_b':ni_b})
ds_out = xr.ufuncs.rad2deg( ds_out ) # rad to deg
return ds_out
def add_matrix_NaNs(regridder):
X = regridder.A
M = scipy.sparse.csr_matrix(X)
num_nonzeros = np.diff(M.indptr)
M[num_nonzeros == 0, 0] = np.NaN
regridder.A = scipy.sparse.coo_matrix(M)
return regridder
def load_grid_info(grid_file=None, model=None):
grid = xr.open_dataset(grid_file)
n_lat = np.rad2deg(grid.grid_center_lat.values.reshape(tuple(np.flip(grid.grid_dims.T.values,0)))) # Reshape
n_lon = np.rad2deg(grid.grid_center_lon.values.reshape(tuple(np.flip(grid.grid_dims.T.values,0)))) # Reshape
grid_imask = grid.grid_imask.values.reshape(tuple(np.flip(grid.grid_dims.T.values,0))) # Reshape
nj = xr.DataArray(np.arange(0,n_lat.shape[0],1), dims=('nj')) # Make indices
ni = xr.DataArray(np.arange(0,n_lat.shape[1],1), dims=('ni'))
lat = xr.DataArray(n_lat, dims=('nj','ni'), coords={'nj':nj, 'ni':ni})
lon = xr.DataArray(n_lon, dims=('nj','ni'), coords={'nj':nj, 'ni':ni})
imask = xr.DataArray(grid_imask, dims=('nj','ni'), coords={'nj':nj, 'ni':ni}).astype('bool') # int to bool
if model=='NSIDC':
lat_b = cell_bounds_to_corners(gridinfo=grid, varname='grid_corner_lat')
lon_b = cell_bounds_to_corners(gridinfo=grid, varname='grid_corner_lon')
elif model=='GFDL' or model=='piomas':
lat_b = cell_bounds_to_corners_GFDL(gridinfo=grid, varname='grid_corner_lat')
lon_b = cell_bounds_to_corners_GFDL(gridinfo=grid, varname='grid_corner_lon')
else:
raise ValueError('model not found.')
# Combine
return xr.Dataset({'lat':lat, 'lon':lon, 'lat_b':lat_b, 'lon_b':lon_b, 'imask':imask})
def split_GFDL(ds_in, varnames=None):
# GFDL grid split parameters
j_s = 175
i_s = 180
# Subset "top"
a = ds_in[varnames].isel(nj=slice(j_s,None), ni=slice(None,i_s))
b = ds_in[varnames].isel(nj=slice(j_s,None), ni=slice(i_s,None))
b['nj'] = np.flip(a.nj, axis=0) + a.nj.max() - a.nj.min() + 1 # reverse in nj dim (reindexed below)
b['ni'] = np.flip(a.ni, axis=0) # flip in ni dim to align with a
ds_top = xr.concat([a, b.transpose()], dim='nj')
if not hasattr(ds_top, 'data_vars'): # convert to dataset if not already
ds_top = ds_top.to_dataset()
# concat over nj dim
ds_top = ds_top.reindex({'nj':np.arange(ds_top.nj.min(), ds_top.nj.max()+1, 1)}) # reindex on nj to "flip" b in nj dim
c = ds_in['lat_b'].isel(nj_b=slice(j_s,ds_in.nj.size), ni_b=slice(None,i_s+1))
d = ds_in['lat_b'].isel(nj_b=slice(j_s,ds_in.nj_b.size), ni_b=slice(i_s,None))
d['nj_b'] = np.flip(np.arange(c.nj_b.max()+1, c.nj_b.max()+2+c.nj_b.size), axis=0)
d['ni_b'] = np.flip(c.ni_b, axis=0)
ds_top.coords['lat_b'] = xr.concat([c, d], dim='nj_b')
ds_top = ds_top.reindex({'nj_b':np.arange(ds_top.nj_b.min(), ds_top.nj_b.max()+1, 1)}) # reindex on nj to "flip" b in nj dim
# Subset "bottom"
# add overlap
j_s = j_s + 10 # Here we add 3 poleward cells to the "bottom" sub-grid, to allow overlap with "top" sub-grid.
ds_bottom = ds_in.isel(nj=slice(None,j_s)).drop(['lat_b','lon_b','nj_b','ni_b'])
ds_bottom.coords['lat_b'] = ds_in['lat_b'].isel(nj_b=slice(None,j_s+1))
return (ds_top, ds_bottom)
def regrid_gfdl_split_domain(ds_all, da_top, da_bottom, regridder_top, regridder_bottom):
# Regrid
da_out_top = regridder_top(da_top)
da_out_bottom = regridder_bottom(da_bottom)
# Mask by latitude
lat_split = ds_all.lat.isel(nj=175).min() # Get the latitude where model domain was split on
lat_split_2 = ds_all.lat.isel(nj=175+5).max() #
da_out_top = da_out_top.where( (da_out_top.lat>=lat_split).values )
da_out_bottom = da_out_bottom.where( (da_out_bottom.lat<lat_split_2).values )
# Add dropped coords
da_out_top['fore_time'] = ds_all.fore_time
da_out_bottom['fore_time'] = ds_all.fore_time
# Merge "top" and "bottom"
da_all_out = da_out_top.combine_first(da_out_bottom)
return da_all_out
def split_by_lat(ds, latVal=65.0, want=None):
if want=='above':
ds_out = ds.drop(['lat_b','lon_b','nj_b','ni_b']).where(ds.lat>latVal, drop=True)
ds_out.coords['lat_b'] = ds.lat_b.sel(nj_b=np.append(ds_out.nj.values, ds_out.nj.values[-1]+1))
elif want=='below':
ds_out = ds.drop(['lat_b','lon_b','nj_b','ni_b']).where(ds.lat<=latVal, drop=True)
ds_out.coords['lat_b'] = ds.lat_b.sel(nj_b=np.append(ds_out.nj.values, ds_out.nj.values[-1]+1))
else:
raise ValueError('Value for want not found. Use above or below.')
return ds_out
def read_piomas_scalar_monthly(f):
xDim = 120
yDim = 360
yyyy = c_files[0].split('.')[1].split('H')[1] # Get year to split out dates
with open(f, 'rb') as fid:
arr = np.fromfile(fid, np.float32).reshape(-1, xDim, yDim)
# Build dates
time = np.arange(yyyy+'-01', str(np.int(yyyy)+1)+'-01', dtype='datetime64[M]')
return xr.DataArray(arr, dims =('time','nj', 'ni'), coords={'time':time})
def read_piomas_scalar_daily(f, varname=None):
xDim = 120
yDim = 360
yyyy = f.split('.')[1].split('H')[1] # Get year to split out dates
with open(f, 'rb') as fid:
arr = np.fromfile(fid, np.float32).reshape(-1, xDim, yDim)
# Build dates
# time = np.arange(yyyy+'-01', str(np.int(yyyy)+1)+'-01', dtype='datetime64[D]').astype('datetime64[ns]')
time = pd.date_range(yyyy+'-01-01', yyyy+'-12-31')
da = xr.DataArray(arr, name=varname, dims =('time','nj', 'ni'), coords={'time':time[0:arr.shape[0]]})
return da.to_dataset() # push to data set so we can add more coords later
def expand_to_sipn_dims(ds):
# Force output datasets have ensemble, init_time, and fore_time as dimensions (otherwise add empty ones)
required_dims = ['ensemble', 'init_time', 'fore_time']
for d in required_dims:
if d not in ds.dims:
ds = ds.expand_dims(d)
return ds
def parse_NSIDC_date(str1):
date1 = str1.split('_')[1]
yyyy = int(date1[0:4])
mm = int(date1[4:6])
dd = int(date1[6:8])
return datetime.datetime(yyyy,mm,dd)
def read_NSIDC_binary(cfile, x, y, product=None):
n_rows=448
n_cols=304
with open(cfile, 'rb') as fr:
if product=='NSIDC_0051' or product=='NSIDC_0081':
#http://nsidc.org/data/nsidc-0051
#http://nsidc.org/data/nsidc-0081
hdr = fr.read(300)
ice = np.fromfile(fr, dtype=np.uint8)
ice = ice.reshape(n_rows, n_cols)
ice_max = 250
hole_mask = 251
coast = 253
land = 254
missing = 255
elif product=='NSIDC_0079':
ice = np.fromfile(fr, dtype=np.uint16)
ice = ice.reshape(n_rows, n_cols)
ice_max = 1000.
hole_mask = 1100
coast = 9999
land = 1200
missing = 9999
else:
raise ValueError('product name not found')
# Make xarray dataArray
da_all = xr.DataArray(ice, coords={'x': x, 'y': y}, dims=('y', 'x'))
# Scale to (0-1) and mask out non-sic
ds = (da_all/ice_max)
ds.name = 'sic'
ds = ds.where(ds<=1).to_dataset()
# Add date
ds.coords['time'] = parse_NSIDC_date(os.path.basename(cfile))
ds.expand_dims('time')
#if get_masks:
# Add other masks
ds.coords['hole_mask'] = da_all==hole_mask
#ds.coords['coast'] = da_all==coast # Commented out because makes filse too slow to load, and not used.
#ds.coords['land'] = da_all==land
#ds.coords['missing'] = da_all==missing
return ds
def load_1_NSIDC(filein=None, product=None):
# Define coords
# Indices values
x = np.arange(0,304,1)
y = np.arange(0,448,1)
ds_sic = read_NSIDC_binary(filein, x, y, product)
return ds_sic
def load_NSIDC(all_files=None, product=None):
# Define coords
# Indices values
x = np.arange(0, 304, 1)
y = np.arange(0, 448, 1)
# Loop through each binary file and read into a Dataarray
da_l = []
for cf in all_files:
da_l.append(read_NSIDC_binary(cf, x, y, product))
ds_sic = xr.concat(da_l, dim='time', coords='different')
return ds_sic
def load_1_iceBridgeQL(filein=None, start_pt=0):
''' Loads in iceBridge quick look data to a Dataset'''
# Data files from:
# https://n5eil01u.ecs.nsidc.org/ICEBRIDGE_FTP/Evaluation_Products/IceBridge_Sea_Ice_Freeboard_SnowDepth_and_Thickness_QuickLook/
# We use the "point" as the record dim. Assuming each point is a unique point lat,lon,time.
# Allow each nc file to be conactenated
## Input ##
# filein - string
# Full file path
# start_pt - index to start next point on (last point + 1)
# Load in to dataframe
df = pd.read_csv(filein, na_values=[-99999.0,'*****'])
# Name index
df.index.name = 'point'
# Adjust points by current start_pt
df.index = df.index + start_pt + 1
# Select variables of interest
df = df[['thickness','thickness_unc','lat','lon','snow_depth','snow_depth_unc','date','mean_fb','fb_unc']]
# To dataset
ds = df.to_xarray()
# Rename things
ds = ds.rename({'thickness':'hi','thickness_unc':'hi_unc','snow_depth':'sd','snow_depth_unc':'sd_unc'})
# Make date int datetime64
# First try to use date column
try:
ds['date'] = xr.DataArray([datetime.datetime.strptime(str(x), "%Y%m%d") for x in ds.date.values],
dims='point', coords={'point':ds.point})
except ValueError:
# Fall back option...
# Get date of flight from file name and add
cdate = datetime.datetime.strptime(os.path.basename(filein).split('_')[1].split('.')[0], "%Y%m%d")
cdate_l = [cdate for n in ds.point.values] # repeat for each point (assumed)
ds['date'] = xr.DataArray(cdate_l, dims='point', coords={'point':ds.point}) # add
return ds
def _load_MME_by_init_end(E=None, runType=None, variable=None, metric=None, init_range=None):
''' Loads and concatenates netcdf files of weekly averaged forecasts from multiple models.
----------
Parameters:
E : Esio data object
Contains path and metadata of all models
runType : String
Type of forecast (used for paths)
metric: String
Name of metric (i.e. mean, anomaly, SIP) to select (used for paths)
init_range: list
list of two np.datetime64 dates, start and stop init period to load
Returns:
ds_m = Dask Dataset
Contains all model forecasts by initialization and forecast time
'''
if init_range:
assert init_range[1]>=init_range[0], "Init_range end must be greater than start"
# Fixed Parameters
concat_dim_time = 'fore_time'
drop_coords = ['init_start','valid_start','valid_end']
# Paths
metric_dir = os.path.join(E.model['MME_NEW'][runType]['sipn_nc'], variable, metric)
# Get list of inits (from directory names)
init_dirs = sorted([ name for name in os.listdir(metric_dir) if os.path.isdir(os.path.join(metric_dir, name)) ])
print(" Found",len(init_dirs),"initialization periods.")
# Subset by init_range requested (optional)
if init_range:
init_dates = [x for x in init_dirs if ((np.datetime64(x) >= init_range[0]) & (np.datetime64(x) <= init_range[1]))]
else:
init_dates = init_dirs # Use all found
ds_init_l = []
for c_init in init_dates:
c_init_path = os.path.join(metric_dir, c_init)
# Get list of models (dirs)
mod_dirs = sorted([ name for name in os.listdir(c_init_path) if os.path.isdir(os.path.join(c_init_path, name)) ])
ds_mod_l = []
for c_mod in mod_dirs:
# Open files
allfiles = sorted(glob.glob(os.path.join(metric_dir, c_init, c_mod,'*.nc')))
if not allfiles:
continue # Skip this model
ds_i = xr.open_mfdataset(allfiles, drop_variables=['xm','ym','time','ensemble'],
concat_dim=concat_dim_time, autoclose=True,
parallel=True) # We drop these coords here because otherwise concat fails below.
ds_mod_l.append(ds_i)
if (len(ds_mod_l)>0) & (ds_mod_l!=['Observed']): # if not empty and not only Observed (we found atleast one model)
ds_all_mods = xr.concat(ds_mod_l, dim='model')
ds_init_l.append(ds_all_mods)
# Drop extra coords because of this issue: https://github.com/pydata/xarray/pull/1953
# This was fixed (?) in Aug 2018, TODO: check its still needed
ds_init_l = [x.drop(drop_coords) for x in ds_init_l]
if ds_init_l:
#print(ds_init_l)
ds_m = xr.concat(ds_init_l, dim='init_end')
else:
raise ValueError('No init times were found....')
# Sometimes lat and lon have init_end as a dim (round off error in lat between files)
# If so, drop it
# lat and lon get loaded as different for each file, set to constant except along x and y
if 'init_end' in ds_m.lat.dims:
ds_m.coords['lat'] = ds_m.sel(model='Observed').isel(init_end=0,fore_time=0).lat.drop([concat_dim_time,'init_end','model'])
ds_m.coords['lon'] = ds_m.sel(model='Observed').isel(init_end=0,fore_time=0).lon.drop([concat_dim_time,'init_end','model'])
# Return dask Dataset
return ds_m
def load_MME_by_init_end(E=None, runType=None, variable=None, metrics=None, init_range=None):
''' Loads and concatenates netcdf files of weekly averaged forecasts from multiple models.
----------
Parameters:
E : Esio data object
Contains path and metadata of all models
runType : String
Type of forecast (used for paths)
variable: String
Name of variable (i.e. sic) to select (used for paths)
metrics: List
List of metrics to load
init_range: List of two datetime64
Start and Stop of init times (inclusive) to load
Returns:
ds_m = Dask Dataset
Contains all model forecasts by initialization and forecast time
For all metrics for a given variable
'''
ds_l = []
for cmetric in metrics:
print(" Loading",cmetric,"...")
ds_m = _load_MME_by_init_end(E=E, runType=runType, variable=variable, metric=cmetric, init_range=init_range)
# Somehow different metrics have lat long differeces order 10-6, so only use values from first metric to allow them to be merged
# TODO: WHY?!?!??!?!?!
if cmetric!=metrics[0]:
ds_m = ds_m.drop(['lat','lon'])
ds_l.append(ds_m)
ds_out = xr.merge(ds_l)
return ds_out