Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Annual statistic on yearly chunks to keep memory low #792

Closed
wants to merge 9 commits into from
23 changes: 22 additions & 1 deletion esmvalcore/preprocessor/_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,7 +447,28 @@ def annual_statistics(cube, operator='mean'):

if not cube.coords('year'):
iris.coord_categorisation.add_year(cube, 'time')
return cube.aggregated_by('year', operator)
years = sorted(list(set(cube.coord("year").points)))
Copy link
Contributor

@jvegreg jvegreg Oct 1, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are you not using somehting like this?

cube = CubeList([year.aggregated_by('year', operator) for year in cube.slices_over('year' )]).merge_cube()

Is there a performance reason?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure, I'd rather not since that looks to me like it's a bit clunky - I'd rather not repeat all the memory profiling tests for a rewrite of the same functionality 😁

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was just curious, as you write lots of code for something that can be a one liner

no_years = len(years)
annual_mean_cube = cube[0:no_years, ...]
time_cells = [
datetime.datetime(year, 7, 1, 0, 0, 0) for year in years
]
annual_mean_cube.coord("time").points = [
cube.coord('time').units.date2num(cl)
for cl in time_cells]
annual_mean_cube.coord('time').bounds = None
annual_mean_cube.coord('time').guess_bounds()
annual_mean_cube.remove_coord("year")
iris.coord_categorisation.add_year(annual_mean_cube, 'time')
for idx, year in enumerate(range(no_years)):
yearly_cube = cube[12 * year:12 * (year + 1)]
annual_mean = yearly_cube.aggregated_by('year', operator)
annual_mean_cube.data[idx] = annual_mean.core_data()
if da.ma.getmaskarray(annual_mean.core_data()).any():
annual_mean_cube.data.mask[idx] = \
da.ma.getmaskarray(annual_mean.core_data())

return annual_mean_cube


def decadal_statistics(cube, operator='mean'):
Expand Down
16 changes: 16 additions & 0 deletions tests/unit/preprocessor/_time/test_time.py
Original file line number Diff line number Diff line change
Expand Up @@ -1158,6 +1158,10 @@ def test_annual_average(existing_coord):
if existing_coord:
iris.coord_categorisation.add_year(cube, 'time')

cube.data = np.ma.array(cube.data,
mask=np.zeros(cube.data.shape, dtype=bool))
cube.data.mask[0] = True
cube.data.mask[3] = True
result = annual_statistics(cube)
expected = np.array([1., 1.])
assert_array_equal(result.data, expected)
Expand All @@ -1172,12 +1176,24 @@ def test_annual_sum(existing_coord):
if existing_coord:
iris.coord_categorisation.add_year(cube, 'time')

# no mask
result = annual_statistics(cube, 'sum')
expected = np.array([12., 12.])
assert_array_equal(result.data, expected)
expected_time = np.array([180., 540.])
assert_array_equal(result.coord('time').points, expected_time)

# add mask
cube.data = np.ma.array(cube.data,
mask=np.zeros(cube.data.shape, dtype=bool))
cube.data.mask[0] = True
cube.data.mask[3] = True
result = annual_statistics(cube, 'sum')
expected = np.array([10., 12.])
assert_array_equal(result.data, expected)
expected_time = np.array([180., 540.])
assert_array_equal(result.coord('time').points, expected_time)


@pytest.mark.parametrize('existing_coord', [True, False])
def test_decadal_average(existing_coord):
Expand Down